From cbb8c45eb9c049a561baa47eda2a770b948c04f2 Mon Sep 17 00:00:00 2001 From: Michael Currin Date: Wed, 23 Oct 2019 11:27:16 +0200 Subject: [PATCH 01/14] feat: Add WIP work --- docs/browser_onetab_extraction.md | 9 ++++----- url_manager/extract_onetab_storage.py | 16 +++++++++++----- 2 files changed, 15 insertions(+), 10 deletions(-) diff --git a/docs/browser_onetab_extraction.md b/docs/browser_onetab_extraction.md index 919369b..2f412f5 100644 --- a/docs/browser_onetab_extraction.md +++ b/docs/browser_onetab_extraction.md @@ -23,12 +23,11 @@ The approach below parses the `storage.json` and gets the value of 'state' field 1. Open Firefox. 2. Go to the [about:profiles](about:profiles) page. This will show you your Firefox users. 3. Choose the profile you want, look at the paths and copy the username from one. e.g. `abcd1234.default` -4. Follow the commands below to enter the browser and username and save the output. An example is shown below. +4. Follow the commands below to enter the browser and username and save the output. An example is shown below. Set your username as the second argument. ```bash - $ # Use the full path to the raw directory and then provide a suitable name for the file. - $ OUTPUT=~/PATH/TO/REPO/url_manager/var/lib/raw/onetab_firefox_abc_personal.json - $ # Set your username as the second argument. - $ ./extract_onetab_storage.py Firefox abcd1234.default > $OUTPUT + # TODO: Make this part of script + # See also debug + $ ./extract_onetab_storage.py Firefox abcd1234.default > var/lib/raw/onetab_firefox_abc_personal.json ``` 5. Go back to step 3 and repeat for other profiles as desired. diff --git a/url_manager/extract_onetab_storage.py b/url_manager/extract_onetab_storage.py index 7de9623..f186303 100755 --- a/url_manager/extract_onetab_storage.py +++ b/url_manager/extract_onetab_storage.py @@ -3,7 +3,9 @@ OneTab storage extractor. Parse OneTab data stored in Firefox's JSON file or Chrome's LevelDB -storage, then pretty print the OneTab user data as JSON. +storage, then pretty print the OneTab user data as JSON. Data is cleaned as +part of parsing but no restructuring is done. + See the docs/browsers_onetab_extraction.md file for instructions. """ @@ -74,10 +76,13 @@ def parse_leveldb_bytes(data_bytes): # Convert double backlash to single. This handles cases like '\\"' => '\"'. data_str = raw_str.replace('\\\\', '\\') - # Edgecase handled by inspection on a title about union and intersection, - data_str = data_str.replace('(*")', '(&)') - data_str = data_str.replace('()")', '(|)') - + # Edgecase - a title with union and intersection. + # Repeat 3? why doesn't it pick up later? + # TODO TEST MORE. chrome and FF + data_str = data_str.replace('\xe2\x88\xaa', '!!!') + # TODO TEST MORE. chrome and FF + data_str = data_str.replace('()")', '(∩)') + # http://www.personal.psu.edu/ejp10/blogs/gotunicode/2007/09/inserting-the-union-and-inters-1.html # Unescape single quote. data_str = data_str.replace(r"\'", r"'") @@ -101,6 +106,7 @@ def parse_leveldb_bytes(data_bytes): try: return json.loads(data_str) except json.JSONDecodeError as e: + # Run everytime? print(f"{type(e).__name__}: {str(e)}") var_dir = conf.get('text_files', 'debug') From 7e8b4e531693fd144f103a30cd09baf381e0f729 Mon Sep 17 00:00:00 2001 From: Michael Currin Date: Sun, 2 Feb 2020 10:53:28 +0200 Subject: [PATCH 02/14] docs: Add comment --- docs/browser_onetab_extraction.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/browser_onetab_extraction.md b/docs/browser_onetab_extraction.md index 2f412f5..9775b69 100644 --- a/docs/browser_onetab_extraction.md +++ b/docs/browser_onetab_extraction.md @@ -25,7 +25,7 @@ The approach below parses the `storage.json` and gets the value of 'state' field 3. Choose the profile you want, look at the paths and copy the username from one. e.g. `abcd1234.default` 4. Follow the commands below to enter the browser and username and save the output. An example is shown below. Set your username as the second argument. ```bash - # TODO: Make this part of script + # TODO: Make this part of script and config # See also debug $ ./extract_onetab_storage.py Firefox abcd1234.default > var/lib/raw/onetab_firefox_abc_personal.json ``` From 51a5bc7c492717e1ab449fe9db652c961f7b7bfe Mon Sep 17 00:00:00 2001 From: Michael Currin Date: Sun, 2 Feb 2020 10:56:40 +0200 Subject: [PATCH 03/14] docs: Update URL --- docs/browser_onetab_extraction.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/browser_onetab_extraction.md b/docs/browser_onetab_extraction.md index 9775b69..ca9863a 100644 --- a/docs/browser_onetab_extraction.md +++ b/docs/browser_onetab_extraction.md @@ -21,7 +21,7 @@ Find the location of OneTab data for your Firefox user accounts and make it avai The approach below parses the `storage.json` and gets the value of 'state' field inside it. 1. Open Firefox. -2. Go to the [about:profiles](about:profiles) page. This will show you your Firefox users. +2. Go to the _about:profiles_ page. This will show you your Firefox users. 3. Choose the profile you want, look at the paths and copy the username from one. e.g. `abcd1234.default` 4. Follow the commands below to enter the browser and username and save the output. An example is shown below. Set your username as the second argument. ```bash From d0c908b46ae91b9c929a8c813ae11e59d9d2aab4 Mon Sep 17 00:00:00 2001 From: Michael Currin Date: Sun, 2 Feb 2020 10:58:21 +0200 Subject: [PATCH 04/14] docs: Update doc codeblock --- docs/browser_onetab_extraction.md | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/docs/browser_onetab_extraction.md b/docs/browser_onetab_extraction.md index ca9863a..73439f7 100644 --- a/docs/browser_onetab_extraction.md +++ b/docs/browser_onetab_extraction.md @@ -82,6 +82,8 @@ This approach was initially created as a manual step which can be ignored if the > path/to/repo/url_manager/var/lib/raw/chrome_onetab_mycompany_personal.json $ # You can view the file if you want. $ view path/to/repo/url_manager/var/lib/raw/chrome_onetab__mycompany_personal.json + ``` + ```json { "tabGroups": [ { @@ -92,9 +94,9 @@ This approach was initially created as a manual step which can be ignored if the "title": "...", "url": "https://..." }, - ... + // ... ] - ... + // ... } ] } From 8a58eddad54c1872d491da4441b38edb82cefced Mon Sep 17 00:00:00 2001 From: Michael Currin Date: Sun, 2 Feb 2020 11:00:30 +0200 Subject: [PATCH 05/14] docs: Update doc codeblock --- docs/browser_onetab_extraction.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/docs/browser_onetab_extraction.md b/docs/browser_onetab_extraction.md index 73439f7..817b46d 100644 --- a/docs/browser_onetab_extraction.md +++ b/docs/browser_onetab_extraction.md @@ -25,6 +25,8 @@ The approach below parses the `storage.json` and gets the value of 'state' field 3. Choose the profile you want, look at the paths and copy the username from one. e.g. `abcd1234.default` 4. Follow the commands below to enter the browser and username and save the output. An example is shown below. Set your username as the second argument. ```bash + $ cd url_manager + $ source venv/bin/activate # TODO: Make this part of script and config # See also debug $ ./extract_onetab_storage.py Firefox abcd1234.default > var/lib/raw/onetab_firefox_abc_personal.json From 394f378f5b535077346e1dab24d8942b1e232d9c Mon Sep 17 00:00:00 2001 From: Michael Currin Date: Sun, 2 Feb 2020 11:03:17 +0200 Subject: [PATCH 06/14] docs: Add comment --- url_manager/extract_onetab_storage.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/url_manager/extract_onetab_storage.py b/url_manager/extract_onetab_storage.py index f186303..364cf09 100755 --- a/url_manager/extract_onetab_storage.py +++ b/url_manager/extract_onetab_storage.py @@ -25,6 +25,8 @@ # Path to OneTab data within a directory for a browser user. This # is for both Linux and Mac. +# FIXME: This does not work after March 2019. There is a storage.js.migrated +# file and the new one is somewhere else. FIREFOX_ONETAB = "browser-extension-data/extension@one-tab.com/storage.js" CHROME_ONETAB = "Local Storage/leveldb" From 4b6ca75b9a1a8a7abe76c3ff9d1b224b70779732 Mon Sep 17 00:00:00 2001 From: Michael Currin Date: Sun, 2 Feb 2020 17:22:59 +0200 Subject: [PATCH 07/14] docs: Add comment --- docs/browser_onetab_extraction.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/docs/browser_onetab_extraction.md b/docs/browser_onetab_extraction.md index 817b46d..b6101fd 100644 --- a/docs/browser_onetab_extraction.md +++ b/docs/browser_onetab_extraction.md @@ -29,10 +29,13 @@ The approach below parses the `storage.json` and gets the value of 'state' field $ source venv/bin/activate # TODO: Make this part of script and config # See also debug + # NB. This no longer works due to the Firefox OneTab migration. $ ./extract_onetab_storage.py Firefox abcd1234.default > var/lib/raw/onetab_firefox_abc_personal.json ``` 5. Go back to step 3 and repeat for other profiles as desired. +Resources: +- [Profiles - Where Firefox stores your bookmarks, passwords and other user data](https://support.mozilla.org/en-US/kb/profiles-where-firefox-stores-user-data) ## Chrome From b6f256dfb4aa3a5fe08537a40ddde4bdfd7259e2 Mon Sep 17 00:00:00 2001 From: Michael Currin <18750745+MichaelCurrin@users.noreply.github.com> Date: Thu, 17 Jun 2021 11:43:40 +0200 Subject: [PATCH 08/14] Update docs/browser_onetab_extraction.md --- docs/browser_onetab_extraction.md | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/browser_onetab_extraction.md b/docs/browser_onetab_extraction.md index b6101fd..282f570 100644 --- a/docs/browser_onetab_extraction.md +++ b/docs/browser_onetab_extraction.md @@ -35,6 +35,7 @@ The approach below parses the `storage.json` and gets the value of 'state' field 5. Go back to step 3 and repeat for other profiles as desired. Resources: + - [Profiles - Where Firefox stores your bookmarks, passwords and other user data](https://support.mozilla.org/en-US/kb/profiles-where-firefox-stores-user-data) ## Chrome From 6b67bfa460b1dc2f028144aea445599b86cf1ab0 Mon Sep 17 00:00:00 2001 From: Michael Currin <18750745+MichaelCurrin@users.noreply.github.com> Date: Thu, 17 Jun 2021 11:45:00 +0200 Subject: [PATCH 09/14] Update docs/browser_onetab_extraction.md --- docs/browser_onetab_extraction.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/browser_onetab_extraction.md b/docs/browser_onetab_extraction.md index 282f570..4e8195e 100644 --- a/docs/browser_onetab_extraction.md +++ b/docs/browser_onetab_extraction.md @@ -29,7 +29,7 @@ The approach below parses the `storage.json` and gets the value of 'state' field $ source venv/bin/activate # TODO: Make this part of script and config # See also debug - # NB. This no longer works due to the Firefox OneTab migration. + # FIXME. Note this no longer works due to the Firefox OneTab migration. $ ./extract_onetab_storage.py Firefox abcd1234.default > var/lib/raw/onetab_firefox_abc_personal.json ``` 5. Go back to step 3 and repeat for other profiles as desired. From e6cf50209350bf64fd8107c2d8d4e46ed8524f7a Mon Sep 17 00:00:00 2001 From: Michael Currin <18750745+MichaelCurrin@users.noreply.github.com> Date: Thu, 17 Jun 2021 11:49:41 +0200 Subject: [PATCH 10/14] Apply suggestions from code review --- docs/browser_onetab_extraction.md | 2 -- 1 file changed, 2 deletions(-) diff --git a/docs/browser_onetab_extraction.md b/docs/browser_onetab_extraction.md index 4e8195e..14d859f 100644 --- a/docs/browser_onetab_extraction.md +++ b/docs/browser_onetab_extraction.md @@ -27,8 +27,6 @@ The approach below parses the `storage.json` and gets the value of 'state' field ```bash $ cd url_manager $ source venv/bin/activate - # TODO: Make this part of script and config - # See also debug # FIXME. Note this no longer works due to the Firefox OneTab migration. $ ./extract_onetab_storage.py Firefox abcd1234.default > var/lib/raw/onetab_firefox_abc_personal.json ``` From 5dbb17585d3a4ad323d231c94b707ce1381edf25 Mon Sep 17 00:00:00 2001 From: Michael Currin <18750745+MichaelCurrin@users.noreply.github.com> Date: Thu, 17 Jun 2021 11:52:58 +0200 Subject: [PATCH 11/14] Apply suggestions from code review --- docs/browser_onetab_extraction.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/browser_onetab_extraction.md b/docs/browser_onetab_extraction.md index 14d859f..774b1ef 100644 --- a/docs/browser_onetab_extraction.md +++ b/docs/browser_onetab_extraction.md @@ -87,7 +87,7 @@ This approach was initially created as a manual step which can be ignored if the $ # You can view the file if you want. $ view path/to/repo/url_manager/var/lib/raw/chrome_onetab__mycompany_personal.json ``` - ```json + ```json5 { "tabGroups": [ { From 3d9861a7a46f07a59ec3111e7ad9d9469da81306 Mon Sep 17 00:00:00 2001 From: Michael Currin <18750745+MichaelCurrin@users.noreply.github.com> Date: Thu, 17 Jun 2021 11:59:13 +0200 Subject: [PATCH 12/14] Update browser_onetab_extraction.md --- docs/browser_onetab_extraction.md | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/docs/browser_onetab_extraction.md b/docs/browser_onetab_extraction.md index 774b1ef..879831f 100644 --- a/docs/browser_onetab_extraction.md +++ b/docs/browser_onetab_extraction.md @@ -14,6 +14,13 @@ https://anothersite.com | Another title in a new section but with no section hea However, the result is not in a JSON structure and also omits metadata like custom titles and times. Therefore this project's own data export process is preferred. +## Note + +The data storage formats change, there are binary characters to handle and special characters can break the parsing, so you might be better off parsing the saved HTML page using Node or Python, or using the plain text output if the headings aren't important. + +Focusing on the frontend is also much easier to reproduce across Chrome and Firefox with one script. + + ## Firefox Find the location of OneTab data for your Firefox user accounts and make it available in the project. Note that this has only been tested for Firefox and not Firefox Quantum. @@ -21,7 +28,7 @@ Find the location of OneTab data for your Firefox user accounts and make it avai The approach below parses the `storage.json` and gets the value of 'state' field inside it. 1. Open Firefox. -2. Go to the _about:profiles_ page. This will show you your Firefox users. +2. Go to the `about:profiles` page. This will show you your Firefox users. 3. Choose the profile you want, look at the paths and copy the username from one. e.g. `abcd1234.default` 4. Follow the commands below to enter the browser and username and save the output. An example is shown below. Set your username as the second argument. ```bash @@ -36,6 +43,7 @@ Resources: - [Profiles - Where Firefox stores your bookmarks, passwords and other user data](https://support.mozilla.org/en-US/kb/profiles-where-firefox-stores-user-data) + ## Chrome This section is applicable for both Chrome and Chromium browsers. The two may both exist on the same system and both may be imported into the URL Manager application. @@ -54,7 +62,7 @@ The approach below reads the OneTab extension data from Chrome's LevelDB storage $ # Use the full path to the raw directory and then provide a suitable name for the file. $ OUTPUT=~/PATH/TO/REPO/url_manager/var/lib/raw/onetab_chrome_abc_personal.json $ # Set your desired browser and display name as arguments. For example: - $ ./extract_onetab_storage.py Chrome 'Profile 1' > $OUTPUT + $ ./extract_onetab_storage.py Chrome 'Profile 1' > "$OUTPUT" ``` 4. Go back to step 2 and repeat for other browser and profile pairs as desired. From 192a1f69814a455fb9ebec2041fb67d889ff7d67 Mon Sep 17 00:00:00 2001 From: Michael Currin Date: Thu, 17 Jun 2021 12:17:59 +0200 Subject: [PATCH 13/14] style: update extract_onetab_storage.py --- url_manager/extract_onetab_storage.py | 53 ++++++++++++++------------- 1 file changed, 28 insertions(+), 25 deletions(-) diff --git a/url_manager/extract_onetab_storage.py b/url_manager/extract_onetab_storage.py index 364cf09..fad1c9d 100755 --- a/url_manager/extract_onetab_storage.py +++ b/url_manager/extract_onetab_storage.py @@ -11,8 +11,8 @@ """ import argparse import json -import re import os +import re import sys import plyvel @@ -20,7 +20,6 @@ from lib import BROWSER_PROFILE_DIRS from lib.config import AppConf - conf = AppConf() # Path to OneTab data within a directory for a browser user. This @@ -32,7 +31,9 @@ # The plyvel docs recommend referencing LevelDB keys in the # binary form (which is how they are stored). -LEVELDB_ONETAB_KEY = b'_chrome-extension://chphlpgkkbolifaimnlloiipkdnihall\x00\x01state' +LEVELDB_ONETAB_KEY = ( + b"_chrome-extension://chphlpgkkbolifaimnlloiipkdnihall\x00\x01state" +) def parse_leveldb_bytes(data_bytes): @@ -76,14 +77,14 @@ def parse_leveldb_bytes(data_bytes): raw_str = str(data_bytes)[2:-1] # Convert double backlash to single. This handles cases like '\\"' => '\"'. - data_str = raw_str.replace('\\\\', '\\') + data_str = raw_str.replace("\\\\", "\\") # Edgecase - a title with union and intersection. # Repeat 3? why doesn't it pick up later? # TODO TEST MORE. chrome and FF - data_str = data_str.replace('\xe2\x88\xaa', '!!!') + data_str = data_str.replace("\xe2\x88\xaa", "!!!") # TODO TEST MORE. chrome and FF - data_str = data_str.replace('()")', '(∩)') + data_str = data_str.replace('()")', "(∩)") # http://www.personal.psu.edu/ejp10/blogs/gotunicode/2007/09/inserting-the-union-and-inters-1.html # Unescape single quote. data_str = data_str.replace(r"\'", r"'") @@ -98,7 +99,7 @@ def parse_leveldb_bytes(data_bytes): # so in this limited solution we replace that double quote only and not # the double quotes which are functional, since there is no other way for # now. - data_str = data_str.replace(' " ', ' ⍰ ') + data_str = data_str.replace(' " ', " ⍰ ") # Remove any characters which still look like bytes. data_str = re.sub(r"\\x\w\w", "⍰", data_str) @@ -111,16 +112,18 @@ def parse_leveldb_bytes(data_bytes): # Run everytime? print(f"{type(e).__name__}: {str(e)}") - var_dir = conf.get('text_files', 'debug') - raw_path = os.path.join(var_dir, 'leveldb_onetab_raw.json') - cleaned_path = os.path.join(var_dir, 'leveldb_onetab_cleaned.json') - with open(raw_path, 'w') as f_out: + var_dir = conf.get("text_files", "debug") + raw_path = os.path.join(var_dir, "leveldb_onetab_raw.json") + cleaned_path = os.path.join(var_dir, "leveldb_onetab_cleaned.json") + with open(raw_path, "w") as f_out: f_out.writelines(raw_str) - with open(cleaned_path, 'w') as f_out: + with open(cleaned_path, "w") as f_out: f_out.writelines(data_str) print(f"Wrote raw data to: {raw_path}") - print(f"Wrote cleaned data containing JSON formatting error to:" - f" {cleaned_path}") + print( + f"Wrote cleaned data containing JSON formatting error to:" + f" {cleaned_path}" + ) sys.exit(1) @@ -161,9 +164,12 @@ def read_storage(browser, username): username. """ browser_profile_dir = BROWSER_PROFILE_DIRS[browser] - is_chrome_like = (browser.startswith('Chrom')) - in_path = os.path.join(browser_profile_dir, username, - CHROME_ONETAB if is_chrome_like else FIREFOX_ONETAB) + is_chrome_like = browser.startswith("Chrom") + in_path = os.path.join( + browser_profile_dir, + username, + CHROME_ONETAB if is_chrome_like else FIREFOX_ONETAB, + ) if is_chrome_like: db = plyvel.DB(in_path) @@ -173,7 +179,7 @@ def read_storage(browser, username): with open(in_path) as f_in: raw_data = json.load(f_in) # The value within the JSON is a plain string and also needs parsing. - data = json.loads(raw_data['state']) + data = json.loads(raw_data["state"]) return data @@ -184,14 +190,11 @@ def main(): """ parser = argparse.ArgumentParser("OneTab storage extractor") + parser.add_argument("BROWSER", choices=sorted(BROWSER_PROFILE_DIRS.keys())) parser.add_argument( - 'BROWSER', - choices=sorted(BROWSER_PROFILE_DIRS.keys()) - ) - parser.add_argument( - 'USERNAME', + "USERNAME", help="You browser account username. e.g. 'Default' or 'Profile 1' for" - " Chrome or 'abcdef.default' for Firefox. See browser_onetab_extraction.md in docs." + " Chrome or 'abcdef.default' for Firefox. See browser_onetab_extraction.md in docs.", ) args = parser.parse_args() @@ -200,5 +203,5 @@ def main(): print(json.dumps(data, indent=4)) -if __name__ == '__main__': +if __name__ == "__main__": main() From e13f629c8597548727690d14e9712d980f63f895 Mon Sep 17 00:00:00 2001 From: Michael Currin Date: Thu, 17 Jun 2021 12:23:02 +0200 Subject: [PATCH 14/14] update extract_onetab_storage.py --- url_manager/extract_onetab_storage.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/url_manager/extract_onetab_storage.py b/url_manager/extract_onetab_storage.py index 2504c12..4b0226c 100755 --- a/url_manager/extract_onetab_storage.py +++ b/url_manager/extract_onetab_storage.py @@ -82,11 +82,14 @@ def parse_leveldb_bytes(data_bytes): data_str = raw_str.replace("\\\\", "\\") ### + + # TESTING + # Repeat 3? why doesn't it pick up later? # TODO TEST MORE. chrome and FF data_str = data_str.replace("\xe2\x88\xaa", "!!!") # TODO TEST MORE. chrome and FF - data_str = data_str.replace('()")', "(∩)") + data_str = data_str.replace('()")', "(@@@)") # http://www.personal.psu.edu/ejp10/blogs/gotunicode/2007/09/inserting-the-union-and-inters-1.html ###