diff --git a/url_manager/extract_onetab_storage.py b/url_manager/extract_onetab_storage.py index ab05742..4b0226c 100755 --- a/url_manager/extract_onetab_storage.py +++ b/url_manager/extract_onetab_storage.py @@ -3,7 +3,9 @@ OneTab storage extractor. Parse OneTab data stored in Firefox's JSON file or Chrome's LevelDB -storage, then pretty print the OneTab user data as JSON. +storage, then pretty print the OneTab user data as JSON. Data is cleaned as +part of parsing but no restructuring is done. + See the docs/browsers_onetab_extraction.md file for instructions. """ @@ -22,6 +24,8 @@ # Path to OneTab data within a directory for a browser user. This # is for both Linux and Mac. +# FIXME: This does not work after March 2019. There is a storage.js.migrated +# file and the new one is somewhere else. FIREFOX_ONETAB = "browser-extension-data/extension@one-tab.com/storage.js" CHROME_ONETAB = "Local Storage/leveldb" @@ -77,7 +81,21 @@ def parse_leveldb_bytes(data_bytes): # Convert double backlash to single. This handles cases like '\\"' => '\"'. data_str = raw_str.replace("\\\\", "\\") - # Edgecase handled by inspection on a title about union and intersection, + ### + + # TESTING + + # Repeat 3? why doesn't it pick up later? + # TODO TEST MORE. chrome and FF + data_str = data_str.replace("\xe2\x88\xaa", "!!!") + # TODO TEST MORE. chrome and FF + data_str = data_str.replace('()")', "(@@@)") + # http://www.personal.psu.edu/ejp10/blogs/gotunicode/2007/09/inserting-the-union-and-inters-1.html + + ### + + # Edgecase handled by inspection on a title about union and intersection. + data_str = data_str.replace('(*")', "(&)") data_str = data_str.replace('()")', "(|)") @@ -104,6 +122,7 @@ def parse_leveldb_bytes(data_bytes): try: return json.loads(data_str) except json.JSONDecodeError as e: + # Run everytime? print(f"{type(e).__name__}: {str(e)}") var_dir = conf.get("text_files", "debug")