diff --git a/kindle2notion/exporting.py b/kindle2notion/exporting.py index 25bee93..930af65 100644 --- a/kindle2notion/exporting.py +++ b/kindle2notion/exporting.py @@ -16,11 +16,11 @@ def export_to_notion( - all_books: Dict, - enable_highlight_date: bool, - enable_book_cover: bool, - notion_api_auth_token: str, - notion_database_id: str, + all_books: Dict, + enable_highlight_date: bool, + enable_book_cover: bool, + notion_api_auth_token: str, + notion_database_id: str, ) -> None: print("Initiating transfer...\n") @@ -48,7 +48,7 @@ def export_to_notion( def _prepare_aggregated_text_for_one_book( - clippings: List, enable_highlight_date: bool + clippings: List, enable_highlight_date: bool ) -> Tuple[str, str]: # TODO: Special case for books with len(clippings) >= 100 characters. Character limit in a Paragraph block in Notion is 100 formatted_clippings = [] @@ -77,17 +77,17 @@ def _prepare_aggregated_text_for_one_book( def _add_book_to_notion( - title: str, - author: str, - clippings_count: int, - formatted_clippings: list, - last_date: str, - notion_api_auth_token: str, - notion_database_id: str, - enable_book_cover: bool, + title: str, + author: str, + clippings_count: int, + formatted_clippings: list, + last_date_string: str, + notion_api_auth_token: str, + notion_database_id: str, + enable_book_cover: bool, ): notion = notional.connect(auth=notion_api_auth_token) - last_date = datetime.strptime(last_date, "%A, %d %B %Y %I:%M:%S %p") + last_date = __get_last_date_from_string(last_date_string) # Condition variables title_exists = False @@ -174,6 +174,15 @@ def _add_book_to_notion( return message +def __get_last_date_from_string(last_date_string: str) -> datetime: + if not last_date_string: + return datetime.now() + try: + return datetime.strptime(last_date_string, "%A, %d %B %Y %I:%M:%S %p") + except ValueError: + # Datetime format is not English, retrying with non AM-PM format + return datetime.strptime(last_date_string, "%A, %d %B %Y %H:%M:%S") + # def _create_rich_text_object(text): # if "Note: " in text: # # Bold text diff --git a/kindle2notion/languages/__init__.py b/kindle2notion/languages/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/kindle2notion/languages/enums.py b/kindle2notion/languages/enums.py new file mode 100644 index 0000000..f99737c --- /dev/null +++ b/kindle2notion/languages/enums.py @@ -0,0 +1,38 @@ +from enum import Enum + + +class Locale(Enum): + # Enum containing languages + ENGLISH = "en" + SPANISH = "es" + + def __str__(self): + return self.value + + +class Word(Enum): + # For each word, we have to handle different languages + NOTE = { + Locale.ENGLISH: "note", + Locale.SPANISH: "nota" + } + LOCATION = { + Locale.ENGLISH: "location", + Locale.SPANISH: "posición", + } + PAGE = { + Locale.ENGLISH: "page", + Locale.SPANISH: "página", + } + DATE_ADDED = { + Locale.ENGLISH: "added on", + Locale.SPANISH: "añadido el", + } + # Date formats also depend on language + DATE_FORMAT = { + Locale.ENGLISH: "%A, %d %B %Y %I:%M:%S %p", + Locale.SPANISH: "%A, %d %B %Y %H:%M:%S", + } + + def __str__(self, language=Locale.ENGLISH): + return self.value[language] diff --git a/kindle2notion/languages/word_detector.py b/kindle2notion/languages/word_detector.py new file mode 100644 index 0000000..2b38126 --- /dev/null +++ b/kindle2notion/languages/word_detector.py @@ -0,0 +1,24 @@ +from typing import List + +from kindle2notion.languages.enums import Word, Locale + + +class WordDetector: + + def __init__(self, languages: List[Locale]): + self.languages = languages + self.language_words = {lang: set() for lang in languages} + + for word in Word: + for lang in word.value: + self.language_words[lang].add(word.value[lang]) + + def detect(self, text): + scores = {lang: 0 for lang in self.languages} + for lang, words in self.language_words.items(): + scores[lang] = sum([len(word) for word in words if self.has_word(text, word)]) + return max(scores, key=scores.get) + + def has_word(self, text, word): + return word.lower() in text.lower() + diff --git a/kindle2notion/parsing.py b/kindle2notion/parsing.py index dea4435..12a850d 100644 --- a/kindle2notion/parsing.py +++ b/kindle2notion/parsing.py @@ -1,8 +1,12 @@ +from datetime import datetime from re import findall from typing import Dict, List, Tuple from dateparser import parse +from kindle2notion.languages.word_detector import WordDetector +from kindle2notion.languages.enums import Locale, Word + BOOKS_WO_AUTHORS = [] ACADEMIC_TITLES = [ @@ -80,6 +84,8 @@ DELIMITERS = ["; ", " & ", " and "] +WORD_DETECTOR = WordDetector([language for language in Locale]) + def parse_raw_clippings_text(raw_clippings_text: str) -> Dict: raw_clippings_list = raw_clippings_text.split("==========") @@ -126,26 +132,34 @@ def _parse_page_location_date_and_note( second_line_as_list = second_line.strip().split(" | ") page = location = date = "" is_note = False - for element in second_line_as_list: element = element.lower() - if "note" in element: + language: Locale = WORD_DETECTOR.detect(element) + if Word.NOTE.value[language] in element: is_note = True - if "page" in element: - page = element[element.find("page") :].replace("page", "").strip() - if "location" in element: - location = ( - element[element.find("location") :].replace("location", "").strip() - ) - if "added on" in element: - date = parse( - element[element.find("added on") :].replace("added on", "").strip() + if is_word_in_element(element, language, Word.PAGE): + page = _parse_word_from_element(element, language, Word.PAGE) + if is_word_in_element(element, language, Word.LOCATION): + location = _parse_word_from_element(element, language, Word.LOCATION) + if is_word_in_element(element, language, Word.DATE_ADDED): + date_string = _parse_word_from_element(element, language, Word.DATE_ADDED) + date_parsed: datetime = parse( + date_string, languages=[language.value for language in Locale] ) - date = date.strftime("%A, %d %B %Y %I:%M:%S %p") + date = date_parsed.strftime(Word.DATE_FORMAT.value[language]) return page, location, date, is_note +def is_word_in_element(element: str, language: Locale, word: Word): + return word.value[language] in element + + +def _parse_word_from_element(element: str, language: Locale, word: Word): + word_value_in_language = word.value[language] + return element[element.find(word_value_in_language):].replace(word_value_in_language, "").strip() + + def _add_parsed_items_to_all_books_dict( all_books: Dict, title: str, diff --git a/setup.py b/setup.py index 22a4545..d511524 100644 --- a/setup.py +++ b/setup.py @@ -11,7 +11,7 @@ setup( name="kindle2notion", - version="1.0.1", + version="1.0.2", author="Jeffrey Jacob", author_email="jeffreysamjacob@gmail.com", description="Export all the clippings from your Kindle device to a database in Notion.", diff --git a/tests/test_exporting.py b/tests/test_exporting.py index 6b4a995..c190dc5 100644 --- a/tests/test_exporting.py +++ b/tests/test_exporting.py @@ -71,3 +71,43 @@ def test_prepare_aggregated_text_for_one_book_should_return_the_aggregated_text_ print(actual) # Then assert expected == actual + + +def test_when_date_is_not_ampm_format_then_aggregated_text_should_return_appropiate_date(): + # Given + highlights = [ + ( + "This is an example highlight.", + "1", + "100", + "jueves, 24 de agosto de 2023 7:28:38", + False, + ), + ( + "This is a second example highlight.", + "2", + "200", + "viernes, 25 de agosto de 2023 7:28:38", + True, + ), + ] + + expected = ( + [ + "This is an example highlight.\n* Page: 1, Location: 100, Date Added: jueves, 24 de agosto de 2023 7:28:38\n\n", + "> NOTE: \nThis is a second example highlight.\n* Page: 2, Location: 200, Date Added: viernes, 25 de agosto de 2023 7:28:38\n\n", + ], + "viernes, 25 de agosto de 2023 7:28:38", + ) + + # When + actual = _prepare_aggregated_text_for_one_book( + highlights, enable_highlight_date=True + ) + print(actual) + # Then + assert expected == actual + + +def test_when_date_is_not_ampm_format_then_aggregated_text_should_not_give_valueerror(): + pass \ No newline at end of file diff --git a/tests/test_parsing.py b/tests/test_parsing.py index 26e0be0..536d3ec 100644 --- a/tests/test_parsing.py +++ b/tests/test_parsing.py @@ -333,3 +333,20 @@ def test_add_parsed_items_to_books_dict_should_add_the_parsed_items_when_the_boo # Then assert expected == actual + +def test_parse_date_when_format_does_not_include_am_pm(): + # Given + raw_clipping_list = [ + "Relativity (Einstein, Albert)", + "- La subrayado en la posición 558-560 | Añadido el viernes, 25 de agosto de 2023 7:28:38", + "", + "This is a test highlight.", + False, + ] + expected = ("3", "", "Friday, 30 April 2021 12:31:29 AM", False) + + # When + actual = _parse_page_location_date_and_note(raw_clipping_list) + + # Then + assert expected == actual