diff --git a/README.md b/README.md index 9bbb0ca..15dbfcf 100644 --- a/README.md +++ b/README.md @@ -240,6 +240,8 @@ If you run into any issues, consult the logs or reach out on the repository's [I --- # Changelog +- v0.7165 - Parsing improvements + - Improved text formatting & escaping in complex markdown vs. html cases - v0.7614 - Better stock market data fetching from Yahoo Finance - Changes made to `src/api_get_stock_prices_yfinance.py` - => More accurate ticker symbol searches, fallbacks, multi-day data etc. diff --git a/config/config.ini b/config/config.ini index df0e838..0a29864 100644 --- a/config/config.ini +++ b/config/config.ini @@ -125,7 +125,8 @@ Enabled = True # The preferred, more capable model to use by default (e.g., gpt-4o, gpt-4.5-preview). # This model will be used until its daily token limit (PremiumTokenLimit) is reached. -PremiumModel = gpt-4o +# PremiumModel = gpt-4o +PremiumModel = gpt-4.1 # The cheaper model to switch to when the PremiumTokenLimit is reached (e.g., gpt-4o-mini). # This model has its own daily token limit (MiniTokenLimit). diff --git a/src/main.py b/src/main.py index 8047c4f..c4a43ab 100755 --- a/src/main.py +++ b/src/main.py @@ -6,7 +6,7 @@ # https://github.com/FlyingFathead/TelegramBot-OpenAI-API # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # version of this program -version_number = "0.7614" +version_number = "0.7615" # Add the project root directory to Python's path import sys diff --git a/src/modules.py b/src/modules.py index 33a0606..4d53271 100644 --- a/src/modules.py +++ b/src/modules.py @@ -106,47 +106,113 @@ def preserve_html_and_escape_text(text): escaped_text += html.escape(text[last_end:]) return escaped_text -# markdown to html parsing (v0.737.2) +# v0.7615 def markdown_to_html(text): - try: - # Handle the code blocks with optional language specification first - def replace_codeblock(match): - codeblock = match.group(2) # Get the actual code inside the block - language = match.group(1) # Get the language identifier - escaped_code = html.escape(codeblock.strip()) - if language: - return f'
{escaped_code}
' - else: - return f'
{escaped_code}
' - - # Replace code blocks with
 tags
-        text = re.sub(r'```(\w+)?\n([\s\S]*?)```', replace_codeblock, text)
-
-        # Now handle Markdown links and convert them to HTML
-        def replace_markdown_link(match):
-            link_text = match.group(1)  # The text to display
-            url = match.group(2)  # The URL
-            return f'{html.escape(link_text)}'
-
-        # Replace Markdown links [text](url) with HTML  tags
-        text = re.sub(r'\[([^\]]+)\]\(([^)]+)\)', replace_markdown_link, text)
-
-        # Handle inline code and other markdown elements
-        text = re.sub(r'\*\*(.*?)\*\*', r'\1', text)
-        text = re.sub(r'\*(.*?)\*', r'\1', text)
-        text = re.sub(r'_(.*?)_', r'\1', text)
-        text = re.sub(r'`([^`]*)`', r'\1', text)
-        text = re.sub(r'######\s*(.*)', r'➤ \1', text)
-        text = re.sub(r'#####\s*(.*)', r'➤ \1', text)
-        text = re.sub(r'####\s*(.*)', r'➤ \1', text)
-        text = re.sub(r'###\s*(.*)', r'➤ \1', text)
-        text = re.sub(r'##\s*(.*)', r'➤ \1', text)
-        text = re.sub(r'#\s*(.*)', r'➤ \1', text)
-
-        return text
+    """
+    Convert a simple subset of Markdown to HTML,
+    ensuring that code blocks are extracted first so they
+    don't get accidentally transformed by heading/bold/italic rules.
+    """
+    # 1) Extract code blocks into placeholders
+    code_blocks = []
+
+    def extract_codeblock(match):
+        language = match.group(1) or ""   # i.e. "python"
+        code_body = match.group(2)       # the code text
+        code_blocks.append((language, code_body))
+        placeholder_index = len(code_blocks) - 1
+        # Return a placeholder token like [CODEBLOCK_0]
+        return f"[CODEBLOCK_{placeholder_index}]"
+
+    # Regex: triple backticks with optional language
+    # Use DOTALL ([\s\S]) so it can capture newlines
+    text = re.sub(
+        r'```(\w+)?\n([\s\S]*?)```',
+        extract_codeblock,
+        text
+    )
+
+    # 2) Now do the normal Markdown parsing on whatever’s left (outside code blocks)
+
+    # Headings: only match at the start of lines (via ^) and multiline
+    text = re.sub(r'^(######)\s+(.*)', r'➤ \2', text, flags=re.MULTILINE)
+    text = re.sub(r'^(#####)\s+(.*)', r'➤ \2', text, flags=re.MULTILINE)
+    text = re.sub(r'^(####)\s+(.*)', r'➤ \2', text, flags=re.MULTILINE)
+    text = re.sub(r'^(###)\s+(.*)', r'➤ \2', text, flags=re.MULTILINE)
+    text = re.sub(r'^(##)\s+(.*)',  r'➤ \2', text, flags=re.MULTILINE)
+    text = re.sub(r'^#\s+(.*)',     r'➤ \1', text, flags=re.MULTILINE)
+
+    # Links of the form [text](url)
+    def replace_markdown_link(m):
+        link_text = m.group(1)
+        url = m.group(2)
+        # Escape any HTML entities in the URL or text
+        return f'{html.escape(link_text)}'
+    text = re.sub(r'\[([^\]]+)\]\(([^)]+)\)', replace_markdown_link, text)
+
+    # Bold
+    text = re.sub(r'\*\*(.*?)\*\*', r'\1', text)
+
+    # Italics: also handle both `*text*` and `_text_`
+    text = re.sub(r'\*(.*?)\*', r'\1', text)
+    text = re.sub(r'_(.*?)_',  r'\1', text)
+
+    # Inline code with single backticks
+    text = re.sub(r'`([^`]*)`', r'\1', text)
+
+    # 3) Re‐insert the code blocks
+    for i, (language, code_body) in enumerate(code_blocks):
+        escaped_code = html.escape(code_body.strip())
+        if language:
+            block_html = f'
{escaped_code}
' + else: + block_html = f'
{escaped_code}
' + # Replace [CODEBLOCK_i] with the final
 block
+        text = text.replace(f"[CODEBLOCK_{i}]", block_html, 1)
 
-    except Exception as e:
-        return str(e)
+    return text
+
+# # markdown to html parsing (v0.737.2)
+# def markdown_to_html(text):
+#     try:
+#         # Handle the code blocks with optional language specification first
+#         def replace_codeblock(match):
+#             codeblock = match.group(2)  # Get the actual code inside the block
+#             language = match.group(1)  # Get the language identifier
+#             escaped_code = html.escape(codeblock.strip())
+#             if language:
+#                 return f'
{escaped_code}
' +# else: +# return f'
{escaped_code}
' + +# # Replace code blocks with
 tags
+#         text = re.sub(r'```(\w+)?\n([\s\S]*?)```', replace_codeblock, text)
+
+#         # Now handle Markdown links and convert them to HTML
+#         def replace_markdown_link(match):
+#             link_text = match.group(1)  # The text to display
+#             url = match.group(2)  # The URL
+#             return f'{html.escape(link_text)}'
+
+#         # Replace Markdown links [text](url) with HTML  tags
+#         text = re.sub(r'\[([^\]]+)\]\(([^)]+)\)', replace_markdown_link, text)
+
+#         # Handle inline code and other markdown elements
+#         text = re.sub(r'\*\*(.*?)\*\*', r'\1', text)
+#         text = re.sub(r'\*(.*?)\*', r'\1', text)
+#         text = re.sub(r'_(.*?)_', r'\1', text)
+#         text = re.sub(r'`([^`]*)`', r'\1', text)
+#         text = re.sub(r'######\s*(.*)', r'➤ \1', text)
+#         text = re.sub(r'#####\s*(.*)', r'➤ \1', text)
+#         text = re.sub(r'####\s*(.*)', r'➤ \1', text)
+#         text = re.sub(r'###\s*(.*)', r'➤ \1', text)
+#         text = re.sub(r'##\s*(.*)', r'➤ \1', text)
+#         text = re.sub(r'#\s*(.*)', r'➤ \1', text)
+
+#         return text
+
+#     except Exception as e:
+#         return str(e)
 
 # Check and update the global rate limit.
 def check_global_rate_limit(max_requests_per_minute, global_request_count, rate_limit_reset_time):