Azure-Samples · Copilot · Jul 28, 2025 · Jul 28, 2025 · Jul 28, 2025 · Jul 28, 2025
diff --git a/.gitignore b/.gitignore
@@ -111,6 +111,7 @@ celerybeat.pid
 # Environments
 .env
 .venv
+.venv_*
 .evalenv
 env/
 venv/

diff --git a/app/backend/prepdocslib/textsplitter.py b/app/backend/prepdocslib/textsplitter.py
@@ -76,7 +76,15 @@ def split_pages(self, pages: list[Page]) -> Generator[SplitPage, None, None]:
 CJK_SENTENCE_ENDINGS = ["。", "！", "？", "‼", "⁇", "⁈", "⁉"]
 
 # NB: text-embedding-3-XX is the same BPE as text-embedding-ada-002
-bpe = tiktoken.encoding_for_model(ENCODING_MODEL)
+_bpe = None
+
+
+def get_encoding():
+    """Get the tiktoken encoding, loading it lazily when first needed."""
+    global _bpe
+    if _bpe is None:
+        _bpe = tiktoken.encoding_for_model(ENCODING_MODEL)
+    return _bpe
 
 DEFAULT_OVERLAP_PERCENT = 10  # See semantic search article for 10% overlap performance
 DEFAULT_SECTION_LENGTH = 1000  # Roughly 400-500 tokens for English
@@ -99,7 +107,7 @@ def split_page_by_max_tokens(self, page_num: int, text: str) -> Generator[SplitP
         """
         Recursively splits page by maximum number of tokens to better handle languages with higher token/word ratios.
         """
-        tokens = bpe.encode(text)
+        tokens = get_encoding().encode(text)
         if len(tokens) <= self.max_tokens_per_section:
             # Section is already within max tokens, return
             yield SplitPage(page_num=page_num, text=text)
-Original file line number
+Diff line change
@@ Expand Up / @@ -111,6 +111,7 @@ celerybeat.pid @@
     # Environments
     .env
     .venv
+    .venv_*
     .evalenv
     env/
     venv/
@@ Expand Down @@