https-deeplearning-ai · tribumirkov · Dec 30, 2025
diff --git a/backend/app.py b/backend/app.py
@@ -61,10 +61,10 @@ async def query_documents(request: QueryRequest):
         session_id = request.session_id
         if not session_id:
             session_id = rag_system.session_manager.create_session()
-        
+
         # Process query using RAG system
         answer, sources = rag_system.query(request.query, session_id)
-        
+
         return QueryResponse(
             answer=answer,
             sources=sources,
@@ -113,7 +113,7 @@ async def get_response(self, path: str, scope):
             response.headers["Pragma"] = "no-cache"
             response.headers["Expires"] = "0"
         return response
-    
-    
+
+
 # Serve static files for the frontend
 app.mount("/", StaticFiles(directory="../frontend", html=True), name="static")
diff --git a/backend/config.py b/backend/config.py
@@ -11,16 +11,16 @@ class Config:
     # Anthropic API settings
     ANTHROPIC_API_KEY: str = os.getenv("ANTHROPIC_API_KEY", "")
     ANTHROPIC_MODEL: str = "claude-sonnet-4-20250514"
-    
+
     # Embedding model settings
     EMBEDDING_MODEL: str = "all-MiniLM-L6-v2"
-    
+
     # Document processing settings
     CHUNK_SIZE: int = 800       # Size of text chunks for vector storage
     CHUNK_OVERLAP: int = 100     # Characters to overlap between chunks
     MAX_RESULTS: int = 5         # Maximum search results to return
     MAX_HISTORY: int = 2         # Number of conversation messages to remember
-    
+
     # Database paths
     CHROMA_PATH: str = "./chroma_db"  # ChromaDB storage location
 

diff --git a/backend/document_processor.py b/backend/document_processor.py
@@ -5,11 +5,11 @@
 
 class DocumentProcessor:
     """Processes course documents and extracts structured information"""
-    
+
     def __init__(self, chunk_size: int, chunk_overlap: int):
         self.chunk_size = chunk_size
         self.chunk_overlap = chunk_overlap
-    
+
     def read_file(self, file_path: str) -> str:
         """Read content from file with UTF-8 encoding"""
         try:
@@ -19,56 +19,56 @@ def read_file(self, file_path: str) -> str:
             # If UTF-8 fails, try with error handling
             with open(file_path, 'r', encoding='utf-8', errors='ignore') as file:
                 return file.read()
-    
+
 
 
     def chunk_text(self, text: str) -> List[str]:
         """Split text into sentence-based chunks with overlap using config settings"""
-        
+
         # Clean up the text
         text = re.sub(r'\s+', ' ', text.strip())  # Normalize whitespace
-        
+
         # Better sentence splitting that handles abbreviations
         # This regex looks for periods followed by whitespace and capital letters
         # but ignores common abbreviations
         sentence_endings = re.compile(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\!|\?)\s+(?=[A-Z])')
         sentences = sentence_endings.split(text)
-        
+
         # Clean sentences
         sentences = [s.strip() for s in sentences if s.strip()]
-        
+
         chunks = []
         i = 0
-        
+
         while i < len(sentences):
             current_chunk = []
             current_size = 0
-            
+
             # Build chunk starting from sentence i
             for j in range(i, len(sentences)):
                 sentence = sentences[j]
-                
+
                 # Calculate size with space
                 space_size = 1 if current_chunk else 0
                 total_addition = len(sentence) + space_size
-                
+
                 # Check if adding this sentence would exceed chunk size
                 if current_size + total_addition > self.chunk_size and current_chunk:
                     break
-                
+
                 current_chunk.append(sentence)
                 current_size += total_addition
-            
+
             # Add chunk if we have content
             if current_chunk:
                 chunks.append(' '.join(current_chunk))
-                
+
                 # Calculate overlap for next chunk
                 if hasattr(self, 'chunk_overlap') and self.chunk_overlap > 0:
                     # Find how many sentences to overlap
                     overlap_size = 0
                     overlap_sentences = 0
-                    
+
                     # Count backwards from end of current chunk
                     for k in range(len(current_chunk) - 1, -1, -1):
                         sentence_len = len(current_chunk[k]) + (1 if k < len(current_chunk) - 1 else 0)
@@ -77,7 +77,7 @@ def chunk_text(self, text: str) -> List[str]:
                             overlap_sentences += 1
                         else:
                             break
-                    
+
                     # Move start position considering overlap
                     next_start = i + len(current_chunk) - overlap_sentences
                     i = max(next_start, i + 1)  # Ensure we make progress
@@ -87,13 +87,13 @@ def chunk_text(self, text: str) -> List[str]:
             else:
                 # No sentences fit, move to next
                 i += 1
-        
+
         return chunks
 
 
 
 
-    
+
     def process_course_document(self, file_path: str) -> Tuple[Course, List[CourseChunk]]:
         """
         Process a course document with expected format:
@@ -104,67 +104,67 @@ def process_course_document(self, file_path: str) -> Tuple[Course, List[CourseCh
         """
         content = self.read_file(file_path)
         filename = os.path.basename(file_path)
-        
+
         lines = content.strip().split('\n')
-        
+
         # Extract course metadata from first three lines
         course_title = filename  # Default fallback
         course_link = None
         instructor_name = "Unknown"
-        
+
         # Parse course title from first line
         if len(lines) >= 1 and lines[0].strip():
             title_match = re.match(r'^Course Title:\s*(.+)$', lines[0].strip(), re.IGNORECASE)
             if title_match:
                 course_title = title_match.group(1).strip()
             else:
                 course_title = lines[0].strip()
-        
+
         # Parse remaining lines for course metadata
         for i in range(1, min(len(lines), 4)):  # Check first 4 lines for metadata
             line = lines[i].strip()
             if not line:
                 continue
-                
+
             # Try to match course link
             link_match = re.match(r'^Course Link:\s*(.+)$', line, re.IGNORECASE)
             if link_match:
                 course_link = link_match.group(1).strip()
                 continue
-                
+
             # Try to match instructor
             instructor_match = re.match(r'^Course Instructor:\s*(.+)$', line, re.IGNORECASE)
             if instructor_match:
                 instructor_name = instructor_match.group(1).strip()
                 continue
-        
+
         # Create course object with title as ID
         course = Course(
             title=course_title,
             course_link=course_link,
             instructor=instructor_name if instructor_name != "Unknown" else None
         )
-        
+
         # Process lessons and create chunks
         course_chunks = []
         current_lesson = None
         lesson_title = None
         lesson_link = None
         lesson_content = []
         chunk_counter = 0
-        
+
         # Start processing from line 4 (after metadata)
         start_index = 3
         if len(lines) > 3 and not lines[3].strip():
             start_index = 4  # Skip empty line after instructor
-        
+
         i = start_index
         while i < len(lines):
             line = lines[i]
-            
+
             # Check for lesson markers (e.g., "Lesson 0: Introduction")
             lesson_match = re.match(r'^Lesson\s+(\d+):\s*(.+)$', line.strip(), re.IGNORECASE)
-            
+
             if lesson_match:
                 # Process previous lesson if it exists
                 if current_lesson is not None and lesson_content:
@@ -177,7 +177,7 @@ def process_course_document(self, file_path: str) -> Tuple[Course, List[CourseCh
                             lesson_link=lesson_link
                         )
                         course.lessons.append(lesson)
-                        
+
                         # Create chunks for this lesson
                         chunks = self.chunk_text(lesson_text)
                         for idx, chunk in enumerate(chunks):
@@ -186,7 +186,7 @@ def process_course_document(self, file_path: str) -> Tuple[Course, List[CourseCh
                                 chunk_with_context = f"Lesson {current_lesson} content: {chunk}"
                             else:
                                 chunk_with_context = chunk
-                            
+
                             course_chunk = CourseChunk(
                                 content=chunk_with_context,
                                 course_title=course.title,
@@ -195,27 +195,27 @@ def process_course_document(self, file_path: str) -> Tuple[Course, List[CourseCh
                             )
                             course_chunks.append(course_chunk)
                             chunk_counter += 1
-                
+
                 # Start new lesson
                 current_lesson = int(lesson_match.group(1))
                 lesson_title = lesson_match.group(2).strip()
                 lesson_link = None
-                
+
                 # Check if next line is a lesson link
                 if i + 1 < len(lines):
                     next_line = lines[i + 1].strip()
                     link_match = re.match(r'^Lesson Link:\s*(.+)$', next_line, re.IGNORECASE)
                     if link_match:
                         lesson_link = link_match.group(1).strip()
                         i += 1  # Skip the link line so it's not added to content
-                
+
                 lesson_content = []
             else:
                 # Add line to current lesson content
                 lesson_content.append(line)
-                
+
             i += 1
-        
+
         # Process the last lesson
         if current_lesson is not None and lesson_content:
             lesson_text = '\n'.join(lesson_content).strip()
@@ -226,13 +226,13 @@ def process_course_document(self, file_path: str) -> Tuple[Course, List[CourseCh
                     lesson_link=lesson_link
                 )
                 course.lessons.append(lesson)
-                
+
                 chunks = self.chunk_text(lesson_text)
                 for idx, chunk in enumerate(chunks):
                     # For any chunk of each lesson, add lesson context & course title
-                  
+
                     chunk_with_context = f"Course {course_title} Lesson {current_lesson} content: {chunk}"
-                    
+
                     course_chunk = CourseChunk(
                         content=chunk_with_context,
                         course_title=course.title,
@@ -241,7 +241,7 @@ def process_course_document(self, file_path: str) -> Tuple[Course, List[CourseCh
                     )
                     course_chunks.append(course_chunk)
                     chunk_counter += 1
-        
+
         # If no lessons found, treat entire content as one document
         if not course_chunks and len(lines) > 2:
             remaining_content = '\n'.join(lines[start_index:]).strip()
@@ -255,5 +255,5 @@ def process_course_document(self, file_path: str) -> Tuple[Course, List[CourseCh
                     )
                     course_chunks.append(course_chunk)
                     chunk_counter += 1
-        
+
         return course, course_chunks