drmingler
diff --git a/‎document_converter/route.py‎
Lines changed: 143 additions & 64 deletions b/‎document_converter/route.py‎
Lines changed: 143 additions & 64 deletions
diff --git a/‎document_converter/schema.py‎
Lines changed: 6 additions & 1 deletion b/‎document_converter/schema.py‎
Lines changed: 6 additions & 1 deletion
@@ -47,20 +47,32 @@ async def convert_single_document(
         le=4,
         description="Scale factor for image resolution (1-4)"
     ),
+    include_page_numbers: bool = Query(
+        False,
+        description="Whether to include page numbers in the markdown"
+    ),
 ):
-    file_bytes = await document.read()
-    if not is_file_format_supported(file_bytes, document.filename):
+    try:
+        # Read the file content
+        file_content = await document.read()
+        
+        # Convert the document
+        result = document_converter_service.convert_document(
+            document=(document.filename, BytesIO(file_content)),
+            extract_tables=extract_tables_as_images,
+            image_resolution_scale=image_resolution_scale,
+            include_page_numbers=include_page_numbers,
+        )
+        
+        # Return the result
+        return result
+    except Exception as e:
+        logging.error(f"Error in convert_single_document: {str(e)}")
         raise HTTPException(
-            status_code=status.HTTP_400_BAD_REQUEST,
-            detail=f"Unsupported file format: {document.filename}"
+            status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
+            detail=f"Error converting document: {str(e)}"
         )
 
-    return document_converter_service.convert_document(
-        (document.filename, BytesIO(file_bytes)),
-        extract_tables=extract_tables_as_images,
-        image_resolution_scale=image_resolution_scale,
-    )
-
 
 @router.post(
     '/documents/batch-convert',
@@ -86,22 +98,34 @@ async def convert_multiple_documents(
         le=4,
         description="Scale factor for image resolution (1-4)"
     ),
+    include_page_numbers: bool = Query(
+        True,
+        description="Whether to include page numbers in the markdown"
+    ),
 ):
-    doc_streams = []
-    for document in documents:
-        file_bytes = await document.read()
-        if not is_file_format_supported(file_bytes, document.filename):
-            raise HTTPException(
-                status_code=status.HTTP_400_BAD_REQUEST,
-                detail=f"Unsupported file format: {document.filename}"
-            )
-        doc_streams.append((document.filename, BytesIO(file_bytes)))
-
-    return document_converter_service.convert_documents(
-        doc_streams,
-        extract_tables=extract_tables_as_images,
-        image_resolution_scale=image_resolution_scale,
-    )
+    try:
+        # Read all files and prepare for batch conversion
+        document_data = []
+        for document in documents:
+            file_content = await document.read()
+            document_data.append((document.filename, BytesIO(file_content)))
+        
+        # Convert all documents
+        results = document_converter_service.convert_documents(
+            documents=document_data,
+            extract_tables=extract_tables_as_images,
+            image_resolution_scale=image_resolution_scale,
+            include_page_numbers=include_page_numbers,
+        )
+        
+        # Return the results
+        return results
+    except Exception as e:
+        logging.error(f"Error in convert_multiple_documents: {str(e)}")
+        raise HTTPException(
+            status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
+            detail=f"Error converting documents: {str(e)}"
+        )
 
 
 # Asynchronous conversion jobs endpoints
@@ -128,24 +152,36 @@ async def create_single_document_conversion_job(
         le=4,
         description="Scale factor for image resolution (1-4)"
     ),
+    include_page_numbers: bool = Query(
+        True,
+        description="Whether to include page numbers in the markdown"
+    ),
 ):
-    file_bytes = await document.read()
-    if not is_file_format_supported(file_bytes, document.filename):
-        raise HTTPException(
-            status_code=status.HTTP_400_BAD_REQUEST,
-            detail=f"Unsupported file format: {document.filename}"
+    try:
+        # Read the file content
+        file_content = await document.read()
+        
+        # Import the task function
+        from worker.tasks import convert_document_task
+        
+        # Queue the conversion task
+        task = convert_document_task.delay(
+            document=(document.filename, file_content),
+            extract_tables=extract_tables_as_images,
+            image_resolution_scale=image_resolution_scale,
+            include_page_numbers=include_page_numbers,
         )
 
-    task = convert_document_task.delay(
-        (document.filename, file_bytes),
-        extract_tables=extract_tables_as_images,
-        image_resolution_scale=image_resolution_scale,
-    )
-
-    return ConversionJobResult(
-        job_id=task.id,
-        status="IN_PROGRESS"
-    )
+        return ConversionJobResult(
+            job_id=task.id,
+            status="IN_PROGRESS"
+        )
+    except Exception as e:
+        logging.error(f"Error in create_single_document_conversion_job: {str(e)}")
+        raise HTTPException(
+            status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
+            detail=f"Error creating conversion job: {str(e)}"
+        )
 
 
 @router.get(
@@ -159,9 +195,19 @@ async def create_single_document_conversion_job(
     },
     description="Get the status and result of a single document conversion job",
 )
-async def get_conversion_job_status(job_id: str):
+async def get_conversion_job_status(
+    job_id: str,
+    include_page_numbers: bool = Query(
+        True,
+        description="Whether to include page numbers in the markdown"
+    ),
+):
     try:
-        result = document_converter_service.get_single_document_task_result(job_id)
+        # Attempt to get the job status and result
+        result = document_converter_service.get_single_document_task_result(
+            job_id=job_id,
+            include_page_numbers=include_page_numbers,
+        )
 
         # Return 202 Accepted if job is still in progress
         if result.status in ["IN_PROGRESS"]:
@@ -212,27 +258,39 @@ async def create_batch_conversion_job(
         le=4,
         description="Scale factor for image resolution (1-4)"
     ),
+    include_page_numbers: bool = Query(
+        True,
+        description="Whether to include page numbers in the markdown"
+    ),
 ):
-    doc_data = []
-    for document in documents:
-        file_bytes = await document.read()
-        if not is_file_format_supported(file_bytes, document.filename):
-            raise HTTPException(
-                status_code=status.HTTP_400_BAD_REQUEST,
-                detail=f"Unsupported file format: {document.filename}"
-            )
-        doc_data.append((document.filename, file_bytes))
-
-    task = convert_documents_task.delay(
-        doc_data,
-        extract_tables=extract_tables_as_images,
-        image_resolution_scale=image_resolution_scale,
-    )
+    try:
+        # Read all files and prepare for batch conversion
+        document_data = []
+        for document in documents:
+            file_content = await document.read()
+            document_data.append((document.filename, file_content))
+        
+        # Import the task function
+        from worker.tasks import convert_documents_task
+        
+        # Queue the batch conversion task
+        task = convert_documents_task.delay(
+            documents=document_data,
+            extract_tables=extract_tables_as_images,
+            image_resolution_scale=image_resolution_scale,
+            include_page_numbers=include_page_numbers,
+        )
 
-    return BatchConversionJobResult(
-        job_id=task.id,
-        status="IN_PROGRESS"
-    )
+        return BatchConversionJobResult(
+            job_id=task.id,
+            status="IN_PROGRESS"
+        )
+    except Exception as e:
+        logging.error(f"Error in create_batch_conversion_job: {str(e)}")
+        raise HTTPException(
+            status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
+            detail=f"Error creating batch conversion job: {str(e)}"
+        )
 
 
 @router.get(
@@ -246,9 +304,19 @@ async def create_batch_conversion_job(
     },
     description="Get the status and results of a batch conversion job",
 )
-async def get_batch_conversion_job_status(job_id: str):
+async def get_batch_conversion_job_status(
+    job_id: str,
+    include_page_numbers: bool = Query(
+        True,
+        description="Whether to include page numbers in the markdown"
+    ),
+):
     try:
-        result = document_converter_service.get_batch_conversion_task_result(job_id)
+        # Attempt to get the batch job status and results
+        result = document_converter_service.get_batch_conversion_task_result(
+            job_id=job_id,
+            include_page_numbers=include_page_numbers,
+        )
 
         # Return 202 Accepted if the batch job or any sub-job is still in progress
         if result.status in ["IN_PROGRESS"] or any(
@@ -341,13 +409,18 @@ async def chunk_document_from_job(
         True,
         description="Whether to merge undersized peer chunks (used for internal configuration)"
     ),
+    include_page_numbers: bool = Query(
+        True,
+        description="Whether to include page number references in chunk metadata"
+    ),
 ):
     try:
         # Attempt to get the chunking result
         result = document_converter_service.chunk_document_from_job(
             job_id=job_id,
             max_tokens=max_tokens,
             merge_peers=merge_peers,
+            include_page_numbers=include_page_numbers,
         )
 
         # Return error response if there's an error
@@ -398,13 +471,18 @@ async def chunk_batch_documents_from_job(
         True,
         description="Whether to merge undersized peer chunks (used for internal configuration)"
     ),
+    include_page_numbers: bool = Query(
+        True,
+        description="Whether to include page number references in chunk metadata"
+    ),
 ):
     try:
-        # Attempt to get the chunking results
+        # Attempt to chunk all documents from the batch job
         results = document_converter_service.chunk_batch_documents_from_job(
             job_id=job_id,
             max_tokens=max_tokens,
             merge_peers=merge_peers,
+            include_page_numbers=include_page_numbers,
         )
 
         # Check if there were errors in the batch
@@ -455,6 +533,7 @@ async def chunk_text_directly(
             filename=request.filename,
             max_tokens=request.max_tokens,
             merge_peers=request.merge_peers,
+            include_page_numbers=request.include_page_numbers,
         )
 
         # Return error response if there's an error
 
@@ -40,6 +40,9 @@ class BatchConversionJobResult(BaseModel):
 class Chunk(BaseModel):
     text: str = Field(..., description="The plain text content of the chunk")
     metadata: Dict[str, Any] = Field(default_factory=dict, description="Additional metadata associated with the chunk")
+    page_numbers: Optional[List[int]] = Field(None, description="List of page numbers this chunk spans across")
+    start_page: Optional[int] = Field(None, description="The page number where this chunk starts")
+    end_page: Optional[int] = Field(None, description="The page number where this chunk ends")
 
 
 class ChunkingResult(BaseModel):
@@ -54,13 +57,15 @@ class TextChunkingRequest(BaseModel):
     filename: Optional[str] = Field("input.txt", description="A name to identify the source (for reporting purposes)")
     max_tokens: int = Field(512, ge=64, le=2048, description="Maximum number of tokens per chunk")
     merge_peers: bool = Field(True, description="Whether to merge undersized peer chunks")
+    include_page_numbers: bool = Field(True, description="Whether to include page number references in chunk metadata")
 
     class Config:
         json_schema_extra = {
             "example": {
                 "text": "This is the text content that needs to be chunked. It can be as long as needed.",
                 "filename": "example.txt",
                 "max_tokens": 512,
-                "merge_peers": True
+                "merge_peers": True,
+                "include_page_numbers": True
             }
         }