feat: Add health check response schema and update conversion result schemas

spa5k · spa5k · commit 4fc4c56cb7a6 · 2025-03-09T19:12:13.000+05:30
- Introduce HealthCheckResponse schema with status and services fields
- Modify ConversionResult schema to make images and page_content optional
- Update BatchConversionResult and BatchConversionJobResult to have optional conversion results
- Adjust Chunk metadata to be more flexible with optional types
- Add response_model to health check route in route.py
diff --git a/document_converter/route.py b/document_converter/route.py
@@ -9,7 +9,8 @@
     ConversionJobResult,
     ConversionResult,
     ChunkingResult,
-    TextChunkingRequest
+    TextChunkingRequest,
+    HealthCheckResponse
 )
 from document_converter.service import DocumentConverterService, DoclingDocumentConversion
 from document_converter.utils import is_file_format_supported
@@ -352,6 +353,7 @@ async def get_batch_conversion_job_status(
 
 @router.get(
     "/health",
+    response_model=HealthCheckResponse,
     responses={
         200: {"description": "All services are healthy"},
         500: {"description": "One or more services are unhealthy"}
diff --git a/document_converter/schema.py b/document_converter/schema.py
@@ -12,14 +12,14 @@ class ImageData(BaseModel):
 class ConversionResult(BaseModel):
     filename: str = Field(..., description="Original filename of the document")
     markdown: Optional[str] = Field(None, description="Converted markdown content")
-    images: List[ImageData] = Field(default_factory=list, description="Images extracted from the document")
+    images: Optional[List[ImageData]] = Field(None, description="Images extracted from the document")
     error: Optional[str] = Field(None, description="Error message if conversion failed")
-    page_content: Optional[Dict[int, str]] = Field(None, description="Markdown content organized by page number")
+    page_content: Optional[Dict[str, Optional[str]]] = Field(None, description="Markdown content organized by page number")
 
 
 class BatchConversionResult(BaseModel):
-    conversion_results: List[ConversionResult] = Field(
-        default_factory=list, description="The results of the conversions"
+    conversion_results: Optional[List[ConversionResult]] = Field(
+        None, description="The results of the conversions"
     )
 
 
@@ -33,13 +33,13 @@ class ConversionJobResult(BaseModel):
 class BatchConversionJobResult(BaseModel):
     job_id: str = Field(..., description="The id of the batch conversion job")
     status: str = Field(..., description="Current status of the batch job")
-    conversion_results: List[ConversionJobResult] = Field(default_factory=list, description="Individual conversion job results")
+    conversion_results: Optional[List[ConversionJobResult]] = Field(None, description="Individual conversion job results")
     error: Optional[str] = Field(None, description="Error message if batch job failed")
 
 
 class Chunk(BaseModel):
     text: str = Field(..., description="The plain text content of the chunk")
-    metadata: Dict[str, Any] = Field(default_factory=dict, description="Additional metadata associated with the chunk")
+    metadata: Optional[Dict[str, Optional[str]]] = Field(None, description="Additional metadata associated with the chunk")
     page_numbers: Optional[List[int]] = Field(None, description="List of page numbers this chunk spans across")
     start_page: Optional[int] = Field(None, description="The page number where this chunk starts")
     end_page: Optional[int] = Field(None, description="The page number where this chunk ends")
@@ -48,24 +48,18 @@ class Chunk(BaseModel):
 class ChunkingResult(BaseModel):
     job_id: str = Field(..., description="The id of the original conversion job")
     filename: str = Field(..., description="The filename of the document")
-    chunks: List[Chunk] = Field(default_factory=list, description="The chunks extracted from the document")
+    chunks: Optional[List[Chunk]] = Field(None, description="The chunks extracted from the document")
     error: Optional[str] = Field(None, description="The error that occurred during chunking")
 
 
 class TextChunkingRequest(BaseModel):
     text: str = Field(..., description="The text content to chunk")
-    filename: Optional[str] = Field("input.txt", description="A name to identify the source (for reporting purposes)")
-    max_tokens: int = Field(512, ge=64, le=2048, description="Maximum number of tokens per chunk")
-    merge_peers: bool = Field(True, description="Whether to merge undersized peer chunks")
-    include_page_numbers: bool = Field(True, description="Whether to include page number references in chunk metadata")
-    
-    class Config:
-        json_schema_extra = {
-            "example": {
-                "text": "This is the text content that needs to be chunked. It can be as long as needed.",
-                "filename": "example.txt",
-                "max_tokens": 512,
-                "merge_peers": True,
-                "include_page_numbers": True
-            }
-        }
+    filename: str = Field(default="input.txt", description="A name to identify the source (for reporting purposes)")
+    max_tokens: int = Field(default=512, description="Maximum number of tokens per chunk")
+    merge_peers: bool = Field(default=True, description="Whether to merge undersized peer chunks")
+    include_page_numbers: bool = Field(default=True, description="Whether to include page number references in chunk metadata")
+
+
+class HealthCheckResponse(BaseModel):
+    status: str = Field(..., description="Overall health status")
+    services: Optional[Dict[str, str]] = Field(None, description="Status of individual services")