Skip to content

Commit a6a91a8

Browse files
committed
feat: Enhance document conversion routes with page number support and improved error handling
- Add `include_page_numbers` parameter to all document conversion and chunking endpoints - Implement comprehensive error handling in route methods - Update route methods to pass `include_page_numbers` to service methods - Improve logging and error reporting for conversion and chunking processes - Extend chunking methods to extract and include page number metadata - Refactor route methods to use consistent error handling and logging patterns
1 parent 54ad284 commit a6a91a8

File tree

4 files changed

+413
-168
lines changed

4 files changed

+413
-168
lines changed

document_converter/route.py

Lines changed: 143 additions & 64 deletions
Original file line numberDiff line numberDiff line change
@@ -47,20 +47,32 @@ async def convert_single_document(
4747
le=4,
4848
description="Scale factor for image resolution (1-4)"
4949
),
50+
include_page_numbers: bool = Query(
51+
False,
52+
description="Whether to include page numbers in the markdown"
53+
),
5054
):
51-
file_bytes = await document.read()
52-
if not is_file_format_supported(file_bytes, document.filename):
55+
try:
56+
# Read the file content
57+
file_content = await document.read()
58+
59+
# Convert the document
60+
result = document_converter_service.convert_document(
61+
document=(document.filename, BytesIO(file_content)),
62+
extract_tables=extract_tables_as_images,
63+
image_resolution_scale=image_resolution_scale,
64+
include_page_numbers=include_page_numbers,
65+
)
66+
67+
# Return the result
68+
return result
69+
except Exception as e:
70+
logging.error(f"Error in convert_single_document: {str(e)}")
5371
raise HTTPException(
54-
status_code=status.HTTP_400_BAD_REQUEST,
55-
detail=f"Unsupported file format: {document.filename}"
72+
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
73+
detail=f"Error converting document: {str(e)}"
5674
)
5775

58-
return document_converter_service.convert_document(
59-
(document.filename, BytesIO(file_bytes)),
60-
extract_tables=extract_tables_as_images,
61-
image_resolution_scale=image_resolution_scale,
62-
)
63-
6476

6577
@router.post(
6678
'/documents/batch-convert',
@@ -86,22 +98,34 @@ async def convert_multiple_documents(
8698
le=4,
8799
description="Scale factor for image resolution (1-4)"
88100
),
101+
include_page_numbers: bool = Query(
102+
True,
103+
description="Whether to include page numbers in the markdown"
104+
),
89105
):
90-
doc_streams = []
91-
for document in documents:
92-
file_bytes = await document.read()
93-
if not is_file_format_supported(file_bytes, document.filename):
94-
raise HTTPException(
95-
status_code=status.HTTP_400_BAD_REQUEST,
96-
detail=f"Unsupported file format: {document.filename}"
97-
)
98-
doc_streams.append((document.filename, BytesIO(file_bytes)))
99-
100-
return document_converter_service.convert_documents(
101-
doc_streams,
102-
extract_tables=extract_tables_as_images,
103-
image_resolution_scale=image_resolution_scale,
104-
)
106+
try:
107+
# Read all files and prepare for batch conversion
108+
document_data = []
109+
for document in documents:
110+
file_content = await document.read()
111+
document_data.append((document.filename, BytesIO(file_content)))
112+
113+
# Convert all documents
114+
results = document_converter_service.convert_documents(
115+
documents=document_data,
116+
extract_tables=extract_tables_as_images,
117+
image_resolution_scale=image_resolution_scale,
118+
include_page_numbers=include_page_numbers,
119+
)
120+
121+
# Return the results
122+
return results
123+
except Exception as e:
124+
logging.error(f"Error in convert_multiple_documents: {str(e)}")
125+
raise HTTPException(
126+
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
127+
detail=f"Error converting documents: {str(e)}"
128+
)
105129

106130

107131
# Asynchronous conversion jobs endpoints
@@ -128,24 +152,36 @@ async def create_single_document_conversion_job(
128152
le=4,
129153
description="Scale factor for image resolution (1-4)"
130154
),
155+
include_page_numbers: bool = Query(
156+
True,
157+
description="Whether to include page numbers in the markdown"
158+
),
131159
):
132-
file_bytes = await document.read()
133-
if not is_file_format_supported(file_bytes, document.filename):
134-
raise HTTPException(
135-
status_code=status.HTTP_400_BAD_REQUEST,
136-
detail=f"Unsupported file format: {document.filename}"
160+
try:
161+
# Read the file content
162+
file_content = await document.read()
163+
164+
# Import the task function
165+
from worker.tasks import convert_document_task
166+
167+
# Queue the conversion task
168+
task = convert_document_task.delay(
169+
document=(document.filename, file_content),
170+
extract_tables=extract_tables_as_images,
171+
image_resolution_scale=image_resolution_scale,
172+
include_page_numbers=include_page_numbers,
137173
)
138174

139-
task = convert_document_task.delay(
140-
(document.filename, file_bytes),
141-
extract_tables=extract_tables_as_images,
142-
image_resolution_scale=image_resolution_scale,
143-
)
144-
145-
return ConversionJobResult(
146-
job_id=task.id,
147-
status="IN_PROGRESS"
148-
)
175+
return ConversionJobResult(
176+
job_id=task.id,
177+
status="IN_PROGRESS"
178+
)
179+
except Exception as e:
180+
logging.error(f"Error in create_single_document_conversion_job: {str(e)}")
181+
raise HTTPException(
182+
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
183+
detail=f"Error creating conversion job: {str(e)}"
184+
)
149185

150186

151187
@router.get(
@@ -159,9 +195,19 @@ async def create_single_document_conversion_job(
159195
},
160196
description="Get the status and result of a single document conversion job",
161197
)
162-
async def get_conversion_job_status(job_id: str):
198+
async def get_conversion_job_status(
199+
job_id: str,
200+
include_page_numbers: bool = Query(
201+
True,
202+
description="Whether to include page numbers in the markdown"
203+
),
204+
):
163205
try:
164-
result = document_converter_service.get_single_document_task_result(job_id)
206+
# Attempt to get the job status and result
207+
result = document_converter_service.get_single_document_task_result(
208+
job_id=job_id,
209+
include_page_numbers=include_page_numbers,
210+
)
165211

166212
# Return 202 Accepted if job is still in progress
167213
if result.status in ["IN_PROGRESS"]:
@@ -212,27 +258,39 @@ async def create_batch_conversion_job(
212258
le=4,
213259
description="Scale factor for image resolution (1-4)"
214260
),
261+
include_page_numbers: bool = Query(
262+
True,
263+
description="Whether to include page numbers in the markdown"
264+
),
215265
):
216-
doc_data = []
217-
for document in documents:
218-
file_bytes = await document.read()
219-
if not is_file_format_supported(file_bytes, document.filename):
220-
raise HTTPException(
221-
status_code=status.HTTP_400_BAD_REQUEST,
222-
detail=f"Unsupported file format: {document.filename}"
223-
)
224-
doc_data.append((document.filename, file_bytes))
225-
226-
task = convert_documents_task.delay(
227-
doc_data,
228-
extract_tables=extract_tables_as_images,
229-
image_resolution_scale=image_resolution_scale,
230-
)
266+
try:
267+
# Read all files and prepare for batch conversion
268+
document_data = []
269+
for document in documents:
270+
file_content = await document.read()
271+
document_data.append((document.filename, file_content))
272+
273+
# Import the task function
274+
from worker.tasks import convert_documents_task
275+
276+
# Queue the batch conversion task
277+
task = convert_documents_task.delay(
278+
documents=document_data,
279+
extract_tables=extract_tables_as_images,
280+
image_resolution_scale=image_resolution_scale,
281+
include_page_numbers=include_page_numbers,
282+
)
231283

232-
return BatchConversionJobResult(
233-
job_id=task.id,
234-
status="IN_PROGRESS"
235-
)
284+
return BatchConversionJobResult(
285+
job_id=task.id,
286+
status="IN_PROGRESS"
287+
)
288+
except Exception as e:
289+
logging.error(f"Error in create_batch_conversion_job: {str(e)}")
290+
raise HTTPException(
291+
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
292+
detail=f"Error creating batch conversion job: {str(e)}"
293+
)
236294

237295

238296
@router.get(
@@ -246,9 +304,19 @@ async def create_batch_conversion_job(
246304
},
247305
description="Get the status and results of a batch conversion job",
248306
)
249-
async def get_batch_conversion_job_status(job_id: str):
307+
async def get_batch_conversion_job_status(
308+
job_id: str,
309+
include_page_numbers: bool = Query(
310+
True,
311+
description="Whether to include page numbers in the markdown"
312+
),
313+
):
250314
try:
251-
result = document_converter_service.get_batch_conversion_task_result(job_id)
315+
# Attempt to get the batch job status and results
316+
result = document_converter_service.get_batch_conversion_task_result(
317+
job_id=job_id,
318+
include_page_numbers=include_page_numbers,
319+
)
252320

253321
# Return 202 Accepted if the batch job or any sub-job is still in progress
254322
if result.status in ["IN_PROGRESS"] or any(
@@ -341,13 +409,18 @@ async def chunk_document_from_job(
341409
True,
342410
description="Whether to merge undersized peer chunks (used for internal configuration)"
343411
),
412+
include_page_numbers: bool = Query(
413+
True,
414+
description="Whether to include page number references in chunk metadata"
415+
),
344416
):
345417
try:
346418
# Attempt to get the chunking result
347419
result = document_converter_service.chunk_document_from_job(
348420
job_id=job_id,
349421
max_tokens=max_tokens,
350422
merge_peers=merge_peers,
423+
include_page_numbers=include_page_numbers,
351424
)
352425

353426
# Return error response if there's an error
@@ -398,13 +471,18 @@ async def chunk_batch_documents_from_job(
398471
True,
399472
description="Whether to merge undersized peer chunks (used for internal configuration)"
400473
),
474+
include_page_numbers: bool = Query(
475+
True,
476+
description="Whether to include page number references in chunk metadata"
477+
),
401478
):
402479
try:
403-
# Attempt to get the chunking results
480+
# Attempt to chunk all documents from the batch job
404481
results = document_converter_service.chunk_batch_documents_from_job(
405482
job_id=job_id,
406483
max_tokens=max_tokens,
407484
merge_peers=merge_peers,
485+
include_page_numbers=include_page_numbers,
408486
)
409487

410488
# Check if there were errors in the batch
@@ -455,6 +533,7 @@ async def chunk_text_directly(
455533
filename=request.filename,
456534
max_tokens=request.max_tokens,
457535
merge_peers=request.merge_peers,
536+
include_page_numbers=request.include_page_numbers,
458537
)
459538

460539
# Return error response if there's an error

document_converter/schema.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,9 @@ class BatchConversionJobResult(BaseModel):
4040
class Chunk(BaseModel):
4141
text: str = Field(..., description="The plain text content of the chunk")
4242
metadata: Dict[str, Any] = Field(default_factory=dict, description="Additional metadata associated with the chunk")
43+
page_numbers: Optional[List[int]] = Field(None, description="List of page numbers this chunk spans across")
44+
start_page: Optional[int] = Field(None, description="The page number where this chunk starts")
45+
end_page: Optional[int] = Field(None, description="The page number where this chunk ends")
4346

4447

4548
class ChunkingResult(BaseModel):
@@ -54,13 +57,15 @@ class TextChunkingRequest(BaseModel):
5457
filename: Optional[str] = Field("input.txt", description="A name to identify the source (for reporting purposes)")
5558
max_tokens: int = Field(512, ge=64, le=2048, description="Maximum number of tokens per chunk")
5659
merge_peers: bool = Field(True, description="Whether to merge undersized peer chunks")
60+
include_page_numbers: bool = Field(True, description="Whether to include page number references in chunk metadata")
5761

5862
class Config:
5963
json_schema_extra = {
6064
"example": {
6165
"text": "This is the text content that needs to be chunked. It can be as long as needed.",
6266
"filename": "example.txt",
6367
"max_tokens": 512,
64-
"merge_peers": True
68+
"merge_peers": True,
69+
"include_page_numbers": True
6570
}
6671
}

0 commit comments

Comments
 (0)