@@ -47,20 +47,32 @@ async def convert_single_document(
47
47
le = 4 ,
48
48
description = "Scale factor for image resolution (1-4)"
49
49
),
50
+ include_page_numbers : bool = Query (
51
+ False ,
52
+ description = "Whether to include page numbers in the markdown"
53
+ ),
50
54
):
51
- file_bytes = await document .read ()
52
- if not is_file_format_supported (file_bytes , document .filename ):
55
+ try :
56
+ # Read the file content
57
+ file_content = await document .read ()
58
+
59
+ # Convert the document
60
+ result = document_converter_service .convert_document (
61
+ document = (document .filename , BytesIO (file_content )),
62
+ extract_tables = extract_tables_as_images ,
63
+ image_resolution_scale = image_resolution_scale ,
64
+ include_page_numbers = include_page_numbers ,
65
+ )
66
+
67
+ # Return the result
68
+ return result
69
+ except Exception as e :
70
+ logging .error (f"Error in convert_single_document: { str (e )} " )
53
71
raise HTTPException (
54
- status_code = status .HTTP_400_BAD_REQUEST ,
55
- detail = f"Unsupported file format : { document . filename } "
72
+ status_code = status .HTTP_500_INTERNAL_SERVER_ERROR ,
73
+ detail = f"Error converting document : { str ( e ) } "
56
74
)
57
75
58
- return document_converter_service .convert_document (
59
- (document .filename , BytesIO (file_bytes )),
60
- extract_tables = extract_tables_as_images ,
61
- image_resolution_scale = image_resolution_scale ,
62
- )
63
-
64
76
65
77
@router .post (
66
78
'/documents/batch-convert' ,
@@ -86,22 +98,34 @@ async def convert_multiple_documents(
86
98
le = 4 ,
87
99
description = "Scale factor for image resolution (1-4)"
88
100
),
101
+ include_page_numbers : bool = Query (
102
+ True ,
103
+ description = "Whether to include page numbers in the markdown"
104
+ ),
89
105
):
90
- doc_streams = []
91
- for document in documents :
92
- file_bytes = await document .read ()
93
- if not is_file_format_supported (file_bytes , document .filename ):
94
- raise HTTPException (
95
- status_code = status .HTTP_400_BAD_REQUEST ,
96
- detail = f"Unsupported file format: { document .filename } "
97
- )
98
- doc_streams .append ((document .filename , BytesIO (file_bytes )))
99
-
100
- return document_converter_service .convert_documents (
101
- doc_streams ,
102
- extract_tables = extract_tables_as_images ,
103
- image_resolution_scale = image_resolution_scale ,
104
- )
106
+ try :
107
+ # Read all files and prepare for batch conversion
108
+ document_data = []
109
+ for document in documents :
110
+ file_content = await document .read ()
111
+ document_data .append ((document .filename , BytesIO (file_content )))
112
+
113
+ # Convert all documents
114
+ results = document_converter_service .convert_documents (
115
+ documents = document_data ,
116
+ extract_tables = extract_tables_as_images ,
117
+ image_resolution_scale = image_resolution_scale ,
118
+ include_page_numbers = include_page_numbers ,
119
+ )
120
+
121
+ # Return the results
122
+ return results
123
+ except Exception as e :
124
+ logging .error (f"Error in convert_multiple_documents: { str (e )} " )
125
+ raise HTTPException (
126
+ status_code = status .HTTP_500_INTERNAL_SERVER_ERROR ,
127
+ detail = f"Error converting documents: { str (e )} "
128
+ )
105
129
106
130
107
131
# Asynchronous conversion jobs endpoints
@@ -128,24 +152,36 @@ async def create_single_document_conversion_job(
128
152
le = 4 ,
129
153
description = "Scale factor for image resolution (1-4)"
130
154
),
155
+ include_page_numbers : bool = Query (
156
+ True ,
157
+ description = "Whether to include page numbers in the markdown"
158
+ ),
131
159
):
132
- file_bytes = await document .read ()
133
- if not is_file_format_supported (file_bytes , document .filename ):
134
- raise HTTPException (
135
- status_code = status .HTTP_400_BAD_REQUEST ,
136
- detail = f"Unsupported file format: { document .filename } "
160
+ try :
161
+ # Read the file content
162
+ file_content = await document .read ()
163
+
164
+ # Import the task function
165
+ from worker .tasks import convert_document_task
166
+
167
+ # Queue the conversion task
168
+ task = convert_document_task .delay (
169
+ document = (document .filename , file_content ),
170
+ extract_tables = extract_tables_as_images ,
171
+ image_resolution_scale = image_resolution_scale ,
172
+ include_page_numbers = include_page_numbers ,
137
173
)
138
174
139
- task = convert_document_task . delay (
140
- ( document . filename , file_bytes ) ,
141
- extract_tables = extract_tables_as_images ,
142
- image_resolution_scale = image_resolution_scale ,
143
- )
144
-
145
- return ConversionJobResult (
146
- job_id = task . id ,
147
- status = "IN_PROGRESS "
148
- )
175
+ return ConversionJobResult (
176
+ job_id = task . id ,
177
+ status = "IN_PROGRESS"
178
+ )
179
+ except Exception as e :
180
+ logging . error ( f"Error in create_single_document_conversion_job: { str ( e ) } " )
181
+ raise HTTPException (
182
+ status_code = status . HTTP_500_INTERNAL_SERVER_ERROR ,
183
+ detail = f"Error creating conversion job: { str ( e ) } "
184
+ )
149
185
150
186
151
187
@router .get (
@@ -159,9 +195,19 @@ async def create_single_document_conversion_job(
159
195
},
160
196
description = "Get the status and result of a single document conversion job" ,
161
197
)
162
- async def get_conversion_job_status (job_id : str ):
198
+ async def get_conversion_job_status (
199
+ job_id : str ,
200
+ include_page_numbers : bool = Query (
201
+ True ,
202
+ description = "Whether to include page numbers in the markdown"
203
+ ),
204
+ ):
163
205
try :
164
- result = document_converter_service .get_single_document_task_result (job_id )
206
+ # Attempt to get the job status and result
207
+ result = document_converter_service .get_single_document_task_result (
208
+ job_id = job_id ,
209
+ include_page_numbers = include_page_numbers ,
210
+ )
165
211
166
212
# Return 202 Accepted if job is still in progress
167
213
if result .status in ["IN_PROGRESS" ]:
@@ -212,27 +258,39 @@ async def create_batch_conversion_job(
212
258
le = 4 ,
213
259
description = "Scale factor for image resolution (1-4)"
214
260
),
261
+ include_page_numbers : bool = Query (
262
+ True ,
263
+ description = "Whether to include page numbers in the markdown"
264
+ ),
215
265
):
216
- doc_data = []
217
- for document in documents :
218
- file_bytes = await document .read ()
219
- if not is_file_format_supported (file_bytes , document .filename ):
220
- raise HTTPException (
221
- status_code = status .HTTP_400_BAD_REQUEST ,
222
- detail = f"Unsupported file format: { document .filename } "
223
- )
224
- doc_data .append ((document .filename , file_bytes ))
225
-
226
- task = convert_documents_task .delay (
227
- doc_data ,
228
- extract_tables = extract_tables_as_images ,
229
- image_resolution_scale = image_resolution_scale ,
230
- )
266
+ try :
267
+ # Read all files and prepare for batch conversion
268
+ document_data = []
269
+ for document in documents :
270
+ file_content = await document .read ()
271
+ document_data .append ((document .filename , file_content ))
272
+
273
+ # Import the task function
274
+ from worker .tasks import convert_documents_task
275
+
276
+ # Queue the batch conversion task
277
+ task = convert_documents_task .delay (
278
+ documents = document_data ,
279
+ extract_tables = extract_tables_as_images ,
280
+ image_resolution_scale = image_resolution_scale ,
281
+ include_page_numbers = include_page_numbers ,
282
+ )
231
283
232
- return BatchConversionJobResult (
233
- job_id = task .id ,
234
- status = "IN_PROGRESS"
235
- )
284
+ return BatchConversionJobResult (
285
+ job_id = task .id ,
286
+ status = "IN_PROGRESS"
287
+ )
288
+ except Exception as e :
289
+ logging .error (f"Error in create_batch_conversion_job: { str (e )} " )
290
+ raise HTTPException (
291
+ status_code = status .HTTP_500_INTERNAL_SERVER_ERROR ,
292
+ detail = f"Error creating batch conversion job: { str (e )} "
293
+ )
236
294
237
295
238
296
@router .get (
@@ -246,9 +304,19 @@ async def create_batch_conversion_job(
246
304
},
247
305
description = "Get the status and results of a batch conversion job" ,
248
306
)
249
- async def get_batch_conversion_job_status (job_id : str ):
307
+ async def get_batch_conversion_job_status (
308
+ job_id : str ,
309
+ include_page_numbers : bool = Query (
310
+ True ,
311
+ description = "Whether to include page numbers in the markdown"
312
+ ),
313
+ ):
250
314
try :
251
- result = document_converter_service .get_batch_conversion_task_result (job_id )
315
+ # Attempt to get the batch job status and results
316
+ result = document_converter_service .get_batch_conversion_task_result (
317
+ job_id = job_id ,
318
+ include_page_numbers = include_page_numbers ,
319
+ )
252
320
253
321
# Return 202 Accepted if the batch job or any sub-job is still in progress
254
322
if result .status in ["IN_PROGRESS" ] or any (
@@ -341,13 +409,18 @@ async def chunk_document_from_job(
341
409
True ,
342
410
description = "Whether to merge undersized peer chunks (used for internal configuration)"
343
411
),
412
+ include_page_numbers : bool = Query (
413
+ True ,
414
+ description = "Whether to include page number references in chunk metadata"
415
+ ),
344
416
):
345
417
try :
346
418
# Attempt to get the chunking result
347
419
result = document_converter_service .chunk_document_from_job (
348
420
job_id = job_id ,
349
421
max_tokens = max_tokens ,
350
422
merge_peers = merge_peers ,
423
+ include_page_numbers = include_page_numbers ,
351
424
)
352
425
353
426
# Return error response if there's an error
@@ -398,13 +471,18 @@ async def chunk_batch_documents_from_job(
398
471
True ,
399
472
description = "Whether to merge undersized peer chunks (used for internal configuration)"
400
473
),
474
+ include_page_numbers : bool = Query (
475
+ True ,
476
+ description = "Whether to include page number references in chunk metadata"
477
+ ),
401
478
):
402
479
try :
403
- # Attempt to get the chunking results
480
+ # Attempt to chunk all documents from the batch job
404
481
results = document_converter_service .chunk_batch_documents_from_job (
405
482
job_id = job_id ,
406
483
max_tokens = max_tokens ,
407
484
merge_peers = merge_peers ,
485
+ include_page_numbers = include_page_numbers ,
408
486
)
409
487
410
488
# Check if there were errors in the batch
@@ -455,6 +533,7 @@ async def chunk_text_directly(
455
533
filename = request .filename ,
456
534
max_tokens = request .max_tokens ,
457
535
merge_peers = request .merge_peers ,
536
+ include_page_numbers = request .include_page_numbers ,
458
537
)
459
538
460
539
# Return error response if there's an error
0 commit comments