From 0ff074c24c3a96679a0555a98053c3277c9bdea8 Mon Sep 17 00:00:00 2001 From: CyMule Date: Tue, 8 Jul 2025 16:09:09 -0400 Subject: [PATCH 1/2] perf: add early page count check to prevent expensive PDFMiner processing --- unstructured/partition/pdf.py | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/unstructured/partition/pdf.py b/unstructured/partition/pdf.py index d38658ed64..0d4063f5ae 100644 --- a/unstructured/partition/pdf.py +++ b/unstructured/partition/pdf.py @@ -290,6 +290,27 @@ def partition_pdf_or_image( validate_strategy(strategy, is_image) + # Early page count check for strategies that will use HI_RES + # This prevents expensive PDFMiner processing for documents that exceed page limits + pdf_hi_res_max_pages = kwargs.get("pdf_hi_res_max_pages") + if pdf_hi_res_max_pages is not None: + # Check if this strategy will result in HI_RES processing + will_use_hi_res = False + + if strategy == PartitionStrategy.HI_RES: + will_use_hi_res = True + elif strategy == PartitionStrategy.AUTO: + # AUTO resolves to HI_RES in these cases: + extract_element = extract_images_in_pdf or bool(extract_image_block_types) + will_use_hi_res = is_image or infer_table_structure or extract_element + + if will_use_hi_res: + check_pdf_hi_res_max_pages_exceeded( + filename=filename, + file=file, + pdf_hi_res_max_pages=pdf_hi_res_max_pages, + ) + last_modified = get_last_modified_date(filename) if filename else None pdfminer_config = PDFMinerConfig( line_margin=pdfminer_line_margin, From b1b99029b08d7a0d12e1bd455bd87be7296043f2 Mon Sep 17 00:00:00 2001 From: CyMule Date: Tue, 8 Jul 2025 16:13:40 -0400 Subject: [PATCH 2/2] changelog --- CHANGELOG.md | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 7d45affb38..0af7658fca 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,12 @@ +## 0.18.6 + +### Enhancements +- **Optimize PDF processing with early page count check** Prevents expensive PDFMiner processing for documents that exceed page limits by checking page count before strategy execution when `pdf_hi_res_max_pages` is set. + +### Features + +### Fixes + ## 0.18.5-dev0 ### Enhancements