Unstructured-IO · dhdaines · Jul 18, 2025 · Jul 19, 2025 · Jul 19, 2025 · Jul 20, 2025
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,6 +1,7 @@
 ## 0.18.11-dev0
 
 ### Enhancements
+- **Switch from pdfminer.six to PAVÉS** Increases robustness of PDF extraction and uses multiple CPUs when possible.  No more need to patch pdfminer or repair pdfs with pikepdf.
 
 ### Features
 

diff --git a/requirements/extra-pdf-image.in b/requirements/extra-pdf-image.in
@@ -4,8 +4,8 @@
 onnx>=1.17.0
 onnxruntime>=1.19.0
 pdf2image
-pdfminer.six
-pikepdf
+paves
+playa-pdf>=0.6.2
 pi_heif
 pypdf
 google-cloud-vision

diff --git a/requirements/extra-pdf-image.txt b/requirements/extra-pdf-image.txt
@@ -33,8 +33,6 @@ cryptography==45.0.5
     #   pdfminer-six
 cycler==0.12.1
     # via matplotlib
-deprecated==1.2.18
-    # via pikepdf
 effdet==0.4.1
     # via -r ./extra-pdf-image.in
 filelock==3.18.0
@@ -88,10 +86,6 @@ jinja2==3.1.6
     # via torch
 kiwisolver==1.4.8
     # via matplotlib
-lxml==6.0.0
-    # via
-    #   -c requirements/base.txt
-    #   pikepdf
 markupsafe==3.0.2
     # via jinja2
 matplotlib==3.10.3
@@ -134,30 +128,30 @@ packaging==25.0
     #   huggingface-hub
     #   matplotlib
     #   onnxruntime
-    #   pikepdf
     #   transformers
     #   unstructured-pytesseract
 pandas==2.3.1
     # via unstructured-inference
+paves==0.6.1
+    # via -r extra-pdf-image.in
 pdf2image==1.17.0
-    # via -r ./extra-pdf-image.in
+    # via -r extra-pdf-image.in
 pdfminer-six==20250327
     # via
-    #   -c requirements/deps/constraints.txt
-    #   -r ./extra-pdf-image.in
+    #   -c ./deps/constraints.txt
     #   unstructured-inference
 pi-heif==1.0.0
-    # via -r ./extra-pdf-image.in
-pikepdf==9.9.0
-    # via -r ./extra-pdf-image.in
+    # via -r extra-pdf-image.in
 pillow==11.3.0
     # via
     #   matplotlib
+    #   paves
     #   pdf2image
     #   pi-heif
-    #   pikepdf
     #   torchvision
     #   unstructured-pytesseract
+playa-pdf==0.6.2
+    # via paves
 proto-plus==1.26.1
     # via
     #   google-api-core
@@ -274,7 +268,6 @@ typing-extensions==4.14.1
     #   -c requirements/base.txt
     #   huggingface-hub
     #   onnx
-    #   pypdf
     #   torch
 tzdata==2025.2
     # via pandas
@@ -287,7 +280,3 @@ urllib3==2.5.0
     #   -c requirements/base.txt
     #   -c requirements/deps/constraints.txt
     #   requests
-wrapt==1.17.2
-    # via
-    #   -c requirements/base.txt
-    #   deprecated
diff --git a/test_unstructured/partition/pdf_image/test_pdf.py b/test_unstructured/partition/pdf_image/test_pdf.py
@@ -1224,14 +1224,14 @@ def test_partition_pdf_with_fast_finds_headers_footers():
 @pytest.mark.parametrize(
     ("filename", "expected_log"),
     [
-        # This one is *actually* an invalid PDF document
+        # This one is *actually* an invalid PDF document, but we no longer need to repair it
         ("invalid-pdf-structure-pdfminer-entire-doc.pdf", "Repairing the PDF document ..."),
     ],
 )
 def test_extractable_elements_repair_invalid_pdf_structure(filename, expected_log, caplog):
     caplog.set_level(logging.INFO)
     assert pdf.extractable_elements(filename=example_doc_path(f"pdf/{filename}"))
-    assert expected_log in caplog.text
+    assert expected_log not in caplog.text
 
 
 @pytest.mark.parametrize(

diff --git a/test_unstructured/partition/pdf_image/test_pdfminer_processing.py b/test_unstructured/partition/pdf_image/test_pdfminer_processing.py
@@ -2,7 +2,7 @@
 
 import numpy as np
 import pytest
-from pdfminer.layout import LAParams
+from paves.miner import LAParams
 from PIL import Image
 from unstructured_inference.constants import Source as InferenceSource
 from unstructured_inference.inference.elements import (

diff --git a/test_unstructured/partition/pdf_image/test_pdfminer_utils.py b/test_unstructured/partition/pdf_image/test_pdfminer_utils.py
@@ -1,6 +1,6 @@
 from unittest.mock import MagicMock
 
-from pdfminer.layout import LTContainer, LTTextLine
+from paves.miner import LTContainer, LTTextLine
 
 from unstructured.partition.pdf_image.pdfminer_utils import extract_text_objects
 

diff --git a/test_unstructured/partition/test_msg.py b/test_unstructured/partition/test_msg.py
@@ -125,8 +125,8 @@ def test_partition_msg_can_process_attachments():
 
     assert all(e.metadata.filename == "fake-email-multiple-attachments.msg" for e in elements[:5])
     assert all(e.metadata.filename == "unstructured_logo.png" for e in elements[5:7])
-    assert all(e.metadata.filename == "dense_doc.pdf" for e in elements[7:343])
-    assert all(e.metadata.filename == "Engineering Onboarding.pptx" for e in elements[343:])
+    assert all(e.metadata.filename == "dense_doc.pdf" for e in elements[7:341])
+    assert all(e.metadata.filename == "Engineering Onboarding.pptx" for e in elements[341:])
     assert [e.text for e in elements[:5]] == [
         "Here are those documents.",
         "--",

diff --git a/...-structured-output-html/local-single-file-chunk-no-orig-elements/multi-column-2p.pdf.html b/...-structured-output-html/local-single-file-chunk-no-orig-elements/multi-column-2p.pdf.html
@@ -58,12 +58,12 @@
   <div class="CompositeElement" id="4204154eefaa843f79edc96dcc208054">
    In this paper, we address the question: can we train a better dense embedding model using only pairs of questions and passages (or answers), with- out additional pretraining? By leveraging the now standard BERT pretrained model (Devlin et al., 2019) and a dual-encoder architecture (Bromley et al., 1994), we focus on developing the right training scheme using a relatively small number of question and passage pairs. Through a series of careful ablation studies, our ﬁnal solution is surprisingly simple: the embedding is optimized for maximizing inner products of the question and relevant passage vectors, with an objective compar- ing all pairs of questions and passages in a batch. Our Dense Passage Retriever (DPR) is exception- ally strong. It not only outperforms BM25 by a large margin (65.2% vs. 42.9% in Top-5 accuracy), but also results in a substantial improvement on the end-to-end QA accuracy compared to ORQA (41.5% vs. 33.3%) in the open Natural Questions setting (Lee et al., 2019; Kwiatkowski et al., 2019). Our contributions are twofold. First, we demon- strate that with the proper training setup, sim- ply ﬁne-tuning the question and passage encoders on existing question-passage pairs is sufﬁcient to greatly outperform BM25. Our empirical results also suggest that additional pretraining may not be needed. Second, we verify that, in the context of open-domain question answering, a higher retrieval precision indeed translates to a higher end-to-end QA accuracy. By applying a modern reader model to the top retrieved passages, we achieve compara- ble or better results on multiple QA datasets in the open-retrieval setting, compared to several, much complicated systems.
   </div>
-  <div class="CompositeElement" id="e6dee1abec28f8ff365ab6275b3e5f0e">
+  <div class="CompositeElement" id="c2959a06eb5a6864c4f0c7d38e21b2e9">
    2 Background
 
 The problem of open-domain QA studied in this paper can be described as follows. Given a factoid question, such as “Who ﬁrst voiced Meg on Family Guy?” or “Where was the 8th Dalai Lama born?”, a system is required to answer it using a large corpus of diversiﬁed topics. More speciﬁcally, we assume
 
-the extractive QA setting, in which the answer is restricted to a span appearing in one or more pas- sages in the corpus. Assume that our collection contains D documents, d1,d2,··· ,dD. We ﬁrst split each of the documents into text passages of equal lengths as the basic retrieval units3 and get M total passages in our corpus C = {p1,p2,...,pM}, where each passage pi can be viewed as a sequence 2 ,··· ,w(i) 1 ,w(i) of tokens w(i) |pi|. Given a question q, the task is to ﬁnd a span w(i) s+1,··· ,w(i) s ,w(i) from one of the passages pi that can answer the question. Notice that to cover a wide variety of domains, the corpus size can easily range from millions of docu- ments (e.g., Wikipedia) to billions (e.g., the Web). As a result, any open-domain QA system needs to include an efﬁcient retriever component that can se- lect a small set of relevant texts, before applying the reader to extract the answer (Chen et al., 2017).4 Formally speaking, a retriever R : (q,C) → CF is a function that takes as input a question q and a corpus C and returns a much smaller ﬁlter set of texts CF ⊂ C, where |CF| = k (cid:28) |C|. For a ﬁxed k, a retriever can be evaluated in isolation on top-k retrieval accuracy, which is the fraction of ques- tions for which CF contains a span that answers the question.
+the extractive QA setting, in which the answer is restricted to a span appearing in one or more pas- sages in the corpus. Assume that our collection contains D documents, d1,d2,··· ,dD. We ﬁrst split each of the documents into text passages of equal lengths as the basic retrieval units3 and get M total passages in our corpus C = {p1,p2,...,pM}, where each passage pi can be viewed as a sequence 2 ,··· ,w(i) 1 ,w(i) of tokens w(i) |pi|. Given a question q, the task is to ﬁnd a span w(i) s+1,··· ,w(i) s ,w(i) from one of the passages pi that can answer the question. Notice that to cover a wide variety of domains, the corpus size can easily range from millions of docu- ments (e.g., Wikipedia) to billions (e.g., the Web). As a result, any open-domain QA system needs to include an efﬁcient retriever component that can se- lect a small set of relevant texts, before applying the reader to extract the answer (Chen et al., 2017).4 Formally speaking, a retriever R : (q,C) → CF is a function that takes as input a question q and a corpus C and returns a much smaller ﬁlter set of texts CF ⊂ C, where |CF| = k |C|. For a ﬁxed k, a retriever can be evaluated in isolation on top-k retrieval accuracy, which is the fraction of ques- tions for which CF contains a span that answers the question.
 
 e
   </div>

diff --git a/...output-markdown/local-single-file-chunk-no-orig-elements/multi-column-2p.pdf.md b/...output-markdown/local-single-file-chunk-no-orig-elements/multi-column-2p.pdf.md
@@ -43,7 +43,7 @@ In this paper, we address the question: can we train a better dense embedding mo
 
 The problem of open-domain QA studied in this paper can be described as follows. Given a factoid question, such as “Who ﬁrst voiced Meg on Family Guy?” or “Where was the 8th Dalai Lama born?”, a system is required to answer it using a large corpus of diversiﬁed topics. More speciﬁcally, we assume
 
-the extractive QA setting, in which the answer is restricted to a span appearing in one or more pas- sages in the corpus. Assume that our collection contains D documents, d1,d2,··· ,dD. We ﬁrst split each of the documents into text passages of equal lengths as the basic retrieval units3 and get M total passages in our corpus C = {p1,p2,...,pM}, where each passage pi can be viewed as a sequence 2 ,··· ,w(i) 1 ,w(i) of tokens w(i) |pi|. Given a question q, the task is to ﬁnd a span w(i) s+1,··· ,w(i) s ,w(i) from one of the passages pi that can answer the question. Notice that to cover a wide variety of domains, the corpus size can easily range from millions of docu- ments (e.g., Wikipedia) to billions (e.g., the Web). As a result, any open-domain QA system needs to include an efﬁcient retriever component that can se- lect a small set of relevant texts, before applying the reader to extract the answer (Chen et al., 2017).4 Formally speaking, a retriever R : (q,C) → CF is a function that takes as input a question q and a corpus C and returns a much smaller ﬁlter set of texts CF ⊂ C, where |CF| = k (cid:28) |C|. For a ﬁxed k, a retriever can be evaluated in isolation on top-k retrieval accuracy, which is the fraction of ques- tions for which CF contains a span that answers the question.
+the extractive QA setting, in which the answer is restricted to a span appearing in one or more pas- sages in the corpus. Assume that our collection contains D documents, d1,d2,··· ,dD. We ﬁrst split each of the documents into text passages of equal lengths as the basic retrieval units3 and get M total passages in our corpus C = {p1,p2,...,pM}, where each passage pi can be viewed as a sequence 2 ,··· ,w(i) 1 ,w(i) of tokens w(i) |pi|. Given a question q, the task is to ﬁnd a span w(i) s+1,··· ,w(i) s ,w(i) from one of the passages pi that can answer the question. Notice that to cover a wide variety of domains, the corpus size can easily range from millions of docu- ments (e.g., Wikipedia) to billions (e.g., the Web). As a result, any open-domain QA system needs to include an efﬁcient retriever component that can se- lect a small set of relevant texts, before applying the reader to extract the answer (Chen et al., 2017).4 Formally speaking, a retriever R : (q,C) → CF is a function that takes as input a question q and a corpus C and returns a much smaller ﬁlter set of texts CF ⊂ C, where |CF| = k |C|. For a ﬁxed k, a retriever can be evaluated in isolation on top-k retrieval accuracy, which is the fraction of ques- tions for which CF contains a span that answers the question.
 
 e
 3 Dense Passage Retriever (DPR)

diff --git a/...ected-structured-output/local-single-file-chunk-no-orig-elements/multi-column-2p.pdf.json b/...ected-structured-output/local-single-file-chunk-no-orig-elements/multi-column-2p.pdf.json
@@ -111,8 +111,8 @@
   },
   {
     "type": "CompositeElement",
-    "element_id": "e6dee1abec28f8ff365ab6275b3e5f0e",
-    "text": "2 Background\n\nThe problem of open-domain QA studied in this paper can be described as follows. Given a factoid question, such as “Who ﬁrst voiced Meg on Family Guy?” or “Where was the 8th Dalai Lama born?”, a system is required to answer it using a large corpus of diversiﬁed topics. More speciﬁcally, we assume\n\nthe extractive QA setting, in which the answer is restricted to a span appearing in one or more pas- sages in the corpus. Assume that our collection contains D documents, d1,d2,··· ,dD. We ﬁrst split each of the documents into text passages of equal lengths as the basic retrieval units3 and get M total passages in our corpus C = {p1,p2,...,pM}, where each passage pi can be viewed as a sequence 2 ,··· ,w(i) 1 ,w(i) of tokens w(i) |pi|. Given a question q, the task is to ﬁnd a span w(i) s+1,··· ,w(i) s ,w(i) from one of the passages pi that can answer the question. Notice that to cover a wide variety of domains, the corpus size can easily range from millions of docu- ments (e.g., Wikipedia) to billions (e.g., the Web). As a result, any open-domain QA system needs to include an efﬁcient retriever component that can se- lect a small set of relevant texts, before applying the reader to extract the answer (Chen et al., 2017).4 Formally speaking, a retriever R : (q,C) → CF is a function that takes as input a question q and a corpus C and returns a much smaller ﬁlter set of texts CF ⊂ C, where |CF| = k (cid:28) |C|. For a ﬁxed k, a retriever can be evaluated in isolation on top-k retrieval accuracy, which is the fraction of ques- tions for which CF contains a span that answers the question.\n\ne",
+    "element_id": "c2959a06eb5a6864c4f0c7d38e21b2e9",
+    "text": "2 Background\n\nThe problem of open-domain QA studied in this paper can be described as follows. Given a factoid question, such as “Who ﬁrst voiced Meg on Family Guy?” or “Where was the 8th Dalai Lama born?”, a system is required to answer it using a large corpus of diversiﬁed topics. More speciﬁcally, we assume\n\nthe extractive QA setting, in which the answer is restricted to a span appearing in one or more pas- sages in the corpus. Assume that our collection contains D documents, d1,d2,··· ,dD. We ﬁrst split each of the documents into text passages of equal lengths as the basic retrieval units3 and get M total passages in our corpus C = {p1,p2,...,pM}, where each passage pi can be viewed as a sequence 2 ,··· ,w(i) 1 ,w(i) of tokens w(i) |pi|. Given a question q, the task is to ﬁnd a span w(i) s+1,··· ,w(i) s ,w(i) from one of the passages pi that can answer the question. Notice that to cover a wide variety of domains, the corpus size can easily range from millions of docu- ments (e.g., Wikipedia) to billions (e.g., the Web). As a result, any open-domain QA system needs to include an efﬁcient retriever component that can se- lect a small set of relevant texts, before applying the reader to extract the answer (Chen et al., 2017).4 Formally speaking, a retriever R : (q,C) → CF is a function that takes as input a question q and a corpus C and returns a much smaller ﬁlter set of texts CF ⊂ C, where |CF| = k |C|. For a ﬁxed k, a retriever can be evaluated in isolation on top-k retrieval accuracy, which is the fraction of ques- tions for which CF contains a span that answers the question.\n\ne",
     "metadata": {
       "data_source": {
         "record_locator": {