mozilla · calixteman · Jun 16, 2025 · fnlctrl · Jun 16, 2025 · calixteman
diff --git a/src/core/evaluator.js b/src/core/evaluator.js
 args = [ 
   args[0].name, 
   args[1] instanceof Dict ? args[1].get("MCID") : null, 
 ]; 
 args = [ 
   args[0].name, 
   args[1] instanceof Dict ? args[1].get("MCID") : null, 
 ]; 
@@ -2406,6 +2406,7 @@ class PartialEvaluator {
       transform: null,
       fontName: null,
       hasEOL: false,
+      span: "",
     };
 
     // Use a circular buffer (length === 2) to save the last chars in the
@@ -3070,6 +3071,19 @@ class PartialEvaluator {
       textContentItem.str.length = 0;
     }
 
+    function replaceTextContentBySpan() {
+      const { span, str } = textContentItem;
+      if (!span) {
+        return;
+      }
+      textContentItem.span = "";
+      if (/^\s+$/.test(span)) {
+        return;
+      }
+      str.length = 0;
+      str.push(span);
+    }
+
     function enqueueChunk(batch = false) {
       const length = textContent.items.length;
       if (length === 0) {
@@ -3446,6 +3460,11 @@ class PartialEvaluator {
             return;
           case OPS.beginMarkedContent:
             flushTextContentItem();
+            if (args[0]?.name === "Span") {
+              textContentItem.span = stringToPDFString(
+                args[1]?.get("ActualText") || ""
+              );
+            }
             if (includeMarkedContent) {
               markedContentData.level++;
 
@@ -3457,6 +3476,11 @@ class PartialEvaluator {
             break;
           case OPS.beginMarkedContentProps:
             flushTextContentItem();
+            if (args[0]?.name === "Span") {
+              textContentItem.span = stringToPDFString(
+                args[1]?.get("ActualText") || ""
+              );
+            }
             if (includeMarkedContent) {
               markedContentData.level++;
 
@@ -3474,6 +3498,7 @@ class PartialEvaluator {
             }
             break;
           case OPS.endMarkedContent:
+            replaceTextContentBySpan();
             flushTextContentItem();
             if (includeMarkedContent) {
               if (markedContentData.level === 0) {

diff --git a/test/pdfs/.gitignore b/test/pdfs/.gitignore
@@ -726,3 +726,4 @@
 !chrome-text-selection-markedContent.pdf
 !bug1963407.pdf
 !issue19517.pdf
+!issue20007.pdf
diff --git a/test/pdfs/issue20007.pdf b/test/pdfs/issue20007.pdf
diff --git a/test/unit/api_spec.js b/test/unit/api_spec.js
@@ -3923,6 +3923,20 @@ Caron Broadcasting, Inc., an Ohio corporation (“Lessee”).`)
       expect(items[1].fontName).not.toEqual(items[0].fontName);
     });
 
+    it("gets text content from /ActualText", async function () {
+      const loadingTask = getDocument(buildGetDocumentParams("issue20007.pdf"));
+      const pdfDoc = await loadingTask.promise;
+      const pdfPage = await pdfDoc.getPage(1);
+
+      const { items } = await pdfPage.getTextContent({
+        disableNormalization: true,
+      });
+      const text = mergeText(items);
+      expect(text).toEqual("The quick brown fox jumps over the lazy dog");
+
+      await loadingTask.destroy();
+    });
+
     it("gets empty structure tree", async function () {
       const tree = await page.getStructTree();