diff --git a/src/core/evaluator.js b/src/core/evaluator.js index edf02c2c052bf..5cff6bdac149a 100644 --- a/src/core/evaluator.js +++ b/src/core/evaluator.js @@ -2406,6 +2406,7 @@ class PartialEvaluator { transform: null, fontName: null, hasEOL: false, + span: "", }; // Use a circular buffer (length === 2) to save the last chars in the @@ -3070,6 +3071,19 @@ class PartialEvaluator { textContentItem.str.length = 0; } + function replaceTextContentBySpan() { + const { span, str } = textContentItem; + if (!span) { + return; + } + textContentItem.span = ""; + if (/^\s+$/.test(span)) { + return; + } + str.length = 0; + str.push(span); + } + function enqueueChunk(batch = false) { const length = textContent.items.length; if (length === 0) { @@ -3446,6 +3460,11 @@ class PartialEvaluator { return; case OPS.beginMarkedContent: flushTextContentItem(); + if (args[0]?.name === "Span") { + textContentItem.span = stringToPDFString( + args[1]?.get("ActualText") || "" + ); + } if (includeMarkedContent) { markedContentData.level++; @@ -3457,6 +3476,11 @@ class PartialEvaluator { break; case OPS.beginMarkedContentProps: flushTextContentItem(); + if (args[0]?.name === "Span") { + textContentItem.span = stringToPDFString( + args[1]?.get("ActualText") || "" + ); + } if (includeMarkedContent) { markedContentData.level++; @@ -3474,6 +3498,7 @@ class PartialEvaluator { } break; case OPS.endMarkedContent: + replaceTextContentBySpan(); flushTextContentItem(); if (includeMarkedContent) { if (markedContentData.level === 0) { diff --git a/test/pdfs/.gitignore b/test/pdfs/.gitignore index e1a6e57ad44e8..75ccda1bd9b39 100644 --- a/test/pdfs/.gitignore +++ b/test/pdfs/.gitignore @@ -726,3 +726,4 @@ !chrome-text-selection-markedContent.pdf !bug1963407.pdf !issue19517.pdf +!issue20007.pdf diff --git a/test/pdfs/issue20007.pdf b/test/pdfs/issue20007.pdf new file mode 100644 index 0000000000000..dddf67441a9bb Binary files /dev/null and b/test/pdfs/issue20007.pdf differ diff --git a/test/unit/api_spec.js b/test/unit/api_spec.js index 6610de8a7ca3c..6fe47f3a06c93 100644 --- a/test/unit/api_spec.js +++ b/test/unit/api_spec.js @@ -3923,6 +3923,20 @@ Caron Broadcasting, Inc., an Ohio corporation (“Lessee”).`) expect(items[1].fontName).not.toEqual(items[0].fontName); }); + it("gets text content from /ActualText", async function () { + const loadingTask = getDocument(buildGetDocumentParams("issue20007.pdf")); + const pdfDoc = await loadingTask.promise; + const pdfPage = await pdfDoc.getPage(1); + + const { items } = await pdfPage.getTextContent({ + disableNormalization: true, + }); + const text = mergeText(items); + expect(text).toEqual("The quick brown fox jumps over the lazy dog"); + + await loadingTask.destroy(); + }); + it("gets empty structure tree", async function () { const tree = await page.getStructTree();