From c5697e12a76d44e7b9224457c7db64fc1917ef55 Mon Sep 17 00:00:00 2001 From: tguerin Date: Mon, 10 Jul 2023 11:42:16 +0200 Subject: [PATCH] [TIKA-4098] Detection fails on PDF with garbage before header --- .../org/apache/tika/mime/tika-mimetypes.xml | 4 ++-- .../apache/tika/parser/pdf/PDFParserTest.java | 8 +++++++- .../testPDF_garbageBeforeHeader.pdf | Bin 0 -> 1654 bytes 3 files changed, 9 insertions(+), 3 deletions(-) create mode 100644 tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/resources/test-documents/testPDF_garbageBeforeHeader.pdf diff --git a/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml b/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml index 75c8eba0d5..6d9c82a281 100644 --- a/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml +++ b/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml @@ -755,8 +755,8 @@ - - + + diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java index cb37992bcc..e3f2366220 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java @@ -140,9 +140,15 @@ public void testFontNameExtraction() throws Exception { assertContains("ABCDEE+Calibri", r.metadata.get(Font.FONT_NAME)); } + @Test + public void testGarbageBeforeHeader() throws Exception { + Metadata metadata = getXML("testPDF_garbageBeforeHeader.pdf").metadata; + assertEquals("application/pdf", metadata.get(Metadata.CONTENT_TYPE)); + } + @Test public void testPdfParsingMetadataOnly() throws Exception { - + Metadata metadata = getXML("testPDF.pdf").metadata; assertEquals("application/pdf", metadata.get(Metadata.CONTENT_TYPE)); assertEquals("Bertrand Delacr\u00e9taz", metadata.get(TikaCoreProperties.CREATOR)); diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/resources/test-documents/testPDF_garbageBeforeHeader.pdf b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/resources/test-documents/testPDF_garbageBeforeHeader.pdf new file mode 100644 index 0000000000000000000000000000000000000000..41e88afcd197e0316620d6b36f510b5d1d1b4a67 GIT binary patch literal 1654 zcmb7EO^6&t6b3<%iW|g(5WN(wLlCX5>i?eEo{l>+J7nFsiIYu477sPu)ibUAm+qR( ztQQZ8CedF#1R*zK`mbcvu#z6KeGp#iF2{CY&W4ZFCyYA%S zhBPl2Z%XI3yU3PSx$taCoJ}|uW(ff2gm}Y?EH6Y@Q&MCiL6&%_A0~r&6f8JRd>W;R zh}~s{bi#GKX#qV}WNA_`*YQ(t6pNC!Lmn}=lX^Yov%Bmk%%_M_mHG-ZRmD(iIyu|`3A+xS5wZWZ zpyYP7u$|MS2w0BRXZ$x{KTK%8yXbSJmj9FPZK@^pIOC|`IioQEHKAjq$U3Oq`0?-W zfC33j`_BT$0l4fO#BIuj7zO~JpUMW}(i@+l16Dv)xc7kT0+#q?Sh>D6yz&EhGxQ6% zqm}N@0NhST35Uvoz|wL!5a1`6AJSGjfjdMdu)K}BA*+^QYS1FKY?_L$iyeC@r<_4j zCYB3f&N35!sz$GXdX*^_{YpomU_bTJVV*%Zj5y07?nYwM4if{y06xW%0Uts| zv&+1v{H=h=+wtekqaTyI@4Wu)yT9JGx%a`V?QdSWcI=Nkp4@9aeQEE_-!9+xvUKUK z;~$;=;%m40*A@Br>&lCrbMmM6K6L+}6R#MHjWFNEaLzp6> zmi<)oFs56gwx#_HMJQ|;)5S#0F`iR#M&;FT^wLx)xZ(?Mbl{F-T6M#2YpoVBjJ9gk s*G!^on$|Kq%DQIPTQ2->7a`=c;qy#TQ7KFu`xQ5)u7O%@{d5=n3kKi&x&QzG literal 0 HcmV?d00001