From 37a40f8ca99d7ab7389e6cf13e43a1929103317c Mon Sep 17 00:00:00 2001 From: Swetha Muthuvel Date: Fri, 4 Jul 2025 13:05:02 +0530 Subject: [PATCH 1/2] Add option to skip corrupt PDFs in PDFMergerUtility with improved exception handling. --- .../pdfbox/multipdf/PDFMergerUtility.java | 106 +++++++++++++----- 1 file changed, 75 insertions(+), 31 deletions(-) diff --git a/pdfbox/src/main/java/org/apache/pdfbox/multipdf/PDFMergerUtility.java b/pdfbox/src/main/java/org/apache/pdfbox/multipdf/PDFMergerUtility.java index 491607e3b1f..63a723e40a1 100644 --- a/pdfbox/src/main/java/org/apache/pdfbox/multipdf/PDFMergerUtility.java +++ b/pdfbox/src/main/java/org/apache/pdfbox/multipdf/PDFMergerUtility.java @@ -102,6 +102,24 @@ public class PDFMergerUtility private DocumentMergeMode documentMergeMode = DocumentMergeMode.PDFBOX_LEGACY_MODE; private AcroFormMergeMode acroFormMergeMode = AcroFormMergeMode.PDFBOX_LEGACY_MODE; + private boolean skipCorruptFiles = false; + + /** + * Set to true to skip corrupt PDF files instead of failing the whole merge. + * + * @param skip true to skip corrupt files + */ + public void setSkipCorruptFiles(boolean skip) { + this.skipCorruptFiles = skip; + } + + /** + *@return true if corrupt files should be skipped + */ + public boolean isSkipCorruptFiles() { + return skipCorruptFiles; + } + /** * The mode to use when merging documents: * @@ -372,28 +390,44 @@ else if (documentMergeMode == DocumentMergeMode.OPTIMIZE_RESOURCES_MODE) } } - private void optimizedMergeDocuments(StreamCacheCreateFunction streamCacheCreateFunction, - CompressParameters compressParameters) throws IOException - { - StreamCacheCreateFunction strmCacheFunc = streamCacheCreateFunction != null ? streamCacheCreateFunction + private void optimizedMergeDocuments(StreamCacheCreateFunction streamCacheCreateFunction, + CompressParameters compressParameters) throws IOException{ + StreamCacheCreateFunction strmCacheFunc = streamCacheCreateFunction != null + ? streamCacheCreateFunction : IOUtils.createMemoryOnlyStreamCache(); + try (PDDocument destination = new PDDocument(strmCacheFunc)) { PDFCloneUtility cloner = new PDFCloneUtility(destination); PDPageTree destinationPageTree = destination.getPages(); // cache PageTree + for (Object sourceObject : sources) { PDDocument sourceDoc = null; try { - if (sourceObject instanceof File) - { - sourceDoc = Loader.loadPDF((File) sourceObject); + //Wrap IOException with context + try { + if (sourceObject instanceof File) { + File file = (File) sourceObject; + sourceDoc = Loader.loadPDF(file); + } else { + sourceDoc = Loader.loadPDF((RandomAccessRead) sourceObject); + } } - else - { - sourceDoc = Loader.loadPDF((RandomAccessRead) sourceObject); + catch (IOException e) { + String sourceDesc = (sourceObject instanceof File) + ? ((File) sourceObject).getAbsolutePath() + : "RandomAccessRead source (index: " + sources.indexOf(sourceObject) + ")"; + if (skipCorruptFiles) { + LOG.warn("Skipping corrupt file: {}", sourceDesc, e); + continue; // skip this source and continue with others + } else { + throw new IOException("Failed to load PDF from source: " + sourceDesc, e); + } } + + for (PDPage page : sourceDoc.getPages()) { PDPage newPage = new PDPage(cloner.cloneForNewDocument(page.getCOSObject())); @@ -403,8 +437,6 @@ private void optimizedMergeDocuments(StreamCacheCreateFunction streamCacheCreate PDResources resources = page.getResources(); if (resources != null) { - // this is smart enough to just create references for resources that are used on multiple - // pages newPage.setResources(new PDResources( cloner.cloneForNewDocument(resources.getCOSObject()))); } @@ -420,7 +452,7 @@ private void optimizedMergeDocuments(StreamCacheCreateFunction streamCacheCreate IOUtils.closeQuietly(sourceDoc); } } - + if (destinationStream == null) { destination.save(destinationFileName, compressParameters); @@ -431,6 +463,7 @@ private void optimizedMergeDocuments(StreamCacheCreateFunction streamCacheCreate } } } + /** @@ -441,31 +474,42 @@ private void optimizedMergeDocuments(StreamCacheCreateFunction streamCacheCreate * * @throws IOException If there is an error saving the document. */ - private void legacyMergeDocuments(StreamCacheCreateFunction streamCacheCreateFunction, - CompressParameters compressParameters) throws IOException - { + private void legacyMergeDocuments(StreamCacheCreateFunction streamCacheCreateFunction, + CompressParameters compressParameters) throws IOException{ if (!sources.isEmpty()) { - // Make sure that: - // - first Exception is kept - // - all PDDocuments are closed - // - all FileInputStreams are closed - // - there's a way to see which errors occurred - StreamCacheCreateFunction strmCacheFunc = streamCacheCreateFunction != null ? streamCacheCreateFunction + StreamCacheCreateFunction strmCacheFunc = streamCacheCreateFunction != null + ? streamCacheCreateFunction : IOUtils.createMemoryOnlyStreamCache(); + try (PDDocument destination = new PDDocument(strmCacheFunc)) { for (Object sourceObject : sources) { PDDocument sourceDoc; - if (sourceObject instanceof File) - { - sourceDoc = Loader.loadPDF((File) sourceObject); + + //Wrap IOException with context + try { + if (sourceObject instanceof File) { + File file = (File) sourceObject; + sourceDoc = Loader.loadPDF(file); + } else { + sourceDoc = Loader.loadPDF((RandomAccessRead) sourceObject); + } } - else - { - sourceDoc = Loader.loadPDF((RandomAccessRead) sourceObject); + catch (IOException e) { + String sourceDesc = (sourceObject instanceof File) + ? ((File) sourceObject).getAbsolutePath() + : "RandomAccessRead source (index: " + sources.indexOf(sourceObject) + ")"; + if (skipCorruptFiles) { + LOG.warn("Skipping corrupt file: {}", sourceDesc, e); + continue; + } else { + throw new IOException("Failed to load PDF from source: " + sourceDesc, e); + } } + + try { appendDocument(destination, sourceDoc); @@ -475,8 +519,7 @@ private void legacyMergeDocuments(StreamCacheCreateFunction streamCacheCreateFun IOUtils.closeAndLogException(sourceDoc, LOG, "PDDocument", null); } } - - // optionally set meta data + if (destinationDocumentInformation != null) { destination.setDocumentInformation(destinationDocumentInformation); @@ -485,7 +528,7 @@ private void legacyMergeDocuments(StreamCacheCreateFunction streamCacheCreateFun { destination.getDocumentCatalog().setMetadata(destinationMetadata); } - + if (destinationStream == null) { destination.save(destinationFileName, compressParameters); @@ -498,6 +541,7 @@ private void legacyMergeDocuments(StreamCacheCreateFunction streamCacheCreateFun } } + /** * append all pages from source to destination. * From ae17c4ddd0ac1686b51904ee036f0f6fcc29ea76 Mon Sep 17 00:00:00 2001 From: Swetha Muthuvel Date: Fri, 4 Jul 2025 14:05:20 +0530 Subject: [PATCH 2/2] PDFBOX-XXXX: Centralize merge summary logging in PDFMergerUtility - Removed duplicate LOG.info calls from optimized and legacy merge methods. - Introduced shared field 'lastMergeSkippedCount' to track skipped corrupt PDFs. - Log merge summary once from mergeDocuments(), improving clarity and avoiding redundant output. --- .../pdfbox/multipdf/PDFMergerUtility.java | 153 ++++++++++-------- 1 file changed, 87 insertions(+), 66 deletions(-) diff --git a/pdfbox/src/main/java/org/apache/pdfbox/multipdf/PDFMergerUtility.java b/pdfbox/src/main/java/org/apache/pdfbox/multipdf/PDFMergerUtility.java index 63a723e40a1..b279329f7ae 100644 --- a/pdfbox/src/main/java/org/apache/pdfbox/multipdf/PDFMergerUtility.java +++ b/pdfbox/src/main/java/org/apache/pdfbox/multipdf/PDFMergerUtility.java @@ -93,6 +93,7 @@ public class PDFMergerUtility private static final Logger LOG = LogManager.getLogger(PDFMergerUtility.class); private final List sources; + private String destinationFileName; private OutputStream destinationStream; private boolean ignoreAcroFormErrors = false; @@ -103,6 +104,8 @@ public class PDFMergerUtility private AcroFormMergeMode acroFormMergeMode = AcroFormMergeMode.PDFBOX_LEGACY_MODE; private boolean skipCorruptFiles = false; + private int lastMergeSkippedCount = 0; + private final List skippedFiles = new ArrayList<>(); /** * Set to true to skip corrupt PDF files instead of failing the whole merge. @@ -388,25 +391,52 @@ else if (documentMergeMode == DocumentMergeMode.OPTIMIZE_RESOURCES_MODE) { optimizedMergeDocuments(streamCacheCreateFunction, compressParameters); } + LOG.info("Merge completed. Total sources: {}, Skipped: {}", sources.size(), lastMergeSkippedCount); + } - - private void optimizedMergeDocuments(StreamCacheCreateFunction streamCacheCreateFunction, - CompressParameters compressParameters) throws IOException{ + + + /** + * @return an unmodifiable list of file paths or stream indices that were skipped + */ + public List getSkippedFiles() { + return Collections.unmodifiableList(skippedFiles); + } + + /** + * Add a PDF source via InputStream. + * The InputStream is wrapped into a RandomAccessReadBuffer for PDFBox loading. + * + * @param inputStream Input stream containing a valid PDF file + */ + public void addSource(InputStream inputStream) { + if (inputStream == null) { + throw new IllegalArgumentException("InputStream cannot be null"); + } + sources.add(new RandomAccessReadBuffer(inputStream)); + } + + private void optimizedMergeDocuments(StreamCacheCreateFunction streamCacheCreateFunction, + CompressParameters compressParameters) throws IOException + { + if (sources.isEmpty()) { + throw new IllegalStateException("No source PDFs provided. Use addSource() before merging."); + } + StreamCacheCreateFunction strmCacheFunc = streamCacheCreateFunction != null ? streamCacheCreateFunction : IOUtils.createMemoryOnlyStreamCache(); - try (PDDocument destination = new PDDocument(strmCacheFunc)) - { + try (PDDocument destination = new PDDocument(strmCacheFunc)) { PDFCloneUtility cloner = new PDFCloneUtility(destination); - PDPageTree destinationPageTree = destination.getPages(); // cache PageTree + PDPageTree destinationPageTree = destination.getPages(); - for (Object sourceObject : sources) - { + lastMergeSkippedCount = 0; + + + for (Object sourceObject : sources) { PDDocument sourceDoc = null; - try - { - //Wrap IOException with context + try { try { if (sourceObject instanceof File) { File file = (File) sourceObject; @@ -414,53 +444,46 @@ private void optimizedMergeDocuments(StreamCacheCreateFunction streamCacheCreate } else { sourceDoc = Loader.loadPDF((RandomAccessRead) sourceObject); } - } - catch (IOException e) { + } catch (IOException e) { String sourceDesc = (sourceObject instanceof File) ? ((File) sourceObject).getAbsolutePath() : "RandomAccessRead source (index: " + sources.indexOf(sourceObject) + ")"; if (skipCorruptFiles) { LOG.warn("Skipping corrupt file: {}", sourceDesc, e); - continue; // skip this source and continue with others + skippedFiles.add(sourceDesc); + lastMergeSkippedCount++; + continue; } else { throw new IOException("Failed to load PDF from source: " + sourceDesc, e); } } - - for (PDPage page : sourceDoc.getPages()) - { + for (PDPage page : sourceDoc.getPages()) { PDPage newPage = new PDPage(cloner.cloneForNewDocument(page.getCOSObject())); newPage.setCropBox(page.getCropBox()); newPage.setMediaBox(page.getMediaBox()); newPage.setRotation(page.getRotation()); + PDResources resources = page.getResources(); - if (resources != null) - { + if (resources != null) { newPage.setResources(new PDResources( cloner.cloneForNewDocument(resources.getCOSObject()))); - } - else - { + } else { newPage.setResources(new PDResources()); } destinationPageTree.add(newPage); } - } - finally - { + } finally { IOUtils.closeQuietly(sourceDoc); } } - if (destinationStream == null) - { + if (destinationStream == null) { destination.save(destinationFileName, compressParameters); - } - else - { + } else { destination.save(destinationStream, compressParameters); } + } } @@ -475,20 +498,22 @@ private void optimizedMergeDocuments(StreamCacheCreateFunction streamCacheCreate * @throws IOException If there is an error saving the document. */ private void legacyMergeDocuments(StreamCacheCreateFunction streamCacheCreateFunction, - CompressParameters compressParameters) throws IOException{ - if (!sources.isEmpty()) - { - StreamCacheCreateFunction strmCacheFunc = streamCacheCreateFunction != null - ? streamCacheCreateFunction - : IOUtils.createMemoryOnlyStreamCache(); + CompressParameters compressParameters) throws IOException + { + if (sources.isEmpty()) { + throw new IllegalStateException("No source PDFs provided. Use addSource() before merging."); + } - try (PDDocument destination = new PDDocument(strmCacheFunc)) - { - for (Object sourceObject : sources) - { - PDDocument sourceDoc; + StreamCacheCreateFunction strmCacheFunc = streamCacheCreateFunction != null + ? streamCacheCreateFunction + : IOUtils.createMemoryOnlyStreamCache(); + + try (PDDocument destination = new PDDocument(strmCacheFunc)) { + lastMergeSkippedCount = 0; - //Wrap IOException with context + for (Object sourceObject : sources) { + PDDocument sourceDoc; + try { try { if (sourceObject instanceof File) { File file = (File) sourceObject; @@ -496,48 +521,44 @@ private void legacyMergeDocuments(StreamCacheCreateFunction streamCacheCreateFun } else { sourceDoc = Loader.loadPDF((RandomAccessRead) sourceObject); } - } - catch (IOException e) { + } catch (IOException e) { String sourceDesc = (sourceObject instanceof File) ? ((File) sourceObject).getAbsolutePath() : "RandomAccessRead source (index: " + sources.indexOf(sourceObject) + ")"; if (skipCorruptFiles) { LOG.warn("Skipping corrupt file: {}", sourceDesc, e); + skippedFiles.add(sourceDesc); + lastMergeSkippedCount++; continue; } else { throw new IOException("Failed to load PDF from source: " + sourceDesc, e); } } - - try - { + try { appendDocument(destination, sourceDoc); - } - finally - { + } finally { IOUtils.closeAndLogException(sourceDoc, LOG, "PDDocument", null); } + } catch (Exception e) { + LOG.error("Unexpected failure during legacy merge: ", e); + throw e; } + } - if (destinationDocumentInformation != null) - { - destination.setDocumentInformation(destinationDocumentInformation); - } - if (destinationMetadata != null) - { - destination.getDocumentCatalog().setMetadata(destinationMetadata); - } + if (destinationDocumentInformation != null) { + destination.setDocumentInformation(destinationDocumentInformation); + } + if (destinationMetadata != null) { + destination.getDocumentCatalog().setMetadata(destinationMetadata); + } - if (destinationStream == null) - { - destination.save(destinationFileName, compressParameters); - } - else - { - destination.save(destinationStream, compressParameters); - } + if (destinationStream == null) { + destination.save(destinationFileName, compressParameters); + } else { + destination.save(destinationStream, compressParameters); } + } }