From 1ffc72e484b667fa1355f8df6061d96815fe4d03 Mon Sep 17 00:00:00 2001
From: Sze Yu Sim <34510821+szeyu@users.noreply.github.com>
Date: Mon, 24 Feb 2025 10:01:43 +0800
Subject: [PATCH] Update `pyzerox/processor/pdf.py` convert_pdf_to_images
 function to add a fallback to use PyMuPDF that convert pdf to image without
 Poppler if Poppler not found

This ensure that it work in "everyone" version before they download the Poppler in the right path
---
 py_zerox/pyzerox/processor/pdf.py | 71 +++++++++++++++++++++++++------
 1 file changed, 57 insertions(+), 14 deletions(-)

diff --git a/py_zerox/pyzerox/processor/pdf.py b/py_zerox/pyzerox/processor/pdf.py
index af36629d..c53f40d3 100644
--- a/py_zerox/pyzerox/processor/pdf.py
+++ b/py_zerox/pyzerox/processor/pdf.py
@@ -2,7 +2,6 @@
 import os
 import asyncio
 from typing import List, Optional, Tuple
-from pdf2image import convert_from_path
 
 # Package Imports
 from .image import save_image
@@ -11,26 +10,70 @@
 from ..models import litellmmodel
 
 
-async def convert_pdf_to_images(image_density: int, image_height: tuple[Optional[int], int], local_path: str, temp_dir: str) -> List[str]:
+async def convert_pdf_to_images(local_path: str, temp_dir: str) -> List[str]:
     """Converts a PDF file to a series of images in the temp_dir. Returns a list of image paths in page order."""
-    options = {
-        "pdf_path": local_path,
-        "output_folder": temp_dir,
-        "dpi": image_density,
-        "fmt": PDFConversionDefaultOptions.FORMAT,
-        "size": image_height,
-        "thread_count": PDFConversionDefaultOptions.THREAD_COUNT,
-        "use_pdftocairo": PDFConversionDefaultOptions.USE_PDFTOCAIRO,
-        "paths_only": True,
-    }
-
     try:
+        logging.info("Attempting to use pdf2image library...")
+
+        # import and try to use pdf2image library
+        from pdf2image import convert_from_path
+
+        options = {
+            "pdf_path": local_path,
+            "output_folder": temp_dir,
+            "dpi": PDFConversionDefaultOptions.DPI,
+            "fmt": PDFConversionDefaultOptions.FORMAT,
+            "size": PDFConversionDefaultOptions.SIZE,
+            "thread_count": PDFConversionDefaultOptions.THREAD_COUNT,
+            "use_pdftocairo": PDFConversionDefaultOptions.USE_PDFTOCAIRO,
+            "paths_only": True,
+        }
         image_paths = await asyncio.to_thread(
             convert_from_path, **options
         )
         return image_paths
+    
     except Exception as err:
-        logging.error(f"Error converting PDF to images: {err}")
+        logging.warning(f"Poppler conversion failed, falling back to PyMuPDF: {err}")
+        
+        # import PyMuPDF library and the Image library
+        import fitz
+        import io
+        from PIL import Image
+
+        try:
+            # Fallback to PyMuPDF
+            image_paths = []
+            doc = fitz.open(local_path)
+            
+            for page_num in range(len(doc)):
+                page = doc[page_num]
+                # Convert to image with specified DPI
+                pix = page.get_pixmap(dpi=PDFConversionDefaultOptions.DPI)
+                
+                # Convert to PIL Image for potential resizing
+                img_data = pix.tobytes("png")
+                img = Image.open(io.BytesIO(img_data))
+                
+                # Resize if needed based on image_height parameter
+                if PDFConversionDefaultOptions.SIZE[1]:
+                    aspect_ratio = img.width / img.height
+                    new_height = min(PDFConversionDefaultOptions.SIZE[1], img.height)
+                    if PDFConversionDefaultOptions.SIZE[0]:
+                        new_height = max(PDFConversionDefaultOptions.SIZE[0], new_height)
+                    new_width = int(new_height * aspect_ratio)
+                    img = img.resize((new_width, new_height), Image.Resampling.LANCZOS)
+                
+                # Save the image
+                output_path = f"{temp_dir}/page_{page_num + 1}.png"
+                img.save(output_path)
+                image_paths.append(output_path)
+            
+            return image_paths
+
+        except Exception as err:
+            logging.error(f"Both Poppler and PyMuPDF conversion failed: {err}")
+            raise
 
 
 async def process_page(