assafelovic · gabe-l-hart · Apr 11, 2025 · Apr 11, 2025 · Apr 11, 2025 · Apr 11, 2025
diff --git a/backend/server/server.py b/backend/server/server.py
@@ -93,6 +93,8 @@ class ConfigRequest(BaseModel):
 
 # Constants
 DOC_PATH = os.getenv("DOC_PATH", "./my-docs")
+CONVERT_WITH_DOCLING = os.getenv("CONVERT_WITH_DOCLING", "").lower() == "true"
+DOCLING_VLM = os.getenv("DOCLING_VLM", "")
 
 # Startup event
 
@@ -102,7 +104,7 @@ def startup_event():
     os.makedirs("outputs", exist_ok=True)
     app.mount("/outputs", StaticFiles(directory="outputs"), name="outputs")
     # os.makedirs(DOC_PATH, exist_ok=True)  # Commented out to avoid creating the folder if not needed
-    
+
 
 # Routes
 
@@ -187,7 +189,7 @@ async def run_multi_agents():
 
 @app.post("/upload/")
 async def upload_file(file: UploadFile = File(...)):
-    return await handle_file_upload(file, DOC_PATH)
+    return await handle_file_upload(file, DOC_PATH, CONVERT_WITH_DOCLING, DOCLING_VLM)
 
 
 @app.delete("/files/{filename}")

diff --git a/backend/server/server_utils.py b/backend/server/server_utils.py
@@ -46,11 +46,11 @@ async def send_json(self, data: Dict[str, Any]) -> None:
         # Send to websocket for real-time display
         if self.websocket:
             await self.websocket.send_json(data)
-            
+
         # Read current log file
         with open(self.log_file, 'r') as f:
             log_data = json.load(f)
-            
+
         # Update appropriate section based on data type
         if data.get('type') == 'logs':
             log_data['events'].append({
@@ -61,7 +61,7 @@ async def send_json(self, data: Dict[str, Any]) -> None:
         else:
             # Update content section for other types of data
             log_data['content'].update(data)
-            
+
         # Save updated log file
         with open(self.log_file, 'w') as f:
             json.dump(log_data, f, indent=2)
@@ -86,14 +86,14 @@ async def research(self) -> dict:
         """Conduct research and return paths to generated files"""
         await self.researcher.conduct_research()
         report = await self.researcher.write_report()
-        
+
         # Generate the files
         sanitized_filename = sanitize_filename(f"task_{int(time.time())}_{self.query}")
         file_paths = await generate_report_files(report, sanitized_filename)
-        
+
         # Get the JSON log path that was created by CustomLogsHandler
         json_relative_path = os.path.relpath(self.logs_handler.log_file)
-        
+
         return {
             "output": {
                 **file_paths,  # Include PDF, DOCX, and MD paths
@@ -105,14 +105,14 @@ def sanitize_filename(filename: str) -> str:
     # Split into components
     prefix, timestamp, *task_parts = filename.split('_')
     task = '_'.join(task_parts)
-    
+
     # Calculate max length for task portion
     # 255 - len(os.getcwd()) - len("\\gpt-researcher\\outputs\\") - len("task_") - len(timestamp) - len("_.json") - safety_margin
     max_task_length = 255 - len(os.getcwd()) - 24 - 5 - 10 - 6 - 5  # ~189 chars for task
-    
+
     # Truncate task if needed
     truncated_task = task[:max_task_length] if len(task) > max_task_length else task
-    
+
     # Reassemble and clean the filename
     sanitized = f"{prefix}_{timestamp}_{truncated_task}"
     return re.sub(r"[^\w\s-]", "", sanitized).strip()
@@ -189,8 +189,8 @@ async def send_file_paths(websocket, file_paths: Dict[str, str]):
 def get_config_dict(
     langchain_api_key: str, openai_api_key: str, tavily_api_key: str,
     google_api_key: str, google_cx_key: str, bing_api_key: str,
-    searchapi_api_key: str, serpapi_api_key: str, serper_api_key: str, searx_url: str
-) -> Dict[str, str]:
+    searchapi_api_key: str, serpapi_api_key: str, serper_api_key: str, searx_url: str,
+) -> Dict[str, str | bool]:
     return {
         "LANGCHAIN_API_KEY": langchain_api_key or os.getenv("LANGCHAIN_API_KEY", ""),
         "OPENAI_API_KEY": openai_api_key or os.getenv("OPENAI_API_KEY", ""),
@@ -205,7 +205,7 @@ def get_config_dict(
         "LANGCHAIN_TRACING_V2": os.getenv("LANGCHAIN_TRACING_V2", "true"),
         "DOC_PATH": os.getenv("DOC_PATH", "./my-docs"),
         "RETRIEVER": os.getenv("RETRIEVER", ""),
-        "EMBEDDING_MODEL": os.getenv("OPENAI_EMBEDDING_MODEL", "")
+        "EMBEDDING_MODEL": os.getenv("OPENAI_EMBEDDING_MODEL", ""),
     }
 
 
@@ -214,13 +214,19 @@ def update_environment_variables(config: Dict[str, str]):
         os.environ[key] = value
 
 
-async def handle_file_upload(file, DOC_PATH: str) -> Dict[str, str]:
+async def handle_file_upload(
+    file, DOC_PATH: str, CONVERT_WITH_DOCLING: bool, DOCLING_VLM: str,
+) -> Dict[str, str]:
     file_path = os.path.join(DOC_PATH, os.path.basename(file.filename))
     with open(file_path, "wb") as buffer:
         shutil.copyfileobj(file.file, buffer)
     print(f"File uploaded to {file_path}")
 
-    document_loader = DocumentLoader(DOC_PATH)
+    document_loader = DocumentLoader(
+        path=DOC_PATH,
+        use_docling=CONVERT_WITH_DOCLING,
+        docling_vlm=DOCLING_VLM,
+    )
     await document_loader.load()
 
     return {"filename": file.filename, "path": file_path}

diff --git a/docs/docs/gpt-researcher/gptr/config.md b/docs/docs/gpt-researcher/gptr/config.md
@@ -44,6 +44,8 @@ Below is a list of current supported options:
 - **`EMBEDDING_KWARGS`**: Json formatted dict of additional keyword args to be passed to the embedding provider class when instantiating it.
 - **`USER_AGENT`**: Custom User-Agent string for web crawling and web requests.
 - **`MEMORY_BACKEND`**: Backend used for memory operations, such as local storage of temporary data. Defaults to `local`.
+- **`CONVERT_WITH_DOCLING`**: Use [docling](https://docling-project.github.io/docling/) for document conversion (NOTE: This requires installing the `docling` extra e.g. `pip install gpt-researcher[docling]`).
+- **`DOCLING_VLM`**: If `CONVERT_WITH_DOCLING` is set, this will enable docling's pipeline for using a Visual Language Model (VLM) to parse images.
 
 To change the default configurations, you can simply add env variables to your `.env` file as named above or export manually in your local project directory.
 

diff --git a/docs/docs/gpt-researcher/gptr/pip-package.md b/docs/docs/gpt-researcher/gptr/pip-package.md
@@ -15,6 +15,12 @@ Follow these easy steps to get started:
 pip install gpt-researcher
 ```
 
+To install optional dependencies, use the following syntax:
+
+```bash
+pip install gpt-researcher[docling]
+```
+
 2. **Environment Variables:** Create a .env file with your OpenAI API key or simply export it
 
 ```bash

diff --git a/gpt_researcher/config/variables/base.py b/gpt_researcher/config/variables/base.py
@@ -28,10 +28,12 @@ class BaseConfig(TypedDict):
     MAX_SCRAPER_WORKERS: int
     MAX_SUBTOPICS: int
     REPORT_SOURCE: Union[str, None]
-    DOC_PATH: str
     PROMPT_FAMILY: str
     LLM_KWARGS: dict
     EMBEDDING_KWARGS: dict
     DEEP_RESEARCH_CONCURRENCY: int
     DEEP_RESEARCH_DEPTH: int
     DEEP_RESEARCH_BREADTH: int
+    DOC_PATH: str
+    CONVERT_WITH_DOCLING: bool
+    DOCLING_VLM: Union[str, None]
diff --git a/gpt_researcher/config/variables/default.py b/gpt_researcher/config/variables/default.py
@@ -26,12 +26,15 @@
     "MAX_SUBTOPICS": 3,
     "LANGUAGE": "english",
     "REPORT_SOURCE": "web",
-    "DOC_PATH": "./my-docs",
     "PROMPT_FAMILY": "default",
     "LLM_KWARGS": {},
     "EMBEDDING_KWARGS": {},
     # Deep research specific settings
     "DEEP_RESEARCH_BREADTH": 3,
     "DEEP_RESEARCH_DEPTH": 2,
     "DEEP_RESEARCH_CONCURRENCY": 4,
+    # Doc conversion settings
+    "DOC_PATH": "./my-docs",
+    "CONVERT_WITH_DOCLING": False,
+    "DOCLING_VLM": None,
 }
diff --git a/gpt_researcher/document/document.py b/gpt_researcher/document/document.py
@@ -1,6 +1,7 @@
 import asyncio
 import os
-from typing import List, Union
+import sys
+from typing import List, Optional, Union
 from langchain_community.document_loaders import (
     PyMuPDFLoader,
     TextLoader,
@@ -11,12 +12,84 @@
     UnstructuredWordDocumentLoader
 )
 from langchain_community.document_loaders import BSHTMLLoader
+from langchain_core.documents import Document
+
+try:
+    from docling.document_converter import DocumentConverter, PdfFormatOption
+    from docling.datamodel.base_models import InputFormat
+    from docling.datamodel.pipeline_options import (
+        VlmModelType,
+        VlmPipelineOptions,
+        granite_vision_vlm_conversion_options,
+        granite_vision_vlm_ollama_conversion_options,
+        smoldocling_vlm_conversion_options,
+        smoldocling_vlm_mlx_conversion_options,
+    )
+    from docling.pipeline.vlm_pipeline import VlmPipeline
+    HAVE_DOCLING = True
+except ImportError:
+    HAVE_DOCLING = False
+
+
+class DoclingLoader:
+    """Loader for an individual document that matches langchain API"""
+
+    def __init__(self, file_path: str, vlm: Optional[str] = None):
+        if not HAVE_DOCLING:
+            raise ImportError("Please install docling to use this function.")
+        self.file_path = file_path
+        vlm_options = None
+        if vlm == VlmModelType.GRANITE_VISION:
+            vlm_options = granite_vision_vlm_conversion_options
+        elif vlm == VlmModelType.GRANITE_VISION_OLLAMA:
+            vlm_options = granite_vision_vlm_ollama_conversion_options
+        elif vlm == VlmModelType.SMOLDOCLING:
+            vlm_options = smoldocling_vlm_conversion_options
+            if sys.platform == "darwin":
+                try:
+                    import mlx_vlm
+
+                    vlm_options = smoldocling_vlm_mlx_conversion_options
+                except ImportError:
+                    print("mlx-vlm not installed, falling back to torch")
+        elif vlm:
+            raise ValueError(f"Unknown docling vlm option: {vlm}")
+
+        format_options = None
+        if vlm_options is not None:
+            pipeline_options = VlmPipelineOptions(enable_remote_services=True)
+            pipeline_options.vlm_options = vlm_options
+            pdf_format_option = PdfFormatOption(
+                pipeline_cls=VlmPipeline, pipeline_options=pipeline_options
+            )
+            format_options = {
+                InputFormat.PDF: pdf_format_option,
+                InputFormat.IMAGE: pdf_format_option,
+            }
+
+        self.converter = DocumentConverter(format_options=format_options)
+
+    def load(self) -> list[Document]:
+        assert HAVE_DOCLING
+        res = self.converter.convert(self.file_path)
+        doc = Document(
+            page_content=res.document.export_to_markdown(),
+            metadata={"source": self.file_path}
+        )
+        return [doc]
 
 
 class DocumentLoader:
 
-    def __init__(self, path: Union[str, List[str]]):
+    def __init__(
+        self,
+        path: Union[str, List[str]],
+        use_docling: bool = False,
+        docling_vlm: Optional[str] = None,
+    ):
         self.path = path
+        self.use_docling = use_docling
+        self.docling_vlm = docling_vlm
 
     async def load(self) -> list:
         tasks = []
@@ -27,25 +100,18 @@ async def load(self) -> list:
                     file_name, file_extension_with_dot = os.path.splitext(filename)
                     file_extension = file_extension_with_dot.strip(".").lower()
                     tasks.append(self._load_document(file_path, file_extension))
-                    
+
         elif isinstance(self.path, (str, bytes, os.PathLike)):
             for root, dirs, files in os.walk(self.path):
                 for file in files:
                     file_path = os.path.join(root, file)
                     file_name, file_extension_with_dot = os.path.splitext(file)
                     file_extension = file_extension_with_dot.strip(".").lower()
                     tasks.append(self._load_document(file_path, file_extension))
-                    
+
         else:
             raise ValueError("Invalid type for path. Expected str, bytes, os.PathLike, or list thereof.")
 
-        # for root, dirs, files in os.walk(self.path):
-        #     for file in files:
-        #         file_path = os.path.join(root, file)
-        #         file_name, file_extension_with_dot = os.path.splitext(file_path)
-        #         file_extension = file_extension_with_dot.strip(".")
-        #         tasks.append(self._load_document(file_path, file_extension))
-
         docs = []
         for pages in await asyncio.gather(*tasks):
             for page in pages:
@@ -54,7 +120,7 @@ async def load(self) -> list:
                         "raw_content": page.page_content,
                         "url": os.path.basename(page.metadata['source'])
                     })
-                    
+
         if not docs:
             raise ValueError("🤷 Failed to load any documents!")
 
@@ -63,19 +129,39 @@ async def load(self) -> list:
     async def _load_document(self, file_path: str, file_extension: str) -> list:
         ret_data = []
         try:
-            loader_dict = {
-                "pdf": PyMuPDFLoader(file_path),
-                "txt": TextLoader(file_path),
-                "doc": UnstructuredWordDocumentLoader(file_path),
-                "docx": UnstructuredWordDocumentLoader(file_path),
-                "pptx": UnstructuredPowerPointLoader(file_path),
-                "csv": UnstructuredCSVLoader(file_path, mode="elements"),
-                "xls": UnstructuredExcelLoader(file_path, mode="elements"),
-                "xlsx": UnstructuredExcelLoader(file_path, mode="elements"),
-                "md": UnstructuredMarkdownLoader(file_path),
-                "html": BSHTMLLoader(file_path),
-                "htm": BSHTMLLoader(file_path)
-            }
+            if self.use_docling and HAVE_DOCLING:
+                docling_loader = DoclingLoader(file_path, self.docling_vlm)
+                loader_dict = {
+                    "pdf": docling_loader,
+                    "txt": TextLoader(file_path),
+                    "doc": docling_loader,
+                    "docx": docling_loader,
+                    "pptx": docling_loader,
+                    "csv": docling_loader,
+                    "xls": docling_loader,
+                    "xlsx": docling_loader,
+                    "md": UnstructuredMarkdownLoader(file_path),
+                    "html": BSHTMLLoader(file_path),
+                    "htm": BSHTMLLoader(file_path),
+                    "png": docling_loader,
+                    "jpg": docling_loader,
+                    "jpeg": docling_loader,
+                }
+
+            else:
+                loader_dict = {
+                    "pdf": PyMuPDFLoader(file_path),
+                    "txt": TextLoader(file_path),
+                    "doc": UnstructuredWordDocumentLoader(file_path),
+                    "docx": UnstructuredWordDocumentLoader(file_path),
+                    "pptx": UnstructuredPowerPointLoader(file_path),
+                    "csv": UnstructuredCSVLoader(file_path, mode="elements"),
+                    "xls": UnstructuredExcelLoader(file_path, mode="elements"),
+                    "xlsx": UnstructuredExcelLoader(file_path, mode="elements"),
+                    "md": UnstructuredMarkdownLoader(file_path),
+                    "html": BSHTMLLoader(file_path),
+                    "htm": BSHTMLLoader(file_path)
+                }
 
             loader = loader_dict.get(file_extension, None)
             if loader: