Skip to content

Docling integration #1337

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 9 commits into
base: master
Choose a base branch
from
6 changes: 4 additions & 2 deletions backend/server/server.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,8 @@ class ConfigRequest(BaseModel):

# Constants
DOC_PATH = os.getenv("DOC_PATH", "./my-docs")
CONVERT_WITH_DOCLING = os.getenv("CONVERT_WITH_DOCLING", "").lower() == "true"
DOCLING_VLM = os.getenv("DOCLING_VLM", "")

# Startup event

Expand All @@ -102,7 +104,7 @@ def startup_event():
os.makedirs("outputs", exist_ok=True)
app.mount("/outputs", StaticFiles(directory="outputs"), name="outputs")
# os.makedirs(DOC_PATH, exist_ok=True) # Commented out to avoid creating the folder if not needed


# Routes

Expand Down Expand Up @@ -187,7 +189,7 @@ async def run_multi_agents():

@app.post("/upload/")
async def upload_file(file: UploadFile = File(...)):
return await handle_file_upload(file, DOC_PATH)
return await handle_file_upload(file, DOC_PATH, CONVERT_WITH_DOCLING, DOCLING_VLM)


@app.delete("/files/{filename}")
Expand Down
34 changes: 20 additions & 14 deletions backend/server/server_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,11 +46,11 @@ async def send_json(self, data: Dict[str, Any]) -> None:
# Send to websocket for real-time display
if self.websocket:
await self.websocket.send_json(data)

# Read current log file
with open(self.log_file, 'r') as f:
log_data = json.load(f)

# Update appropriate section based on data type
if data.get('type') == 'logs':
log_data['events'].append({
Expand All @@ -61,7 +61,7 @@ async def send_json(self, data: Dict[str, Any]) -> None:
else:
# Update content section for other types of data
log_data['content'].update(data)

# Save updated log file
with open(self.log_file, 'w') as f:
json.dump(log_data, f, indent=2)
Expand All @@ -86,14 +86,14 @@ async def research(self) -> dict:
"""Conduct research and return paths to generated files"""
await self.researcher.conduct_research()
report = await self.researcher.write_report()

# Generate the files
sanitized_filename = sanitize_filename(f"task_{int(time.time())}_{self.query}")
file_paths = await generate_report_files(report, sanitized_filename)

# Get the JSON log path that was created by CustomLogsHandler
json_relative_path = os.path.relpath(self.logs_handler.log_file)

return {
"output": {
**file_paths, # Include PDF, DOCX, and MD paths
Expand All @@ -105,14 +105,14 @@ def sanitize_filename(filename: str) -> str:
# Split into components
prefix, timestamp, *task_parts = filename.split('_')
task = '_'.join(task_parts)

# Calculate max length for task portion
# 255 - len(os.getcwd()) - len("\\gpt-researcher\\outputs\\") - len("task_") - len(timestamp) - len("_.json") - safety_margin
max_task_length = 255 - len(os.getcwd()) - 24 - 5 - 10 - 6 - 5 # ~189 chars for task

# Truncate task if needed
truncated_task = task[:max_task_length] if len(task) > max_task_length else task

# Reassemble and clean the filename
sanitized = f"{prefix}_{timestamp}_{truncated_task}"
return re.sub(r"[^\w\s-]", "", sanitized).strip()
Expand Down Expand Up @@ -189,8 +189,8 @@ async def send_file_paths(websocket, file_paths: Dict[str, str]):
def get_config_dict(
langchain_api_key: str, openai_api_key: str, tavily_api_key: str,
google_api_key: str, google_cx_key: str, bing_api_key: str,
searchapi_api_key: str, serpapi_api_key: str, serper_api_key: str, searx_url: str
) -> Dict[str, str]:
searchapi_api_key: str, serpapi_api_key: str, serper_api_key: str, searx_url: str,
) -> Dict[str, str | bool]:
return {
"LANGCHAIN_API_KEY": langchain_api_key or os.getenv("LANGCHAIN_API_KEY", ""),
"OPENAI_API_KEY": openai_api_key or os.getenv("OPENAI_API_KEY", ""),
Expand All @@ -205,7 +205,7 @@ def get_config_dict(
"LANGCHAIN_TRACING_V2": os.getenv("LANGCHAIN_TRACING_V2", "true"),
"DOC_PATH": os.getenv("DOC_PATH", "./my-docs"),
"RETRIEVER": os.getenv("RETRIEVER", ""),
"EMBEDDING_MODEL": os.getenv("OPENAI_EMBEDDING_MODEL", "")
"EMBEDDING_MODEL": os.getenv("OPENAI_EMBEDDING_MODEL", ""),
}


Expand All @@ -214,13 +214,19 @@ def update_environment_variables(config: Dict[str, str]):
os.environ[key] = value


async def handle_file_upload(file, DOC_PATH: str) -> Dict[str, str]:
async def handle_file_upload(
file, DOC_PATH: str, CONVERT_WITH_DOCLING: bool, DOCLING_VLM: str,
) -> Dict[str, str]:
file_path = os.path.join(DOC_PATH, os.path.basename(file.filename))
with open(file_path, "wb") as buffer:
shutil.copyfileobj(file.file, buffer)
print(f"File uploaded to {file_path}")

document_loader = DocumentLoader(DOC_PATH)
document_loader = DocumentLoader(
path=DOC_PATH,
use_docling=CONVERT_WITH_DOCLING,
docling_vlm=DOCLING_VLM,
)
await document_loader.load()

return {"filename": file.filename, "path": file_path}
Expand Down
2 changes: 2 additions & 0 deletions docs/docs/gpt-researcher/gptr/config.md
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,8 @@ Below is a list of current supported options:
- **`EMBEDDING_KWARGS`**: Json formatted dict of additional keyword args to be passed to the embedding provider class when instantiating it.
- **`USER_AGENT`**: Custom User-Agent string for web crawling and web requests.
- **`MEMORY_BACKEND`**: Backend used for memory operations, such as local storage of temporary data. Defaults to `local`.
- **`CONVERT_WITH_DOCLING`**: Use [docling](https://docling-project.github.io/docling/) for document conversion (NOTE: This requires installing the `docling` extra e.g. `pip install gpt-researcher[docling]`).
- **`DOCLING_VLM`**: If `CONVERT_WITH_DOCLING` is set, this will enable docling's pipeline for using a Visual Language Model (VLM) to parse images.

To change the default configurations, you can simply add env variables to your `.env` file as named above or export manually in your local project directory.

Expand Down
6 changes: 6 additions & 0 deletions docs/docs/gpt-researcher/gptr/pip-package.md
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,12 @@ Follow these easy steps to get started:
pip install gpt-researcher
```

To install optional dependencies, use the following syntax:

```bash
pip install gpt-researcher[docling]
```

2. **Environment Variables:** Create a .env file with your OpenAI API key or simply export it

```bash
Expand Down
4 changes: 3 additions & 1 deletion gpt_researcher/config/variables/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,10 +28,12 @@ class BaseConfig(TypedDict):
MAX_SCRAPER_WORKERS: int
MAX_SUBTOPICS: int
REPORT_SOURCE: Union[str, None]
DOC_PATH: str
PROMPT_FAMILY: str
LLM_KWARGS: dict
EMBEDDING_KWARGS: dict
DEEP_RESEARCH_CONCURRENCY: int
DEEP_RESEARCH_DEPTH: int
DEEP_RESEARCH_BREADTH: int
DOC_PATH: str
CONVERT_WITH_DOCLING: bool
DOCLING_VLM: Union[str, None]
5 changes: 4 additions & 1 deletion gpt_researcher/config/variables/default.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,12 +26,15 @@
"MAX_SUBTOPICS": 3,
"LANGUAGE": "english",
"REPORT_SOURCE": "web",
"DOC_PATH": "./my-docs",
"PROMPT_FAMILY": "default",
"LLM_KWARGS": {},
"EMBEDDING_KWARGS": {},
# Deep research specific settings
"DEEP_RESEARCH_BREADTH": 3,
"DEEP_RESEARCH_DEPTH": 2,
"DEEP_RESEARCH_CONCURRENCY": 4,
# Doc conversion settings
"DOC_PATH": "./my-docs",
"CONVERT_WITH_DOCLING": False,
"DOCLING_VLM": None,
}
136 changes: 111 additions & 25 deletions gpt_researcher/document/document.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import asyncio
import os
from typing import List, Union
import sys
from typing import List, Optional, Union
from langchain_community.document_loaders import (
PyMuPDFLoader,
TextLoader,
Expand All @@ -11,12 +12,84 @@
UnstructuredWordDocumentLoader
)
from langchain_community.document_loaders import BSHTMLLoader
from langchain_core.documents import Document

try:
from docling.document_converter import DocumentConverter, PdfFormatOption
from docling.datamodel.base_models import InputFormat
from docling.datamodel.pipeline_options import (
VlmModelType,
VlmPipelineOptions,
granite_vision_vlm_conversion_options,
granite_vision_vlm_ollama_conversion_options,
smoldocling_vlm_conversion_options,
smoldocling_vlm_mlx_conversion_options,
)
from docling.pipeline.vlm_pipeline import VlmPipeline
HAVE_DOCLING = True
except ImportError:
HAVE_DOCLING = False


class DoclingLoader:
"""Loader for an individual document that matches langchain API"""

def __init__(self, file_path: str, vlm: Optional[str] = None):
if not HAVE_DOCLING:
raise ImportError("Please install docling to use this function.")
self.file_path = file_path
vlm_options = None
if vlm == VlmModelType.GRANITE_VISION:
vlm_options = granite_vision_vlm_conversion_options
elif vlm == VlmModelType.GRANITE_VISION_OLLAMA:
vlm_options = granite_vision_vlm_ollama_conversion_options
elif vlm == VlmModelType.SMOLDOCLING:
vlm_options = smoldocling_vlm_conversion_options
if sys.platform == "darwin":
try:
import mlx_vlm

vlm_options = smoldocling_vlm_mlx_conversion_options
except ImportError:
print("mlx-vlm not installed, falling back to torch")
elif vlm:
raise ValueError(f"Unknown docling vlm option: {vlm}")

format_options = None
if vlm_options is not None:
pipeline_options = VlmPipelineOptions(enable_remote_services=True)
pipeline_options.vlm_options = vlm_options
pdf_format_option = PdfFormatOption(
pipeline_cls=VlmPipeline, pipeline_options=pipeline_options
)
format_options = {
InputFormat.PDF: pdf_format_option,
InputFormat.IMAGE: pdf_format_option,
}

self.converter = DocumentConverter(format_options=format_options)

def load(self) -> list[Document]:
assert HAVE_DOCLING
res = self.converter.convert(self.file_path)
doc = Document(
page_content=res.document.export_to_markdown(),
metadata={"source": self.file_path}
)
return [doc]


class DocumentLoader:

def __init__(self, path: Union[str, List[str]]):
def __init__(
self,
path: Union[str, List[str]],
use_docling: bool = False,
docling_vlm: Optional[str] = None,
):
self.path = path
self.use_docling = use_docling
self.docling_vlm = docling_vlm

async def load(self) -> list:
tasks = []
Expand All @@ -27,25 +100,18 @@ async def load(self) -> list:
file_name, file_extension_with_dot = os.path.splitext(filename)
file_extension = file_extension_with_dot.strip(".").lower()
tasks.append(self._load_document(file_path, file_extension))

elif isinstance(self.path, (str, bytes, os.PathLike)):
for root, dirs, files in os.walk(self.path):
for file in files:
file_path = os.path.join(root, file)
file_name, file_extension_with_dot = os.path.splitext(file)
file_extension = file_extension_with_dot.strip(".").lower()
tasks.append(self._load_document(file_path, file_extension))

else:
raise ValueError("Invalid type for path. Expected str, bytes, os.PathLike, or list thereof.")

# for root, dirs, files in os.walk(self.path):
# for file in files:
# file_path = os.path.join(root, file)
# file_name, file_extension_with_dot = os.path.splitext(file_path)
# file_extension = file_extension_with_dot.strip(".")
# tasks.append(self._load_document(file_path, file_extension))

docs = []
for pages in await asyncio.gather(*tasks):
for page in pages:
Expand All @@ -54,7 +120,7 @@ async def load(self) -> list:
"raw_content": page.page_content,
"url": os.path.basename(page.metadata['source'])
})

if not docs:
raise ValueError("🤷 Failed to load any documents!")

Expand All @@ -63,19 +129,39 @@ async def load(self) -> list:
async def _load_document(self, file_path: str, file_extension: str) -> list:
ret_data = []
try:
loader_dict = {
"pdf": PyMuPDFLoader(file_path),
"txt": TextLoader(file_path),
"doc": UnstructuredWordDocumentLoader(file_path),
"docx": UnstructuredWordDocumentLoader(file_path),
"pptx": UnstructuredPowerPointLoader(file_path),
"csv": UnstructuredCSVLoader(file_path, mode="elements"),
"xls": UnstructuredExcelLoader(file_path, mode="elements"),
"xlsx": UnstructuredExcelLoader(file_path, mode="elements"),
"md": UnstructuredMarkdownLoader(file_path),
"html": BSHTMLLoader(file_path),
"htm": BSHTMLLoader(file_path)
}
if self.use_docling and HAVE_DOCLING:
docling_loader = DoclingLoader(file_path, self.docling_vlm)
loader_dict = {
"pdf": docling_loader,
"txt": TextLoader(file_path),
"doc": docling_loader,
"docx": docling_loader,
"pptx": docling_loader,
"csv": docling_loader,
"xls": docling_loader,
"xlsx": docling_loader,
"md": UnstructuredMarkdownLoader(file_path),
"html": BSHTMLLoader(file_path),
"htm": BSHTMLLoader(file_path),
"png": docling_loader,
"jpg": docling_loader,
"jpeg": docling_loader,
}

else:
loader_dict = {
"pdf": PyMuPDFLoader(file_path),
"txt": TextLoader(file_path),
"doc": UnstructuredWordDocumentLoader(file_path),
"docx": UnstructuredWordDocumentLoader(file_path),
"pptx": UnstructuredPowerPointLoader(file_path),
"csv": UnstructuredCSVLoader(file_path, mode="elements"),
"xls": UnstructuredExcelLoader(file_path, mode="elements"),
"xlsx": UnstructuredExcelLoader(file_path, mode="elements"),
"md": UnstructuredMarkdownLoader(file_path),
"html": BSHTMLLoader(file_path),
"htm": BSHTMLLoader(file_path)
}

loader = loader_dict.get(file_extension, None)
if loader:
Expand Down
Loading