From 3eba14e973b800419e24fc29df8173e1fc2550d9 Mon Sep 17 00:00:00 2001 From: deanchanter Date: Tue, 29 Apr 2025 09:06:46 -0400 Subject: [PATCH] adding pdf support --- mcpdoc/main.py | 22 ++++++++++++++++++++-- pyproject.toml | 1 + uv.lock | 14 ++++++++++++++ 3 files changed, 35 insertions(+), 2 deletions(-) diff --git a/mcpdoc/main.py b/mcpdoc/main.py index 278f99e..6d32815 100644 --- a/mcpdoc/main.py +++ b/mcpdoc/main.py @@ -7,7 +7,7 @@ from markdownify import markdownify from mcp.server.fastmcp import FastMCP from typing_extensions import NotRequired, TypedDict - +from pypdf import PdfReader class DocSource(TypedDict): """A source of documentation for a library or a package.""" @@ -21,6 +21,17 @@ class DocSource(TypedDict): description: NotRequired[str] """Description of the documentation source (optional).""" +def extract_text_from_pdf(pdf_file) -> str: + """Extract text content from a PDF file.""" + try: + reader = PdfReader(pdf_file) + text = "" + for page in reader.pages: + text += page.extract_text() + "\n" + return text.strip() + except Exception as e: + raise ValueError(f"Failed to extract text from PDF: {str(e)}") + def extract_domain(url: str) -> str: """Extract domain from URL. @@ -213,7 +224,14 @@ async def fetch_docs(url: str) -> str: try: response = await httpx_client.get(url, timeout=timeout) response.raise_for_status() - return markdownify(response.text) + if url.endswith(".txt"): + return response.text + elif url.endswith(".md"): + return markdownify(response.text) + elif url.endswith(".pdf"): + return extract_text_from_pdf(io.BytesIO(response.content)) + else: + return markdownify(response.text) except (httpx.HTTPStatusError, httpx.RequestError) as e: return f"Encountered an HTTP error: {str(e)}" diff --git a/pyproject.toml b/pyproject.toml index 7e6f899..939a39e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -9,6 +9,7 @@ dependencies = [ "httpx>=0.28.1", "markdownify>=1.1.0", "mcp[cli]>=1.4.1", + "pypdf>=5.4.0", "pyyaml>=6.0.1", ] diff --git a/uv.lock b/uv.lock index 9dd00bb..80b1189 100644 --- a/uv.lock +++ b/uv.lock @@ -265,6 +265,7 @@ dependencies = [ { name = "httpx" }, { name = "markdownify" }, { name = "mcp", extra = ["cli"] }, + { name = "pypdf" }, { name = "pyyaml" }, ] @@ -284,6 +285,7 @@ requires-dist = [ { name = "httpx", specifier = ">=0.28.1" }, { name = "markdownify", specifier = ">=1.1.0" }, { name = "mcp", extras = ["cli"], specifier = ">=1.4.1" }, + { name = "pypdf", specifier = ">=5.4.0" }, { name = "pyyaml", specifier = ">=6.0.1" }, ] @@ -436,6 +438,18 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/8a/0b/9fcc47d19c48b59121088dd6da2488a49d5f72dacf8262e2790a1d2c7d15/pygments-2.19.1-py3-none-any.whl", hash = "sha256:9ea1544ad55cecf4b8242fab6dd35a93bbce657034b0611ee383099054ab6d8c", size = 1225293 }, ] +[[package]] +name = "pypdf" +version = "5.4.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "typing-extensions", marker = "python_full_version < '3.11'" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/f9/43/4026f6ee056306d0e0eb04fcb9f2122a0f1a5c57ad9dc5e0d67399e47194/pypdf-5.4.0.tar.gz", hash = "sha256:9af476a9dc30fcb137659b0dec747ea94aa954933c52cf02ee33e39a16fe9175", size = 5012492 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/0b/27/d83f8f2a03ca5408dc2cc84b49c0bf3fbf059398a6a2ea7c10acfe28859f/pypdf-5.4.0-py3-none-any.whl", hash = "sha256:db994ab47cadc81057ea1591b90e5b543e2b7ef2d0e31ef41a9bfe763c119dab", size = 302306 }, +] + [[package]] name = "pytest" version = "8.3.5"