drmingler
diff --git a/‎Dockerfile
Lines changed: 28 additions & 15 deletions b/‎Dockerfile
Lines changed: 28 additions & 15 deletions
diff --git a/‎README.md
Lines changed: 72 additions & 0 deletions b/‎README.md
Lines changed: 72 additions & 0 deletions
@@ -8,16 +8,10 @@ RUN apt-get update && \
     apt-get install -y --no-install-recommends libgl1 libglib2.0-0 && \
     rm -rf /var/lib/apt/lists/*
 
-# Enable bytecode compilation and set proper link mode for cache mounting
-ENV UV_COMPILE_BYTECODE=1 \
-    UV_LINK_MODE=copy \
-    HF_HOME=/app/.cache/huggingface \
-    TORCH_HOME=/app/.cache/torch \
-    PYTHONPATH=/app \
-    OMP_NUM_THREADS=4
-
-# Copy dependency files and README
-COPY pyproject.toml uv.lock README.md ./
+# Copy only dependency files and create a dummy README
+COPY pyproject.toml uv.lock ./
+# Create a dummy README.md file to satisfy package requirements
+RUN echo "# Placeholder README" > README.md
 
 # Install dependencies but not the project itself
 RUN --mount=type=cache,target=/root/.cache/uv \
@@ -46,9 +40,12 @@ RUN ARCH=$(uname -m) && \
     uv pip install --no-cache-dir torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121; \
     fi
 
-# Install the project in non-editable mode
-RUN --mount=type=cache,target=/root/.cache/uv \
-    uv sync --frozen --no-editable
+# Download models
+RUN . /app/.venv/bin/activate && \
+    mkdir -p /app/.cache && \
+    python -c 'from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline; artifacts_path = StandardPdfPipeline.download_models_hf(force=True);' && \
+    python -c 'import easyocr; reader = easyocr.Reader(["fr", "de", "es", "en", "it", "pt"], gpu=True); print("EasyOCR models downloaded successfully")' && \
+    python -c 'from chonkie import SDPMChunker; chunker = SDPMChunker(embedding_model="minishlab/potion-base-8M"); print("Chonkie models downloaded successfully")'
 
 # Download models for the pipeline
 RUN uv run python -c "from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline; artifacts_path = StandardPdfPipeline.download_models_hf(force=True)"
@@ -62,6 +59,8 @@ RUN ARCH=$(uname -m) && \
     echo "Downloading EasyOCR models with GPU support" && \
     uv run python -c "import easyocr; reader = easyocr.Reader(['fr', 'de', 'es', 'en', 'it', 'pt'], gpu=True); print('EasyOCR GPU models downloaded successfully')"; \
     fi
+    
+RUN uv run python -c 'from chonkie import SDPMChunker; chunker = SDPMChunker(embedding_model="minishlab/potion-base-8M"); print("Chonkie models downloaded successfully")'
 
 # Production stage
 FROM ghcr.io/astral-sh/uv:python3.12-bookworm-slim
@@ -72,8 +71,22 @@ RUN apt-get update && \
     apt-get install -y --no-install-recommends redis-server libgl1 libglib2.0-0 curl && \
     rm -rf /var/lib/apt/lists/*
 
-# Set environment variables
-ENV HF_HOME=/app/.cache/huggingface \
+# Copy model cache from builder - this rarely changes
+COPY --from=builder --chown=app:app /app/.cache /app/.cache/
+COPY --from=builder --chown=app:app /app/.venv /app/.venv/
+
+# Create dummy README and copy dependency files
+RUN echo "# Placeholder README" > README.md
+COPY --chown=app:app pyproject.toml uv.lock ./
+
+# Copy project files from disk
+COPY --chown=app:app document_converter/ ./document_converter/
+COPY --chown=app:app worker/ ./worker/
+COPY --chown=app:app main.py ./
+
+# Set up Python environment
+ENV PYTHONPATH=/app \
+    HF_HOME=/app/.cache/huggingface \
     TORCH_HOME=/app/.cache/torch \
     PYTHONPATH=/app \
     OMP_NUM_THREADS=4 \
 
@@ -34,12 +34,14 @@
   - Image extraction and processing
   - Multi-language OCR support (French, German, Spanish, English, Italian, Portuguese etc)
   - Configurable image resolution scaling
+  - Document chunking for LLM processing and RAG applications
 
 - **API Endpoints**:
   - Synchronous single document conversion
   - Synchronous batch document conversion
   - Asynchronous single document conversion with job tracking
   - Asynchronous batch conversion with job tracking
+  - Document chunking for completed conversion jobs
 
 - **Processing Modes**:
   - CPU-only processing for standard deployments
@@ -236,6 +238,76 @@ curl -X POST "http://localhost:8080/batch-conversion-jobs" \
   -F "documents=@/path/to/document2.pdf"
 ```
 
+### Document Chunking
+
+After converting documents, you can generate text chunks optimized for LLM processing:
+
+1. Chunk a single converted document:
+
+```bash
+curl -X GET "http://localhost:8080/conversion-jobs/{job_id}/chunks?max_tokens=512&merge_peers=true&include_page_numbers=true" \
+  -H "accept: application/json"
+```
+
+2. Chunk all documents from a batch conversion:
+
+```bash
+curl -X GET "http://localhost:8080/batch-conversion-jobs/{job_id}/chunks?max_tokens=512&merge_peers=true&include_page_numbers=true" \
+  -H "accept: application/json"
+```
+
+3. Chunk text directly (without requiring a conversion job):
+
+```bash
+curl -X POST "http://localhost:8080/text/chunk" \
+  -H "accept: application/json" \
+  -H "Content-Type: application/json" \
+  -d '{
+    "text": "This is the text content that needs to be chunked. It can be as long as needed.",
+    "filename": "example.txt",
+    "max_tokens": 512,
+    "merge_peers": true,
+    "include_page_numbers": false
+  }'
+```
+
+Chunking parameters:
+- `max_tokens`: Maximum number of tokens per chunk (range: 64-2048, default: 512)
+- `merge_peers`: Whether to merge undersized peer chunks (default: true)
+- `include_page_numbers`: Whether to include page number references in chunk metadata (default: false)
+
+#### Chunking Implementation
+
+The API uses the Semantic Double-Pass Merging (SDPM) algorithm from the Chonkie library to produce high-quality chunks with improved context preservation. This chunker:
+
+1. Groups content by semantic similarity
+2. Merges similar groups within a skip window
+3. Connects related content that may not be consecutive in the text
+4. Preserves contextual relationships between different parts of the document
+
+The chunker is particularly effective for documents with recurring themes or concepts spread throughout the text.
+
+The response includes:
+```json
+{
+  "job_id": "the-job-id",
+  "filename": "document-name",
+  "chunks": [
+    {
+      "text": "Plain text content of the chunk without additional context",
+      "metadata": {
+        "token_count": 123,
+        "start_index": 0,
+        "end_index": 512,
+        "sentence_count": 5,
+        "page_number": 1
+      }
+    }
+  ],
+  "error": null  // Error message if chunking failed
+}
+```
+
 ## Configuration Options
 
 - `image_resolution_scale`: Control the resolution of extracted images (1-4)