Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
64 changes: 24 additions & 40 deletions Dockerfile
100755 → 100644
Original file line number Diff line number Diff line change
@@ -1,54 +1,38 @@
FROM pytorch/pytorch:2.4.0-cuda11.8-cudnn9-runtime
COPY --from=ghcr.io/astral-sh/uv:latest /uv /uvx /bin/

RUN apt-get update
RUN apt-get install --fix-missing -y -q --no-install-recommends libgomp1 ffmpeg libsm6 pdftohtml libxext6 git ninja-build g++ qpdf pandoc curl


RUN apt-get install -y ocrmypdf
RUN apt-get install -y tesseract-ocr-fra
RUN apt-get install -y tesseract-ocr-spa
RUN apt-get install -y tesseract-ocr-deu
RUN apt-get install -y tesseract-ocr-ara
RUN apt-get install -y tesseract-ocr-mya
RUN apt-get install -y tesseract-ocr-hin
RUN apt-get install -y tesseract-ocr-tam
RUN apt-get install -y tesseract-ocr-tha
RUN apt-get install -y tesseract-ocr-chi-sim
RUN apt-get install -y tesseract-ocr-tur
RUN apt-get install -y tesseract-ocr-ukr
RUN apt-get install -y tesseract-ocr-ell
RUN apt-get install -y tesseract-ocr-rus
RUN apt-get install -y tesseract-ocr-kor
RUN apt-get install -y tesseract-ocr-kor-vert
COPY --from=ghcr.io/astral-sh/uv:latest /uv /uvx /bin/

RUN apt-get update && apt-get install --fix-missing -y -q --no-install-recommends \
libgomp1 ffmpeg libsm6 pdftohtml libxext6 git ninja-build g++ qpdf pandoc curl \
ocrmypdf tesseract-ocr-fra tesseract-ocr-spa tesseract-ocr-deu tesseract-ocr-ara \
tesseract-ocr-mya tesseract-ocr-hin tesseract-ocr-tam tesseract-ocr-tha \
tesseract-ocr-chi-sim tesseract-ocr-tur tesseract-ocr-ukr tesseract-ocr-ell \
tesseract-ocr-rus tesseract-ocr-kor tesseract-ocr-kor-vert \
&& rm -rf /var/lib/apt/lists/*

RUN mkdir -p /app/src
RUN mkdir -p /app/models
WORKDIR /app

RUN addgroup --system python && adduser --system --group python
RUN chown -R python:python /app
USER python
# Copy dependency files
COPY pyproject.toml uv.lock ./

ENV VIRTUAL_ENV=/app/.venv
RUN python -m venv $VIRTUAL_ENV
ENV PATH="$VIRTUAL_ENV/bin:$PATH"
# Install dependencies with uv
RUN uv sync --frozen

COPY requirements.txt requirements.txt
RUN uv pip install --upgrade pip
RUN uv pip install -r requirements.txt
# Copy application code
COPY ./src/. ./src
COPY ./models/. ./models/
COPY ./start.sh ./start.sh

WORKDIR /app
# Setup detectron2 with --no-build-isolation since torch is already installed
RUN cd src && git clone https://github.com/facebookresearch/detectron2 && \
cd detectron2 && git checkout 70f454304e1a38378200459dd2dbca0f0f4a5ab4 && \
uv pip install --no-build-isolation -e .

RUN cd src; git clone https://github.com/facebookresearch/detectron2;
RUN cd src/detectron2; git checkout 70f454304e1a38378200459dd2dbca0f0f4a5ab4; python setup.py build develop
RUN uv pip install pycocotools==2.0.8
RUN uv run python src/download_models.py

COPY ./start.sh ./start.sh
COPY ./src/. ./src
COPY ./models/. ./models/
RUN python src/download_models.py
RUN uv run python -c "import detectron2; print(f'detectron2 installed at: {detectron2.__file__}')"

ENV PYTHONPATH "${PYTHONPATH}:/app/src"
ENV PYTHONPATH="${PYTHONPATH}:/app/src"
ENV TRANSFORMERS_VERBOSITY=error
ENV TRANSFORMERS_NO_ADVISORY_WARNINGS=1
3 changes: 2 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ dependencies = [
"requests==2.32.3",
"torch==2.4.0",
"torchvision==0.19.0",
"prometheus-fastapi-instrumentator>=7.1.0,<8",
"timm==1.0.8",
"Pillow==10.4.0",
"pdf-annotate==0.12.0",
Expand All @@ -36,4 +37,4 @@ dependencies = [
HURIDOCS = "https://huridocs.org"
GitHub = "https://github.com/huridocs/pdf-document-layout-analysis"
HuggingFace = "https://huggingface.co/HURIDOCS/pdf-document-layout-analysis"
DockerHub = "https://hub.docker.com/r/huridocs/pdf-document-layout-analysis"
DockerHub = "https://hub.docker.com/r/huridocs/pdf-document-layout-analysis"
3 changes: 2 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -26,4 +26,5 @@ latex2mathml==3.78.0
PyMuPDF==1.25.5
ollama==0.6.0
cachetools==6.2.1
git+https://github.com/huridocs/[email protected]
prometheus-fastapi-instrumentator>=7.1.0,<8
git+https://github.com/huridocs/[email protected]
3 changes: 2 additions & 1 deletion src/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
from catch_exceptions import catch_exceptions
from typing import Optional, Union
from starlette.concurrency import run_in_threadpool
from prometheus_fastapi_instrumentator import Instrumentator
import torch
import sys
import subprocess
Expand All @@ -21,7 +22,7 @@
controllers = setup_dependencies()

app = FastAPI()

Instrumentator().instrument(app).expose(app)

@app.get("/")
async def root():
Expand Down
Loading