-
Couldn't load subscription status.
- Fork 98
Open
Description
So we're trying to inline ocr result inside markdown export.
from docling.datamodel.accelerator_options import AcceleratorOptions, AcceleratorDevice
from docling.datamodel.base_models import InputFormat, DocumentStream
from docling.datamodel.pipeline_options import PdfPipelineOptions
from docling.document_converter import DocumentConverter, PdfFormatOption, PowerpointFormatOption, HTMLFormatOption, CsvFormatOption, ExcelFormatOption, MarkdownFormatOption, WordFormatOption
from pathlib import Path
from typing import Union
from docling_core.types.doc.base import ImageRefMode
from docling_core.types.doc import TextItem
class DoclingWorker:
_instance = None
def __new__(cls, *args, **kwargs):
if cls._instance is None:
cls._instance = super(DoclingWorker, cls).__new__(cls)
return cls._instance
def __init__(self, pipeline_options: PdfPipelineOptions = None):
if hasattr(self, "_initialized") and self._initialized:
return
if pipeline_options is None:
accelerator_options = AcceleratorOptions(
num_threads=1, device=AcceleratorDevice.CUDA, cuda_use_flash_attention2=True
)
pipeline_options = PdfPipelineOptions()
pipeline_options.do_ocr = False
pipeline_options.do_table_structure = True
pipeline_options.accelerator_options = accelerator_options
self.format_options = {
InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options, generate_picture_images=True),
InputFormat.DOCX: WordFormatOption(pipeline_options=pipeline_options),
InputFormat.PPTX: PowerpointFormatOption(pipeline_options=pipeline_options),
InputFormat.HTML: HTMLFormatOption(pipeline_options=pipeline_options),
# InputFormat.ASCIIDOC: AsciiDocFormatOption(pipeline_options=pipeline_options),
InputFormat.MD: MarkdownFormatOption(pipeline_options=pipeline_options),
InputFormat.CSV: CsvFormatOption(pipeline_options=pipeline_options),
InputFormat.XLSX: ExcelFormatOption(pipeline_options=pipeline_options)
}
self.converter = DocumentConverter(
format_options=self.format_options
)
self._initialized = True
def convert_to_markdown(self, document:Union[Path, str, DocumentStream]) -> str:
result = self.converter.convert(document)
return result.document.export_to_markdown(image_mode=ImageRefMode.INLINE)
@property
def supported_formats(self):
return [key.value for key in self.format_options.keys()]
if __name__ == "__main__":
from docling.datamodel.pipeline_options import EasyOcrOptions
accelerator_options = AcceleratorOptions(
num_threads=1, device=AcceleratorDevice.CUDA, cuda_use_flash_attention2=True
)
pipeline_options = PdfPipelineOptions()
pipeline_options.do_ocr = True
pipeline_options.do_table_structure = True
pipeline_options.accelerator_options = accelerator_options
pipeline_options.force_backend_text = True
pipeline_options.ocr_options = EasyOcrOptions(
lang=['fr', 'en'],
bitmap_area_threshold=0.0,
confidence_threshold=0.0
)
pipeline_options.do_picture_description = False
dw = DoclingWorker(pipeline_options=pipeline_options)
files = ["./pdftest.pdf", "Coucou text du haut.pdf"]
for file in files:
print(f"--- Converting {file} ---")
md = dw.convert_to_markdown(file)
print(md)
print("--- End of Conversion ---\n\n")So i have a try with two PDFS :
Coucou text du haut.pdf
pdftest.pdf
pdftest looks like this :
But returns :
Hey this is upper text
Hey this is bottom text
## THIS I5 AN IMAGE
Please note the image is placed at bottom
And with second pdf it returns :
Coucou text du haut
<!-- image -->
Coucou texte du bas
When I apply the PR : #390
Second PDF turns into :
Coucou text du haut
AEXTINCTION NON CONTROLÉE DE LA LUMIERE
RISQUES
DE CHUTES
UTILISATION D'APPAREIL DE VISION NOCTURNE
FORTEMENT CONSEILLÉ
Coucou texte du bas
Which is exactly (or kind of) what we expect.
- OCR markdown export doesn't work on all pdfs/images
- export isn't correctly placed (unless my fix is applied, but only applies on non accepted images)
Metadata
Metadata
Assignees
Labels
No labels