Skip to content

Weird ocr markown text inlining #391

@ExtReMLapin

Description

@ExtReMLapin

So we're trying to inline ocr result inside markdown export.

from docling.datamodel.accelerator_options import AcceleratorOptions, AcceleratorDevice
from docling.datamodel.base_models import InputFormat, DocumentStream
from docling.datamodel.pipeline_options import PdfPipelineOptions
from docling.document_converter import DocumentConverter, PdfFormatOption, PowerpointFormatOption, HTMLFormatOption, CsvFormatOption, ExcelFormatOption, MarkdownFormatOption, WordFormatOption
from pathlib import Path
from typing import Union
from docling_core.types.doc.base import ImageRefMode
from docling_core.types.doc import TextItem
class DoclingWorker:
    _instance = None

    def __new__(cls, *args, **kwargs):
        if cls._instance is None:
            cls._instance = super(DoclingWorker, cls).__new__(cls)
        return cls._instance

    def __init__(self, pipeline_options: PdfPipelineOptions = None):
        if hasattr(self, "_initialized") and self._initialized:
            return

        if pipeline_options is None:
            accelerator_options = AcceleratorOptions(
                num_threads=1, device=AcceleratorDevice.CUDA, cuda_use_flash_attention2=True
            )



            pipeline_options = PdfPipelineOptions()
            pipeline_options.do_ocr = False
            pipeline_options.do_table_structure = True
            pipeline_options.accelerator_options = accelerator_options
        
        self.format_options = {
            InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options, generate_picture_images=True),
            InputFormat.DOCX: WordFormatOption(pipeline_options=pipeline_options),
            InputFormat.PPTX: PowerpointFormatOption(pipeline_options=pipeline_options),
            InputFormat.HTML: HTMLFormatOption(pipeline_options=pipeline_options),
            # InputFormat.ASCIIDOC: AsciiDocFormatOption(pipeline_options=pipeline_options),
            InputFormat.MD: MarkdownFormatOption(pipeline_options=pipeline_options),
            InputFormat.CSV: CsvFormatOption(pipeline_options=pipeline_options),
            InputFormat.XLSX: ExcelFormatOption(pipeline_options=pipeline_options)
        }
        self.converter = DocumentConverter(
            format_options=self.format_options
        )
        self._initialized = True
        
    def convert_to_markdown(self, document:Union[Path, str, DocumentStream]) -> str:        
        result = self.converter.convert(document)
        
        

        
        return result.document.export_to_markdown(image_mode=ImageRefMode.INLINE)

    @property
    def supported_formats(self):
        return [key.value for key in self.format_options.keys()]



if __name__ == "__main__":
    from docling.datamodel.pipeline_options import EasyOcrOptions
    accelerator_options = AcceleratorOptions(
        num_threads=1, device=AcceleratorDevice.CUDA, cuda_use_flash_attention2=True
    )

    pipeline_options = PdfPipelineOptions()
    pipeline_options.do_ocr = True
    pipeline_options.do_table_structure = True
    pipeline_options.accelerator_options = accelerator_options

    pipeline_options.force_backend_text = True

    pipeline_options.ocr_options = EasyOcrOptions(
        lang=['fr', 'en'],
        bitmap_area_threshold=0.0,
        confidence_threshold=0.0 
    )

    pipeline_options.do_picture_description = False



    dw = DoclingWorker(pipeline_options=pipeline_options)

    files = ["./pdftest.pdf", "Coucou text du haut.pdf"]
    for file in files:
        print(f"--- Converting {file} ---")
        md = dw.convert_to_markdown(file)
        print(md)
        print("--- End of Conversion ---\n\n")

So i have a try with two PDFS :

Coucou text du haut.pdf
pdftest.pdf

pdftest looks like this :

Image

But returns :

Hey this is upper text

Hey this is bottom text

## THIS I5 AN IMAGE

Please note the image is placed at bottom


Image

And with second pdf it returns :

Coucou text du haut

<!-- image -->

Coucou texte du bas

When I apply the PR : #390

Second PDF turns into :

Coucou text du haut

AEXTINCTION NON CONTROLÉE DE LA LUMIERE
RISQUES
DE CHUTES
UTILISATION D'APPAREIL DE VISION NOCTURNE
FORTEMENT CONSEILLÉ

Coucou texte du bas

Which is exactly (or kind of) what we expect.

  1. OCR markdown export doesn't work on all pdfs/images
  2. export isn't correctly placed (unless my fix is applied, but only applies on non accepted images)

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions