Skip to content

Commit 2859eb3

Browse files
authored
TALIE-1176 each page text layer correctness (#527)
1 parent 3d1bd92 commit 2859eb3

File tree

18 files changed

+407
-188
lines changed

18 files changed

+407
-188
lines changed

README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -94,6 +94,7 @@ Relevant documentation of dedoc is available [here](https://dedoc.readthedocs.io
9494
* Article [ISPRAS@FinTOC-2022 shared task: Two-stage TOC generation model](https://aclanthology.org/2022.fnp-1.13.pdf) for the [FinTOC 2022 Shared Task](https://wp.lancs.ac.uk/cfie/fintoc2022/). We are the winners :smiley: :trophy:!
9595
* Article on habr.com [Dedoc: как автоматически извлечь из текстового документа всё и даже немного больше](https://habr.com/ru/companies/isp_ras/articles/779390/) in Russian (2023)
9696
* Article [Dedoc: A Universal System for Extracting Content and Logical Structure From Textual Documents](https://ieeexplore.ieee.org/abstract/document/10508151/) in English (2023)
97+
* Article [Automatic verification of the text layer correctness in PDF documents](https://ieeexplore.ieee.org/abstract/document/10659388/) in English (2024)
9798

9899
# Join Our Community
99100

dedoc/api/api_args.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@ class QueryParameters:
2828
description="Extract text from a text layer of PDF or using OCR methods for image-like documents")
2929
fast_textual_layer_detection: str = Form("false", enum=["true", "false"],
3030
description="Use non-ML solution to detect textual layer. Much faster but less accurate.")
31+
each_page_textual_layer_detection: str = Form("false", enum=["true", "false"], description="Detect textual layer on each page. Slower but more accurate.")
3132
language: str = Form("rus+eng", description="Recognition language ('rus+eng', 'rus', 'eng', 'fra', 'spa')")
3233
pages: str = Form(":", description='Page numbers range for reading PDF or images, "left:right" means read pages from left to right')
3334
is_one_column_document: str = Form("auto", enum=["auto", "true", "false"],

dedoc/api/web/index.html

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -100,7 +100,7 @@ <h4>Attachments handling</h4>
100100

101101
<div class="parameters">
102102
<h4>PDF handling</h4>
103-
<details><summary>pdf_with_text_layer, need_pdf_table_analysis, fast_textual_layer_detection, language, pages, is_one_column_document, document_orientation, need_header_footer_analysis, need_binarization, need_gost_frame_analysis</summary>
103+
<details><summary>pdf_with_text_layer, need_pdf_table_analysis, fast_textual_layer_detection, each_page_textual_layer_detection, language, pages, is_one_column_document, document_orientation, need_header_footer_analysis, need_binarization, need_gost_frame_analysis</summary>
104104
<br>
105105
<p>
106106
<label>
@@ -117,7 +117,11 @@ <h4>PDF handling</h4>
117117
<p>
118118
<label><input name="fast_textual_layer_detection" type="checkbox" value="true"> fast_textual_layer_detection</label>
119119
</p>
120-
120+
121+
<p>
122+
<label><input name="each_page_textual_layer_detection" type="checkbox" value="true"> each_page_textual_layer_detection</label>
123+
</p>
124+
121125
<p>
122126
<label> language
123127
<input name="language" list="language" size="8" placeholder="rus+eng">
Lines changed: 71 additions & 79 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,8 @@
1-
from typing import Optional
1+
from typing import List, Optional, Tuple
22

33
from dedoc.data_structures.unstructured_document import UnstructuredDocument
44
from dedoc.readers.base_reader import BaseReader
5+
from dedoc.readers.pdf_reader.pdf_auto_reader.txtlayer_result import TxtLayerResult
56

67

78
class PdfAutoReader(BaseReader):
@@ -54,100 +55,91 @@ def read(self, file_path: str, parameters: Optional[dict] = None) -> Unstructure
5455
"""
5556
parameters = {} if parameters is None else parameters
5657
warnings = []
57-
txtlayer_parameters = self.txtlayer_detector.detect_txtlayer(path=file_path, parameters=parameters)
58+
txtlayer_result = self.txtlayer_detector.detect_txtlayer(path=file_path, parameters=parameters)
5859

59-
if txtlayer_parameters.is_correct_text_layer:
60-
result = self.__handle_correct_text_layer(is_first_page_correct=txtlayer_parameters.is_first_page_correct,
61-
parameters=parameters,
62-
path=file_path,
63-
warnings=warnings)
64-
else:
65-
result = self.__handle_incorrect_text_layer(parameters, file_path, warnings)
60+
documents = []
61+
for txtlayer_result_chunk in txtlayer_result:
62+
document = self.__parse_document(txtlayer_result=txtlayer_result_chunk, parameters=parameters, path=file_path, warnings=warnings)
63+
documents.append(document)
6664

67-
result.warnings.extend(warnings)
68-
return result
65+
result_document = self.__merge_documents(documents)
66+
result_document.warnings.extend(warnings)
67+
return result_document
6968

70-
def __handle_incorrect_text_layer(self, parameters_copy: dict, path: str, warnings: list) -> UnstructuredDocument:
69+
def __parse_document(self, txtlayer_result: TxtLayerResult, parameters: dict, path: str, warnings: list) -> UnstructuredDocument:
7170
import os
7271

73-
self.logger.info(f"Assume document {os.path.basename(path)} has incorrect textual layer")
74-
warnings.append("Assume document has incorrect textual layer")
75-
result = self.pdf_image_reader.read(file_path=path, parameters=parameters_copy)
76-
return result
72+
end = "" if txtlayer_result.end is None else txtlayer_result.end
73+
correct_text = "correct" if txtlayer_result.correct else "incorrect"
74+
log_text = f"Assume document {os.path.basename(path)} has {correct_text} textual layer on pages [{txtlayer_result.start}:{end}]"
75+
self.logger.info(log_text)
76+
warnings.append(log_text)
77+
if txtlayer_result.document:
78+
return txtlayer_result.document
7779

78-
def __handle_correct_text_layer(self, is_first_page_correct: bool, parameters: dict, path: str, warnings: list) -> UnstructuredDocument:
79-
import os
80+
import copy
8081
from dedoc.utils.parameter_utils import get_param_pdf_with_txt_layer
8182

82-
self.logger.info(f"Assume document {os.path.basename(path)} has a correct textual layer")
83-
warnings.append("Assume document has a correct textual layer")
84-
recognized_first_page = None
85-
86-
if not is_first_page_correct:
87-
message = "Assume the first page hasn't a textual layer"
88-
warnings.append(message)
89-
self.logger.info(message)
90-
91-
# GET THE FIRST PAGE: recognize the first page like a scanned page
92-
scan_parameters = self.__preparing_first_page_parameters(parameters)
93-
recognized_first_page = self.pdf_image_reader.read(file_path=path, parameters=scan_parameters)
94-
95-
# PREPARE PARAMETERS: from the second page we recognize the content like PDF with a textual layer
96-
parameters = self.__preparing_other_pages_parameters(parameters)
83+
if txtlayer_result.correct:
84+
pdf_with_txt_layer = get_param_pdf_with_txt_layer(parameters)
85+
reader = self.pdf_txtlayer_reader if pdf_with_txt_layer == "auto" else self.pdf_tabby_reader
86+
else:
87+
reader = self.pdf_image_reader
9788

98-
pdf_with_txt_layer = get_param_pdf_with_txt_layer(parameters)
99-
reader = self.pdf_txtlayer_reader if pdf_with_txt_layer == "auto" else self.pdf_tabby_reader
100-
result = reader.read(file_path=path, parameters=parameters)
101-
result = self.__merge_documents(recognized_first_page, result) if recognized_first_page is not None else result
89+
copy_parameters = copy.deepcopy(parameters)
90+
copy_parameters["pages"] = f"{txtlayer_result.start}:{end}"
91+
result = reader.read(file_path=path, parameters=copy_parameters)
10292
return result
10393

104-
def __preparing_first_page_parameters(self, parameters: dict) -> dict:
105-
import copy
106-
from dedoc.utils.parameter_utils import get_param_page_slice
107-
108-
first_page, last_page = get_param_page_slice(parameters)
109-
# calculate indexes for the first page parsing
110-
first_page_index = 0 if first_page is None else first_page
111-
last_page_index = 0
112-
scan_parameters = copy.deepcopy(parameters)
113-
114-
# page numeration in parameters starts with 1, both ends are included
115-
scan_parameters["pages"] = f"{first_page_index + 1}:{last_page_index + 1}"
116-
# if the first page != 0 then we won't read it (because first_page_index > last_page_index)
117-
return scan_parameters
94+
def __merge_documents(self, documents: List[UnstructuredDocument]) -> UnstructuredDocument:
95+
if len(documents) == 0:
96+
raise ValueError("No documents to merge")
11897

119-
def __preparing_other_pages_parameters(self, parameters: dict) -> dict:
120-
from dedoc.utils.parameter_utils import get_param_page_slice
98+
if len(documents) == 1:
99+
return documents[0]
121100

122-
first_page, last_page = get_param_page_slice(parameters)
123-
# parameters for reading pages from the second page
124-
first_page_index = 1 if first_page is None else first_page
125-
last_page_index = "" if last_page is None else last_page
126-
parameters["pages"] = f"{first_page_index + 1}:{last_page_index}"
127-
128-
return parameters
129-
130-
def __merge_documents(self, first: UnstructuredDocument, second: UnstructuredDocument) -> UnstructuredDocument:
131101
from itertools import chain
102+
from dedoc.data_structures.concrete_annotations.attach_annotation import AttachAnnotation
132103
from dedoc.data_structures.concrete_annotations.table_annotation import TableAnnotation
133104
from dedoc.data_structures.line_with_meta import LineWithMeta
134105

135-
tables = first.tables
136-
dropped_tables = set()
137-
for table in second.tables:
138-
if table.metadata.page_id != 0:
139-
tables.append(table)
140-
else:
141-
dropped_tables.add(table.metadata.uid)
142-
143-
lines = []
144-
line_id = 0
145-
for line in chain(first.lines, second.lines):
106+
tables, attachments = self.__prepare_tables_attachments(documents)
107+
warnings = list(set(chain.from_iterable([document.warnings for document in documents])))
108+
table_uids = set([table.metadata.uid for table in tables])
109+
attachment_uids = set([attachment.uid for attachment in attachments])
110+
lines, line_id = [], 0
111+
112+
for line in chain.from_iterable([document.lines for document in documents]):
146113
line.metadata.line_id = line_id
147114
line_id += 1
148-
annotations = [
149-
annotation for annotation in line.annotations if not (isinstance(annotation, TableAnnotation) and annotation.value in dropped_tables)
150-
]
151-
new_line = LineWithMeta(line=line.line, metadata=line.metadata, annotations=annotations, uid=line.uid)
152-
lines.append(new_line)
153-
return UnstructuredDocument(tables=tables, lines=lines, attachments=first.attachments + second.attachments, metadata=second.metadata)
115+
annotations = []
116+
for annotation in line.annotations:
117+
if isinstance(annotation, TableAnnotation) and annotation.value not in table_uids:
118+
continue
119+
if isinstance(annotation, AttachAnnotation) and annotation.value not in attachment_uids:
120+
continue
121+
annotations.append(annotation)
122+
lines.append(LineWithMeta(line=line.line, metadata=line.metadata, annotations=annotations, uid=line.uid))
123+
124+
return UnstructuredDocument(tables=tables, lines=lines, attachments=attachments, metadata=documents[0].metadata, warnings=warnings)
125+
126+
def __prepare_tables_attachments(self, documents: List[UnstructuredDocument]) -> Tuple[list, list]:
127+
from dedoc.readers.pdf_reader.data_classes.pdf_image_attachment import PdfImageAttachment
128+
129+
tables, attachments, attachment_uids = [], [], set()
130+
for document in documents:
131+
if not document.lines:
132+
continue
133+
134+
lines = sorted(document.lines, key=lambda l: l.metadata.page_id)
135+
min_page, max_page = lines[0].metadata.page_id, lines[-1].metadata.page_id
136+
tables.extend([table for table in document.tables if min_page <= table.metadata.page_id <= max_page])
137+
for attachment in document.attachments:
138+
if not isinstance(attachment, PdfImageAttachment) and attachment.uid not in attachment_uids:
139+
attachment_uids.add(attachment.uid)
140+
attachments.append(attachment)
141+
142+
if isinstance(attachment, PdfImageAttachment) and min_page <= attachment.location.page_number <= max_page:
143+
attachments.append(attachment)
144+
145+
return tables, attachments

dedoc/readers/pdf_reader/pdf_auto_reader/txtlayer_classifier.py

Lines changed: 0 additions & 61 deletions
This file was deleted.

dedoc/readers/pdf_reader/pdf_auto_reader/txtlayer_classifier/__init__.py

Whitespace-only changes.
Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
import logging
2+
from abc import ABC, abstractmethod
3+
from typing import List
4+
5+
import numpy as np
6+
7+
from dedoc.data_structures.line_with_meta import LineWithMeta
8+
9+
10+
class AbstractTxtlayerClassifier(ABC):
11+
12+
def __init__(self, *, config: dict) -> None:
13+
self.config = config
14+
self.logger = config.get("logger", logging.getLogger())
15+
16+
@abstractmethod
17+
def predict(self, lines: List[List[LineWithMeta]]) -> np.ndarray:
18+
"""
19+
Classifies the correctness of the text layer in a PDF document.
20+
21+
:param lines: list of lists with document textual lines.
22+
:returns: array of bool values - True if the textual layer is correct, False otherwise.
23+
"""
24+
pass
Lines changed: 68 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,68 @@
1+
import os
2+
from typing import List
3+
4+
import numpy as np
5+
from xgboost import XGBClassifier
6+
7+
from dedoc.config import get_config
8+
from dedoc.data_structures.line_with_meta import LineWithMeta
9+
from dedoc.download_models import download_from_hub
10+
from dedoc.readers.pdf_reader.pdf_auto_reader.txtlayer_classifier.abstract_txtlayer_classifier import AbstractTxtlayerClassifier
11+
from dedoc.readers.pdf_reader.pdf_auto_reader.txtlayer_classifier.txtlayer_feature_extractor import TxtlayerFeatureExtractor
12+
from dedoc.utils.parameter_utils import get_param_gpu_available
13+
14+
15+
class MlTxtlayerClassifier(AbstractTxtlayerClassifier):
16+
"""
17+
The MlTxtlayerClassifier class is used for classifying the correctness of the textual layer in a PDF document
18+
using XGBClassifier (only for languages based on cyrillic- or latin-based alphabets).
19+
"""
20+
21+
def __init__(self, *, config: dict) -> None:
22+
super().__init__(config=config)
23+
self.feature_extractor = TxtlayerFeatureExtractor()
24+
self.path = os.path.join(get_config()["resources_path"], "txtlayer_classifier.json")
25+
self.__model = None
26+
27+
@property
28+
def __get_model(self) -> XGBClassifier:
29+
if self.__model is not None:
30+
return self.__model
31+
32+
if not os.path.isfile(self.path):
33+
out_dir, out_name = os.path.split(self.path)
34+
download_from_hub(out_dir=out_dir, out_name=out_name, repo_name="txtlayer_classifier", hub_name="model.json")
35+
36+
assert os.path.isfile(self.path)
37+
self.__model = XGBClassifier()
38+
self.__model.load_model(self.path)
39+
40+
if get_param_gpu_available(self.config, self.logger):
41+
gpu_params = dict(predictor="gpu_predictor", tree_method="auto", gpu_id=0)
42+
self.__model.set_params(**gpu_params)
43+
self.__model.get_booster().set_param(gpu_params)
44+
45+
return self.__model
46+
47+
def predict(self, lines: List[List[LineWithMeta]]) -> np.ndarray:
48+
result = np.zeros(len(lines))
49+
50+
idx_list = []
51+
text_for_inference = []
52+
for i, line_list in enumerate(lines):
53+
text_layer = "".join([line.line for line in line_list])
54+
if not text_layer:
55+
continue
56+
57+
if len(text_layer) < 150:
58+
text_layer = f"\n{text_layer}" * (150 // len(text_layer))
59+
text_for_inference.append(text_layer)
60+
idx_list.append(i)
61+
62+
if not text_for_inference:
63+
return result
64+
65+
features = self.feature_extractor.transform(text_for_inference)
66+
predictions = self.__get_model.predict(features)
67+
result[idx_list] = predictions
68+
return result.astype(bool)
Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
from typing import List
2+
3+
import numpy as np
4+
5+
from dedoc.data_structures.line_with_meta import LineWithMeta
6+
from dedoc.readers.pdf_reader.pdf_auto_reader.txtlayer_classifier.abstract_txtlayer_classifier import AbstractTxtlayerClassifier
7+
8+
9+
class SimpleTxtlayerClassifier(AbstractTxtlayerClassifier):
10+
"""
11+
Simple textual layer correctness classification.
12+
The textual layer is considered as a correct if it isn't empty.
13+
"""
14+
15+
def predict(self, lines: List[List[LineWithMeta]]) -> np.ndarray:
16+
result = np.array([any(line.line.strip() for line in line_list) for line_list in lines])
17+
return result
File renamed without changes.

0 commit comments

Comments
 (0)