Skip to content

Commit d6330ef

Browse files
sinkudoNastyBoget
andauthored
pdf broken encoding reader (#522)
Co-authored-by: Zykina (Bogatenkova) Anastasiya <[email protected]>
1 parent 2859eb3 commit d6330ef

File tree

25 files changed

+1533871
-7
lines changed

25 files changed

+1533871
-7
lines changed

.gitignore

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -148,4 +148,4 @@ crashlytics-build.properties
148148
fabric.properties
149149

150150
# Mac OS extentions
151-
*.DS_Store
151+
*.DS_Store

Dockerfile

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ ENV RESOURCES_PATH "/dedoc_root/resources"
88

99
COPY requirements.txt .
1010
RUN pip3 install --no-cache-dir -r requirements.txt
11+
RUN apt-get update && apt-get install -y --fix-missing --no-install-recommends fontforge
1112

1213
RUN mkdir /dedoc_root
1314
RUN mkdir /dedoc_root/dedoc

dedoc/api/api_args.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@ class QueryParameters:
2424
table_type: str = Form("", description="Pipeline mode for table recognition")
2525

2626
# pdf handling
27-
pdf_with_text_layer: str = Form("auto_tabby", enum=["true", "false", "auto", "auto_tabby", "tabby"],
27+
pdf_with_text_layer: str = Form("auto_tabby", enum=["true", "false", "auto", "auto_tabby", "tabby", "bad_encoding_reader"],
2828
description="Extract text from a text layer of PDF or using OCR methods for image-like documents")
2929
fast_textual_layer_detection: str = Form("false", enum=["true", "false"],
3030
description="Use non-ML solution to detect textual layer. Much faster but less accurate.")

dedoc/api/web/index.html

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -110,6 +110,7 @@ <h4>PDF handling</h4>
110110
<option value="auto">auto</option>
111111
<option value="auto_tabby" selected>auto_tabby</option>
112112
<option value="tabby">tabby</option>
113+
<option value="bad_encoding_reader">bad_encoding_reader</option>
113114
</select> pdf_with_text_layer
114115
</label>
115116
</p>

dedoc/manager_config.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,7 @@ def _get_manager_config(config: dict) -> dict:
3434
from dedoc.readers.pdf_reader.pdf_image_reader.pdf_image_reader import PdfImageReader
3535
from dedoc.readers.pdf_reader.pdf_txtlayer_reader.pdf_tabby_reader import PdfTabbyReader
3636
from dedoc.readers.pdf_reader.pdf_txtlayer_reader.pdf_txtlayer_reader import PdfTxtlayerReader
37+
from dedoc.readers.pdf_reader.pdf_txtlayer_reader.pdf_broken_encoding_reader.pdf_broken_encoding_reader import PdfBrokenEncodingReader
3738
from dedoc.readers.pptx_reader.pptx_reader import PptxReader
3839
from dedoc.readers.reader_composition import ReaderComposition
3940
from dedoc.readers.txt_reader.raw_text_reader import RawTextReader
@@ -73,6 +74,7 @@ def _get_manager_config(config: dict) -> dict:
7374
PdfAutoReader(config=config),
7475
PdfTabbyReader(config=config),
7576
PdfTxtlayerReader(config=config),
77+
PdfBrokenEncodingReader(config=config),
7678
PdfImageReader(config=config),
7779
EmailReader(config=config),
7880
MhtmlReader(config=config)

dedoc/readers/__init__.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
from .pdf_reader.pdf_auto_reader.pdf_auto_reader import PdfAutoReader
1313
from .pdf_reader.pdf_base_reader import PdfBaseReader
1414
from .pdf_reader.pdf_image_reader.pdf_image_reader import PdfImageReader
15+
from .pdf_reader.pdf_txtlayer_reader.pdf_broken_encoding_reader.pdf_broken_encoding_reader import PdfBrokenEncodingReader
1516
from .pdf_reader.pdf_txtlayer_reader.pdf_tabby_reader import PdfTabbyReader
1617
from .pdf_reader.pdf_txtlayer_reader.pdf_txtlayer_reader import PdfTxtlayerReader
1718
from .pptx_reader.pptx_reader import PptxReader
@@ -20,4 +21,4 @@
2021

2122
__all__ = ['ArchiveReader', 'ArticleReader', 'BaseReader', 'CSVReader', 'DocxReader', 'EmailReader', 'ExcelReader', 'HtmlReader', 'JsonReader', 'MhtmlReader',
2223
'NoteReader', 'PptxReader', 'ReaderComposition', 'RawTextReader', 'PdfBaseReader', 'PdfImageReader', 'PdfTabbyReader', 'PdfTxtlayerReader',
23-
'PdfAutoReader']
24+
'PdfAutoReader', 'PdfBrokenEncodingReader']

dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdf_broken_encoding_reader/__init__.py

Whitespace-only changes.
Lines changed: 123 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,123 @@
1+
import enum
2+
from functools import cached_property
3+
from pathlib import Path
4+
from typing import Dict, List, Type
5+
6+
char_pool = dict(
7+
rus_eng=[
8+
"a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", "o", "p", "q", "r", "s", "t", "u",
9+
"v", "w", "x", "y", "z", "A", "B", "C", "D", "E", "F", "G", "H", "I", "J", "K", "L", "M", "N", "O", "P",
10+
"Q", "R", "S", "T", "U", "V", "W", "X", "Y", "Z", "а", "б", "в", "г", "д", "е", "ж", "з", "и", "й", "к",
11+
"л", "м", "н", "о", "п", "р", "с", "т", "у", "ф", "х", "ц", "ч", "ш", "щ", "ъ", "ы", "ь", "э", "ю", "я",
12+
"А", "Б", "В", "Г", "Д", "Е", "Ж", "З", "И", "Й", "К", "Л", "М", "Н", "О", "П", "Р", "С", "Т", "У", "Ф",
13+
"Х", "Ц", "Ч", "Ш", "Щ", "Ъ", "Ы", "Ь", "Э", "Ю", "Я", "0", "1", "2", "3", "4", "5", "6", "7", "8", "9",
14+
"!", '"', "#", "$", "%", "&", "'", "(", ")", "*", "+", "-", ".", ",", "/", ":", ";", "<", "=", ">", "?",
15+
"@", "[", "\\", "]", "^", "_", "`", "{", "|", "}", "~", "©", "™"
16+
],
17+
rus_eng_no_reg_diff=[
18+
"a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", "o", "p", "q", "r", "s",
19+
"t", "u", "v", "w", "x", "y", "z", "а", "б", "в", "г", "д", "е", "ж", "з", "и", "й", "к",
20+
"л", "м", "н", "о", "п", "р", "с", "т", "у", "ф", "х", "ц", "ч", "ш", "щ", "ъ", "ы", "ь", "э",
21+
"ю", "я", "0", "1", "2", "3", "4", "5", "6", "7", "8", "9", "!", '"', "#", "$", "%", "&", "'",
22+
"(", ")", "*", "+", "-", ".", ",", "/", ":", ";", "<", "=", ">", "?", "@", "[", "\\", "]", "^",
23+
"_", "`", "{", "|", "}", "~", "©", "™"
24+
],
25+
rus=[
26+
"а", "б", "в", "г", "д", "е", "ж", "з", "и", "й", "к", "л", "м", "н", "о", "п", "р", "с", "т", "у", "ф",
27+
"х", "ц", "ч", "ш", "щ", "ъ", "ы", "ь", "э", "ю", "я", "А", "Б", "В", "Г", "Д", "Е", "Ж", "З", "И", "Й",
28+
"К", "Л", "М", "Н", "О", "П", "Р", "С", "Т", "У", "Ф", "Х", "Ц", "Ч", "Ш", "Щ", "Ъ", "Ы", "Ь", "Э", "Ю",
29+
"Я", "0", "1", "2", "3", "4", "5", "6", "7", "8", "9", "!", '"', "#", "$", "%", "&", "'", "(", ")", "*",
30+
"+", "-", ",", ".", "/", ":", ";", "<", "=", ">", "?", "@", "[", "\\", "]", "^", "_", "`", "{", "|",
31+
"}", "~", "©", "™"
32+
],
33+
rus_no_reg_diff=[
34+
"а", "б", "в", "г", "д", "е", "ж", "з", "и", "й", "к", "л", "м", "н", "о", "п", "р", "с", "т", "у",
35+
"ф", "х", "ц", "ч", "ш", "щ", "ъ", "ы", "ь", "э", "ю", "я", "0", "1", "2", "3", "4", "5", "6", "7",
36+
"8", "9", "!", '"', "#", "$", "%", "&", "'", "(", ")", "*", "+", "-", ",", ".", "/", ":", ";", "<",
37+
"=", ">", "?", "@", "[", "\\", "]", "^", "_", "`", "{", "|", "}", "~", "©", "™"
38+
],
39+
eng=[
40+
"a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", "o", "p", "q", "r", "s", "t", "u",
41+
"v", "w", "x", "y", "z", "A", "B", "C", "D", "E", "F", "G", "H", "I", "J", "K", "L", "M", "N", "O", "P",
42+
"Q", "R", "S", "T", "U", "V", "W", "X", "Y", "Z", "0", "1", "2", "3", "4", "5", "6", "7", "8", "9", "!",
43+
'"', "#", "$", "%", "&", "'", "(", ")", "*", "+", "-", ",", ".", "/", ":", ";", "<", "=", ">", "?", "@",
44+
"[", "\\", "]", "^", "_", "`", "{", "|", "}", "~", "©", "™"
45+
],
46+
eng_no_reg_diff=[
47+
"a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", "o", "p", "q", "r", "s", "t",
48+
"u", "v", "w", "x", "y", "z", "0", "1", "2", "3", "4", "5", "6", "7", "8", "9", "!", '"', "#", "$",
49+
"%", "&", "'", "(", ")", "*", "+", "-", ",", ".", "/", ":", ";", "<", "=", ">", "?", "@", "[",
50+
"\\", "]", "^", "_", "`", "{", "|", "}", "~", "©", "™"
51+
]
52+
)
53+
54+
other = dict(
55+
bottom_align=[",", ".", "_"],
56+
dont_aug=[
57+
",", "dot", "\\", "`", "_", "-", "=", ";", ":", "quotedbl", "colon", "backslash", ")", "(", "[", "]", "<",
58+
">", "~", "+", "'"
59+
]
60+
)
61+
62+
convert = dict(
63+
convert_chars_to_rus={
64+
"a": "а", "b": "в", "c": "с", "d": "д", "e": "е", "h": "н", "k": "к", "m": "м", "o": "о", "p": "р", "r": "г",
65+
"y": "у", "t": "т", "u": "и", "x": "х"
66+
}
67+
)
68+
69+
70+
class FolderPaths:
71+
@cached_property
72+
def paths(self) -> Dict[str, Path]:
73+
from dedoc.readers.pdf_reader.pdf_txtlayer_reader.pdf_broken_encoding_reader.functions import get_project_root
74+
root_dir = get_project_root()
75+
return dict(
76+
fonts_folders=Path(root_dir, "data", "fonts_folders"),
77+
images_folder=Path(root_dir, "data/datasets/test2"),
78+
output_train=Path(root_dir, "data/datasets/images/output"),
79+
last_prepared_data=Path(root_dir, "data/datasets/last_prepared"),
80+
extracted_data_folder=Path(root_dir, "data/pdfdata"),
81+
extracted_fonts_folder=Path(root_dir, "data/pdfdata/extracted_fonts"),
82+
extracted_glyphs_folder=Path(root_dir, "data/pdfdata/glyph_images"),
83+
default_models_folder=Path(root_dir, "data/models/default_models"),
84+
custom_models_folder=Path(root_dir, "data/models/custom_models"),
85+
datasets_folder=Path(root_dir, "data", "datasets"),
86+
ffwraper_folder=Path(root_dir, "ffwrapper", "fontforge_wrapper.py")
87+
)
88+
89+
90+
folders = FolderPaths().paths
91+
92+
93+
def get_default_models() -> List[str]:
94+
models_folder = Path(folders.get("default_models_folder"))
95+
return [f.stem for f in models_folder.glob("*.pt")]
96+
97+
98+
default_models = get_default_models()
99+
100+
101+
def chars_to_code(char_list: List[str]) -> List[int]:
102+
return [ord(i) for i in char_list]
103+
104+
105+
class Language(enum.Enum):
106+
Russian_and_English_no_reg_diff = char_pool["rus_eng_no_reg_diff"]
107+
Russian_no_reg_diff = char_pool["rus_no_reg_diff"]
108+
English_no_reg_diff = char_pool["eng_no_reg_diff"]
109+
Russian_and_English = char_pool["rus_eng"]
110+
Russian = char_pool["rus"]
111+
English = char_pool["eng"]
112+
113+
@classmethod
114+
def from_string(cls: Type["Language"], model_name: str) -> "Language":
115+
mapping = {
116+
"ruseng": cls.Russian_and_English,
117+
"rus": cls.Russian,
118+
"eng": cls.English
119+
}
120+
try:
121+
return mapping[model_name.lower()]
122+
except KeyError:
123+
raise ValueError("Incorrect model_name (rus, eng, ruseng)")

0 commit comments

Comments
 (0)