-
Notifications
You must be signed in to change notification settings - Fork 677
Closed
Labels
not a bugnot a bug / user error / unable to reproducenot a bug / user error / unable to reproduce
Description
Description of the bug
the problems come from this pdf:
file_preview3.pdf
use the bboxes to crop, and I got four black dots.. the images do not match the text, please fix this bug
How to reproduce the bug
code:
def crop(pix1, bbox):
x0, y0, x1, y1 = bbox
width = x1 - x0
height = y1 - y0
# 创建一个全白的图像,大小与边界框相同
pix2 = fitz.Pixmap(fitz.csRGB, fitz.IRect(bbox))
# pix2 = fitz.Pixmap(width, height) math.ceil(width), math.ceil(height)
pix2.clear_with(255) # 填充白色
pix2.copy(pix1, (x0, y0, x1, y1))
return pix2
with fitz.open('pdf', pdf_bytes) as doc:
print(doc.is_repaired)
for page in doc:
text = page.get_text("text", sort=True)
page_text.append(text)
# (x0, y0, x1, y1, "lines in the block", block_no, block_type)
this_anns = page.get_text("blocks", sort=True)
all_annotations[page.number] = []
ans.append(this_anns)
dict_t=page.get_text("dict", sort=True)
dict_lists.append(dict_t)
for ann in this_anns:
line_result = {"path": path,
"page_no": page.number,
"language": '',
"image": None,
"image_upload_info": None,
"caption": None,
"bbox": None,
"type": 'line',
"question": "OCR: "}
# block_type -- 0:text, 1:image
if ann[6] != 0:
continue
x0, y0, x1, y1 = ann[:4]
text = ann[4]
block_num = ann[5]
this_ann = {
"bbox": [x0 * zoom_x, y0 * zoom_y, x1 * zoom_x, y1 * zoom_y],
"text": text.strip(),
"block_num": block_num
}
line_result["caption"] = text.strip()
pix = page.get_pixmap()
bbox = (int(x0), int(y0), math.ceil(x1), math.ceil(y1))
cropped_pix = crop(pix, bbox)
os.makedirs(data_dir + "/pages/page-%i/" % page.number, exist_ok=True)
cropped_pix.save(data_dir + "/pages/page-%i/" % page.number + str(block_num) +'.png')
PyMuPDF version
1.24.9
Operating system
Linux
Python version
3.10
Metadata
Metadata
Assignees
Labels
not a bugnot a bug / user error / unable to reproducenot a bug / user error / unable to reproduce

