Skip to content

Commit 5afa99e

Browse files
dolfim-ibmceberam
andauthored
fix: referenced artifacts relative to the document location (#361)
* fix relative references Signed-off-by: Michele Dolfi <[email protected]> * Apply suggestions from code review Co-authored-by: Cesar Berrospi Ramis <[email protected]> Signed-off-by: Michele Dolfi <[email protected]> --------- Signed-off-by: Michele Dolfi <[email protected]> Signed-off-by: Michele Dolfi <[email protected]> Co-authored-by: Cesar Berrospi Ramis <[email protected]>
1 parent 6be0c8b commit 5afa99e

File tree

2 files changed

+28
-25
lines changed

2 files changed

+28
-25
lines changed

docling_core/types/doc/document.py

Lines changed: 14 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1373,11 +1373,12 @@ def _image_to_base64(self, pil_image, format="PNG"):
13731373
) # Encode to Base64 and decode to string
13741374
return img_base64
13751375

1376-
def _image_to_hexhash(self) -> Optional[str]:
1376+
@staticmethod
1377+
def _image_to_hexhash(img: Optional[PILImage.Image]) -> Optional[str]:
13771378
"""Hexash from the image."""
1378-
if self.image is not None and self.image._pil is not None:
1379+
if img is not None:
13791380
# Convert the image to raw bytes
1380-
image_bytes = self.image._pil.tobytes()
1381+
image_bytes = img.tobytes()
13811382

13821383
# Create a hash object (e.g., SHA-256)
13831384
hasher = hashlib.sha256(usedforsecurity=False)
@@ -4116,16 +4117,10 @@ def _with_pictures_refs(
41164117
if image_dir.is_dir():
41174118
for item, level in result.iterate_items(page_no=page_no, with_groups=False):
41184119
if isinstance(item, PictureItem):
4120+
img = item.get_image(doc=self)
4121+
if img is not None:
41194122

4120-
if (
4121-
item.image is not None
4122-
and isinstance(item.image.uri, AnyUrl)
4123-
and item.image.uri.scheme == "data"
4124-
and item.image.pil_image is not None
4125-
):
4126-
img = item.image.pil_image
4127-
4128-
hexhash = item._image_to_hexhash()
4123+
hexhash = PictureItem._image_to_hexhash(img)
41294124

41304125
# loc_path = image_dir / f"image_{img_count:06}.png"
41314126
if hexhash is not None:
@@ -4140,6 +4135,11 @@ def _with_pictures_refs(
41404135
else:
41414136
obj_path = loc_path
41424137

4138+
if item.image is None:
4139+
scale = img.size[0] / item.prov[0].bbox.width
4140+
item.image = ImageRef.from_pil(
4141+
image=img, dpi=round(72 * scale)
4142+
)
41434143
item.image.uri = Path(obj_path)
41444144

41454145
# if item.image._pil is not None:
@@ -4539,6 +4539,8 @@ def _get_output_paths(
45394539
reference_path = None
45404540
else:
45414541
reference_path = filename.parent
4542+
artifacts_dir = reference_path / artifacts_dir
4543+
45424544
return artifacts_dir, reference_path
45434545

45444546
def _make_copy_with_refmode(

test/test_docling_doc.py

Lines changed: 14 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1442,10 +1442,11 @@ def test_save_to_disk():
14421442

14431443
doc: DoclingDocument = _construct_doc()
14441444

1445-
image_dir = Path("./test/data/doc/constructed_images/")
1445+
test_dir = Path("./test/data/doc")
1446+
image_dir = Path("constructed_images/") # will be relative to test_dir
14461447

14471448
doc_with_references = doc._with_pictures_refs(
1448-
image_dir=image_dir, # Path("./test/data/constructed_images/")
1449+
image_dir=(test_dir / image_dir),
14491450
page_no=None,
14501451
)
14511452

@@ -1455,53 +1456,53 @@ def test_save_to_disk():
14551456

14561457
### MarkDown
14571458

1458-
filename = Path("test/data/doc/constructed_doc.placeholder.md")
1459+
filename = test_dir / "constructed_doc.placeholder.md"
14591460
doc.save_as_markdown(
14601461
filename=filename, artifacts_dir=image_dir, image_mode=ImageRefMode.PLACEHOLDER
14611462
)
14621463
_verify_saved_output(filename=filename, paths=paths)
14631464

1464-
filename = Path("test/data/doc/constructed_doc.embedded.md")
1465+
filename = test_dir / "constructed_doc.embedded.md"
14651466
doc.save_as_markdown(
14661467
filename=filename, artifacts_dir=image_dir, image_mode=ImageRefMode.EMBEDDED
14671468
)
14681469
_verify_saved_output(filename=filename, paths=paths)
14691470

1470-
filename = Path("test/data/doc/constructed_doc.referenced.md")
1471+
filename = test_dir / "constructed_doc.referenced.md"
14711472
doc.save_as_markdown(
14721473
filename=filename, artifacts_dir=image_dir, image_mode=ImageRefMode.REFERENCED
14731474
)
14741475
_verify_saved_output(filename=filename, paths=paths)
14751476

14761477
### HTML
14771478

1478-
filename = Path("test/data/doc/constructed_doc.placeholder.html")
1479+
filename = test_dir / "constructed_doc.placeholder.html"
14791480
doc.save_as_html(
14801481
filename=filename, artifacts_dir=image_dir, image_mode=ImageRefMode.PLACEHOLDER
14811482
)
14821483
_verify_saved_output(filename=filename, paths=paths)
14831484

1484-
filename = Path("test/data/doc/constructed_doc.embedded.html")
1485+
filename = test_dir / "constructed_doc.embedded.html"
14851486
doc.save_as_html(
14861487
filename=filename, artifacts_dir=image_dir, image_mode=ImageRefMode.EMBEDDED
14871488
)
14881489
_verify_saved_output(filename=filename, paths=paths)
14891490

1490-
filename = Path("test/data/doc/constructed_doc.referenced.html")
1491+
filename = test_dir / "constructed_doc.referenced.html"
14911492
doc.save_as_html(
14921493
filename=filename, artifacts_dir=image_dir, image_mode=ImageRefMode.REFERENCED
14931494
)
14941495
_verify_saved_output(filename=filename, paths=paths)
14951496

14961497
### Document Tokens
14971498

1498-
filename = Path("test/data/doc/constructed_doc.dt")
1499+
filename = test_dir / "constructed_doc.dt"
14991500
doc.save_as_doctags(filename=filename)
15001501
_verify_saved_output(filename=filename, paths=paths)
15011502

15021503
### JSON
15031504

1504-
filename = Path("test/data/doc/constructed_doc.embedded.json")
1505+
filename = test_dir / "constructed_doc.embedded.json"
15051506
doc.save_as_json(
15061507
filename=filename,
15071508
artifacts_dir=image_dir,
@@ -1512,7 +1513,7 @@ def test_save_to_disk():
15121513
doc_emb_loaded = DoclingDocument.load_from_json(filename)
15131514
_verify_loaded_output(filename=filename, pred=doc_emb_loaded)
15141515

1515-
filename = Path("test/data/doc/constructed_doc.referenced.json")
1516+
filename = test_dir / "constructed_doc.referenced.json"
15161517
doc.save_as_json(
15171518
filename=filename,
15181519
artifacts_dir=image_dir,
@@ -1525,15 +1526,15 @@ def test_save_to_disk():
15251526

15261527
### YAML
15271528

1528-
filename = Path("test/data/doc/constructed_doc.embedded.yaml")
1529+
filename = test_dir / "constructed_doc.embedded.yaml"
15291530
doc.save_as_yaml(
15301531
filename=filename,
15311532
artifacts_dir=image_dir,
15321533
image_mode=ImageRefMode.EMBEDDED,
15331534
)
15341535
_verify_saved_output(filename=filename, paths=paths)
15351536

1536-
filename = Path("test/data/doc/constructed_doc.referenced.yaml")
1537+
filename = test_dir / "constructed_doc.referenced.yaml"
15371538
doc.save_as_yaml(
15381539
filename=filename,
15391540
artifacts_dir=image_dir,

0 commit comments

Comments
 (0)