fix(html): Parse rawspan and colspan when they include non numerical values (#2048)

touma-I · dolfim-ibm · web-flow · commit ed56f2de5d58 · 2025-08-11T13:53:29.000+02:00
* use re to stop at first non-digit

Signed-off-by: Maroun Touma &lt;touma@us.ibm.com&gt;

* Allow digit in first place followed by non numerical values

Signed-off-by: Maroun Touma &lt;touma@us.ibm.com&gt;

* refactor to match type checker

Signed-off-by: Michele Dolfi &lt;dol@zurich.ibm.com&gt;

---------

Signed-off-by: Maroun Touma &lt;touma@us.ibm.com&gt;
Signed-off-by: Michele Dolfi &lt;dol@zurich.ibm.com&gt;
Co-authored-by: Michele Dolfi &lt;dol@zurich.ibm.com&gt;
diff --git a/docling/backend/html_backend.py b/docling/backend/html_backend.py
@@ -511,9 +511,17 @@ def _get_cell_spans(cell: Tag) -> tuple[int, int]:
             str(cell.get("colspan", "1")),
             str(cell.get("rowspan", "1")),
         )
+
+        def _extract_num(s: str) -> int:
+            if s and s[0].isnumeric():
+                match = re.search(r"\d+", s)
+                if match:
+                    return int(match.group())
+            return 1
+
         int_spans: tuple[int, int] = (
-            int(raw_spans[0]) if raw_spans[0].isnumeric() else 1,
-            int(raw_spans[1]) if raw_spans[0].isnumeric() else 1,
+            _extract_num(raw_spans[0]),
+            _extract_num(raw_spans[1]),
         )
 
         return int_spans