Skip to content

Commit 5d79779

Browse files
committed
Address #4716
Words cannot be allowed to start with the ZERO WIDTH JOINER Unicode 0x200d.
1 parent 264747c commit 5d79779

File tree

4 files changed

+24
-1
lines changed

4 files changed

+24
-1
lines changed

src/__init__.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16607,6 +16607,10 @@ def extractWORDS(self, delimiters=None):
1660716607
and not mupdf.fz_is_infinite_rect(tp_rect)
1660816608
):
1660916609
continue
16610+
16611+
if buflen == 0 and ch.m_internal.c == 0x200d:
16612+
# ZERO WIDTH JOINER cannot start a word
16613+
continue
1661016614
word_delimiter = JM_is_word_delimiter(ch.m_internal.c, delimiters)
1661116615
this_char_rtl = JM_is_rtl_char(ch.m_internal.c)
1661216616
if word_delimiter or this_char_rtl != last_char_rtl:

src/extra.i

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3295,7 +3295,11 @@ PyObject* extractWORDS(mupdf::FzStextPage& this_tpage, PyObject *delimiters)
32953295
{
32963296
continue;
32973297
}
3298-
3298+
// prevent Unicode ZWJ 0x200d to start a word
3299+
if (buflen == 0 && ch.m_internal->c == 0x200d)
3300+
{
3301+
continue;
3302+
}
32993303
int word_delimiter = JM_is_word_delimiter(ch.m_internal->c, delimiters);
33003304
int this_char_rtl = JM_is_rtl_char(ch.m_internal->c);
33013305
if (word_delimiter || this_char_rtl != last_char_rtl)

tests/resources/test_4716.pdf

309 KB
Binary file not shown.

tests/test_4716.py

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
import pymupdf
2+
import os
3+
4+
def test_4716():
5+
"""Confirm that ZERO WIDTH JOINER will never start a word."""
6+
script_dir = os.path.dirname(__file__)
7+
filename = os.path.join(script_dir, "resources", "test_4716.pdf")
8+
doc = pymupdf.open(filename)
9+
expected = set(["+25.00", "Любимый", "-10.00"])
10+
word_text = set()
11+
for page in doc:
12+
words = page.get_text("words")
13+
for w in words:
14+
word_text.add(w[4])
15+
assert word_text == expected

0 commit comments

Comments
 (0)