Skip to content

Commit 5e89363

Browse files
committed
pdf: Support multi-character glyphs when subsetting
For ligatures or complex shapings, multiple characters may map to a single glyph. In this case, we still want to output a single character code for the string using the font subset, but the `ToUnicode` map should give back all the characters.
1 parent 5309322 commit 5e89363

File tree

2 files changed

+45
-29
lines changed

2 files changed

+45
-29
lines changed

lib/matplotlib/backends/_backend_pdf_ps.py

Lines changed: 38 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -126,9 +126,10 @@ class CharacterTracker:
126126
If *subset_size* is not set, then there will only be one subset per font
127127
filename.
128128
glyph_map : dict
129-
A dictionary of font files to glyph maps. The glyph map is from (character code,
130-
glyph index)-pairs to (subset index, subset character code)-pairs. You probably
131-
will want to use the `.subset_to_unicode` method instead of this attribute.
129+
A dictionary of font files to glyph maps. The glyph map is from (character
130+
code(s), glyph index)-pairs to (subset index, subset character code)-pairs. You
131+
probably will want to use the `.subset_to_unicode` method instead of this
132+
attribute.
132133
"""
133134

134135
def __init__(self, subset_size: int = 0):
@@ -141,7 +142,7 @@ def __init__(self, subset_size: int = 0):
141142
"""
142143
self.used: dict[str, list[dict[CharacterCodeType, GlyphIndexType]]] = {}
143144
self.glyph_map: dict[str,
144-
dict[tuple[CharacterCodeType, GlyphIndexType],
145+
dict[tuple[tuple[CharacterCodeType, ...], GlyphIndexType],
145146
tuple[int, CharacterCodeType]]] = {}
146147
self.subset_size = subset_size
147148

@@ -170,18 +171,18 @@ def track(self, font: FT2Font, s: str) -> list[tuple[int, CharacterCodeType]]:
170171
for _c, _f in font._get_fontmap(s).items()
171172
]
172173

173-
def track_glyph(
174-
self, font: FT2Font, charcode: CharacterCodeType,
175-
glyph: GlyphIndexType) -> tuple[int, CharacterCodeType]:
174+
def track_glyph(self, font: FT2Font, chars: str | CharacterCodeType,
175+
glyph: GlyphIndexType) -> tuple[int, CharacterCodeType]:
176176
"""
177177
Record character code *charcode* at glyph index *glyph* as using font *font*.
178178
179179
Parameters
180180
----------
181181
font : FT2Font
182182
A font that is being used for the provided string.
183-
charcode : CharacterCodeType
184-
The character code to record.
183+
chars : str or CharacterCodeType
184+
The character(s) to record. This may be a single character code, or multiple
185+
characters in a string, if the glyph maps to several characters.
185186
glyph : GlyphIndexType
186187
The corresponding glyph index to record.
187188
@@ -194,25 +195,38 @@ def track_glyph(
194195
The character code within the above subset. If *subset_size* was not
195196
specified on this instance, then this is just *charcode* unmodified.
196197
"""
198+
# Normalize for the key.
199+
if isinstance(chars, str):
200+
charcode = tuple(ord(c) for c in chars)
201+
elif not isinstance(chars, tuple):
202+
charcode = (chars, )
203+
else:
204+
charcode = chars
205+
197206
glyph_map = self.glyph_map.setdefault(font.fname, {})
198207
key = (charcode, glyph)
199208
if key in glyph_map:
200209
return glyph_map[key]
201210

202211
subset_maps = self.used.setdefault(font.fname, [{}])
203-
# Default to preserving the character code as it was.
204-
subset = 0
205-
subset_charcode = charcode
206212
use_next_charmap = False
207-
if self.subset_size != 0:
208-
# But start filling a new subset if outside the first block; this preserves
209-
# ASCII (for Type 3) or the Basic Multilingual Plane (for Type 42).
210-
if charcode >= self.subset_size:
211-
use_next_charmap = True
212-
# Or, use a new subset if the character code is already mapped for the first
213-
# block. This means it's using an alternate glyph.
214-
elif charcode in subset_maps[0]:
215-
use_next_charmap = True
213+
if len(charcode) > 1:
214+
# Multi-character glyphs always go in the non-0 subset.
215+
use_next_charmap = True
216+
else:
217+
# Default to preserving the character code as it was.
218+
subset = 0
219+
subset_charcode = charcode[0]
220+
if self.subset_size != 0:
221+
# But start filling a new subset if outside the first block; this
222+
# preserves ASCII (for Type 3) or the Basic Multilingual Plane
223+
# (for Type 42).
224+
if charcode[0] >= self.subset_size:
225+
use_next_charmap = True
226+
# Or, use a new subset if the character code is already mapped for the
227+
# first block. This means it's using an alternate glyph.
228+
elif charcode[0] in subset_maps[0]:
229+
use_next_charmap = True
216230
if use_next_charmap:
217231
if len(subset_maps) == 1 or len(subset_maps[-1]) == self.subset_size:
218232
subset_maps.append({})
@@ -223,7 +237,7 @@ def track_glyph(
223237
return (subset, subset_charcode)
224238

225239
def subset_to_unicode(self, fontname: str, index: int,
226-
charcode: CharacterCodeType) -> CharacterCodeType:
240+
charcode: CharacterCodeType) -> tuple[CharacterCodeType, ...]:
227241
"""
228242
Map a subset index and character code to a Unicode character code.
229243
@@ -238,8 +252,8 @@ def subset_to_unicode(self, fontname: str, index: int,
238252
239253
Returns
240254
-------
241-
CharacterCodeType
242-
The Unicode character code corresponding to the subsetted one.
255+
tuple of CharacterCodeType
256+
The Unicode character code(s) corresponding to the subsetted one.
243257
"""
244258
search = (index, charcode)
245259
for orig_info, subset_info in self.glyph_map[fontname].items():

lib/matplotlib/backends/backend_pdf.py

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -996,7 +996,8 @@ def _embedTeXFont(self, dvifont):
996996
# for that subset, and compute various properties based on the encoding.
997997
charmap = self._character_tracker.used[dvifont.fname][0]
998998
chars = {
999-
self._character_tracker.subset_to_unicode(dvifont.fname, 0, ccode)
999+
# DVI fonts always map 1-to-1.
1000+
self._character_tracker.subset_to_unicode(dvifont.fname, 0, ccode)[0]
10001001
for ccode in charmap
10011002
}
10021003
t1font = t1font.subset(chars, self._get_subset_prefix(charmap.values()))
@@ -1150,10 +1151,11 @@ def generate_unicode_cmap(subset_index, charmap):
11501151
last_ccode = ccode
11511152

11521153
def _to_unicode(ccode):
1153-
real_ccode = self._character_tracker.subset_to_unicode(
1154+
chars = self._character_tracker.subset_to_unicode(
11541155
filename, subset_index, ccode)
1155-
unicodestr = chr(real_ccode).encode('utf-16be').hex()
1156-
return f'<{unicodestr}>'
1156+
unicodestr = ''.join(chr(c) for c in chars)
1157+
hexstr = unicodestr.encode('utf-16be').hex()
1158+
return f'<{hexstr}>'
11571159

11581160
width = 2 if fonttype == 3 else 4
11591161
unicode_bfrange = []
@@ -2332,7 +2334,7 @@ def output_singlebyte_chunk(kerns_or_chars):
23322334
for item in _text_helpers.layout(s, font, kern_mode=Kerning.UNFITTED,
23332335
language=language):
23342336
subset, charcode = self.file._character_tracker.track_glyph(
2335-
item.ft_object, ord(item.char), item.glyph_index)
2337+
item.ft_object, item.char, item.glyph_index)
23362338
if (item.ft_object, subset) != prev_font:
23372339
if singlebyte_chunk:
23382340
output_singlebyte_chunk(singlebyte_chunk)

0 commit comments

Comments
 (0)