Skip to content

Commit a101730

Browse files
committed
pdf: Correct Unicode mapping for out-of-range font chunks
For Type 3 fonts, add a `ToUnicode` mapping (which was added in PDF 1.2), and for Type 42 fonts, correct the Unicode encoding, which should be UTF-16BE, not UCS2.
1 parent 770ab8e commit a101730

File tree

2 files changed

+64
-31
lines changed

2 files changed

+64
-31
lines changed

lib/matplotlib/backends/_backend_pdf_ps.py

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -205,6 +205,25 @@ def track_glyph(
205205
self.used.setdefault((font.fname, subset), {})[subset_charcode] = glyph
206206
return (subset, subset_charcode)
207207

208+
def subset_to_unicode(self, index: int,
209+
charcode: CharacterCodeType) -> CharacterCodeType:
210+
"""
211+
Map a subset index and character code to a Unicode character code.
212+
213+
Parameters
214+
----------
215+
index : int
216+
The subset index within a font.
217+
charcode : CharacterCodeType
218+
The character code within a subset to map back.
219+
220+
Returns
221+
-------
222+
CharacterCodeType
223+
The Unicode character code corresponding to the subsetted one.
224+
"""
225+
return index * self.subset_size + charcode
226+
208227

209228
class RendererPDFPSBase(RendererBase):
210229
# The following attributes must be defined by the subclasses:

lib/matplotlib/backends/backend_pdf.py

Lines changed: 45 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -950,7 +950,7 @@ def writeFonts(self):
950950
_log.debug('Writing TrueType font.')
951951
charmap = self._character_tracker.used.get((filename, subset))
952952
if charmap:
953-
fonts[Fx] = self.embedTTF(filename, charmap)
953+
fonts[Fx] = self.embedTTF(filename, subset, charmap)
954954
self.writeObject(self.fontObject, fonts)
955955

956956
def _write_afm_font(self, filename):
@@ -1118,7 +1118,7 @@ def createType1Descriptor(self, t1font, fontfile=None):
11181118
end
11191119
end"""
11201120

1121-
def embedTTF(self, filename, charmap):
1121+
def embedTTF(self, filename, subset_index, charmap):
11221122
"""Embed the TTF font from the named file into the document."""
11231123
font = get_font(filename)
11241124
fonttype = mpl.rcParams['pdf.fonttype']
@@ -1134,12 +1134,40 @@ def cvt(length, upe=font.units_per_EM, nearest=True):
11341134
else:
11351135
return math.ceil(value)
11361136

1137-
def embedTTFType3(font, charmap, descriptor):
1137+
def generate_unicode_cmap(subset_index, charmap):
1138+
# Make the ToUnicode CMap.
1139+
last_ccode = -2
1140+
unicode_groups = []
1141+
for ccode in sorted(charmap.keys()):
1142+
if ccode != last_ccode + 1:
1143+
unicode_groups.append([ccode, ccode])
1144+
else:
1145+
unicode_groups[-1][1] = ccode
1146+
last_ccode = ccode
1147+
1148+
width = 2 if fonttype == 3 else 4
1149+
unicode_bfrange = []
1150+
for start, end in unicode_groups:
1151+
real_start = self._character_tracker.subset_to_unicode(subset_index,
1152+
start)
1153+
real_end = self._character_tracker.subset_to_unicode(subset_index, end)
1154+
real_values = ' '.join('<%s>' % chr(x).encode('utf-16be').hex()
1155+
for x in range(real_start, real_end+1))
1156+
unicode_bfrange.append(
1157+
f'<{start:0{width}x}> <{end:0{width}x}> [{real_values}]')
1158+
unicode_cmap = (self._identityToUnicodeCMap %
1159+
(len(unicode_groups),
1160+
'\n'.join(unicode_bfrange).encode('ascii')))
1161+
1162+
return unicode_cmap
1163+
1164+
def embedTTFType3(font, subset_index, charmap, descriptor):
11381165
"""The Type 3-specific part of embedding a Truetype font"""
11391166
widthsObject = self.reserveObject('font widths')
11401167
fontdescObject = self.reserveObject('font descriptor')
11411168
fontdictObject = self.reserveObject('font dictionary')
11421169
charprocsObject = self.reserveObject('character procs')
1170+
toUnicodeMapObject = self.reserveObject('ToUnicode map')
11431171
differencesArray = []
11441172
firstchar, lastchar = min(charmap), max(charmap)
11451173
bbox = [cvt(x, nearest=False) for x in font.bbox]
@@ -1158,8 +1186,9 @@ def embedTTFType3(font, charmap, descriptor):
11581186
'Encoding': {
11591187
'Type': Name('Encoding'),
11601188
'Differences': differencesArray},
1161-
'Widths': widthsObject
1162-
}
1189+
'Widths': widthsObject,
1190+
'ToUnicode': toUnicodeMapObject,
1191+
}
11631192

11641193
# Make the "Widths" array
11651194
def get_char_width(charcode):
@@ -1192,15 +1221,18 @@ def get_char_width(charcode):
11921221
self.outputStream(charprocObject, stream)
11931222
charprocs[charname] = charprocObject
11941223

1224+
unicode_cmap = generate_unicode_cmap(subset_index, charmap)
1225+
11951226
# Write everything out
11961227
self.writeObject(fontdictObject, fontdict)
11971228
self.writeObject(fontdescObject, descriptor)
11981229
self.writeObject(widthsObject, widths)
11991230
self.writeObject(charprocsObject, charprocs)
1231+
self.outputStream(toUnicodeMapObject, unicode_cmap)
12001232

12011233
return fontdictObject
12021234

1203-
def embedTTFType42(font, charmap, descriptor):
1235+
def embedTTFType42(font, subset_index, charmap, descriptor):
12041236
"""The Type 42-specific part of embedding a Truetype font"""
12051237
fontdescObject = self.reserveObject('font descriptor')
12061238
cidFontDictObject = self.reserveObject('CID font dictionary')
@@ -1210,12 +1242,12 @@ def embedTTFType42(font, charmap, descriptor):
12101242
wObject = self.reserveObject('Type 0 widths')
12111243
toUnicodeMapObject = self.reserveObject('ToUnicode map')
12121244

1213-
_log.debug("SUBSET %s characters: %s", filename, charmap)
1245+
_log.debug("SUBSET %s:%d characters: %s", filename, subset_index, charmap)
12141246
with _backend_pdf_ps.get_glyphs_subset(filename,
12151247
charmap.values()) as subset:
12161248
fontdata = _backend_pdf_ps.font_as_file(subset)
12171249
_log.debug(
1218-
"SUBSET %s %d -> %d", filename,
1250+
"SUBSET %s:%d %d -> %d", filename, subset_index,
12191251
os.stat(filename).st_size, fontdata.getbuffer().nbytes
12201252
)
12211253

@@ -1252,55 +1284,37 @@ def embedTTFType42(font, charmap, descriptor):
12521284
fontfileObject, fontdata.getvalue(),
12531285
extra={'Length1': fontdata.getbuffer().nbytes})
12541286

1255-
# Make the 'W' (Widths) array, CidToGidMap and ToUnicode CMap
1256-
# at the same time
1287+
# Make the 'W' (Widths) array and CidToGidMap at the same time.
12571288
cid_to_gid_map = ['\0'] * 65536
12581289
widths = []
12591290
max_ccode = 0
12601291
for ccode, gind in charmap.items():
12611292
glyph = font.load_glyph(gind,
12621293
flags=LoadFlags.NO_SCALE | LoadFlags.NO_HINTING)
12631294
widths.append((ccode, cvt(glyph.horiAdvance)))
1264-
if ccode < 65536:
1265-
cid_to_gid_map[ccode] = chr(gind)
1295+
cid_to_gid_map[ccode] = chr(gind)
12661296
max_ccode = max(ccode, max_ccode)
12671297
widths.sort()
12681298
cid_to_gid_map = cid_to_gid_map[:max_ccode + 1]
12691299

12701300
last_ccode = -2
12711301
w = []
12721302
max_width = 0
1273-
unicode_groups = []
12741303
for ccode, width in widths:
12751304
if ccode != last_ccode + 1:
12761305
w.append(ccode)
12771306
w.append([width])
1278-
unicode_groups.append([ccode, ccode])
12791307
else:
12801308
w[-1].append(width)
1281-
unicode_groups[-1][1] = ccode
12821309
max_width = max(max_width, width)
12831310
last_ccode = ccode
12841311

1285-
unicode_bfrange = []
1286-
for start, end in unicode_groups:
1287-
# Ensure the CID map contains only chars from BMP
1288-
if start > 65535:
1289-
continue
1290-
end = min(65535, end)
1291-
1292-
unicode_bfrange.append(
1293-
b"<%04x> <%04x> [%s]" %
1294-
(start, end,
1295-
b" ".join(b"<%04x>" % x for x in range(start, end+1))))
1296-
unicode_cmap = (self._identityToUnicodeCMap %
1297-
(len(unicode_groups), b"\n".join(unicode_bfrange)))
1298-
12991312
# CIDToGIDMap stream
13001313
cid_to_gid_map = "".join(cid_to_gid_map).encode("utf-16be")
13011314
self.outputStream(cidToGidMapObject, cid_to_gid_map)
13021315

13031316
# ToUnicode CMap
1317+
unicode_cmap = generate_unicode_cmap(subset_index, charmap)
13041318
self.outputStream(toUnicodeMapObject, unicode_cmap)
13051319

13061320
descriptor['MaxWidth'] = max_width
@@ -1356,9 +1370,9 @@ def embedTTFType42(font, charmap, descriptor):
13561370
}
13571371

13581372
if fonttype == 3:
1359-
return embedTTFType3(font, charmap, descriptor)
1373+
return embedTTFType3(font, subset_index, charmap, descriptor)
13601374
elif fonttype == 42:
1361-
return embedTTFType42(font, charmap, descriptor)
1375+
return embedTTFType42(font, subset_index, charmap, descriptor)
13621376

13631377
def alphaState(self, alpha):
13641378
"""Return name of an ExtGState that sets alpha to the given value."""

0 commit comments

Comments
 (0)