Skip to content

Commit 558890f

Browse files
Support Unicode 17
Add support for Unicode 17, Including VS3 variation sequences for Mongolian quotation marks (https://www.unicode.org/L2/L2025/25028-vs3-sibe-quotation-marks.pdf).
1 parent 7a7fcdc commit 558890f

File tree

4 files changed

+203
-91
lines changed

4 files changed

+203
-91
lines changed

scripts/unicode.py

+26-22
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,7 @@
4343
from itertools import batched
4444
from typing import Callable, Iterable
4545

46-
UNICODE_VERSION = "16.0.0"
46+
UNICODE_VERSION = "17.0.0"
4747
"""The version of the Unicode data files to download."""
4848

4949
NUM_CODEPOINTS = 0x110000
@@ -178,7 +178,9 @@ class WidthState(enum.IntEnum):
178178
(if set, should also set 3rd and 4th)
179179
- 6th bit: if 4th is set but this one is not, then this is a ZWJ ligature state
180180
where no ZWJ has been encountered yet; encountering one flips this on
181-
- Seventh bit: is VS1 (if CJK) or is VS2 (not CJK)
181+
- Seventh bit:
182+
- CJK mode: is VS1 or VS3
183+
- Not CJK: is VS2
182184
"""
183185

184186
# BASIC WIDTHS
@@ -275,8 +277,8 @@ class WidthState(enum.IntEnum):
275277

276278
# VARIATION SELECTORS
277279

278-
VARIATION_SELECTOR_1_OR_2 = 0b0000_0010_0000_0000
279-
"\\uFE00 if CJK, or \\uFE01 otherwise"
280+
VARIATION_SELECTOR_1_2_OR_3 = 0b0000_0010_0000_0000
281+
"\\uFE00 or \\uFE02 if CJK, or \\uFE01 otherwise"
280282

281283
# Text presentation sequences (not CJK)
282284
VARIATION_SELECTOR_15 = 0b0100_0000_0000_0000
@@ -373,7 +375,7 @@ def width_alone(self) -> int:
373375
| WidthState.COMBINING_LONG_SOLIDUS_OVERLAY
374376
| WidthState.VARIATION_SELECTOR_15
375377
| WidthState.VARIATION_SELECTOR_16
376-
| WidthState.VARIATION_SELECTOR_1_OR_2
378+
| WidthState.VARIATION_SELECTOR_1_2_OR_3
377379
):
378380
return 0
379381
case (
@@ -657,11 +659,12 @@ def load_width_maps() -> tuple[list[WidthState], list[WidthState]]:
657659
ea[cp] = width
658660

659661
# East-Asian only
660-
ea[0xFE00] = WidthState.VARIATION_SELECTOR_1_OR_2
661662
ea[0x0338] = WidthState.COMBINING_LONG_SOLIDUS_OVERLAY
663+
ea[0xFE00] = WidthState.VARIATION_SELECTOR_1_2_OR_3
664+
ea[0xFE02] = WidthState.VARIATION_SELECTOR_1_2_OR_3
662665

663666
# Not East Asian only
664-
not_ea[0xFE01] = WidthState.VARIATION_SELECTOR_1_OR_2
667+
not_ea[0xFE01] = WidthState.VARIATION_SELECTOR_1_2_OR_3
665668
not_ea[0xFE0E] = WidthState.VARIATION_SELECTOR_15
666669

667670
return (not_ea, ea)
@@ -759,7 +762,7 @@ def load_solidus_transparent(
759762
num_chars = len(ccc_above_1)
760763

761764
for cp in ccc_above_1:
762-
if cp not in [0xFE00, 0xFE0F]:
765+
if cp not in [0xFE00, 0xFE02, 0xFE0F]:
763766
assert (
764767
cjk_width_map[cp].table_width() != CharWidthInTable.SPECIAL
765768
), f"U+{cp:X}"
@@ -1317,14 +1320,14 @@ def lookup_fns(
13171320

13181321
if is_cjk:
13191322
s += """
1320-
if c == '\\u{FE00}' {
1321-
return (0, next_info.set_vs1_2());
1323+
if matches!(c, '\\u{FE00}' | '\\u{FE02}') {
1324+
return (0, next_info.set_vs1_2_3());
13221325
}
13231326
"""
13241327
else:
13251328
s += """
13261329
if c == '\\u{FE01}' {
1327-
return (0, next_info.set_vs1_2());
1330+
return (0, next_info.set_vs1_2_3());
13281331
}
13291332
if c == '\\u{FE0E}' {
13301333
return (0, next_info.set_text_presentation());
@@ -1337,15 +1340,15 @@ def lookup_fns(
13371340
}
13381341
} else """
13391342

1340-
s += """if next_info.is_vs1_2() {
1343+
s += """if next_info.is_vs1_2_3() {
13411344
if matches!(c, '\\u{2018}' | '\\u{2019}' | '\\u{201C}' | '\\u{201D}') {
13421345
return ("""
13431346

13441347
s += str(2 - is_cjk)
13451348

13461349
s += """, WidthInfo::DEFAULT);
13471350
} else {
1348-
next_info = next_info.unset_vs1_2();
1351+
next_info = next_info.unset_vs1_2_3();
13491352
}
13501353
}
13511354
if next_info.is_ligature_transparent() {
@@ -1655,7 +1658,7 @@ def emit_module(
16551658
self.0
16561659
| WidthInfo::VARIATION_SELECTOR_16.0
16571660
& !WidthInfo::VARIATION_SELECTOR_15.0
1658-
& !WidthInfo::VARIATION_SELECTOR_1_OR_2.0,
1661+
& !WidthInfo::VARIATION_SELECTOR_1_2_OR_3.0,
16591662
)
16601663
}} else {{
16611664
Self::VARIATION_SELECTOR_16
@@ -1683,7 +1686,7 @@ def emit_module(
16831686
self.0
16841687
| WidthInfo::VARIATION_SELECTOR_15.0
16851688
& !WidthInfo::VARIATION_SELECTOR_16.0
1686-
& !WidthInfo::VARIATION_SELECTOR_1_OR_2.0,
1689+
& !WidthInfo::VARIATION_SELECTOR_1_2_OR_3.0,
16871690
)
16881691
}} else {{
16891692
Self(WidthInfo::VARIATION_SELECTOR_15.0)
@@ -1696,27 +1699,28 @@ def emit_module(
16961699
}}
16971700
16981701
/// Has 7th bit set
1699-
fn is_vs1_2(self) -> bool {{
1700-
(self.0 & WidthInfo::VARIATION_SELECTOR_1_OR_2.0) == WidthInfo::VARIATION_SELECTOR_1_OR_2.0
1702+
fn is_vs1_2_3(self) -> bool {{
1703+
(self.0 & WidthInfo::VARIATION_SELECTOR_1_2_OR_3.0)
1704+
== WidthInfo::VARIATION_SELECTOR_1_2_OR_3.0
17011705
}}
17021706
17031707
/// Set 7th bit
1704-
fn set_vs1_2(self) -> Self {{
1708+
fn set_vs1_2_3(self) -> Self {{
17051709
if (self.0 & LIGATURE_TRANSPARENT_MASK) == LIGATURE_TRANSPARENT_MASK {{
17061710
Self(
17071711
self.0
1708-
| WidthInfo::VARIATION_SELECTOR_1_OR_2.0
1712+
| WidthInfo::VARIATION_SELECTOR_1_2_OR_3.0
17091713
& !WidthInfo::VARIATION_SELECTOR_15.0
17101714
& !WidthInfo::VARIATION_SELECTOR_16.0,
17111715
)
17121716
}} else {{
1713-
Self(WidthInfo::VARIATION_SELECTOR_1_OR_2.0)
1717+
Self(WidthInfo::VARIATION_SELECTOR_1_2_OR_3.0)
17141718
}}
17151719
}}
17161720
17171721
/// Clear 7th bit
1718-
fn unset_vs1_2(self) -> Self {{
1719-
Self(self.0 & !WidthInfo::VARIATION_SELECTOR_1_OR_2.0)
1722+
fn unset_vs1_2_3(self) -> Self {{
1723+
Self(self.0 & !WidthInfo::VARIATION_SELECTOR_1_2_OR_3.0)
17201724
}}
17211725
}}
17221726

src/lib.rs

+2-2
Original file line numberDiff line numberDiff line change
@@ -62,8 +62,8 @@
6262
//! - Outside of an East Asian context, [text presentation sequences] have width 1 if their base character:
6363
//! - Has the [`Emoji_Presentation`] property, and
6464
//! - Is not in the [Enclosed Ideographic Supplement] block.
65-
//! - [`'\u{2018}'`, `'\u{2019}'`, `'\u{201C}'`, and `'\u{201D}'`][General Punctuation] always have width 1 when followed by '\u{FE00}',
66-
//! and width 2 when followed by '\u{FE01}'.
65+
//! - [`'\u{2018}'`, `'\u{2019}'`, `'\u{201C}'`, and `'\u{201D}'`][General Punctuation] always have width 1
66+
//! when followed by '\u{FE00}' or '\u{FE02}', and width 2 when followed by '\u{FE01}'.
6767
//! - Script-specific ligatures:
6868
//! - For all the following ligatures, the insertion of any number of [default-ignorable][`Default_Ignorable_Code_Point`]
6969
//! [combining marks] anywhere in the sequence will not change the total width. In addition, for all non-Arabic

0 commit comments

Comments
 (0)