1
1
# script to generate grapheme cluster boundary data.
2
- # https://www.unicode.org/reports/tr41/
3
2
from enum import IntEnum
4
3
5
4
from FileGenerator import Regenerate
8
7
9
8
# Unicode Text Segmentation
10
9
# https://www.unicode.org/reports/tr29/#Grapheme_Cluster_Break_Property_Values
10
+ # https://www.unicode.org/Public/UCD/latest/ucd/auxiliary/GraphemeBreakTest.html
11
11
class GraphemeBreakProperty (IntEnum ):
12
12
Other = 0
13
- CR = 1
14
- LF = 2
15
- Control = 3
16
- Extend = 4
17
- RegionalIndicator = 5
18
- Prepend = 6
19
- SpacingMark = 7
20
- HangulL = 8
21
- HangulV = 9
22
- HangulT = 10
23
- HangulLV = 11
24
- HangulLVT = 12
25
- ExtendedPictographic = 13
26
- ZeroWidthJoiner = 14
27
-
13
+ Control = 1
14
+ Extend = 2
15
+ RegionalIndicator = 3
16
+ Prepend = 4
17
+ HangulL = 5
18
+ HangulV = 6
19
+ HangulT = 7
20
+ HangulLV = 8
21
+ HangulLVT = 9
22
+ ExtendedPictographic = 10
23
+ ZeroWidthJoiner = 11
24
+ # Indic_Conjunct_Break
25
+ ConjunctLinker = 12
26
+ LinkingConsonant = 13
27
+ ExtendConjunctLinker = 14
28
+ # merged property
29
+ SpacingMark = 15
30
+ CR = 16
31
+ LF = 17
32
+
33
+ # https://www.unicode.org/reports/tr35/tr35-general.html#segmentations
34
+ # https://github.com/unicode-org/cldr/blob/main/common/segments/root.xml
35
+ # https://www.unicode.org/reports/tr51/#Emoji_Properties
28
36
GraphemeBreakPropertyMap = GraphemeBreakProperty .__members__ | {
29
37
'Regional_Indicator' : GraphemeBreakProperty .RegionalIndicator ,
30
38
'RI' : GraphemeBreakProperty .RegionalIndicator ,
@@ -36,28 +44,51 @@ class GraphemeBreakProperty(IntEnum):
36
44
'Extended_Pictographic' : GraphemeBreakProperty .ExtendedPictographic ,
37
45
'ExtPict' : GraphemeBreakProperty .ExtendedPictographic ,
38
46
'ZWJ' : GraphemeBreakProperty .ZeroWidthJoiner ,
47
+ 'Consonant' : GraphemeBreakProperty .LinkingConsonant ,
48
+ 'Virama' : GraphemeBreakProperty .ConjunctLinker ,
49
+ 'ExtendLinker' : GraphemeBreakProperty .ExtendConjunctLinker ,
39
50
}
40
51
41
- graphemeClusterBoundary = [0xffff ] * (max (GraphemeBreakProperty .__members__ .values ()) + 1 )
52
+ # https://www.unicode.org/reports/tr44/#Indic_Conjunct_Break
53
+ def updateIndicConjunctBreak (graphemeBreakTable ):
54
+ # https://www.unicode.org/Public/UCD/latest/ucd/IndicSyllabicCategory.txt
55
+ indicConjunctBreak = ['Other' ] * UnicodeCharacterCount
56
+ version , propertyList = readUnicodePropertyFile ('IndicSyllabicCategory.txt' )
57
+ flattenUnicodePropertyTable (indicConjunctBreak , propertyList )
58
+ defaultValue = int (GraphemeBreakProperty .Other )
59
+ extend = int (GraphemeBreakProperty .Extend )
60
+ extendLinker = int (GraphemeBreakProperty .ExtendConjunctLinker )
61
+ for index , conjunct in enumerate (indicConjunctBreak ):
62
+ grapheme = graphemeBreakTable [index ]
63
+ if grapheme == defaultValue :
64
+ grapheme = GraphemeBreakPropertyMap .get (conjunct , grapheme )
65
+ elif grapheme == extend :
66
+ if conjunct == 'Virama' :
67
+ grapheme = extendLinker
68
+ graphemeBreakTable [index ] = grapheme
69
+
70
+ graphemeClusterBoundary = [0x3ffff ] * (max (GraphemeBreakProperty .__members__ .values ()) + 1 )
71
+ # https://www.unicode.org/reports/tr29/#Grapheme_Cluster_Boundary_Rules
42
72
def buildGraphemeClusterBoundary ():
43
73
table = graphemeClusterBoundary
44
74
45
- # https://www.unicode.org/Public/UCD/latest/ucd/auxiliary/GraphemeBreakTest.html
46
75
notBreak = {
47
- 'Other' : ['Extend' , 'SpacingMark' , 'ZWJ' ],
76
+ 'Other' : ['Extend' , 'SpacingMark' , 'ZWJ' , 'ExtendLinker' ],
48
77
'CR' : ['LF' ],
49
- 'Extend' : ['Extend' , 'SpacingMark' , 'ZWJ' ],
50
- 'RI' : ['Extend' , 'RI' , 'SpacingMark' , 'ZWJ' ],
51
- 'Prepend' : ['Other' , 'Extend' , 'RI' , 'Prepend' , 'SpacingMark' , 'L' , 'V' , 'T' , 'LV' , 'LVT' , 'ExtPict' , 'ZWJ' ],
52
- 'SpacingMark' : ['Extend' , 'SpacingMark' , 'ZWJ' ],
53
- 'L' : ['Extend' , 'SpacingMark' , 'L' , 'V' , 'LV' , 'LVT' , 'ZWJ' ],
54
- 'V' : ['Extend' , 'SpacingMark' , 'V' , 'T' , 'ZWJ' ],
55
- 'T' : ['Extend' , 'SpacingMark' , 'T' , 'ZWJ' ],
56
- 'LV' : ['Extend' , 'SpacingMark' , 'V' , 'T' , 'ZWJ' ],
57
- 'LVT' : ['Extend' , 'SpacingMark' , 'T' , 'ZWJ' ],
58
- 'ExtPict' : ['Extend' , 'SpacingMark' , 'ZWJ' ],
59
- #'ZWJ': ['Extend', 'SpacingMark', 'ZWJ'],
60
- 'ZWJ' : ['Extend' , 'SpacingMark' , 'ExtPict' , 'ZWJ' ],
78
+ 'Extend' : ['Extend' , 'SpacingMark' , 'ZWJ' , 'ExtendLinker' ],
79
+ 'RI' : ['Extend' , 'SpacingMark' , 'ZWJ' , 'ExtendLinker' , 'RI' ],
80
+ 'Prepend' : ['Other' , 'Extend' , 'SpacingMark' , 'ZWJ' , 'ExtendLinker' , 'RI' , 'Prepend' , 'L' , 'V' , 'T' , 'LV' , 'LVT' , 'ExtPict' , 'ConjunctLinker' , 'Consonant' ],
81
+ 'SpacingMark' : ['Extend' , 'SpacingMark' , 'ZWJ' , 'ExtendLinker' ],
82
+ 'L' : ['Extend' , 'SpacingMark' , 'ZWJ' , 'ExtendLinker' , 'L' , 'V' , 'LV' , 'LVT' ],
83
+ 'V' : ['Extend' , 'SpacingMark' , 'ZWJ' , 'ExtendLinker' , 'V' , 'T' ],
84
+ 'T' : ['Extend' , 'SpacingMark' , 'ZWJ' , 'ExtendLinker' , 'T' ],
85
+ 'LV' : ['Extend' , 'SpacingMark' , 'ZWJ' , 'ExtendLinker' , 'V' , 'T' ],
86
+ 'LVT' : ['Extend' , 'SpacingMark' , 'ZWJ' , 'ExtendLinker' , 'T' ],
87
+ 'ExtPict' : ['Extend' , 'SpacingMark' , 'ZWJ' , 'ExtendLinker' ],
88
+ 'ZWJ' : ['Extend' , 'SpacingMark' , 'ZWJ' , 'ExtendLinker' , 'ExtPict' , 'Consonant' ],
89
+ 'ConjunctLinker' : ['Extend' , 'SpacingMark' , 'ZWJ' , 'ExtendLinker' , 'Consonant' ],
90
+ 'Consonant' : ['Extend' , 'SpacingMark' , 'ZWJ' , 'ExtendLinker' , 'ConjunctLinker' ],
91
+ 'ExtendLinker' : ['Extend' , 'SpacingMark' , 'ZWJ' , 'ExtendLinker' , 'Consonant' ],
61
92
}
62
93
63
94
for key , row in notBreak .items ():
@@ -93,10 +124,12 @@ def findLongestCharacterSequence(path):
93
124
def testGraphemeBreak (path , graphemeBreakTable ):
94
125
opportunity = '×÷'
95
126
allow = opportunity [1 ]
96
- totalCount = 0
97
- failCount = 0
127
+ total = 0
128
+ fail = [0 , 0 ]
129
+ ignore = 0
98
130
with open (path , encoding = 'utf-8' ) as fd :
99
131
lineno = 0
132
+ indent = ' ' * 4
100
133
for line in fd .readlines ():
101
134
lineno += 1
102
135
line = line .strip ()
@@ -111,14 +144,19 @@ def testGraphemeBreak(path, graphemeBreakTable):
111
144
ch = sequence [index ]
112
145
official = sequence [index + 1 ]
113
146
chNext = sequence [index + 2 ]
147
+ if ch == '000D' and chNext == '000A' :
148
+ ignore += 1
149
+ continue
114
150
prop = GraphemeBreakProperty (graphemeBreakTable [int (ch , 16 )])
115
151
propNext = GraphemeBreakProperty (graphemeBreakTable [int (chNext , 16 )])
116
- value = opportunity [(graphemeClusterBoundary [prop ] >> propNext ) & 1 ]
117
- totalCount += 1
152
+ result = (graphemeClusterBoundary [prop ] >> propNext ) & 1
153
+ value = opportunity [result ]
154
+ total += 1
118
155
if value != official :
119
- failCount += 1
156
+ fail [ result ^ 1 ] += 1
120
157
print (f'test fail on line { lineno } : { ch } { official } { chNext } => { prop .name } { value } { propNext .name } ' )
121
- print (f'{ path } total test: { totalCount } , failed test: { failCount } ' )
158
+ print (f'{ indent } { line } ' )
159
+ print (f'{ path } total test: { total } , failed: { opportunity [0 ]} { fail [0 ]} , { opportunity [1 ]} { fail [1 ]} , ignored: { ignore } ' )
122
160
123
161
def updateGraphemeBreakTable (headerFile , sourceFile ):
124
162
defaultValue = int (GraphemeBreakProperty .Other )
@@ -129,6 +167,7 @@ def updateGraphemeBreakTable(headerFile, sourceFile):
129
167
# https://www.unicode.org/Public/UCD/latest/ucd/auxiliary/GraphemeBreakProperty.txt
130
168
version , propertyList = readUnicodePropertyFile ('GraphemeBreakProperty.txt' )
131
169
updateUnicodePropertyTable (graphemeBreakTable , GraphemeBreakPropertyMap , propertyList )
170
+ updateIndicConjunctBreak (graphemeBreakTable )
132
171
133
172
tableSize = getMinTableSize (graphemeBreakTable , defaultValue )
134
173
print (f'Grapheme Break table size: { tableSize } , last value: { GraphemeBreakProperty (graphemeBreakTable [tableSize - 1 ]).name } ' )
@@ -150,8 +189,11 @@ def updateGraphemeBreakTable(headerFile, sourceFile):
150
189
151
190
output = []
152
191
output .append ('enum class GraphemeBreakProperty {' )
192
+ propNext = GraphemeBreakProperty (max (graphemeBreakTable ))
153
193
for prop in GraphemeBreakProperty .__members__ .values ():
154
194
output .append (f'\t { prop .name } = { prop .value } ,' )
195
+ if prop == propNext :
196
+ break
155
197
output .append ('\t Sentinel = Prepend,' )
156
198
output .append ('};' )
157
199
@@ -162,7 +204,8 @@ def updateGraphemeBreakTable(headerFile, sourceFile):
162
204
163
205
output .append ('' )
164
206
output .append ('constexpr uint16_t graphemeClusterBoundary[] = {' )
165
- output .extend (bitValue (value ) + ', // ' + GraphemeBreakProperty (index ).name for index , value in enumerate (graphemeClusterBoundary ))
207
+ table = graphemeClusterBoundary [:int (propNext ) + 1 ]
208
+ output .extend (bitValue (value & 0xffff ) + ', // ' + GraphemeBreakProperty (index ).name for index , value in enumerate (table ))
166
209
output .append ('};' )
167
210
168
211
output .append ("""
@@ -192,4 +235,5 @@ def updateGraphemeBreakTable(headerFile, sourceFile):
192
235
Regenerate (sourceFile , "//grapheme table" , table )
193
236
194
237
if __name__ == '__main__' :
238
+ # parseSegmentationChart('Grapheme Break Chart.html')
195
239
updateGraphemeBreakTable ('../src/CharClassify.h' , '../src/CharClassify.cxx' )
0 commit comments