Skip to content

Commit 4b8a188

Browse files
committed
Update Unicode data to 16.0, improve grapheme cluster boundary for Indic
scripts, see https://sourceforge.net/p/scintilla/feature-requests/1417/
1 parent 6e1dd89 commit 4b8a188

13 files changed

+976
-746
lines changed

scintilla/lexlib/CharacterCategory.cxx

+292-276
Large diffs are not rendered by default.

scintilla/lexlib/StyleContext.cxx

+6
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,9 @@ bool StyleContext::MatchIgnoreCase(const char *s) const noexcept {
4444
return false;
4545
}
4646
s++;
47+
if (!*s) {
48+
return true;
49+
}
4750
if (MakeLowerCase(chNext) != static_cast<unsigned char>(*s)) {
4851
return false;
4952
}
@@ -61,6 +64,9 @@ bool StyleContext::MatchLowerCase(const char *s) const noexcept {
6164
return false;
6265
}
6366
s++;
67+
if (!*s) {
68+
return true;
69+
}
6470
if (UnsafeLower(chNext) != static_cast<unsigned char>(*s)) {
6571
return false;
6672
}

scintilla/scripts/GenerateGraphemeBreak.py

+82-38
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,4 @@
11
# script to generate grapheme cluster boundary data.
2-
# https://www.unicode.org/reports/tr41/
32
from enum import IntEnum
43

54
from FileGenerator import Regenerate
@@ -8,23 +7,32 @@
87

98
# Unicode Text Segmentation
109
# https://www.unicode.org/reports/tr29/#Grapheme_Cluster_Break_Property_Values
10+
# https://www.unicode.org/Public/UCD/latest/ucd/auxiliary/GraphemeBreakTest.html
1111
class GraphemeBreakProperty(IntEnum):
1212
Other = 0
13-
CR = 1
14-
LF = 2
15-
Control = 3
16-
Extend = 4
17-
RegionalIndicator = 5
18-
Prepend = 6
19-
SpacingMark = 7
20-
HangulL = 8
21-
HangulV = 9
22-
HangulT = 10
23-
HangulLV = 11
24-
HangulLVT = 12
25-
ExtendedPictographic = 13
26-
ZeroWidthJoiner = 14
27-
13+
Control = 1
14+
Extend = 2
15+
RegionalIndicator = 3
16+
Prepend = 4
17+
HangulL = 5
18+
HangulV = 6
19+
HangulT = 7
20+
HangulLV = 8
21+
HangulLVT = 9
22+
ExtendedPictographic = 10
23+
ZeroWidthJoiner = 11
24+
# Indic_Conjunct_Break
25+
ConjunctLinker = 12
26+
LinkingConsonant = 13
27+
ExtendConjunctLinker = 14
28+
# merged property
29+
SpacingMark = 15
30+
CR = 16
31+
LF = 17
32+
33+
# https://www.unicode.org/reports/tr35/tr35-general.html#segmentations
34+
# https://github.com/unicode-org/cldr/blob/main/common/segments/root.xml
35+
# https://www.unicode.org/reports/tr51/#Emoji_Properties
2836
GraphemeBreakPropertyMap = GraphemeBreakProperty.__members__ | {
2937
'Regional_Indicator': GraphemeBreakProperty.RegionalIndicator,
3038
'RI': GraphemeBreakProperty.RegionalIndicator,
@@ -36,28 +44,51 @@ class GraphemeBreakProperty(IntEnum):
3644
'Extended_Pictographic': GraphemeBreakProperty.ExtendedPictographic,
3745
'ExtPict': GraphemeBreakProperty.ExtendedPictographic,
3846
'ZWJ': GraphemeBreakProperty.ZeroWidthJoiner,
47+
'Consonant': GraphemeBreakProperty.LinkingConsonant,
48+
'Virama': GraphemeBreakProperty.ConjunctLinker,
49+
'ExtendLinker': GraphemeBreakProperty.ExtendConjunctLinker,
3950
}
4051

41-
graphemeClusterBoundary = [0xffff] * (max(GraphemeBreakProperty.__members__.values()) + 1)
52+
# https://www.unicode.org/reports/tr44/#Indic_Conjunct_Break
53+
def updateIndicConjunctBreak(graphemeBreakTable):
54+
# https://www.unicode.org/Public/UCD/latest/ucd/IndicSyllabicCategory.txt
55+
indicConjunctBreak = ['Other'] * UnicodeCharacterCount
56+
version, propertyList = readUnicodePropertyFile('IndicSyllabicCategory.txt')
57+
flattenUnicodePropertyTable(indicConjunctBreak, propertyList)
58+
defaultValue = int(GraphemeBreakProperty.Other)
59+
extend = int(GraphemeBreakProperty.Extend)
60+
extendLinker = int(GraphemeBreakProperty.ExtendConjunctLinker)
61+
for index, conjunct in enumerate(indicConjunctBreak):
62+
grapheme = graphemeBreakTable[index]
63+
if grapheme == defaultValue:
64+
grapheme = GraphemeBreakPropertyMap.get(conjunct, grapheme)
65+
elif grapheme == extend:
66+
if conjunct == 'Virama':
67+
grapheme = extendLinker
68+
graphemeBreakTable[index] = grapheme
69+
70+
graphemeClusterBoundary = [0x3ffff] * (max(GraphemeBreakProperty.__members__.values()) + 1)
71+
# https://www.unicode.org/reports/tr29/#Grapheme_Cluster_Boundary_Rules
4272
def buildGraphemeClusterBoundary():
4373
table = graphemeClusterBoundary
4474

45-
# https://www.unicode.org/Public/UCD/latest/ucd/auxiliary/GraphemeBreakTest.html
4675
notBreak = {
47-
'Other': ['Extend', 'SpacingMark', 'ZWJ'],
76+
'Other': ['Extend', 'SpacingMark', 'ZWJ', 'ExtendLinker'],
4877
'CR': ['LF'],
49-
'Extend': ['Extend', 'SpacingMark', 'ZWJ'],
50-
'RI': ['Extend', 'RI', 'SpacingMark', 'ZWJ'],
51-
'Prepend': ['Other', 'Extend', 'RI', 'Prepend', 'SpacingMark', 'L', 'V', 'T', 'LV', 'LVT', 'ExtPict', 'ZWJ'],
52-
'SpacingMark': ['Extend', 'SpacingMark', 'ZWJ'],
53-
'L': ['Extend', 'SpacingMark', 'L', 'V', 'LV', 'LVT', 'ZWJ'],
54-
'V': ['Extend', 'SpacingMark', 'V', 'T', 'ZWJ'],
55-
'T': ['Extend', 'SpacingMark', 'T', 'ZWJ'],
56-
'LV': ['Extend', 'SpacingMark', 'V', 'T', 'ZWJ'],
57-
'LVT': ['Extend', 'SpacingMark', 'T', 'ZWJ'],
58-
'ExtPict': ['Extend', 'SpacingMark', 'ZWJ'],
59-
#'ZWJ': ['Extend', 'SpacingMark', 'ZWJ'],
60-
'ZWJ': ['Extend', 'SpacingMark', 'ExtPict', 'ZWJ'],
78+
'Extend': ['Extend', 'SpacingMark', 'ZWJ', 'ExtendLinker'],
79+
'RI': ['Extend', 'SpacingMark', 'ZWJ', 'ExtendLinker', 'RI'],
80+
'Prepend': ['Other', 'Extend', 'SpacingMark', 'ZWJ', 'ExtendLinker', 'RI', 'Prepend', 'L', 'V', 'T', 'LV', 'LVT', 'ExtPict', 'ConjunctLinker', 'Consonant'],
81+
'SpacingMark': ['Extend', 'SpacingMark', 'ZWJ', 'ExtendLinker'],
82+
'L': ['Extend', 'SpacingMark', 'ZWJ', 'ExtendLinker', 'L', 'V', 'LV', 'LVT'],
83+
'V': ['Extend', 'SpacingMark', 'ZWJ', 'ExtendLinker', 'V', 'T'],
84+
'T': ['Extend', 'SpacingMark', 'ZWJ', 'ExtendLinker', 'T'],
85+
'LV': ['Extend', 'SpacingMark', 'ZWJ', 'ExtendLinker', 'V', 'T'],
86+
'LVT': ['Extend', 'SpacingMark', 'ZWJ', 'ExtendLinker', 'T'],
87+
'ExtPict': ['Extend', 'SpacingMark', 'ZWJ', 'ExtendLinker'],
88+
'ZWJ': ['Extend', 'SpacingMark', 'ZWJ', 'ExtendLinker', 'ExtPict', 'Consonant'],
89+
'ConjunctLinker': ['Extend', 'SpacingMark', 'ZWJ', 'ExtendLinker', 'Consonant'],
90+
'Consonant': ['Extend', 'SpacingMark', 'ZWJ', 'ExtendLinker', 'ConjunctLinker'],
91+
'ExtendLinker': ['Extend', 'SpacingMark', 'ZWJ', 'ExtendLinker', 'Consonant'],
6192
}
6293

6394
for key, row in notBreak.items():
@@ -93,10 +124,12 @@ def findLongestCharacterSequence(path):
93124
def testGraphemeBreak(path, graphemeBreakTable):
94125
opportunity = '×÷'
95126
allow = opportunity[1]
96-
totalCount = 0
97-
failCount = 0
127+
total = 0
128+
fail = [0, 0]
129+
ignore = 0
98130
with open(path, encoding='utf-8') as fd:
99131
lineno = 0
132+
indent = ' '*4
100133
for line in fd.readlines():
101134
lineno += 1
102135
line = line.strip()
@@ -111,14 +144,19 @@ def testGraphemeBreak(path, graphemeBreakTable):
111144
ch = sequence[index]
112145
official = sequence[index + 1]
113146
chNext = sequence[index + 2]
147+
if ch == '000D' and chNext == '000A':
148+
ignore += 1
149+
continue
114150
prop = GraphemeBreakProperty(graphemeBreakTable[int(ch, 16)])
115151
propNext = GraphemeBreakProperty(graphemeBreakTable[int(chNext, 16)])
116-
value = opportunity[(graphemeClusterBoundary[prop] >> propNext) & 1]
117-
totalCount += 1
152+
result = (graphemeClusterBoundary[prop] >> propNext) & 1
153+
value = opportunity[result]
154+
total += 1
118155
if value != official:
119-
failCount += 1
156+
fail[result ^ 1] += 1
120157
print(f'test fail on line {lineno}: {ch} {official} {chNext} => {prop.name} {value} {propNext.name}')
121-
print(f'{path} total test: {totalCount}, failed test: {failCount}')
158+
print(f'{indent}{line}')
159+
print(f'{path} total test: {total}, failed: {opportunity[0]} {fail[0]}, {opportunity[1]} {fail[1]}, ignored: {ignore}')
122160

123161
def updateGraphemeBreakTable(headerFile, sourceFile):
124162
defaultValue = int(GraphemeBreakProperty.Other)
@@ -129,6 +167,7 @@ def updateGraphemeBreakTable(headerFile, sourceFile):
129167
# https://www.unicode.org/Public/UCD/latest/ucd/auxiliary/GraphemeBreakProperty.txt
130168
version, propertyList = readUnicodePropertyFile('GraphemeBreakProperty.txt')
131169
updateUnicodePropertyTable(graphemeBreakTable, GraphemeBreakPropertyMap, propertyList)
170+
updateIndicConjunctBreak(graphemeBreakTable)
132171

133172
tableSize = getMinTableSize(graphemeBreakTable, defaultValue)
134173
print(f'Grapheme Break table size: {tableSize}, last value: {GraphemeBreakProperty(graphemeBreakTable[tableSize - 1]).name}')
@@ -150,8 +189,11 @@ def updateGraphemeBreakTable(headerFile, sourceFile):
150189

151190
output = []
152191
output.append('enum class GraphemeBreakProperty {')
192+
propNext = GraphemeBreakProperty(max(graphemeBreakTable))
153193
for prop in GraphemeBreakProperty.__members__.values():
154194
output.append(f'\t{prop.name} = {prop.value},')
195+
if prop == propNext:
196+
break
155197
output.append('\tSentinel = Prepend,')
156198
output.append('};')
157199

@@ -162,7 +204,8 @@ def updateGraphemeBreakTable(headerFile, sourceFile):
162204

163205
output.append('')
164206
output.append('constexpr uint16_t graphemeClusterBoundary[] = {')
165-
output.extend(bitValue(value) + ', // ' + GraphemeBreakProperty(index).name for index, value in enumerate(graphemeClusterBoundary))
207+
table = graphemeClusterBoundary[:int(propNext) + 1]
208+
output.extend(bitValue(value & 0xffff) + ', // ' + GraphemeBreakProperty(index).name for index, value in enumerate(table))
166209
output.append('};')
167210

168211
output.append("""
@@ -192,4 +235,5 @@ def updateGraphemeBreakTable(headerFile, sourceFile):
192235
Regenerate(sourceFile, "//grapheme table", table)
193236

194237
if __name__ == '__main__':
238+
# parseSegmentationChart('Grapheme Break Chart.html')
195239
updateGraphemeBreakTable('../src/CharClassify.h', '../src/CharClassify.cxx')

scintilla/scripts/GenerateLineBreak.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,4 @@
11
# script to generate line breaking data.
2-
# https://www.unicode.org/reports/tr41/
3-
# https://www.unicode.org/Public/UCD/latest/ucd/auxiliary/LineBreakTest.html
42

53
import platform
64
import unicodedata
@@ -13,6 +11,7 @@
1311

1412
# Unicode Line Breaking Algorithm
1513
# https://www.unicode.org/reports/tr14/
14+
# https://www.unicode.org/Public/UCD/latest/ucd/auxiliary/LineBreakTest.html
1615
class LineBreak(IntFlag):
1716
NonBreak = 0
1817
BreakBefore = 1 # B
@@ -86,6 +85,7 @@ class LineBreak(IntFlag):
8685
def updateUnicodeLineBreak(filename):
8786
lineBreakTable = ['XX'] * UnicodeCharacterCount # @missing
8887
# https://www.unicode.org/Public/UCD/latest/ucd/LineBreak.txt
88+
# https://www.unicode.org/Public/UCD/latest/ucd/extracted/DerivedLineBreak.txt
8989
version, propertyList = readUnicodePropertyFile('LineBreak.txt')
9090
flattenUnicodePropertyTable(lineBreakTable, propertyList)
9191

scintilla/scripts/UnicodeData.py

+105
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,15 @@
22
import os.path
33
import unicodedata
44

5+
# Common References for Unicode Standard Annexes
6+
# https://www.unicode.org/reports/tr41/
7+
# Unicode Character Database
8+
# https://www.unicode.org/reports/tr44/
9+
# Unicode Locale Data Markup Language (LDML)
10+
# https://www.unicode.org/reports/tr35/
11+
# Unicode Character Database in XML
12+
# https://www.unicode.org/reports/tr42/
13+
514
MaxCharacter = sys.maxunicode
615
UnicodeCharacterCount = sys.maxunicode + 1
716
BMPCharacterCharacterCount = 0xffff + 1
@@ -88,3 +97,99 @@ def flattenPropertyMap(propertyMap):
8897
for value in items:
8998
result[value] = key
9099
return result
100+
101+
def parseSegmentationChart(path, opportunity=0):
102+
from bs4 import BeautifulSoup
103+
104+
print('parse:', path)
105+
with open(path, encoding='utf-8', newline='\n') as fd:
106+
doc = fd.read()
107+
soup = BeautifulSoup(doc, 'html5lib')
108+
title = soup.find('title').get_text()
109+
node = soup.find('a', {'name': 'table'})
110+
node = node.find_next('table').find('tbody')
111+
table = []
112+
row_map = {}
113+
same_row = {}
114+
column_header = []
115+
for row in node.find_all('tr'):
116+
items = []
117+
for column in row.find_all('th'):
118+
items.append(column.get_text().strip())
119+
if not items:
120+
continue
121+
if not column_header:
122+
assert len(items) > 1 and len(items[0]) == 0
123+
column_header = items[1:]
124+
else:
125+
assert len(items) == len(column_header) + 1
126+
header = items[0]
127+
items = items[1:]
128+
if header in row_map:
129+
assert items == row_map[header]
130+
else:
131+
for name, value in row_map.items():
132+
if items == value:
133+
same_row[header] = name
134+
# print('same row:', name, header)
135+
break
136+
table.append(items)
137+
row_map[header] = items
138+
139+
row_header = list(row_map.keys())
140+
row_count = len(row_header)
141+
print(f'{title} table row: {row_count}, column: {len(column_header)}')
142+
column_map = {}
143+
same_column = {}
144+
for index, header in enumerate(column_header):
145+
items = [None] * row_count
146+
for i in range(row_count):
147+
items[i] = table[i][index]
148+
if header in column_map:
149+
assert items == column_map[header]
150+
else:
151+
for name, value in column_map.items():
152+
if items == value:
153+
same_column[header] = name
154+
# print('same column:', name, header)
155+
break
156+
column_map[header] = items
157+
158+
# find property with same row and same column
159+
same_map = {}
160+
both = (same_row.keys() | same_row.values()) & (same_column.keys() | same_column.values())
161+
if len(both) > 1:
162+
count = len(both)
163+
both = sorted(both, key=row_header.index)
164+
index = 0
165+
while index < count:
166+
header = both[index]
167+
index += 1
168+
row = same_row.get(header, header)
169+
column = same_column.get(header, header)
170+
j = index
171+
while j < count:
172+
name = both[j]
173+
j += 1
174+
r = same_row.get(name, name)
175+
c = same_column.get(name, name)
176+
if row == r and column == c:
177+
print('same row & column:', header, name)
178+
same_map[name] = header
179+
j -= 1
180+
count -= 1
181+
del both[j]
182+
183+
opportunity = '×÷'[opportunity]
184+
for header, items in row_map.items():
185+
if header in same_map:
186+
continue
187+
row = []
188+
for index, symbol in enumerate(items):
189+
if symbol == opportunity:
190+
value = row_header[index]
191+
value = same_map.get(value, value)
192+
if value not in row:
193+
row.append(value)
194+
if row:
195+
print(f'{opportunity} {header}:', ', '.join(row))

scintilla/src/CaseConvert.cxx

+6
Original file line numberDiff line numberDiff line change
@@ -84,6 +84,7 @@ constexpr int symmetricCaseConversionRanges[] = {
8484
66979,66940,15,1,
8585
66995,66956,7,1,
8686
68800,68736,51,1,
87+
68976,68944,22,1,
8788
71872,71840,32,1,
8889
93792,93760,32,1,
8990
125218,125184,34,1,
@@ -114,6 +115,7 @@ constexpr int symmetricCaseConversions[] = {
114115
405,502,
115116
409,408,
116117
410,573,
118+
411,42972,
117119
414,544,
118120
417,416,
119121
419,418,
@@ -149,6 +151,7 @@ constexpr int symmetricCaseConversions[] = {
149151
608,403,
150152
609,42924,
151153
611,404,
154+
612,42955,
152155
613,42893,
153156
614,42922,
154157
616,407,
@@ -195,6 +198,7 @@ constexpr int symmetricCaseConversions[] = {
195198
4349,7357,
196199
4350,7358,
197200
4351,7359,
201+
7306,7305,
198202
7545,42877,
199203
7549,11363,
200204
7566,42950,
@@ -246,9 +250,11 @@ constexpr int symmetricCaseConversions[] = {
246250
42900,42948,
247251
42952,42951,
248252
42954,42953,
253+
42957,42956,
249254
42961,42960,
250255
42967,42966,
251256
42969,42968,
257+
42971,42970,
252258
42998,42997,
253259
43859,42931,
254260
67003,66964,

0 commit comments

Comments
 (0)