Skip to content

Commit 6599d6f

Browse files
committed
Minor update Unicode data generating.
1 parent a299130 commit 6599d6f

8 files changed

+180
-146
lines changed

scintilla/lexlib/CharacterCategory.h

+1
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88

99
namespace Lexilla {
1010

11+
// General Category Values https://www.unicode.org/reports/tr44/#GC_Values_Table
1112
enum CharacterCategory {
1213
ccLu, ccLl, ccLt, ccLm, ccLo,
1314
ccMn, ccMc, ccMe,

scintilla/scripts/GenerateCaseConvert.py

+5-2
Original file line numberDiff line numberDiff line change
@@ -119,10 +119,13 @@ def updateCaseConvert():
119119
rangeGroups, nonRanges = groupRanges(symmetrics)
120120

121121
print(len(rangeGroups), "ranges")
122-
rangeLines = ["%d,%d,%d,%d," % x for x in rangeGroups]
122+
rangeLines = []
123+
for lower, upper, length, pitch in rangeGroups:
124+
assert length < 256 and pitch < 256
125+
rangeLines.append(f"0x{lower:04X}'{length:02X},0x{upper:04X}'{pitch:02X},")
123126

124127
print(len(nonRanges), "non ranges")
125-
nonRangeLines = ["%d,%d," % x for x in nonRanges]
128+
nonRangeLines = [f"{lower},{upper}," for lower, upper in nonRanges]
126129

127130
print(len(symmetrics), "symmetric")
128131

scintilla/scripts/GenerateCharacterCategory.py

+27-7
Original file line numberDiff line numberDiff line change
@@ -353,7 +353,7 @@ def updateCharClassifyTable(filename, headfile):
353353
return static_cast<CharacterClass>(classifyMap[ch]);
354354
}
355355
if (ch >= maxUnicode) {
356-
return CharacterClass::space; // Cn
356+
return CharacterClass::space; // Co, Cn
357357
}
358358
359359
ch -= sizeof(classifyMap);""".replace('maxUnicode', hex(tableSize)),
@@ -368,16 +368,36 @@ def updateCharClassifyTable(filename, headfile):
368368
Regenerate(filename, "//", output)
369369
Regenerate(headfile, "//", head_output)
370370

371+
def readCharacterCategoryTable(categories):
372+
# https://www.unicode.org/Public/UCD/latest/ucd/UnicodeData.txt
373+
version, propertyList = readUnicodePropertyFile('UnicodeData.txt', propertyIndex=2, firstLast=1)
374+
indexTable = ['Cn'] * UnicodeCharacterCount
375+
flattenUnicodePropertyTable(indexTable, propertyList)
376+
diff = {}
377+
for index, cc in enumerate(indexTable):
378+
indexTable[index] = categories.index(cc)
379+
category = unicodedata.category(chr(index))
380+
if cc != category:
381+
key = (cc, category)
382+
if key in diff:
383+
prev = diff[key]
384+
start, end = prev[-1]
385+
if index - end == 1:
386+
prev[-1] = (start, index)
387+
else:
388+
prev.append((index, index))
389+
else:
390+
diff[key] = [(index, index)]
391+
for key, rangeList in diff.items():
392+
line = ', '.join(f'{start:04X}..{end:04X}' for start, end in rangeList)
393+
print(f'{key[0]} => {key[1]}: {line}')
394+
return indexTable
395+
371396
def updateCharacterCategoryTable(filename):
372397
categories = findCategories("../lexlib/CharacterCategory.h")
373398
output = [f"// Created with Python {platform.python_version()}, Unicode {unicodedata.unidata_version}"]
374399

375-
# https://www.unicode.org/Public/UCD/latest/ucd/UnicodeData.txt
376-
# version, propertyList = readUnicodePropertyFile('UnicodeData.txt', propertyIndex=2)
377-
# indexTable = ['Cn'] * UnicodeCharacterCount
378-
# flattenUnicodePropertyTable(indexTable, propertyList)
379-
# for index, cc in enumerate(indexTable):
380-
# indexTable[index] = categories.index(cc)
400+
# indexTable = readCharacterCategoryTable(categories)
381401
defaultValue = categories.index('Cn')
382402
indexTable = [defaultValue] * UnicodeCharacterCount
383403
for ch in range(UnicodeCharacterCount):

scintilla/scripts/GenerateGraphemeBreak.py

+25-27
Original file line numberDiff line numberDiff line change
@@ -12,41 +12,39 @@ class GraphemeBreakProperty(IntEnum):
1212
Other = 0
1313
Control = 1
1414
Extend = 2
15-
RegionalIndicator = 3
15+
ZeroWidthJoiner = 3
1616
Prepend = 4
1717
HangulL = 5
1818
HangulV = 6
1919
HangulT = 7
2020
HangulLV = 8
2121
HangulLVT = 9
2222
ExtendedPictographic = 10
23-
ZeroWidthJoiner = 11
23+
RegionalIndicator = 11
2424
# Indic_Conjunct_Break
2525
ConjunctLinker = 12
2626
LinkingConsonant = 13
27-
ExtendConjunctLinker = 14
2827
# merged property
29-
SpacingMark = 15
30-
CR = 16
31-
LF = 17
28+
SpacingMark = 14
29+
CR = 15
30+
LF = 16
3231

3332
# https://www.unicode.org/reports/tr35/tr35-general.html#segmentations
3433
# https://github.com/unicode-org/cldr/blob/main/common/segments/root.xml
3534
# https://www.unicode.org/reports/tr51/#Emoji_Properties
3635
GraphemeBreakPropertyMap = GraphemeBreakProperty.__members__ | {
37-
'Regional_Indicator': GraphemeBreakProperty.RegionalIndicator,
38-
'RI': GraphemeBreakProperty.RegionalIndicator,
36+
'ZWJ': GraphemeBreakProperty.ZeroWidthJoiner,
3937
'L': GraphemeBreakProperty.HangulL,
4038
'V': GraphemeBreakProperty.HangulV,
4139
'T': GraphemeBreakProperty.HangulT,
4240
'LV': GraphemeBreakProperty.HangulLV,
4341
'LVT': GraphemeBreakProperty.HangulLVT,
4442
'Extended_Pictographic': GraphemeBreakProperty.ExtendedPictographic,
4543
'ExtPict': GraphemeBreakProperty.ExtendedPictographic,
46-
'ZWJ': GraphemeBreakProperty.ZeroWidthJoiner,
44+
'Regional_Indicator': GraphemeBreakProperty.RegionalIndicator,
45+
'RI': GraphemeBreakProperty.RegionalIndicator,
4746
'Consonant': GraphemeBreakProperty.LinkingConsonant,
4847
'Virama': GraphemeBreakProperty.ConjunctLinker,
49-
'ExtendLinker': GraphemeBreakProperty.ExtendConjunctLinker,
5048
}
5149

5250
# https://www.unicode.org/reports/tr44/#Indic_Conjunct_Break
@@ -57,14 +55,15 @@ def updateIndicConjunctBreak(graphemeBreakTable):
5755
flattenUnicodePropertyTable(indicConjunctBreak, propertyList)
5856
defaultValue = int(GraphemeBreakProperty.Other)
5957
extend = int(GraphemeBreakProperty.Extend)
60-
extendLinker = int(GraphemeBreakProperty.ExtendConjunctLinker)
58+
linker = int(GraphemeBreakProperty.ConjunctLinker)
6159
for index, conjunct in enumerate(indicConjunctBreak):
6260
grapheme = graphemeBreakTable[index]
6361
if grapheme == defaultValue:
62+
assert conjunct != 'Virama'
6463
grapheme = int(GraphemeBreakPropertyMap.get(conjunct, grapheme))
6564
elif grapheme == extend:
6665
if conjunct == 'Virama':
67-
grapheme = extendLinker
66+
grapheme = linker
6867
graphemeBreakTable[index] = grapheme
6968

7069
graphemeClusterBoundary = [0x3ffff] * (max(GraphemeBreakProperty.__members__.values()) + 1)
@@ -73,22 +72,21 @@ def buildGraphemeClusterBoundary():
7372
table = graphemeClusterBoundary
7473

7574
notBreak = {
76-
'Other': ['Extend', 'SpacingMark', 'ZWJ', 'ExtendLinker'],
75+
'Other': ['Extend', 'SpacingMark', 'ZWJ', 'ConjunctLinker'],
7776
'CR': ['LF'],
78-
'Extend': ['Extend', 'SpacingMark', 'ZWJ', 'ExtendLinker'],
79-
'RI': ['Extend', 'SpacingMark', 'ZWJ', 'ExtendLinker', 'RI'],
80-
'Prepend': ['Other', 'Extend', 'SpacingMark', 'ZWJ', 'ExtendLinker', 'RI', 'Prepend', 'L', 'V', 'T', 'LV', 'LVT', 'ExtPict', 'ConjunctLinker', 'Consonant'],
81-
'SpacingMark': ['Extend', 'SpacingMark', 'ZWJ', 'ExtendLinker'],
82-
'L': ['Extend', 'SpacingMark', 'ZWJ', 'ExtendLinker', 'L', 'V', 'LV', 'LVT'],
83-
'V': ['Extend', 'SpacingMark', 'ZWJ', 'ExtendLinker', 'V', 'T'],
84-
'T': ['Extend', 'SpacingMark', 'ZWJ', 'ExtendLinker', 'T'],
85-
'LV': ['Extend', 'SpacingMark', 'ZWJ', 'ExtendLinker', 'V', 'T'],
86-
'LVT': ['Extend', 'SpacingMark', 'ZWJ', 'ExtendLinker', 'T'],
87-
'ExtPict': ['Extend', 'SpacingMark', 'ZWJ', 'ExtendLinker'],
88-
'ZWJ': ['Extend', 'SpacingMark', 'ZWJ', 'ExtendLinker', 'ExtPict', 'Consonant'],
89-
'ConjunctLinker': ['Extend', 'SpacingMark', 'ZWJ', 'ExtendLinker', 'Consonant'],
90-
'Consonant': ['Extend', 'SpacingMark', 'ZWJ', 'ExtendLinker', 'ConjunctLinker'],
91-
'ExtendLinker': ['Extend', 'SpacingMark', 'ZWJ', 'ExtendLinker', 'Consonant'],
77+
'Extend': ['Extend', 'SpacingMark', 'ZWJ', 'ConjunctLinker'],
78+
'ZWJ': ['Extend', 'SpacingMark', 'ZWJ', 'ConjunctLinker', 'ExtPict', 'Consonant'],
79+
'Prepend': ['Other', 'Extend', 'SpacingMark', 'ZWJ', 'ConjunctLinker', 'RI', 'Prepend', 'L', 'V', 'T', 'LV', 'LVT', 'ExtPict', 'Consonant'],
80+
'SpacingMark': ['Extend', 'SpacingMark', 'ZWJ', 'ConjunctLinker'],
81+
'L': ['Extend', 'SpacingMark', 'ZWJ', 'ConjunctLinker', 'L', 'V', 'LV', 'LVT'],
82+
'V': ['Extend', 'SpacingMark', 'ZWJ', 'ConjunctLinker', 'V', 'T'],
83+
'T': ['Extend', 'SpacingMark', 'ZWJ', 'ConjunctLinker', 'T'],
84+
'LV': ['Extend', 'SpacingMark', 'ZWJ', 'ConjunctLinker', 'V', 'T'],
85+
'LVT': ['Extend', 'SpacingMark', 'ZWJ', 'ConjunctLinker', 'T'],
86+
'ExtPict': ['Extend', 'SpacingMark', 'ZWJ', 'ConjunctLinker'],
87+
'RI': ['Extend', 'SpacingMark', 'ZWJ', 'ConjunctLinker', 'RI'],
88+
'ConjunctLinker': ['Extend', 'SpacingMark', 'ZWJ', 'ConjunctLinker', 'Consonant'],
89+
'Consonant': ['Extend', 'SpacingMark', 'ZWJ', 'ConjunctLinker'],
9290
}
9391

9492
for key, row in notBreak.items():

scintilla/scripts/UnicodeData.py

+17-5
Original file line numberDiff line numberDiff line change
@@ -22,11 +22,12 @@ def getCharacterName(ch):
2222
except ValueError:
2323
return ''
2424

25-
def readUnicodePropertyFile(path, propertyIndex=1):
25+
def readUnicodePropertyFile(path, propertyIndex=1, firstLast=None):
2626
filename, ext = os.path.splitext(os.path.basename(path))
2727
version = ''
2828
propertyList = {}
2929
missingList = {}
30+
prevLine = []
3031
with open(path, encoding='utf-8') as fd:
3132
for line in fd.readlines():
3233
line = line.strip()
@@ -56,10 +57,21 @@ def readUnicodePropertyFile(path, propertyIndex=1):
5657
missingList[propertyName] = (code, value)
5758
else:
5859
propertyName = items[propertyIndex].strip()
59-
if propertyName in propertyList:
60-
propertyList[propertyName].append(code)
61-
else:
62-
propertyList[propertyName] = [code]
60+
if firstLast:
61+
last = items[firstLast].strip()
62+
if last.endswith('Last>'):
63+
first = prevLine[firstLast].strip()
64+
prop = prevLine[propertyIndex].strip()
65+
assert first.endswith('First>')
66+
assert propertyName == prop and len(code) == 1
67+
propertyList[propertyName][-1].append(code[0])
68+
propertyName = None
69+
if propertyName is not None:
70+
if propertyName in propertyList:
71+
propertyList[propertyName].append(code)
72+
else:
73+
propertyList[propertyName] = [code]
74+
prevLine = items
6375

6476
print(path, version, 'property:', ', '.join(sorted(propertyList.keys())))
6577
return version, propertyList

scintilla/src/CaseConvert.cxx

+66-64
Original file line numberDiff line numberDiff line change
@@ -31,63 +31,63 @@ namespace {
3131
// Another pattern (pitch==2) is where each lower case letter is preceded by
3232
// the upper case form. These are also grouped into ranges.
3333

34-
constexpr int symmetricCaseConversionRanges[] = {
35-
//lower, upper, range length, range pitch
34+
constexpr unsigned int symmetricCaseConversionRanges[] = {
35+
//(lower << 8, range length), (upper << 8, range pitch)
3636
//++Autogenerated -- start of section automatically generated
3737
//**\(\*\n\)
38-
97,65,26,1,
39-
224,192,23,1,
40-
248,216,7,1,
41-
257,256,24,2,
42-
314,313,8,2,
43-
331,330,23,2,
44-
462,461,8,2,
45-
479,478,9,2,
46-
505,504,20,2,
47-
547,546,9,2,
48-
583,582,5,2,
49-
945,913,17,1,
50-
963,931,9,1,
51-
985,984,12,2,
52-
1072,1040,32,1,
53-
1104,1024,16,1,
54-
1121,1120,17,2,
55-
1163,1162,27,2,
56-
1218,1217,7,2,
57-
1233,1232,48,2,
58-
1377,1329,38,1,
59-
4304,7312,43,1,
60-
7681,7680,75,2,
61-
7841,7840,48,2,
62-
7936,7944,8,1,
63-
7952,7960,6,1,
64-
7968,7976,8,1,
65-
7984,7992,8,1,
66-
8000,8008,6,1,
67-
8032,8040,8,1,
68-
8560,8544,16,1,
69-
9424,9398,26,1,
70-
11312,11264,48,1,
71-
11393,11392,50,2,
72-
11520,4256,38,1,
73-
42561,42560,23,2,
74-
42625,42624,14,2,
75-
42787,42786,7,2,
76-
42803,42802,31,2,
77-
42879,42878,5,2,
78-
42903,42902,10,2,
79-
42933,42932,8,2,
80-
65345,65313,26,1,
81-
66600,66560,40,1,
82-
66776,66736,36,1,
83-
66967,66928,11,1,
84-
66979,66940,15,1,
85-
66995,66956,7,1,
86-
68800,68736,51,1,
87-
68976,68944,22,1,
88-
71872,71840,32,1,
89-
93792,93760,32,1,
90-
125218,125184,34,1,
38+
0x0061'1A,0x0041'01,
39+
0x00E0'17,0x00C0'01,
40+
0x00F8'07,0x00D8'01,
41+
0x0101'18,0x0100'02,
42+
0x013A'08,0x0139'02,
43+
0x014B'17,0x014A'02,
44+
0x01CE'08,0x01CD'02,
45+
0x01DF'09,0x01DE'02,
46+
0x01F9'14,0x01F8'02,
47+
0x0223'09,0x0222'02,
48+
0x0247'05,0x0246'02,
49+
0x03B1'11,0x0391'01,
50+
0x03C3'09,0x03A3'01,
51+
0x03D9'0C,0x03D8'02,
52+
0x0430'20,0x0410'01,
53+
0x0450'10,0x0400'01,
54+
0x0461'11,0x0460'02,
55+
0x048B'1B,0x048A'02,
56+
0x04C2'07,0x04C1'02,
57+
0x04D1'30,0x04D0'02,
58+
0x0561'26,0x0531'01,
59+
0x10D0'2B,0x1C90'01,
60+
0x1E01'4B,0x1E00'02,
61+
0x1EA1'30,0x1EA0'02,
62+
0x1F00'08,0x1F08'01,
63+
0x1F10'06,0x1F18'01,
64+
0x1F20'08,0x1F28'01,
65+
0x1F30'08,0x1F38'01,
66+
0x1F40'06,0x1F48'01,
67+
0x1F60'08,0x1F68'01,
68+
0x2170'10,0x2160'01,
69+
0x24D0'1A,0x24B6'01,
70+
0x2C30'30,0x2C00'01,
71+
0x2C81'32,0x2C80'02,
72+
0x2D00'26,0x10A0'01,
73+
0xA641'17,0xA640'02,
74+
0xA681'0E,0xA680'02,
75+
0xA723'07,0xA722'02,
76+
0xA733'1F,0xA732'02,
77+
0xA77F'05,0xA77E'02,
78+
0xA797'0A,0xA796'02,
79+
0xA7B5'08,0xA7B4'02,
80+
0xFF41'1A,0xFF21'01,
81+
0x10428'28,0x10400'01,
82+
0x104D8'24,0x104B0'01,
83+
0x10597'0B,0x10570'01,
84+
0x105A3'0F,0x1057C'01,
85+
0x105B3'07,0x1058C'01,
86+
0x10CC0'33,0x10C80'01,
87+
0x10D70'16,0x10D50'01,
88+
0x118C0'20,0x118A0'01,
89+
0x16E60'20,0x16E40'01,
90+
0x1E922'22,0x1E900'01,
9191

9292
//--Autogenerated -- end of section automatically generated
9393
};
@@ -697,19 +697,21 @@ void CaseConverter::AddSymmetric(CaseConversion conversion, int lower, int upper
697697

698698
void CaseConverter::SetupConversions(CaseConversion conversion) {
699699
// First initialize for the symmetric ranges
700-
for (size_t i = 0; i < std::size(symmetricCaseConversionRanges);) {
701-
const int lower = symmetricCaseConversionRanges[i++];
702-
const int upper = symmetricCaseConversionRanges[i++];
703-
const int length = symmetricCaseConversionRanges[i++];
704-
const int pitch = symmetricCaseConversionRanges[i++];
705-
for (int j = 0; j < length*pitch; j += pitch) {
700+
for (size_t i = 0; i < std::size(symmetricCaseConversionRanges); i += 2) {
701+
unsigned int lower = symmetricCaseConversionRanges[i];
702+
unsigned int upper = symmetricCaseConversionRanges[i + 1];
703+
const int pitch = upper & 0xff;
704+
const int length = (lower & 0xff)*pitch;
705+
lower >>= 8;
706+
upper >>= 8;
707+
for (int j = 0; j < length; j += pitch) {
706708
AddSymmetric(conversion, lower + j, upper + j);
707709
}
708710
}
709711
// Add the symmetric singletons
710-
for (size_t i = 0; i < std::size(symmetricCaseConversions);) {
711-
const int lower = symmetricCaseConversions[i++];
712-
const int upper = symmetricCaseConversions[i++];
712+
for (size_t i = 0; i < std::size(symmetricCaseConversions); i += 2) {
713+
const int lower = symmetricCaseConversions[i];
714+
const int upper = symmetricCaseConversions[i + 1];
713715
AddSymmetric(conversion, lower, upper);
714716
}
715717
// Add the complex cases

0 commit comments

Comments
 (0)