zufuliu
diff --git a/‎scintilla/lexlib/CharacterCategory.cxx
+292-276 b/‎scintilla/lexlib/CharacterCategory.cxx
+292-276
diff --git a/‎scintilla/lexlib/StyleContext.cxx
+6 b/‎scintilla/lexlib/StyleContext.cxx
+6
diff --git a/‎scintilla/scripts/GenerateGraphemeBreak.py
+82-38 b/‎scintilla/scripts/GenerateGraphemeBreak.py
+82-38
diff --git a/‎scintilla/scripts/GenerateLineBreak.py
+2-2 b/‎scintilla/scripts/GenerateLineBreak.py
+2-2
diff --git a/‎scintilla/scripts/UnicodeData.py
+105 b/‎scintilla/scripts/UnicodeData.py
+105
diff --git a/‎scintilla/src/CaseConvert.cxx
+6 b/‎scintilla/src/CaseConvert.cxx
+6
@@ -44,6 +44,9 @@ bool StyleContext::MatchIgnoreCase(const char *s) const noexcept {
 		return false;
 	}
 	s++;
+	if (!*s) {
+		return true;
+	}
 	if (MakeLowerCase(chNext) != static_cast<unsigned char>(*s)) {
 		return false;
 	}
@@ -61,6 +64,9 @@ bool StyleContext::MatchLowerCase(const char *s) const noexcept {
 		return false;
 	}
 	s++;
+	if (!*s) {
+		return true;
+	}
 	if (UnsafeLower(chNext) != static_cast<unsigned char>(*s)) {
 		return false;
 	}
 
@@ -1,5 +1,4 @@
 # script to generate grapheme cluster boundary data.
-# https://www.unicode.org/reports/tr41/
 from enum import IntEnum
 
 from FileGenerator import Regenerate
@@ -8,23 +7,32 @@
 
 # Unicode Text Segmentation
 # https://www.unicode.org/reports/tr29/#Grapheme_Cluster_Break_Property_Values
+# https://www.unicode.org/Public/UCD/latest/ucd/auxiliary/GraphemeBreakTest.html
 class GraphemeBreakProperty(IntEnum):
 	Other = 0
-	CR = 1
-	LF = 2
-	Control = 3
-	Extend = 4
-	RegionalIndicator = 5
-	Prepend = 6
-	SpacingMark = 7
-	HangulL = 8
-	HangulV = 9
-	HangulT = 10
-	HangulLV = 11
-	HangulLVT = 12
-	ExtendedPictographic = 13
-	ZeroWidthJoiner = 14
-
+	Control = 1
+	Extend = 2
+	RegionalIndicator = 3
+	Prepend = 4
+	HangulL = 5
+	HangulV = 6
+	HangulT = 7
+	HangulLV = 8
+	HangulLVT = 9
+	ExtendedPictographic = 10
+	ZeroWidthJoiner = 11
+	# Indic_Conjunct_Break
+	ConjunctLinker = 12
+	LinkingConsonant = 13
+	ExtendConjunctLinker = 14
+	# merged property
+	SpacingMark = 15
+	CR = 16
+	LF = 17
+
+# https://www.unicode.org/reports/tr35/tr35-general.html#segmentations
+# https://github.com/unicode-org/cldr/blob/main/common/segments/root.xml
+# https://www.unicode.org/reports/tr51/#Emoji_Properties
 GraphemeBreakPropertyMap = GraphemeBreakProperty.__members__ | {
 	'Regional_Indicator': GraphemeBreakProperty.RegionalIndicator,
 	'RI': GraphemeBreakProperty.RegionalIndicator,
@@ -36,28 +44,51 @@ class GraphemeBreakProperty(IntEnum):
 	'Extended_Pictographic': GraphemeBreakProperty.ExtendedPictographic,
 	'ExtPict': GraphemeBreakProperty.ExtendedPictographic,
 	'ZWJ': GraphemeBreakProperty.ZeroWidthJoiner,
+	'Consonant': GraphemeBreakProperty.LinkingConsonant,
+	'Virama': GraphemeBreakProperty.ConjunctLinker,
+	'ExtendLinker': GraphemeBreakProperty.ExtendConjunctLinker,
 }
 
-graphemeClusterBoundary = [0xffff] * (max(GraphemeBreakProperty.__members__.values()) + 1)
+# https://www.unicode.org/reports/tr44/#Indic_Conjunct_Break
+def updateIndicConjunctBreak(graphemeBreakTable):
+	# https://www.unicode.org/Public/UCD/latest/ucd/IndicSyllabicCategory.txt
+	indicConjunctBreak = ['Other'] * UnicodeCharacterCount
+	version, propertyList = readUnicodePropertyFile('IndicSyllabicCategory.txt')
+	flattenUnicodePropertyTable(indicConjunctBreak, propertyList)
+	defaultValue = int(GraphemeBreakProperty.Other)
+	extend = int(GraphemeBreakProperty.Extend)
+	extendLinker = int(GraphemeBreakProperty.ExtendConjunctLinker)
+	for index, conjunct in enumerate(indicConjunctBreak):
+		grapheme = graphemeBreakTable[index]
+		if grapheme == defaultValue:
+			grapheme = GraphemeBreakPropertyMap.get(conjunct, grapheme)
+		elif grapheme == extend:
+			if conjunct == 'Virama':
+				grapheme = extendLinker
+		graphemeBreakTable[index] = grapheme
+
+graphemeClusterBoundary = [0x3ffff] * (max(GraphemeBreakProperty.__members__.values()) + 1)
+# https://www.unicode.org/reports/tr29/#Grapheme_Cluster_Boundary_Rules
 def buildGraphemeClusterBoundary():
 	table = graphemeClusterBoundary
 
-	# https://www.unicode.org/Public/UCD/latest/ucd/auxiliary/GraphemeBreakTest.html
 	notBreak = {
-		'Other': ['Extend', 'SpacingMark', 'ZWJ'],
+		'Other': ['Extend', 'SpacingMark', 'ZWJ', 'ExtendLinker'],
 		'CR': ['LF'],
-		'Extend': ['Extend', 'SpacingMark', 'ZWJ'],
-		'RI': ['Extend', 'RI', 'SpacingMark', 'ZWJ'],
-		'Prepend': ['Other', 'Extend', 'RI', 'Prepend', 'SpacingMark', 'L', 'V', 'T', 'LV', 'LVT', 'ExtPict', 'ZWJ'],
-		'SpacingMark': ['Extend', 'SpacingMark', 'ZWJ'],
-		'L': ['Extend', 'SpacingMark', 'L', 'V', 'LV', 'LVT', 'ZWJ'],
-		'V': ['Extend', 'SpacingMark', 'V', 'T', 'ZWJ'],
-		'T': ['Extend', 'SpacingMark', 'T', 'ZWJ'],
-		'LV': ['Extend', 'SpacingMark', 'V', 'T', 'ZWJ'],
-		'LVT': ['Extend', 'SpacingMark', 'T', 'ZWJ'],
-		'ExtPict': ['Extend', 'SpacingMark', 'ZWJ'],
-		#'ZWJ': ['Extend', 'SpacingMark', 'ZWJ'],
-		'ZWJ': ['Extend', 'SpacingMark', 'ExtPict', 'ZWJ'],
+		'Extend': ['Extend', 'SpacingMark', 'ZWJ', 'ExtendLinker'],
+		'RI': ['Extend', 'SpacingMark', 'ZWJ', 'ExtendLinker', 'RI'],
+		'Prepend': ['Other', 'Extend', 'SpacingMark', 'ZWJ', 'ExtendLinker', 'RI', 'Prepend', 'L', 'V', 'T', 'LV', 'LVT', 'ExtPict', 'ConjunctLinker', 'Consonant'],
+		'SpacingMark': ['Extend', 'SpacingMark', 'ZWJ', 'ExtendLinker'],
+		'L': ['Extend', 'SpacingMark', 'ZWJ', 'ExtendLinker', 'L', 'V', 'LV', 'LVT'],
+		'V': ['Extend', 'SpacingMark', 'ZWJ', 'ExtendLinker', 'V', 'T'],
+		'T': ['Extend', 'SpacingMark', 'ZWJ', 'ExtendLinker', 'T'],
+		'LV': ['Extend', 'SpacingMark', 'ZWJ', 'ExtendLinker', 'V', 'T'],
+		'LVT': ['Extend', 'SpacingMark', 'ZWJ', 'ExtendLinker', 'T'],
+		'ExtPict': ['Extend', 'SpacingMark', 'ZWJ', 'ExtendLinker'],
+		'ZWJ': ['Extend', 'SpacingMark', 'ZWJ', 'ExtendLinker', 'ExtPict', 'Consonant'],
+		'ConjunctLinker': ['Extend', 'SpacingMark', 'ZWJ', 'ExtendLinker', 'Consonant'],
+		'Consonant': ['Extend', 'SpacingMark', 'ZWJ', 'ExtendLinker', 'ConjunctLinker'],
+		'ExtendLinker': ['Extend', 'SpacingMark', 'ZWJ', 'ExtendLinker', 'Consonant'],
 	}
 
 	for key, row in notBreak.items():
@@ -93,10 +124,12 @@ def findLongestCharacterSequence(path):
 def testGraphemeBreak(path, graphemeBreakTable):
 	opportunity = '×÷'
 	allow = opportunity[1]
-	totalCount = 0
-	failCount = 0
+	total = 0
+	fail = [0, 0]
+	ignore = 0
 	with open(path, encoding='utf-8') as fd:
 		lineno = 0
+		indent = ' '*4
 		for line in fd.readlines():
 			lineno += 1
 			line = line.strip()
@@ -111,14 +144,19 @@ def testGraphemeBreak(path, graphemeBreakTable):
 				ch = sequence[index]
 				official = sequence[index + 1]
 				chNext = sequence[index + 2]
+				if ch == '000D' and chNext == '000A':
+					ignore += 1
+					continue
 				prop = GraphemeBreakProperty(graphemeBreakTable[int(ch, 16)])
 				propNext = GraphemeBreakProperty(graphemeBreakTable[int(chNext, 16)])
-				value = opportunity[(graphemeClusterBoundary[prop] >> propNext) & 1]
-				totalCount += 1
+				result = (graphemeClusterBoundary[prop] >> propNext) & 1
+				value = opportunity[result]
+				total += 1
 				if value != official:
-					failCount += 1
+					fail[result ^ 1] += 1
 					print(f'test fail on line {lineno}: {ch} {official} {chNext} => {prop.name} {value} {propNext.name}')
-	print(f'{path} total test: {totalCount}, failed test: {failCount}')
+					print(f'{indent}{line}')
+	print(f'{path} total test: {total}, failed: {opportunity[0]} {fail[0]}, {opportunity[1]} {fail[1]}, ignored: {ignore}')
 
 def updateGraphemeBreakTable(headerFile, sourceFile):
 	defaultValue = int(GraphemeBreakProperty.Other)
@@ -129,6 +167,7 @@ def updateGraphemeBreakTable(headerFile, sourceFile):
 	# https://www.unicode.org/Public/UCD/latest/ucd/auxiliary/GraphemeBreakProperty.txt
 	version, propertyList = readUnicodePropertyFile('GraphemeBreakProperty.txt')
 	updateUnicodePropertyTable(graphemeBreakTable, GraphemeBreakPropertyMap, propertyList)
+	updateIndicConjunctBreak(graphemeBreakTable)
 
 	tableSize = getMinTableSize(graphemeBreakTable, defaultValue)
 	print(f'Grapheme Break table size: {tableSize}, last value: {GraphemeBreakProperty(graphemeBreakTable[tableSize - 1]).name}')
@@ -150,8 +189,11 @@ def updateGraphemeBreakTable(headerFile, sourceFile):
 
 	output = []
 	output.append('enum class GraphemeBreakProperty {')
+	propNext = GraphemeBreakProperty(max(graphemeBreakTable))
 	for prop in GraphemeBreakProperty.__members__.values():
 		output.append(f'\t{prop.name} = {prop.value},')
+		if prop == propNext:
+			break
 	output.append('\tSentinel = Prepend,')
 	output.append('};')
 
@@ -162,7 +204,8 @@ def updateGraphemeBreakTable(headerFile, sourceFile):
 
 	output.append('')
 	output.append('constexpr uint16_t graphemeClusterBoundary[] = {')
-	output.extend(bitValue(value) + ', // ' + GraphemeBreakProperty(index).name for index, value in enumerate(graphemeClusterBoundary))
+	table = graphemeClusterBoundary[:int(propNext) + 1]
+	output.extend(bitValue(value & 0xffff) + ', // ' + GraphemeBreakProperty(index).name for index, value in enumerate(table))
 	output.append('};')
 
 	output.append("""
@@ -192,4 +235,5 @@ def updateGraphemeBreakTable(headerFile, sourceFile):
 	Regenerate(sourceFile, "//grapheme table", table)
 
 if __name__ == '__main__':
+	# parseSegmentationChart('Grapheme Break Chart.html')
 	updateGraphemeBreakTable('../src/CharClassify.h', '../src/CharClassify.cxx')
@@ -1,6 +1,4 @@
 # script to generate line breaking data.
-# https://www.unicode.org/reports/tr41/
-# https://www.unicode.org/Public/UCD/latest/ucd/auxiliary/LineBreakTest.html
 
 import platform
 import unicodedata
@@ -13,6 +11,7 @@
 
 # Unicode Line Breaking Algorithm
 # https://www.unicode.org/reports/tr14/
+# https://www.unicode.org/Public/UCD/latest/ucd/auxiliary/LineBreakTest.html
 class LineBreak(IntFlag):
 	NonBreak = 0
 	BreakBefore = 1	# B
@@ -86,6 +85,7 @@ class LineBreak(IntFlag):
 def updateUnicodeLineBreak(filename):
 	lineBreakTable = ['XX'] * UnicodeCharacterCount	# @missing
 	# https://www.unicode.org/Public/UCD/latest/ucd/LineBreak.txt
+	# https://www.unicode.org/Public/UCD/latest/ucd/extracted/DerivedLineBreak.txt
 	version, propertyList = readUnicodePropertyFile('LineBreak.txt')
 	flattenUnicodePropertyTable(lineBreakTable, propertyList)
 
 
@@ -2,6 +2,15 @@
 import os.path
 import unicodedata
 
+# Common References for Unicode Standard Annexes
+# https://www.unicode.org/reports/tr41/
+# Unicode Character Database
+# https://www.unicode.org/reports/tr44/
+# Unicode Locale Data Markup Language (LDML)
+# https://www.unicode.org/reports/tr35/
+# Unicode Character Database in XML
+# https://www.unicode.org/reports/tr42/
+
 MaxCharacter = sys.maxunicode
 UnicodeCharacterCount = sys.maxunicode + 1
 BMPCharacterCharacterCount = 0xffff + 1
@@ -88,3 +97,99 @@ def flattenPropertyMap(propertyMap):
 		for value in items:
 			result[value] = key
 	return result
+
+def parseSegmentationChart(path, opportunity=0):
+	from bs4 import BeautifulSoup
+
+	print('parse:', path)
+	with open(path, encoding='utf-8', newline='\n') as fd:
+		doc = fd.read()
+	soup = BeautifulSoup(doc, 'html5lib')
+	title = soup.find('title').get_text()
+	node = soup.find('a', {'name': 'table'})
+	node = node.find_next('table').find('tbody')
+	table = []
+	row_map = {}
+	same_row = {}
+	column_header = []
+	for row in node.find_all('tr'):
+		items = []
+		for column in row.find_all('th'):
+			items.append(column.get_text().strip())
+		if not items:
+			continue
+		if not column_header:
+			assert len(items) > 1 and len(items[0]) == 0
+			column_header = items[1:]
+		else:
+			assert len(items) == len(column_header) + 1
+			header = items[0]
+			items = items[1:]
+			if header in row_map:
+				assert items == row_map[header]
+			else:
+				for name, value in row_map.items():
+					if items == value:
+						same_row[header] = name
+						# print('same row:', name, header)
+						break
+				table.append(items)
+				row_map[header] = items
+
+	row_header = list(row_map.keys())
+	row_count = len(row_header)
+	print(f'{title} table row: {row_count}, column: {len(column_header)}')
+	column_map = {}
+	same_column = {}
+	for index, header in enumerate(column_header):
+		items = [None] * row_count
+		for i in range(row_count):
+			items[i] = table[i][index]
+		if header in column_map:
+			assert items == column_map[header]
+		else:
+			for name, value in column_map.items():
+				if items == value:
+					same_column[header] = name
+					# print('same column:', name, header)
+					break
+			column_map[header] = items
+
+	# find property with same row and same column
+	same_map = {}
+	both = (same_row.keys() | same_row.values()) & (same_column.keys() | same_column.values())
+	if len(both) > 1:
+		count = len(both)
+		both = sorted(both, key=row_header.index)
+		index = 0
+		while index < count:
+			header = both[index]
+			index += 1
+			row = same_row.get(header, header)
+			column = same_column.get(header, header)
+			j = index
+			while j < count:
+				name = both[j]
+				j += 1
+				r = same_row.get(name, name)
+				c = same_column.get(name, name)
+				if row == r and column == c:
+					print('same row & column:', header, name)
+					same_map[name] = header
+					j -= 1
+					count -= 1
+					del both[j]
+
+	opportunity = '×÷'[opportunity]
+	for header, items in row_map.items():
+		if header in same_map:
+			continue
+		row = []
+		for index, symbol in enumerate(items):
+			if symbol == opportunity:
+				value = row_header[index]
+				value = same_map.get(value, value)
+				if value not in row:
+					row.append(value)
+		if row:
+			print(f'{opportunity} {header}:', ', '.join(row))
@@ -84,6 +84,7 @@ constexpr int symmetricCaseConversionRanges[] = {
 66979,66940,15,1,
 66995,66956,7,1,
 68800,68736,51,1,
+68976,68944,22,1,
 71872,71840,32,1,
 93792,93760,32,1,
 125218,125184,34,1,
@@ -114,6 +115,7 @@ constexpr int symmetricCaseConversions[] = {
 405,502,
 409,408,
 410,573,
+411,42972,
 414,544,
 417,416,
 419,418,
@@ -149,6 +151,7 @@ constexpr int symmetricCaseConversions[] = {
 608,403,
 609,42924,
 611,404,
+612,42955,
 613,42893,
 614,42922,
 616,407,
@@ -195,6 +198,7 @@ constexpr int symmetricCaseConversions[] = {
 4349,7357,
 4350,7358,
 4351,7359,
+7306,7305,
 7545,42877,
 7549,11363,
 7566,42950,
@@ -246,9 +250,11 @@ constexpr int symmetricCaseConversions[] = {
 42900,42948,
 42952,42951,
 42954,42953,
+42957,42956,
 42961,42960,
 42967,42966,
 42969,42968,
+42971,42970,
 42998,42997,
 43859,42931,
 67003,66964,
Original file line number	Diff line number	Diff line change
`@@ -44,6 +44,9 @@ bool StyleContext::MatchIgnoreCase(const char *s) const noexcept {`
`44`	`44`	`return false;`
`45`	`45`	`}`
`46`	`46`	`s++;`
	`47`	`+ if (!*s) {`
	`48`	`+ return true;`
	`49`	`+ }`
`47`	`50`	`if (MakeLowerCase(chNext) != static_cast<unsigned char>(*s)) {`
`48`	`51`	`return false;`
`49`	`52`	`}`
`@@ -61,6 +64,9 @@ bool StyleContext::MatchLowerCase(const char *s) const noexcept {`
`61`	`64`	`return false;`
`62`	`65`	`}`
`63`	`66`	`s++;`
	`67`	`+ if (!*s) {`
	`68`	`+ return true;`
	`69`	`+ }`
`64`	`70`	`if (UnsafeLower(chNext) != static_cast<unsigned char>(*s)) {`
`65`	`71`	`return false;`
`66`	`72`	`}`