Improve grapheme cluster boundary detection for keycap emoji,

zufuliu · zufuliu · commit de56e2fb5eaa · 2025-02-22T18:30:57.000+08:00
see https://sourceforge.net/p/scintilla/feature-requests/1417/
diff --git a/scintilla/scripts/GenerateGraphemeBreak.py b/scintilla/scripts/GenerateGraphemeBreak.py
@@ -61,7 +61,7 @@ def updateIndicConjunctBreak(graphemeBreakTable):
 	for index, conjunct in enumerate(indicConjunctBreak):
 		grapheme = graphemeBreakTable[index]
 		if grapheme == defaultValue:
-			grapheme = GraphemeBreakPropertyMap.get(conjunct, grapheme)
+			grapheme = int(GraphemeBreakPropertyMap.get(conjunct, grapheme))
 		elif grapheme == extend:
 			if conjunct == 'Virama':
 				grapheme = extendLinker
@@ -194,7 +194,8 @@ def updateGraphemeBreakTable(headerFile, sourceFile):
 		output.append(f'\t{prop.name} = {prop.value},')
 		if prop == propNext:
 			break
-	output.append('\tSentinel = Prepend,')
+	output.append('\tForwardSentinel = Prepend,')
+	output.append('\tBackwardSentinel = Extend,')
 	output.append('};')
 
 	output.append('')
diff --git a/scintilla/src/CharClassify.h b/scintilla/src/CharClassify.h
@@ -47,7 +47,8 @@ enum class GraphemeBreakProperty {
 	ConjunctLinker = 12,
 	LinkingConsonant = 13,
 	ExtendConjunctLinker = 14,
-	Sentinel = Prepend,
+	ForwardSentinel = Prepend,
+	BackwardSentinel = Extend,
 };
 
 constexpr int maxUnicodeGraphemeBreakCharacter = 0xe1000;
diff --git a/scintilla/src/Document.cxx b/scintilla/src/Document.cxx
@@ -1168,8 +1168,7 @@ bool Document::IsDBCSDualByteAt(Sci::Position pos) const noexcept {
 //   4) Break after whole character, this may break combining characters
 
 size_t Document::SafeSegment(const char *text, size_t lengthSegment, EncodingFamily encodingFamily) const noexcept {
-	const char * const end = text + lengthSegment;
-	const char *it = end;
+	const char *it = text + lengthSegment;
 	// check space first as most written language use spaces.
 	do {
 		if (IsBreakSpace(*it)) {
@@ -1180,40 +1179,44 @@ size_t Document::SafeSegment(const char *text, size_t lengthSegment, EncodingFam
 
 	if (encodingFamily != EncodingFamily::dbcs) {
 		// backward iterate for UTF-8 and single byte encoding to find word and punctuation boundary.
-		it = end;
+		it = text + lengthSegment;
+		size_t lastPunctuationBreak = lengthSegment;
 		const CharacterClass ccPrev = charClass.GetClass(*it);
 		do {
 			--it;
-			const CharacterClass cc = charClass.GetClass(*it);
+			uint8_t ch = *it;
+			const CharacterClass cc = charClass.GetClass(ch);
 			if (cc != ccPrev) {
-				return it - text + 1;
+				lastPunctuationBreak = it - text + 1;
+				break;
 			}
 		} while (it != text);
 
-		it = end;
-		if (encodingFamily != EncodingFamily::eightBit && ccPrev == CharacterClass::word) {
+		if (ccPrev >= CharacterClass::punctuation && encodingFamily != EncodingFamily::eightBit) {
 			// for UTF-8 go back two code points to detect grapheme cluster boundary.
-			it -= 2*UTF8MaxBytes;
-			for (int tryCount = 0; tryCount < 2; tryCount++) {
+			it = text + lastPunctuationBreak;
+			// only find grapheme cluster boundary within last longest sequence
+			const char * const end = it - std::min<size_t>(lastPunctuationBreak, longestUnicodeCharacterSequenceBytes + UTF8MaxBytes);
+			const char *prev = it;
+			GraphemeBreakProperty next = GraphemeBreakProperty::BackwardSentinel;
+			do {
 				// go back to the start of current character.
-				for (int trail = 0; trail < UTF8MaxBytes - 1 && UTF8IsTrailByte(*it); trail++) {
+				while (UTF8IsTrailByte(*it)) {
 					--it;
 				}
-				GraphemeBreakProperty prev = GraphemeBreakProperty::Sentinel;
-				do {
-					const int character = UnicodeFromUTF8(reinterpret_cast<const unsigned char *>(it));
-					const GraphemeBreakProperty current = CharClassify::GetGraphemeBreakProperty(character);
-					if (IsGraphemeClusterBoundary(prev, current)) {
-						return it - text;
-					}
-					prev = current;
-					it += UTF8BytesOfLead(static_cast<unsigned char>(*it));
-				} while (it < end);
-				// no boundary between last two code points, assume text ends with the longest sequence.
-				it -= longestUnicodeCharacterSequenceBytes + UTF8MaxBytes;
-			}
+				// text is valid UTF-8, invalid UTF-8 are represented with isolated bytes
+				const int character = UnicodeFromUTF8(reinterpret_cast<const unsigned char *>(it));
+				const GraphemeBreakProperty current = CharClassify::GetGraphemeBreakProperty(character);
+				if (IsGraphemeClusterBoundary(current, next)) {
+					lastPunctuationBreak = prev - text;
+					break;
+				}
+				next = current;
+				prev = it;
+				--it;
+			} while (it > end);
 		}
-		return it - text;
+		return lastPunctuationBreak;
 	}
 
 	{
diff --git a/scintilla/src/UniConversion.h b/scintilla/src/UniConversion.h
@@ -54,6 +54,10 @@ constexpr bool UTF8IsTrailByte(unsigned char ch) noexcept {
 	return (ch >= 0x80) && (ch < 0xc0);
 }
 
+constexpr bool UTF8IsFirstByte(unsigned char ch) noexcept {
+	return (ch >= 0xc2) && (ch <= 0xf4);
+}
+
 constexpr bool IsASCIICharacter(unsigned int ch) noexcept {
 	return ch < 0x80;
 }

Original file line number	Diff line number	Diff line change
`@@ -54,6 +54,10 @@ constexpr bool UTF8IsTrailByte(unsigned char ch) noexcept {`
`54`	`54`	`return (ch >= 0x80) && (ch < 0xc0);`
`55`	`55`	`}`
`56`	`56`
	`57`	`+constexpr bool UTF8IsFirstByte(unsigned char ch) noexcept {`
	`58`	`+ return (ch >= 0xc2) && (ch <= 0xf4);`
	`59`	`+}`
	`60`	`+`
`57`	`61`	`constexpr bool IsASCIICharacter(unsigned int ch) noexcept {`
`58`	`62`	`return ch < 0x80;`
`59`	`63`	`}`