Skip to content

Commit de56e2f

Browse files
committed
Improve grapheme cluster boundary detection for keycap emoji,
see https://sourceforge.net/p/scintilla/feature-requests/1417/
1 parent 4b8a188 commit de56e2f

File tree

4 files changed

+36
-27
lines changed

4 files changed

+36
-27
lines changed

scintilla/scripts/GenerateGraphemeBreak.py

+3-2
Original file line numberDiff line numberDiff line change
@@ -61,7 +61,7 @@ def updateIndicConjunctBreak(graphemeBreakTable):
6161
for index, conjunct in enumerate(indicConjunctBreak):
6262
grapheme = graphemeBreakTable[index]
6363
if grapheme == defaultValue:
64-
grapheme = GraphemeBreakPropertyMap.get(conjunct, grapheme)
64+
grapheme = int(GraphemeBreakPropertyMap.get(conjunct, grapheme))
6565
elif grapheme == extend:
6666
if conjunct == 'Virama':
6767
grapheme = extendLinker
@@ -194,7 +194,8 @@ def updateGraphemeBreakTable(headerFile, sourceFile):
194194
output.append(f'\t{prop.name} = {prop.value},')
195195
if prop == propNext:
196196
break
197-
output.append('\tSentinel = Prepend,')
197+
output.append('\tForwardSentinel = Prepend,')
198+
output.append('\tBackwardSentinel = Extend,')
198199
output.append('};')
199200

200201
output.append('')

scintilla/src/CharClassify.h

+2-1
Original file line numberDiff line numberDiff line change
@@ -47,7 +47,8 @@ enum class GraphemeBreakProperty {
4747
ConjunctLinker = 12,
4848
LinkingConsonant = 13,
4949
ExtendConjunctLinker = 14,
50-
Sentinel = Prepend,
50+
ForwardSentinel = Prepend,
51+
BackwardSentinel = Extend,
5152
};
5253

5354
constexpr int maxUnicodeGraphemeBreakCharacter = 0xe1000;

scintilla/src/Document.cxx

+27-24
Original file line numberDiff line numberDiff line change
@@ -1168,8 +1168,7 @@ bool Document::IsDBCSDualByteAt(Sci::Position pos) const noexcept {
11681168
// 4) Break after whole character, this may break combining characters
11691169

11701170
size_t Document::SafeSegment(const char *text, size_t lengthSegment, EncodingFamily encodingFamily) const noexcept {
1171-
const char * const end = text + lengthSegment;
1172-
const char *it = end;
1171+
const char *it = text + lengthSegment;
11731172
// check space first as most written language use spaces.
11741173
do {
11751174
if (IsBreakSpace(*it)) {
@@ -1180,40 +1179,44 @@ size_t Document::SafeSegment(const char *text, size_t lengthSegment, EncodingFam
11801179

11811180
if (encodingFamily != EncodingFamily::dbcs) {
11821181
// backward iterate for UTF-8 and single byte encoding to find word and punctuation boundary.
1183-
it = end;
1182+
it = text + lengthSegment;
1183+
size_t lastPunctuationBreak = lengthSegment;
11841184
const CharacterClass ccPrev = charClass.GetClass(*it);
11851185
do {
11861186
--it;
1187-
const CharacterClass cc = charClass.GetClass(*it);
1187+
uint8_t ch = *it;
1188+
const CharacterClass cc = charClass.GetClass(ch);
11881189
if (cc != ccPrev) {
1189-
return it - text + 1;
1190+
lastPunctuationBreak = it - text + 1;
1191+
break;
11901192
}
11911193
} while (it != text);
11921194

1193-
it = end;
1194-
if (encodingFamily != EncodingFamily::eightBit && ccPrev == CharacterClass::word) {
1195+
if (ccPrev >= CharacterClass::punctuation && encodingFamily != EncodingFamily::eightBit) {
11951196
// for UTF-8 go back two code points to detect grapheme cluster boundary.
1196-
it -= 2*UTF8MaxBytes;
1197-
for (int tryCount = 0; tryCount < 2; tryCount++) {
1197+
it = text + lastPunctuationBreak;
1198+
// only find grapheme cluster boundary within last longest sequence
1199+
const char * const end = it - std::min<size_t>(lastPunctuationBreak, longestUnicodeCharacterSequenceBytes + UTF8MaxBytes);
1200+
const char *prev = it;
1201+
GraphemeBreakProperty next = GraphemeBreakProperty::BackwardSentinel;
1202+
do {
11981203
// go back to the start of current character.
1199-
for (int trail = 0; trail < UTF8MaxBytes - 1 && UTF8IsTrailByte(*it); trail++) {
1204+
while (UTF8IsTrailByte(*it)) {
12001205
--it;
12011206
}
1202-
GraphemeBreakProperty prev = GraphemeBreakProperty::Sentinel;
1203-
do {
1204-
const int character = UnicodeFromUTF8(reinterpret_cast<const unsigned char *>(it));
1205-
const GraphemeBreakProperty current = CharClassify::GetGraphemeBreakProperty(character);
1206-
if (IsGraphemeClusterBoundary(prev, current)) {
1207-
return it - text;
1208-
}
1209-
prev = current;
1210-
it += UTF8BytesOfLead(static_cast<unsigned char>(*it));
1211-
} while (it < end);
1212-
// no boundary between last two code points, assume text ends with the longest sequence.
1213-
it -= longestUnicodeCharacterSequenceBytes + UTF8MaxBytes;
1214-
}
1207+
// text is valid UTF-8, invalid UTF-8 are represented with isolated bytes
1208+
const int character = UnicodeFromUTF8(reinterpret_cast<const unsigned char *>(it));
1209+
const GraphemeBreakProperty current = CharClassify::GetGraphemeBreakProperty(character);
1210+
if (IsGraphemeClusterBoundary(current, next)) {
1211+
lastPunctuationBreak = prev - text;
1212+
break;
1213+
}
1214+
next = current;
1215+
prev = it;
1216+
--it;
1217+
} while (it > end);
12151218
}
1216-
return it - text;
1219+
return lastPunctuationBreak;
12171220
}
12181221

12191222
{

scintilla/src/UniConversion.h

+4
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,10 @@ constexpr bool UTF8IsTrailByte(unsigned char ch) noexcept {
5454
return (ch >= 0x80) && (ch < 0xc0);
5555
}
5656

57+
constexpr bool UTF8IsFirstByte(unsigned char ch) noexcept {
58+
return (ch >= 0xc2) && (ch <= 0xf4);
59+
}
60+
5761
constexpr bool IsASCIICharacter(unsigned int ch) noexcept {
5862
return ch < 0x80;
5963
}

0 commit comments

Comments
 (0)