@@ -1168,8 +1168,7 @@ bool Document::IsDBCSDualByteAt(Sci::Position pos) const noexcept {
1168
1168
// 4) Break after whole character, this may break combining characters
1169
1169
1170
1170
size_t Document::SafeSegment (const char *text, size_t lengthSegment, EncodingFamily encodingFamily) const noexcept {
1171
- const char * const end = text + lengthSegment;
1172
- const char *it = end;
1171
+ const char *it = text + lengthSegment;
1173
1172
// check space first as most written language use spaces.
1174
1173
do {
1175
1174
if (IsBreakSpace (*it)) {
@@ -1180,40 +1179,44 @@ size_t Document::SafeSegment(const char *text, size_t lengthSegment, EncodingFam
1180
1179
1181
1180
if (encodingFamily != EncodingFamily::dbcs) {
1182
1181
// backward iterate for UTF-8 and single byte encoding to find word and punctuation boundary.
1183
- it = end;
1182
+ it = text + lengthSegment;
1183
+ size_t lastPunctuationBreak = lengthSegment;
1184
1184
const CharacterClass ccPrev = charClass.GetClass (*it);
1185
1185
do {
1186
1186
--it;
1187
- const CharacterClass cc = charClass.GetClass (*it);
1187
+ uint8_t ch = *it;
1188
+ const CharacterClass cc = charClass.GetClass (ch);
1188
1189
if (cc != ccPrev) {
1189
- return it - text + 1 ;
1190
+ lastPunctuationBreak = it - text + 1 ;
1191
+ break ;
1190
1192
}
1191
1193
} while (it != text);
1192
1194
1193
- it = end;
1194
- if (encodingFamily != EncodingFamily::eightBit && ccPrev == CharacterClass::word) {
1195
+ if (ccPrev >= CharacterClass::punctuation && encodingFamily != EncodingFamily::eightBit) {
1195
1196
// for UTF-8 go back two code points to detect grapheme cluster boundary.
1196
- it -= 2 *UTF8MaxBytes;
1197
- for (int tryCount = 0 ; tryCount < 2 ; tryCount++) {
1197
+ it = text + lastPunctuationBreak;
1198
+ // only find grapheme cluster boundary within last longest sequence
1199
+ const char * const end = it - std::min<size_t >(lastPunctuationBreak, longestUnicodeCharacterSequenceBytes + UTF8MaxBytes);
1200
+ const char *prev = it;
1201
+ GraphemeBreakProperty next = GraphemeBreakProperty::BackwardSentinel;
1202
+ do {
1198
1203
// go back to the start of current character.
1199
- for ( int trail = 0 ; trail < UTF8MaxBytes - 1 && UTF8IsTrailByte (*it); trail++ ) {
1204
+ while ( UTF8IsTrailByte (*it)) {
1200
1205
--it;
1201
1206
}
1202
- GraphemeBreakProperty prev = GraphemeBreakProperty::Sentinel;
1203
- do {
1204
- const int character = UnicodeFromUTF8 (reinterpret_cast <const unsigned char *>(it));
1205
- const GraphemeBreakProperty current = CharClassify::GetGraphemeBreakProperty (character);
1206
- if (IsGraphemeClusterBoundary (prev, current)) {
1207
- return it - text;
1208
- }
1209
- prev = current;
1210
- it += UTF8BytesOfLead (static_cast <unsigned char >(*it));
1211
- } while (it < end);
1212
- // no boundary between last two code points, assume text ends with the longest sequence.
1213
- it -= longestUnicodeCharacterSequenceBytes + UTF8MaxBytes;
1214
- }
1207
+ // text is valid UTF-8, invalid UTF-8 are represented with isolated bytes
1208
+ const int character = UnicodeFromUTF8 (reinterpret_cast <const unsigned char *>(it));
1209
+ const GraphemeBreakProperty current = CharClassify::GetGraphemeBreakProperty (character);
1210
+ if (IsGraphemeClusterBoundary (current, next)) {
1211
+ lastPunctuationBreak = prev - text;
1212
+ break ;
1213
+ }
1214
+ next = current;
1215
+ prev = it;
1216
+ --it;
1217
+ } while (it > end);
1215
1218
}
1216
- return it - text ;
1219
+ return lastPunctuationBreak ;
1217
1220
}
1218
1221
1219
1222
{
0 commit comments