Skip to content

Commit f1470d9

Browse files
committed
Avoid word wrap breaks inside grapheme cluster boundary,
see https://sourceforge.net/p/scintilla/feature-requests/1417/
1 parent de56e2f commit f1470d9

File tree

3 files changed

+59
-7
lines changed

3 files changed

+59
-7
lines changed

scintilla/src/Document.cxx

+37-1
Original file line numberDiff line numberDiff line change
@@ -1151,6 +1151,41 @@ bool Document::IsDBCSDualByteAt(Sci::Position pos) const noexcept {
11511151
&& IsDBCSTrailByteNoExcept(cb.UCharAt(pos + 1));
11521152
}
11531153

1154+
size_t Document::DiscardLastCombinedCharacter(const char *text, size_t lengthSegment, size_t lenBytes) noexcept {
1155+
const char *it = text + lengthSegment;
1156+
const char * const back = text + lenBytes;
1157+
// only find grapheme cluster boundary within last longest sequence
1158+
constexpr size_t longest = longestUnicodeCharacterSequenceBytes + UTF8MaxBytes;
1159+
const char * const end = (lengthSegment > longest) ? it - longest : text;
1160+
const char *prev = it;
1161+
GraphemeBreakProperty next = GraphemeBreakProperty::BackwardSentinel;
1162+
do {
1163+
// go back to the start of current character.
1164+
int trail = 1;
1165+
while (it != end && trail < UTF8MaxBytes && UTF8IsTrailByte(*it)) {
1166+
++trail;
1167+
--it;
1168+
}
1169+
// unlike SafeSegment(), text may contains invalid UTF-8
1170+
const int utf8status = UTF8Classify(it, back - it);
1171+
if (utf8status & UTF8MaskInvalid) {
1172+
// treat invalid UTF-8 as control character represented with isolated bytes
1173+
lengthSegment = prev - text;
1174+
break;
1175+
}
1176+
const int character = UnicodeFromUTF8(reinterpret_cast<const unsigned char *>(it));
1177+
const GraphemeBreakProperty current = CharClassify::GetGraphemeBreakProperty(character);
1178+
if (IsGraphemeClusterBoundary(current, next)) {
1179+
lengthSegment = prev - text;
1180+
break;
1181+
}
1182+
next = current;
1183+
prev = it;
1184+
--it;
1185+
} while (it > end);
1186+
return lengthSegment;
1187+
}
1188+
11541189
// Need to break text into segments near end but taking into account the
11551190
// encoding to not break inside a UTF-8 or DBCS character and also trying
11561191
// to avoid breaking inside a pair of combining characters, or inside
@@ -1196,7 +1231,8 @@ size_t Document::SafeSegment(const char *text, size_t lengthSegment, EncodingFam
11961231
// for UTF-8 go back two code points to detect grapheme cluster boundary.
11971232
it = text + lastPunctuationBreak;
11981233
// only find grapheme cluster boundary within last longest sequence
1199-
const char * const end = it - std::min<size_t>(lastPunctuationBreak, longestUnicodeCharacterSequenceBytes + UTF8MaxBytes);
1234+
constexpr size_t longest = longestUnicodeCharacterSequenceBytes + UTF8MaxBytes;
1235+
const char * const end = (lastPunctuationBreak > longest) ? it - longest : text;
12001236
const char *prev = it;
12011237
GraphemeBreakProperty next = GraphemeBreakProperty::BackwardSentinel;
12021238
do {

scintilla/src/Document.h

+1
Original file line numberDiff line numberDiff line change
@@ -396,6 +396,7 @@ class Document : PerLine, public Scintilla::IDocument, public Scintilla::ILoader
396396
}
397397
bool IsDBCSDualByteAt(Sci::Position pos) const noexcept;
398398
int DBCSDrawBytes(const char *text, size_t length) const noexcept;
399+
static size_t DiscardLastCombinedCharacter(const char *text, size_t lengthSegment, size_t lenBytes) noexcept;
399400
size_t SafeSegment(const char *text, size_t lengthSegment, EncodingFamily encodingFamily) const noexcept;
400401
EncodingFamily CodePageFamily() const noexcept;
401402

scintilla/src/PositionCache.cxx

+21-6
Original file line numberDiff line numberDiff line change
@@ -387,6 +387,12 @@ constexpr WrapBreak GetWrapBreakEx(unsigned int ch, bool isUtf8) noexcept {
387387
}
388388

389389
void LineLayout::WrapLine(const Document *pdoc, Sci::Position posLineStart, Wrap wrapState, XYPOSITION wrapWidth, XYPOSITION wrapIndent_, bool partialLine) {
390+
// Document wants document positions but simpler to work in line positions
391+
// so take care of adding and subtracting line start in a lambda.
392+
auto CharacterBoundary = [=](Sci::Position i, int moveDir) noexcept -> Sci::Position {
393+
return pdoc->MovePositionOutsideChar(i + posLineStart, moveDir) - posLineStart;
394+
};
395+
// Calculate line start positions based upon width.
390396
Sci::Position lastLineStart = 0;
391397
XYPOSITION startOffset = wrapWidth;
392398
Sci::Position p = 0;
@@ -409,8 +415,9 @@ void LineLayout::WrapLine(const Document *pdoc, Sci::Position posLineStart, Wrap
409415
// backtrack to find lastGoodBreak
410416
Sci::Position lastGoodBreak = p;
411417
if (p > 0) {
412-
lastGoodBreak = pdoc->MovePositionOutsideChar(p + posLineStart, -1) - posLineStart;
418+
lastGoodBreak = CharacterBoundary(p, -1);
413419
}
420+
bool foundBreak = false;
414421
if (wrapState != Wrap::Char) {
415422
Sci::Position pos = lastGoodBreak;
416423
CharacterClass ccPrev = CharacterClass::space;
@@ -425,13 +432,15 @@ void LineLayout::WrapLine(const Document *pdoc, Sci::Position posLineStart, Wrap
425432
while (pos > lastLineStart) {
426433
// style boundary and space
427434
if (wrapState != Wrap::WhiteSpace && (styles[pos - 1] != styles[pos])) {
435+
foundBreak = true;
428436
break;
429437
}
430438
if (IsBreakSpace(chars[pos - 1]) && !IsBreakSpace(chars[pos])) {
439+
foundBreak = true;
431440
break;
432441
}
433442

434-
const Sci::Position posBefore = pdoc->MovePositionOutsideChar(pos + posLineStart - 1, -1) - posLineStart;
443+
const Sci::Position posBefore = CharacterBoundary(pos - 1, -1);
435444
if (wrapState == Wrap::Auto) {
436445
// word boundary
437446
// TODO: Unicode Line Breaking Algorithm https://www.unicode.org/reports/tr14/
@@ -466,14 +475,20 @@ void LineLayout::WrapLine(const Document *pdoc, Sci::Position posLineStart, Wrap
466475
lastGoodBreak = pos;
467476
}
468477
}
469-
if (lastGoodBreak == lastLineStart) {
478+
if (lastGoodBreak == lastLineStart || (isUtf8 && !foundBreak)) {
470479
// Try moving to start of last character
471-
if (p > 0) {
472-
lastGoodBreak = pdoc->MovePositionOutsideChar(p + posLineStart, -1) - posLineStart;
480+
if (lastGoodBreak == lastLineStart && p > 0) {
481+
lastGoodBreak = CharacterBoundary(p, -1);
482+
}
483+
if (isUtf8 && lastGoodBreak != lastLineStart) {
484+
const char *text = &chars[lastLineStart];
485+
size_t lengthSegment = lastGoodBreak - lastLineStart;
486+
lengthSegment = Document::DiscardLastCombinedCharacter(text, lengthSegment, maxLineLength - lastLineStart);
487+
lastGoodBreak = lastLineStart + lengthSegment;
473488
}
474489
if (lastGoodBreak == lastLineStart) {
475490
// Ensure at least one character on line.
476-
lastGoodBreak = pdoc->MovePositionOutsideChar(lastGoodBreak + posLineStart + 1, 1) - posLineStart;
491+
lastGoodBreak = CharacterBoundary(lastGoodBreak + 1, 1);
477492
}
478493
}
479494
lastLineStart = lastGoodBreak;

0 commit comments

Comments
 (0)