Avoid word wrap breaks inside grapheme cluster boundary,

zufuliu · zufuliu · commit f1470d9d297f · 2025-02-23T09:18:17.000+08:00
see https://sourceforge.net/p/scintilla/feature-requests/1417/
diff --git a/scintilla/src/Document.cxx b/scintilla/src/Document.cxx
@@ -1151,6 +1151,41 @@ bool Document::IsDBCSDualByteAt(Sci::Position pos) const noexcept {
 		&& IsDBCSTrailByteNoExcept(cb.UCharAt(pos + 1));
 }
 
+size_t Document::DiscardLastCombinedCharacter(const char *text, size_t lengthSegment, size_t lenBytes) noexcept {
+	const char *it = text + lengthSegment;
+	const char * const back = text + lenBytes;
+	// only find grapheme cluster boundary within last longest sequence
+	constexpr size_t longest = longestUnicodeCharacterSequenceBytes + UTF8MaxBytes;
+	const char * const end = (lengthSegment > longest) ? it - longest : text;
+	const char *prev = it;
+	GraphemeBreakProperty next = GraphemeBreakProperty::BackwardSentinel;
+	do {
+		// go back to the start of current character.
+		int trail = 1;
+		while (it != end && trail < UTF8MaxBytes && UTF8IsTrailByte(*it)) {
+			++trail;
+			--it;
+		}
+		// unlike SafeSegment(), text may contains invalid UTF-8
+		const int utf8status = UTF8Classify(it, back - it);
+		if (utf8status & UTF8MaskInvalid) {
+			// treat invalid UTF-8 as control character represented with isolated bytes
+			lengthSegment = prev - text;
+			break;
+		}
+		const int character = UnicodeFromUTF8(reinterpret_cast<const unsigned char *>(it));
+		const GraphemeBreakProperty current = CharClassify::GetGraphemeBreakProperty(character);
+		if (IsGraphemeClusterBoundary(current, next)) {
+			lengthSegment = prev - text;
+			break;
+		}
+		next = current;
+		prev = it;
+		--it;
+	} while (it > end);
+	return lengthSegment;
+}
+
 // Need to break text into segments near end but taking into account the
 // encoding to not break inside a UTF-8 or DBCS character and also trying
 // to avoid breaking inside a pair of combining characters, or inside
@@ -1196,7 +1231,8 @@ size_t Document::SafeSegment(const char *text, size_t lengthSegment, EncodingFam
 			// for UTF-8 go back two code points to detect grapheme cluster boundary.
 			it = text + lastPunctuationBreak;
 			// only find grapheme cluster boundary within last longest sequence
-			const char * const end = it - std::min<size_t>(lastPunctuationBreak, longestUnicodeCharacterSequenceBytes + UTF8MaxBytes);
+			constexpr size_t longest = longestUnicodeCharacterSequenceBytes + UTF8MaxBytes;
+			const char * const end = (lastPunctuationBreak > longest) ? it - longest : text;
 			const char *prev = it;
 			GraphemeBreakProperty next = GraphemeBreakProperty::BackwardSentinel;
 			do {
diff --git a/scintilla/src/Document.h b/scintilla/src/Document.h
@@ -396,6 +396,7 @@ class Document : PerLine, public Scintilla::IDocument, public Scintilla::ILoader
 	}
 	bool IsDBCSDualByteAt(Sci::Position pos) const noexcept;
 	int DBCSDrawBytes(const char *text, size_t length) const noexcept;
+	static size_t DiscardLastCombinedCharacter(const char *text, size_t lengthSegment, size_t lenBytes) noexcept;
 	size_t SafeSegment(const char *text, size_t lengthSegment, EncodingFamily encodingFamily) const noexcept;
 	EncodingFamily CodePageFamily() const noexcept;
 
diff --git a/scintilla/src/PositionCache.cxx b/scintilla/src/PositionCache.cxx
@@ -387,6 +387,12 @@ constexpr WrapBreak GetWrapBreakEx(unsigned int ch, bool isUtf8) noexcept {
 }
 
 void LineLayout::WrapLine(const Document *pdoc, Sci::Position posLineStart, Wrap wrapState, XYPOSITION wrapWidth, XYPOSITION wrapIndent_, bool partialLine) {
+	// Document wants document positions but simpler to work in line positions
+	// so take care of adding and subtracting line start in a lambda.
+	auto CharacterBoundary = [=](Sci::Position i, int moveDir) noexcept -> Sci::Position {
+		return pdoc->MovePositionOutsideChar(i + posLineStart, moveDir) - posLineStart;
+	};
+	// Calculate line start positions based upon width.
 	Sci::Position lastLineStart = 0;
 	XYPOSITION startOffset = wrapWidth;
 	Sci::Position p = 0;
@@ -409,8 +415,9 @@ void LineLayout::WrapLine(const Document *pdoc, Sci::Position posLineStart, Wrap
 			// backtrack to find lastGoodBreak
 			Sci::Position lastGoodBreak = p;
 			if (p > 0) {
-				lastGoodBreak = pdoc->MovePositionOutsideChar(p + posLineStart, -1) - posLineStart;
+				lastGoodBreak = CharacterBoundary(p, -1);
 			}
+			bool foundBreak = false;
 			if (wrapState != Wrap::Char) {
 				Sci::Position pos = lastGoodBreak;
 				CharacterClass ccPrev = CharacterClass::space;
@@ -425,13 +432,15 @@ void LineLayout::WrapLine(const Document *pdoc, Sci::Position posLineStart, Wrap
 				while (pos > lastLineStart) {
 					// style boundary and space
 					if (wrapState != Wrap::WhiteSpace && (styles[pos - 1] != styles[pos])) {
+						foundBreak = true;
 						break;
 					}
 					if (IsBreakSpace(chars[pos - 1]) && !IsBreakSpace(chars[pos])) {
+						foundBreak = true;
 						break;
 					}
 
-					const Sci::Position posBefore = pdoc->MovePositionOutsideChar(pos + posLineStart - 1, -1) - posLineStart;
+					const Sci::Position posBefore = CharacterBoundary(pos - 1, -1);
 					if (wrapState == Wrap::Auto) {
 						// word boundary
 						// TODO: Unicode Line Breaking Algorithm https://www.unicode.org/reports/tr14/
@@ -466,14 +475,20 @@ void LineLayout::WrapLine(const Document *pdoc, Sci::Position posLineStart, Wrap
 					lastGoodBreak = pos;
 				}
 			}
-			if (lastGoodBreak == lastLineStart) {
+			if (lastGoodBreak == lastLineStart || (isUtf8 && !foundBreak)) {
 				// Try moving to start of last character
-				if (p > 0) {
-					lastGoodBreak = pdoc->MovePositionOutsideChar(p + posLineStart, -1) - posLineStart;
+				if (lastGoodBreak == lastLineStart && p > 0) {
+					lastGoodBreak = CharacterBoundary(p, -1);
+				}
+				if (isUtf8 && lastGoodBreak != lastLineStart) {
+					const char *text = &chars[lastLineStart];
+					size_t lengthSegment = lastGoodBreak - lastLineStart;
+					lengthSegment = Document::DiscardLastCombinedCharacter(text, lengthSegment, maxLineLength - lastLineStart);
+					lastGoodBreak = lastLineStart + lengthSegment;
 				}
 				if (lastGoodBreak == lastLineStart) {
 					// Ensure at least one character on line.
-					lastGoodBreak = pdoc->MovePositionOutsideChar(lastGoodBreak + posLineStart + 1, 1) - posLineStart;
+					lastGoodBreak = CharacterBoundary(lastGoodBreak + 1, 1);
 				}
 			}
 			lastLineStart = lastGoodBreak;

Original file line number	Diff line number	Diff line change
`@@ -396,6 +396,7 @@ class Document : PerLine, public Scintilla::IDocument, public Scintilla::ILoader`
`396`	`396`	`}`
`397`	`397`	`bool IsDBCSDualByteAt(Sci::Position pos) const noexcept;`
`398`	`398`	`int DBCSDrawBytes(const char *text, size_t length) const noexcept;`
	`399`	`+ static size_t DiscardLastCombinedCharacter(const char *text, size_t lengthSegment, size_t lenBytes) noexcept;`
`399`	`400`	`size_t SafeSegment(const char *text, size_t lengthSegment, EncodingFamily encodingFamily) const noexcept;`
`400`	`401`	`EncodingFamily CodePageFamily() const noexcept;`
`401`	`402`