Fix GetCharacterAndWidth() and CharacterAfter() bugs for DBCS code pa…

…ges. See https://sourceforge.net/p/scintilla/feature-requests/1408/
zufuliu · Jun 23, 2021 · 3e7741f · 3e7741f
1 parent 9164ccd
commit 3e7741f
Show file tree

Hide file tree

Showing 3 changed files with 37 additions and 48 deletions.
diff --git a/scintilla/lexers/LexBatch.cxx b/scintilla/lexers/LexBatch.cxx
@@ -338,6 +338,7 @@ void ColouriseBatchDoc(Sci_PositionU startPos, Sci_Position length, int initStyl
 					varQuoteChar = '\0';
 					sc.ChangeState(outerStyle);
 					sc.Rewind();
+					sc.Forward();
 				}
 			}
 			if (varQuoteChar == '\0') {

diff --git a/scintilla/lexlib/StyleContext.h b/scintilla/lexlib/StyleContext.h
@@ -49,15 +49,14 @@ class StyleContext final {
 	int chPrev;
 	int ch;
 	int chNext;
-	Sci_Position width;
-	Sci_Position widthNext;
+	Sci_Position width = 1;
+	Sci_Position widthNext = 1;
 
 	StyleContext(Sci_PositionU startPos, Sci_PositionU length,
 		int initStyle, LexAccessor &styler_) noexcept :
 	styler(styler_),
 	endPos(startPos + length),
 	lengthDocument(styler.Length()),
-	currentPos(startPos),
 	multiByteAccess(styler.Encoding() == EncodingType::dbcs),
 	state(initStyle) {
 		styler.StartAt(startPos);
@@ -70,18 +69,7 @@ class StyleContext final {
 		}
 		lineDocEnd = styler.GetLine(lengthDocument);
 		atLineStart = static_cast<Sci_PositionU>(styler.LineStart(currentLine)) == startPos;
-
-		chPrev = 0;
-		width = 1;
-		widthNext = 1;
-		if (!multiByteAccess) {
-			ch = static_cast<unsigned char>(styler[startPos]);
-		} else {
-			ch =  styler.GetCharacterAndWidth(startPos, &widthNext);
-			width = widthNext;
-		}
-
-		GetNextChar();
+		SeekTo(startPos);
 	}
 	// Deleted so StyleContext objects can not be copied.
 	StyleContext(const StyleContext &) = delete;
@@ -234,16 +222,20 @@ class StyleContext final {
 		styler.GetRangeLowered(styler.GetStartSegment(), currentPos, s, len);
 	}
 
-	void Rewind() noexcept {
-		currentPos = styler.GetStartSegment();
+	void SeekTo(Sci_PositionU startPos) noexcept {
+		currentPos = startPos;
 		chPrev = 0;
 		if (!multiByteAccess) {
-			ch = static_cast<unsigned char>(styler[currentPos]);
+			ch = static_cast<unsigned char>(styler[startPos]);
 		} else {
-			ch =  styler.GetCharacterAndWidth(currentPos, &widthNext);
+			ch =  styler.GetCharacterAndWidth(startPos, &widthNext);
 			width = widthNext;
 		}
-		Forward();
+		GetNextChar();
+	}
+
+	void Rewind() noexcept {
+		SeekTo(styler.GetStartSegment());
 	}
 
 	bool LineEndsWith(char ch0) const noexcept {

diff --git a/scintilla/src/Document.cxx b/scintilla/src/Document.cxx
@@ -896,7 +896,7 @@ Document::CharacterExtracted Document::CharacterAfter(Sci::Position position) co
 		return CharacterExtracted(unicodeReplacementChar, 0);
 	}
 	const unsigned char leadByte = cb.UCharAt(position);
-	if (!dbcsCodePage || UTF8IsAscii(leadByte)) {
+	if (UTF8IsAscii(leadByte) || !dbcsCodePage) {
 		// Common case: ASCII character
 		return CharacterExtracted(leadByte, 1);
 	}
@@ -915,10 +915,12 @@ Document::CharacterExtracted Document::CharacterAfter(Sci::Position position) co
 		}
 	} else {
 		if (IsDBCSLeadByteNoExcept(leadByte) && ((position + 1) < Length())) {
-			return CharacterExtracted::DBCS(leadByte, cb.UCharAt(position + 1));
-		} else {
-			return CharacterExtracted(leadByte, 1);
+			const unsigned char trailByte = cb.UCharAt(position + 1);
+			if (!IsDBCSTrailByteInvalid(trailByte)) {
+				return CharacterExtracted::DBCS(leadByte, trailByte);
+			}
 		}
+		return CharacterExtracted(leadByte, 1);
 	}
 }
 
@@ -1006,39 +1008,33 @@ Sci::Position Document::GetRelativePositionUTF16(Sci::Position positionStart, Sc
 }
 
 int SCI_METHOD Document::GetCharacterAndWidth(Sci_Position position, Sci_Position *pWidth) const noexcept {
-	int character;
 	int bytesInCharacter = 1;
 	const unsigned char leadByte = cb.UCharAt(position);
-	if (dbcsCodePage) {
+	int character = leadByte;
+	if (!UTF8IsAscii(leadByte) && dbcsCodePage) {
 		if (CpUtf8 == dbcsCodePage) {
-			if (UTF8IsAscii(leadByte)) {
-				// Single byte character or invalid
-				character = leadByte;
+			const int widthCharBytes = UTF8BytesOfLead(leadByte);
+			unsigned char charBytes[UTF8MaxBytes] = { leadByte, 0, 0, 0 };
+			for (int b = 1; b < widthCharBytes; b++) {
+				charBytes[b] = cb.UCharAt(position + b);
+			}
+			const int utf8status = UTF8ClassifyMulti(charBytes, widthCharBytes);
+			if (utf8status & UTF8MaskInvalid) {
+				// Report as singleton surrogate values which are invalid Unicode
+				character = 0xDC80 + character;
 			} else {
-				const int widthCharBytes = UTF8BytesOfLead(leadByte);
-				unsigned char charBytes[UTF8MaxBytes] = { leadByte, 0, 0, 0 };
-				for (int b = 1; b < widthCharBytes; b++) {
-					charBytes[b] = cb.UCharAt(position + b);
-				}
-				const int utf8status = UTF8ClassifyMulti(charBytes, widthCharBytes);
-				if (utf8status & UTF8MaskInvalid) {
-					// Report as singleton surrogate values which are invalid Unicode
-					character = 0xDC80 + leadByte;
-				} else {
-					bytesInCharacter = utf8status & UTF8MaskWidth;
-					character = UnicodeFromUTF8(charBytes);
-				}
+				bytesInCharacter = utf8status & UTF8MaskWidth;
+				character = UnicodeFromUTF8(charBytes);
 			}
 		} else {
 			if (IsDBCSLeadByteNoExcept(leadByte)) {
-				bytesInCharacter = 2;
-				character = (leadByte << 8) | cb.UCharAt(position + 1);
-			} else {
-				character = leadByte;
+				const unsigned char trailByte = cb.UCharAt(position + 1);
+				if (!IsDBCSTrailByteInvalid(trailByte)) {
+					bytesInCharacter = 2;
+					character = (character << 8) | trailByte;
+				}
 			}
 		}
-	} else {
-		character = leadByte;
 	}
 	if (pWidth) {
 		*pWidth = bytesInCharacter;