Change GetLaTeXInputUnicodeCharacter() to returns UTF-16 characters d…

…irectly to avoid extra UTF-32 to UTF-16 conversion, issue #289.
zufuliu · Feb 28, 2021 · 32747ce · 32747ce
1 parent 61fa668
commit 32747ce
Show file tree

Hide file tree

Showing 6 changed files with 1,720 additions and 1,731 deletions.
diff --git a/scintilla/include/LaTeXInput.h b/scintilla/include/LaTeXInput.h
@@ -10,7 +10,7 @@ extern "C" {
 #define EnableLaTeXLikeEmojiInput	1
 
 //++Autogenerated -- start of section automatically generated
-// input sequences based on Julia version 1.7.0-DEV.623 (Saturday 27 February 2021),
+// input sequences based on Julia version 1.7.0-DEV.625 (Saturday 27 February 2021),
 // documented at https://docs.julialang.org/en/v1/manual/unicode-input/
 
 enum {
@@ -53,7 +53,7 @@ extern const char * const kAllEmojiInputSequences;
 #endif
 
 /*!
- * @brief Get Unicode characters for LaTeX or Emoji input sequence.
+ * @brief Get Unicode UTF-16 characters for LaTeX or Emoji input sequence.
  * example: \sum to U+2211 ∑, \:laughing: to U+1F606 😆 and \gvertneqq to U+2269 + U+FE00 ≩︀.
  * @param sequence The input sequence withou the prefix '\', sequence[0] == ':' indicates Emoji.
  * @param length Length for the input sequence withou the prefix '\'.

diff --git a/scintilla/scripts/LaTeXInput.py b/scintilla/scripts/LaTeXInput.py
@@ -42,7 +42,6 @@ def find_word_contains_punctuation(items):
 	result.sort()
 	return result
 
-
 def json_dump(obj):
 	return json.dumps(obj, ensure_ascii=False, indent='\t')
 
@@ -163,14 +162,20 @@ def update_latex_input_data(input_name, input_map, max_hash_size):
 	if input_name == 'Emoji':
 		prefix = '\\:'
 		suffix = ':'
+	# see https://www.unicode.org/faq/utf_bom.html
+	LEAD_OFFSET = 0xD800 - (0x10000 >> 10)
 	for info in input_list:
 		character = info['character']
 		if len(character) == 1:
 			ch = ord(character)
 			if ch <= 0xffff:
 				code = '0x%04X' % ch
 			else:
-				code = '0x%X' % ch
+				character = ('U+%X, ' % ch) + character
+				# convert to UTF-16
+				lead = LEAD_OFFSET + (ch >> 10)
+				trail = 0xDC00 + (ch & 0x3FF)
+				code = "0x%04X'%04X" % (trail, lead)
 		else:
 			code = "0x%04X'%04X" % (ord(character[1]), ord(character[0]))
 		magic = info['magic']

diff --git a/scintilla/src/UniConversion.h b/scintilla/src/UniConversion.h
@@ -108,7 +108,6 @@ enum {
 	SURROGATE_TRAIL_FIRST = 0xDC00,
 	SURROGATE_TRAIL_LAST = 0xDFFF,
 	SUPPLEMENTAL_PLANE_FIRST = 0x10000,
-	MAX_UNICODE = 0x10ffff,
 };
 
 constexpr unsigned int UTF16CharLength(wchar_t uch) noexcept {
@@ -129,19 +128,4 @@ inline unsigned int UTF16FromUTF32Character(unsigned int val, wchar_t *tbuf) noe
 	return 2;
 }
 
-inline unsigned int UTF16FromLaTeXInputCharacter(unsigned int val, wchar_t *tbuf) noexcept {
-	if (val < SUPPLEMENTAL_PLANE_FIRST) {
-		tbuf[0] = static_cast<wchar_t>(val);
-		return 1;
-	}
-	if (val <= MAX_UNICODE) {
-		tbuf[0] = static_cast<wchar_t>(((val - SUPPLEMENTAL_PLANE_FIRST) >> 10) + SURROGATE_LEAD_FIRST);
-		tbuf[1] = static_cast<wchar_t>((val & 0x3ff) + SURROGATE_TRAIL_FIRST);
-	} else {
-		tbuf[0] = val & 0xffff;
-		tbuf[1] = val >> 16;
-	}
-	return 2;
-}
-
 }
diff --git a/scintilla/win32/LaTeXInput.cxx b/scintilla/win32/LaTeXInput.cxx
@@ -19,7 +19,7 @@ struct InputSequence {
 
 template <typename T, uint32_t N>
 constexpr uint32_t array_size([[maybe_unused]] const T (&a)[N]) noexcept {
-	return N ;
+	return N;
 }
 
 }