From e60a65ae458d3e9b2a5c504bc08d710485e5afa0 Mon Sep 17 00:00:00 2001 From: Henri Sivonen Date: Thu, 19 Sep 2024 21:48:45 +0300 Subject: [PATCH] Update docs for GBK and gb18030 --- doc/GBK.txt | 7 ++++--- doc/gb18030.txt | 9 +++++---- src/lib.rs | 16 +++++++++------- 3 files changed, 18 insertions(+), 14 deletions(-) diff --git a/doc/GBK.txt b/doc/GBK.txt index 2faefff4..931156f4 100644 --- a/doc/GBK.txt +++ b/doc/GBK.txt @@ -1,8 +1,9 @@ /// The decoder for this encoding is the same as the decoder for gb18030. /// The encoder side of this encoding is GBK with Windows code page 936 euro -/// sign behavior. GBK extends GB2312-80 to cover the CJK Unified Ideographs -/// Unicode block as well as a handful of ideographs from the CJK Unified -/// Ideographs Extension A and CJK Compatibility Ideographs blocks. +/// sign behavior and with the changes to two-byte sequences made in GB18030-2022. +/// GBK extends GB2312-80 to cover the CJK Unified Ideographs Unicode block as +/// well as a handful of ideographs from the CJK Unified Ideographs Extension A +/// and CJK Compatibility Ideographs blocks. /// /// Unlike e.g. in the case of ISO-8859-1 and windows-1252, GBK encoder wasn't /// unified with the gb18030 encoder in the Encoding Standard out of concern diff --git a/doc/gb18030.txt b/doc/gb18030.txt index 572a593d..32e97092 100644 --- a/doc/gb18030.txt +++ b/doc/gb18030.txt @@ -1,7 +1,8 @@ -/// This encoding matches GB18030-2005 except the two-byte sequence 0xA3 0xA0 -/// maps to U+3000 for compatibility with existing Web content. As a result, -/// this encoding can represent all of Unicode except for the private-use -/// character U+E5E5. +/// This encoding matches GB18030-2022 except the two-byte sequence 0xA3 0xA0 +/// maps to U+3000 for compatibility with existing Web content and the four-byte +/// sequences for the non-PUA characters that got two-byte sequences still decode +/// to the same non-PUA characters as in GB18030-2005. As a result, this encoding +/// can represent all of Unicode except for 19 private-use characters. /// /// [Index visualization for the two-byte sequences](https://encoding.spec.whatwg.org/gb18030.html), /// [Visualization of BMP coverage of the two-byte index](https://encoding.spec.whatwg.org/gb18030-bmp.html) diff --git a/src/lib.rs b/src/lib.rs index d4b92d61..5c5313af 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -946,9 +946,10 @@ pub static GBK_INIT: Encoding = Encoding { /// /// The decoder for this encoding is the same as the decoder for gb18030. /// The encoder side of this encoding is GBK with Windows code page 936 euro -/// sign behavior. GBK extends GB2312-80 to cover the CJK Unified Ideographs -/// Unicode block as well as a handful of ideographs from the CJK Unified -/// Ideographs Extension A and CJK Compatibility Ideographs blocks. +/// sign behavior and with the changes to two-byte sequences made in GB18030-2022. +/// GBK extends GB2312-80 to cover the CJK Unified Ideographs Unicode block as +/// well as a handful of ideographs from the CJK Unified Ideographs Extension A +/// and CJK Compatibility Ideographs blocks. /// /// Unlike e.g. in the case of ISO-8859-1 and windows-1252, GBK encoder wasn't /// unified with the gb18030 encoder in the Encoding Standard out of concern @@ -1690,10 +1691,11 @@ pub static GB18030_INIT: Encoding = Encoding { /// The gb18030 encoding. /// -/// This encoding matches GB18030-2005 except the two-byte sequence 0xA3 0xA0 -/// maps to U+3000 for compatibility with existing Web content. As a result, -/// this encoding can represent all of Unicode except for the private-use -/// character U+E5E5. +/// This encoding matches GB18030-2022 except the two-byte sequence 0xA3 0xA0 +/// maps to U+3000 for compatibility with existing Web content and the four-byte +/// sequences for the non-PUA characters that got two-byte sequences still decode +/// to the same non-PUA characters as in GB18030-2005. As a result, this encoding +/// can represent all of Unicode except for 19 private-use characters. /// /// [Index visualization for the two-byte sequences](https://encoding.spec.whatwg.org/gb18030.html), /// [Visualization of BMP coverage of the two-byte index](https://encoding.spec.whatwg.org/gb18030-bmp.html)