From 6a489c7e89085f8c953478a813e9f8bab9f8cc1b Mon Sep 17 00:00:00 2001 From: Henri Sivonen Date: Thu, 19 Sep 2024 20:13:22 +0300 Subject: [PATCH] Make the GBK-range GB18030-2022 changes work --- src/gb18030.rs | 16 ++++++++++---- src/gb18030_2022.rs | 54 +++++++++++++++++++++++++++++++++++++++++++++ src/lib.rs | 1 + 3 files changed, 67 insertions(+), 4 deletions(-) create mode 100644 src/gb18030_2022.rs diff --git a/src/gb18030.rs b/src/gb18030.rs index a0b3bd7f..515d0e80 100644 --- a/src/gb18030.rs +++ b/src/gb18030.rs @@ -9,6 +9,7 @@ use super::*; use crate::data::*; +use crate::gb18030_2022::*; use crate::handles::*; use crate::variant::*; // Rust 1.14.0 requires the following despite the asterisk above. @@ -347,8 +348,15 @@ fn gbk_encode_non_unified(bmp: u16) -> Option<(usize, usize)> { } return None; } - if bmp >= 0xE794 { - // Various brackets, all in PUA or full-width regions + + if in_inclusive_range16(bmp, 0xE78D, 0xE864) { + // The array is sorted but short, so let's do linear search. + if let Some(pos) = position(&GB18030_2022_OVERRIDE_PUA[..], bmp) { + let pair = &GB18030_2022_OVERRIDE_BYTES[pos]; + return Some((pair[0].into(), pair[1].into())); + } + } else if bmp >= 0xFE17 { + // Various brackets, all in full-width regions if let Some(pos) = position(&GB2312_SYMBOLS_AFTER_GREEK[..], bmp) { return Some((0xA6, pos + (0x9F - 0x60 + 0xA1))); } @@ -380,8 +388,8 @@ fn gbk_encode_non_unified(bmp: u16) -> Option<(usize, usize)> { let offset = if other_trail < 0x3F { 0x40 } else { 0x41 }; return Some((other_lead + (0x81 + 0x20), other_trail + offset)); } - // CJK Radicals Supplement or PUA in GBK_BOTTOM - if in_inclusive_range16(bmp, 0x2E81, 0x2ECA) || in_inclusive_range16(bmp, 0xE816, 0xE864) { + // CJK Radicals Supplement and U+9FBx ideographs in GBK_BOTTOM + if in_inclusive_range16(bmp, 0x2E81, 0x2ECA) || in_inclusive_range16(bmp, 0x9FB4, 0x9FBB) { if let Some(pos) = position(&GBK_BOTTOM[21..], bmp) { let trail = pos + 16; let offset = if trail < 0x3F { 0x40 } else { 0x41 }; diff --git a/src/gb18030_2022.rs b/src/gb18030_2022.rs new file mode 100644 index 00000000..3163f56a --- /dev/null +++ b/src/gb18030_2022.rs @@ -0,0 +1,54 @@ +// Copyright © WHATWG (Apple, Google, Mozilla, Microsoft). +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// +// 1. Redistributions of source code must retain the above copyright notice, this +// list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// 3. Neither the name of the copyright holder nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +// DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +// FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +// DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +// SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +/// The PUA code points special-cased in the GB18030 encoder. +pub(crate) static GB18030_2022_OVERRIDE_PUA: [u16; 18] = [ + 0xE78D, 0xE78E, 0xE78F, 0xE790, 0xE791, 0xE792, 0xE793, 0xE794, 0xE795, 0xE796, 0xE81E, 0xE826, + 0xE82B, 0xE82C, 0xE832, 0xE843, 0xE854, 0xE864, +]; + +/// The bytes corresponding to the PUA code points special-cased in the GB18030 encoder. +pub(crate) static GB18030_2022_OVERRIDE_BYTES: [[u8; 2]; 18] = [ + [0xA6, 0xD9], + [0xA6, 0xDA], + [0xA6, 0xDB], + [0xA6, 0xDC], + [0xA6, 0xDD], + [0xA6, 0xDE], + [0xA6, 0xDF], + [0xA6, 0xEC], + [0xA6, 0xED], + [0xA6, 0xF3], + [0xFE, 0x59], + [0xFE, 0x61], + [0xFE, 0x66], + [0xFE, 0x67], + [0xFE, 0x6D], + [0xFE, 0x7E], + [0xFE, 0x90], + [0xFE, 0xA0], +]; diff --git a/src/lib.rs b/src/lib.rs index b708b704..d4b92d61 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -761,6 +761,7 @@ mod big5; mod euc_jp; mod euc_kr; mod gb18030; +mod gb18030_2022; mod iso_2022_jp; mod replacement; mod shift_jis;