Skip to content

Commit

Permalink
Make the GBK-range GB18030-2022 changes work
Browse files Browse the repository at this point in the history
  • Loading branch information
hsivonen committed Oct 24, 2024
1 parent 2f1b14f commit 6a489c7
Show file tree
Hide file tree
Showing 3 changed files with 67 additions and 4 deletions.
16 changes: 12 additions & 4 deletions src/gb18030.rs
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@

use super::*;
use crate::data::*;
use crate::gb18030_2022::*;
use crate::handles::*;
use crate::variant::*;
// Rust 1.14.0 requires the following despite the asterisk above.
Expand Down Expand Up @@ -347,8 +348,15 @@ fn gbk_encode_non_unified(bmp: u16) -> Option<(usize, usize)> {
}
return None;
}
if bmp >= 0xE794 {
// Various brackets, all in PUA or full-width regions

if in_inclusive_range16(bmp, 0xE78D, 0xE864) {
// The array is sorted but short, so let's do linear search.
if let Some(pos) = position(&GB18030_2022_OVERRIDE_PUA[..], bmp) {
let pair = &GB18030_2022_OVERRIDE_BYTES[pos];
return Some((pair[0].into(), pair[1].into()));
}
} else if bmp >= 0xFE17 {
// Various brackets, all in full-width regions
if let Some(pos) = position(&GB2312_SYMBOLS_AFTER_GREEK[..], bmp) {
return Some((0xA6, pos + (0x9F - 0x60 + 0xA1)));
}
Expand Down Expand Up @@ -380,8 +388,8 @@ fn gbk_encode_non_unified(bmp: u16) -> Option<(usize, usize)> {
let offset = if other_trail < 0x3F { 0x40 } else { 0x41 };
return Some((other_lead + (0x81 + 0x20), other_trail + offset));
}
// CJK Radicals Supplement or PUA in GBK_BOTTOM
if in_inclusive_range16(bmp, 0x2E81, 0x2ECA) || in_inclusive_range16(bmp, 0xE816, 0xE864) {
// CJK Radicals Supplement and U+9FBx ideographs in GBK_BOTTOM
if in_inclusive_range16(bmp, 0x2E81, 0x2ECA) || in_inclusive_range16(bmp, 0x9FB4, 0x9FBB) {
if let Some(pos) = position(&GBK_BOTTOM[21..], bmp) {
let trail = pos + 16;
let offset = if trail < 0x3F { 0x40 } else { 0x41 };
Expand Down
54 changes: 54 additions & 0 deletions src/gb18030_2022.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
// Copyright © WHATWG (Apple, Google, Mozilla, Microsoft).
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are met:
//
// 1. Redistributions of source code must retain the above copyright notice, this
// list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
//
// 3. Neither the name of the copyright holder nor the names of its
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
// DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
// FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
// DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
// SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

/// The PUA code points special-cased in the GB18030 encoder.
pub(crate) static GB18030_2022_OVERRIDE_PUA: [u16; 18] = [
0xE78D, 0xE78E, 0xE78F, 0xE790, 0xE791, 0xE792, 0xE793, 0xE794, 0xE795, 0xE796, 0xE81E, 0xE826,
0xE82B, 0xE82C, 0xE832, 0xE843, 0xE854, 0xE864,
];

/// The bytes corresponding to the PUA code points special-cased in the GB18030 encoder.
pub(crate) static GB18030_2022_OVERRIDE_BYTES: [[u8; 2]; 18] = [
[0xA6, 0xD9],
[0xA6, 0xDA],
[0xA6, 0xDB],
[0xA6, 0xDC],
[0xA6, 0xDD],
[0xA6, 0xDE],
[0xA6, 0xDF],
[0xA6, 0xEC],
[0xA6, 0xED],
[0xA6, 0xF3],
[0xFE, 0x59],
[0xFE, 0x61],
[0xFE, 0x66],
[0xFE, 0x67],
[0xFE, 0x6D],
[0xFE, 0x7E],
[0xFE, 0x90],
[0xFE, 0xA0],
];
1 change: 1 addition & 0 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -761,6 +761,7 @@ mod big5;
mod euc_jp;
mod euc_kr;
mod gb18030;
mod gb18030_2022;
mod iso_2022_jp;
mod replacement;
mod shift_jis;
Expand Down

0 comments on commit 6a489c7

Please sign in to comment.