Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Assign width 1 to control characters #45

Merged
merged 1 commit into from
May 9, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 4 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ use unicode_width::UnicodeWidthStr;

fn main() {
let teststr = "Hello, world!";
let width = UnicodeWidthStr::width(teststr);
let width = teststr.width();
println!("{}", teststr);
println!("The above string is {} columns wide.", width);
let width = teststr.width_cjk();
Expand All @@ -34,9 +34,9 @@ extern crate unicode_width;
use unicode_width::UnicodeWidthStr;

fn main() {
assert_eq!(UnicodeWidthStr::width("👩"), 2); // Woman
assert_eq!(UnicodeWidthStr::width("🔬"), 2); // Microscope
assert_eq!(UnicodeWidthStr::width("👩‍🔬"), 4); // Woman scientist
assert_eq!("👩".width(), 2); // Woman
assert_eq!("🔬".width(), 2); // Microscope
assert_eq!("👩‍🔬".width(), 4); // Woman scientist
}
```

Expand Down
41 changes: 2 additions & 39 deletions scripts/unicode.py
Original file line number Diff line number Diff line change
Expand Up @@ -165,21 +165,14 @@ def load_zero_widths() -> "list[bool]":
"""Returns a list `l` where `l[c]` is true if codepoint `c` is considered a zero-width
character. `c` is considered a zero-width character if

- it is a control character,
- or if it has the `Default_Ignorable_Code_Point` property (determined from `DerivedCoreProperties.txt`),
- it has the `Default_Ignorable_Code_Point` property (determined from `DerivedCoreProperties.txt`),
- or if it has the `Grapheme_Extend` property (determined from `DerivedCoreProperties.txt`),
- or if it one of eight characters that should be `Grapheme_Extend` but aren't due to a Unicode spec bug,
- or if it has a `Hangul_Syllable_Type` of `Vowel_Jamo` or `Trailing_Jamo` (determined from `HangulSyllableType.txt`).
"""

zw_map = [False] * NUM_CODEPOINTS

# Control characters have width 0
for c in range(0x00, 0x20):
zw_map[c] = True
for c in range(0x7F, 0xA0):
zw_map[c] = True

# `Default_Ignorable_Code_Point`s also have 0 width:
# https://www.unicode.org/faq/unsup_char.html#3
# https://www.unicode.org/versions/Unicode15.1.0/ch05.pdf#G40095
Expand Down Expand Up @@ -563,7 +556,7 @@ def emit_module(
/// However, if you change the *actual structure* of the lookup tables (perhaps by editing the
/// `TABLE_CFGS` global in `unicode.py`) you must ensure that this code reflects those changes.
#[inline]
fn lookup_width(c: char, is_cjk: bool) -> usize {
pub fn lookup_width(c: char, is_cjk: bool) -> usize {
let cp = c as usize;

let t1_offset = TABLES_0[cp >> 13 & 0xFF];
Expand Down Expand Up @@ -664,36 +657,6 @@ def emit_module(
"""
)

module.write(
"""
/// Returns the [UAX #11](https://www.unicode.org/reports/tr11/) based width of `c`, or
/// `None` if `c` is a control character other than `'\\x00'`.
/// If `is_cjk == true`, ambiguous width characters are treated as double width; otherwise,
/// they're treated as single width.
#[inline]
pub fn width(c: char, is_cjk: bool) -> Option<usize> {
if c < '\\u{7F}' {
if c >= '\\u{20}' {
// U+0020 to U+007F (exclusive) are single-width ASCII codepoints
Some(1)
} else if c == '\\0' {
// U+0000 *is* a control code, but it's special-cased
Some(0)
} else {
// U+0001 to U+0020 (exclusive) are control codes
None
}
} else if c >= '\\u{A0}' {
// No characters >= U+00A0 are control codes, so we can consult the lookup tables
Some(lookup_width(c, is_cjk))
} else {
// U+007F to U+00A0 (exclusive) are control codes
None
}
}
"""
)

subtable_count = 1
for i, table in enumerate(tables):
new_subtable_count = len(table.buckets())
Expand Down
105 changes: 68 additions & 37 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -34,12 +34,13 @@
//!
//! 1. [Emoji presentation sequences] have width 2.
//! (The width of a string may therefore differ from the sum of the widths of its characters.)
//! 2. Outside of an East Asian context, [text presentation sequences] have width 1
//! iff their base character fulfills all the following requirements:
//! 2. Outside of an East Asian context, [text presentation sequences] fulfilling all the following requirements
//! have width 1:
//! - Has the [`Emoji_Presentation`] property, and
//! - Not in the [Enclosed Ideographic Supplement] block.
//! 3. [`'\u{115F}'` HANGUL CHOSEONG FILLER](https://util.unicode.org/UnicodeJsps/character.jsp?a=115F) has width 2.
//! 4. The following have width 0:
//! 3. The sequence `"\r\n"` has width 1.
//! 4. [`'\u{115F}'` HANGUL CHOSEONG FILLER](https://util.unicode.org/UnicodeJsps/character.jsp?a=115F) has width 2.
//! 5. The following have width 0:
//! - [Characters](https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5Cp%7BDefault_Ignorable_Code_Point%7D)
//! with the [`Default_Ignorable_Code_Point`](https://www.unicode.org/versions/Unicode15.0.0/ch05.pdf#G40095) property.
//! - [Characters](https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5Cp%7BGrapheme_Extend%7D)
Expand All @@ -55,9 +56,6 @@
//! - [`'\u{1B43}'` BALINESE VOWEL SIGN PEPET TEDUNG](https://util.unicode.org/UnicodeJsps/character.jsp?a=1B43).
//! - [Characters](https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5Cp%7BHangul_Syllable_Type%3DV%7D%5Cp%7BHangul_Syllable_Type%3DT%7D)
//! with a [`Hangul_Syllable_Type`] of `Vowel_Jamo` (`V`) or `Trailing_Jamo` (`T`).
//! - [`'\0'` NUL](https://util.unicode.org/UnicodeJsps/character.jsp?a=0000).
//! 5. The [control characters](https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5Cp%7BCc%7D)
//! have no defined width, and are ignored when determining the width of a string.
//! 6. [Characters](https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5Cp%7BEast_Asian_Width%3DF%7D%5Cp%7BEast_Asian_Width%3DW%7D)
//! with an [`East_Asian_Width`] of [`Fullwidth`] or [`Wide`] have width 2.
//! 7. [Characters](https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5Cp%7BEast_Asian_Width%3DA%7D)
Expand Down Expand Up @@ -99,7 +97,7 @@ mod tables;
/// Methods for determining displayed width of Unicode characters.
pub trait UnicodeWidthChar {
/// Returns the character's displayed width in columns, or `None` if the
/// character is a control character other than `'\x00'`.
/// character is a control character.
///
/// This function treats characters in the Ambiguous category according
/// to [Unicode Standard Annex #11](http://www.unicode.org/reports/tr11/)
Expand All @@ -108,7 +106,7 @@ pub trait UnicodeWidthChar {
fn width(self) -> Option<usize>;

/// Returns the character's displayed width in columns, or `None` if the
/// character is a control character other than `'\x00'`.
/// character is a control character.
///
/// This function treats characters in the Ambiguous category according
/// to [Unicode Standard Annex #11](http://www.unicode.org/reports/tr11/)
Expand All @@ -120,23 +118,42 @@ pub trait UnicodeWidthChar {
impl UnicodeWidthChar for char {
#[inline]
fn width(self) -> Option<usize> {
cw::width(self, false)
single_char_width(self, false)
}

#[inline]
fn width_cjk(self) -> Option<usize> {
cw::width(self, true)
single_char_width(self, true)
}
}

/// Returns the [UAX #11](https://www.unicode.org/reports/tr11/) based width of `c`, or
/// `None` if `c` is a control character.
/// If `is_cjk == true`, ambiguous width characters are treated as double width; otherwise,
/// they're treated as single width.
#[inline]
fn single_char_width(c: char, is_cjk: bool) -> Option<usize> {
if c < '\u{7F}' {
if c >= '\u{20}' {
// U+0020 to U+007F (exclusive) are single-width ASCII codepoints
Some(1)
} else {
// U+0001 to U+0020 (exclusive) are control codes
None
}
} else if c >= '\u{A0}' {
// No characters >= U+00A0 are control codes, so we can consult the lookup tables
Some(cw::lookup_width(c, is_cjk))
} else {
// U+007F to U+00A0 (exclusive) are control codes
None
}
}

/// Methods for determining displayed width of Unicode strings.
pub trait UnicodeWidthStr {
/// Returns the string's displayed width in columns.
///
/// Control characters are treated as having zero width,
/// and [emoji presentation sequences](https://unicode.org/reports/tr51/#def_emoji_presentation_sequence)
/// are assigned width 2.
///
/// This function treats characters in the Ambiguous category according
/// to [Unicode Standard Annex #11](http://www.unicode.org/reports/tr11/)
/// as 1 column wide. This is consistent with the recommendations for
Expand All @@ -145,10 +162,6 @@ pub trait UnicodeWidthStr {

/// Returns the string's displayed width in columns.
///
/// Control characters are treated as having zero width,
/// and [emoji presentation sequences](https://unicode.org/reports/tr51/#def_emoji_presentation_sequence)
/// are assigned width 2.
///
/// This function treats characters in the Ambiguous category according
/// to [Unicode Standard Annex #11](http://www.unicode.org/reports/tr11/)
/// as 2 column wide. This is consistent with the recommendations for
Expand All @@ -168,30 +181,48 @@ impl UnicodeWidthStr for str {
}
}

#[derive(Clone, Copy, Debug, PartialEq, Eq)]
enum VariationSelector {
#[derive(Clone, Copy, Debug, Default, PartialEq, Eq)]
enum NextCharInfo {
#[default]
Default,
LineFeed = 0x0A,
Vs15 = 0x0E,
Vs16 = 0x0F,
}

fn str_width(s: &str, is_cjk: bool) -> usize {
s.chars()
.rfold((0, None), |(sum, vsel), c| match c {
'\u{FE0E}' => (sum, Some(VariationSelector::Vs15)),
'\u{FE0F}' => (sum, Some(VariationSelector::Vs16)),
_ => {
let add = match vsel {
Some(VariationSelector::Vs15)
if !is_cjk && cw::starts_non_ideographic_text_presentation_seq(c) =>
{
1
}

Some(VariationSelector::Vs16) if cw::starts_emoji_presentation_seq(c) => 2,
_ => cw::width(c, is_cjk).unwrap_or(0),
};
(sum + add, None)
}
.rfold((0, NextCharInfo::Default), |(sum, next_info), c| {
let (add, info) = width_in_str(c, is_cjk, next_info);
(sum + add, info)
})
.0
}

/// Returns the [UAX #11](https://www.unicode.org/reports/tr11/) based width of `c`.
/// If `is_cjk == true`, ambiguous width characters are treated as double width; otherwise,
/// they're treated as single width.
#[inline]
fn width_in_str(c: char, is_cjk: bool, next_info: NextCharInfo) -> (usize, NextCharInfo) {
match next_info {
NextCharInfo::Vs15 if !is_cjk && cw::starts_non_ideographic_text_presentation_seq(c) => {
(1, NextCharInfo::Default)
}
NextCharInfo::Vs16 if cw::starts_emoji_presentation_seq(c) => (2, NextCharInfo::Default),
_ => {
if c <= '\u{A0}' {
match c {
'\n' => (1, NextCharInfo::LineFeed),
'\r' if next_info == NextCharInfo::LineFeed => (0, NextCharInfo::Default),
_ => (1, NextCharInfo::Default),
}
} else {
match c {
'\u{FE0E}' => (0, NextCharInfo::Vs15),
'\u{FE0F}' => (0, NextCharInfo::Vs16),
_ => (cw::lookup_width(c, is_cjk), NextCharInfo::Default),
}
}
}
}
}
Loading