From 1988690954841da90ac4f4ad87a23bddc7d5af47 Mon Sep 17 00:00:00 2001 From: Behnam Esfahbod Date: Sat, 29 Jul 2017 12:37:21 -0400 Subject: [PATCH] [char_property] Implement *CharProperty traits Character Properties are of different kinds and shapes, and as UNIC components grow, we need a better way to be able to categorize them by their shape, and a way to make sure we have consistent, noncolliding API for them. This is the first step into building a CharProperty taxonomy, with as little as possibly needed to provide the assurances desired. We hope that the implementation can be improved over time with new features added to the language. There's already some proposals in this front. See these discussions for more details: * [Traits as contract, without changes to call-sites](https://users.rust-lang.org/t/traits-as-contract-without-changes-to-call-sites/11938/11>) * [RFC: delegation of implementation](https://github.com/rust-lang/rfcs/pull/1406) --- unic/ucd/age/src/age.rs | 7 ++ unic/ucd/age/src/lib.rs | 2 +- unic/ucd/bidi/src/bidi_class.rs | 47 +++++++++++++- unic/ucd/bidi/src/lib.rs | 2 +- unic/ucd/category/Cargo.toml | 2 +- unic/ucd/category/src/category.rs | 55 ++++++++++++++-- unic/ucd/category/src/lib.rs | 3 +- unic/ucd/core/src/lib.rs | 2 +- .../normal/src/canonical_combining_class.rs | 30 ++++++++- unic/ucd/normal/src/decomposition_type.rs | 65 ++++++++++++++++++- unic/ucd/normal/src/lib.rs | 2 +- unic/ucd/src/lib.rs | 2 +- unic/utils/README.md | 6 ++ unic/utils/src/char_property.rs | 46 +++++++++++++ unic/utils/src/lib.rs | 2 + 15 files changed, 258 insertions(+), 15 deletions(-) create mode 100644 unic/utils/README.md create mode 100644 unic/utils/src/char_property.rs diff --git a/unic/ucd/age/src/age.rs b/unic/ucd/age/src/age.rs index 8845ad41..cf589c52 100644 --- a/unic/ucd/age/src/age.rs +++ b/unic/ucd/age/src/age.rs @@ -14,6 +14,7 @@ use std::fmt; use unic_utils::CharDataTable; pub use unic_ucd_core::UnicodeVersion; +use unic_utils::CharProperty; /// Represents values of the Unicode character property @@ -41,6 +42,12 @@ pub enum Age { Unassigned, // Unassigned is older (larger) than any age } +impl CharProperty for Age { + fn of(ch: char) -> Self { + Self::of(ch) + } +} + use Age::{Assigned, Unassigned}; diff --git a/unic/ucd/age/src/lib.rs b/unic/ucd/age/src/lib.rs index c28be658..b59b1226 100644 --- a/unic/ucd/age/src/lib.rs +++ b/unic/ucd/age/src/lib.rs @@ -10,7 +10,7 @@ // except according to those terms. -#![forbid(unsafe_code)] +#![forbid(unsafe_code, unconditional_recursion)] #![deny(missing_docs)] //! # UNIC — UCD — Character Age diff --git a/unic/ucd/bidi/src/bidi_class.rs b/unic/ucd/bidi/src/bidi_class.rs index 3c100480..ec1e4e4a 100644 --- a/unic/ucd/bidi/src/bidi_class.rs +++ b/unic/ucd/bidi/src/bidi_class.rs @@ -9,9 +9,11 @@ // option. This file may not be copied, modified, or distributed // except according to those terms. + use std::fmt; -use unic_utils::CharDataTable; +use unic_utils::{CharDataTable, CharProperty, EnumeratedCharProperty}; + /// Represents the Unicode character /// [*Bidi_Class*](http://www.unicode.org/reports/tr44/#Bidi_Class) property, also known as the @@ -48,6 +50,18 @@ pub enum BidiClass { // [UNIC_UPDATE_ON_UNICODE_UPDATE] Source: `tables/bidi_class_type.rsv` } +impl CharProperty for BidiClass { + fn of(ch: char) -> Self { + Self::of(ch) + } +} + +impl EnumeratedCharProperty for BidiClass { + fn all_values() -> &'static [Self] { + Self::all_values() + } +} + /// Abbreviated name aliases for /// [*Bidi_Class*](http://www.unicode.org/reports/tr44/#Bidi_Class) property. @@ -111,6 +125,37 @@ impl BidiClass { *TABLE.find_or(ch, &L) } + /// Exhaustive list of all `BidiClass` property values. + pub fn all_values() -> &'static [BidiClass] { + use BidiClass::*; + const ALL_VALUES: &[BidiClass] = &[ + ArabicLetter, + ArabicNumber, + ParagraphSeparator, + BoundaryNeutral, + CommonSeparator, + EuropeanNumber, + EuropeanSeparator, + EuropeanTerminator, + FirstStrongIsolate, + LeftToRight, + LeftToRightEmbedding, + LeftToRightIsolate, + LeftToRightOverride, + NonspacingMark, + OtherNeutral, + PopDirectionalFormat, + PopDirectionalIsolate, + RightToLeft, + RightToLeftEmbedding, + RightToLeftIsolate, + RightToLeftOverride, + SegmentSeparator, + WhiteSpace, + ]; + ALL_VALUES + } + /// Abbreviated name of the *Bidi_Class* property value. /// /// diff --git a/unic/ucd/bidi/src/lib.rs b/unic/ucd/bidi/src/lib.rs index f3bf46cc..18f8753b 100644 --- a/unic/ucd/bidi/src/lib.rs +++ b/unic/ucd/bidi/src/lib.rs @@ -10,7 +10,7 @@ // except according to those terms. -#![forbid(unsafe_code)] +#![forbid(unsafe_code, unconditional_recursion)] #![deny(missing_docs)] //! # UNIC — UCD — Bidi diff --git a/unic/ucd/category/Cargo.toml b/unic/ucd/category/Cargo.toml index 85943c94..df7f598a 100644 --- a/unic/ucd/category/Cargo.toml +++ b/unic/ucd/category/Cargo.toml @@ -15,6 +15,6 @@ exclude = [] travis-ci = { repository = "behnam/rust-unic", branch = "master" } [dependencies] +matches = "0.1" unic-ucd-core = { path = "../core/", version = "0.5.0" } unic-utils = { path = "../../utils/", version = "0.5.0" } -matches = "0.1" diff --git a/unic/ucd/category/src/category.rs b/unic/ucd/category/src/category.rs index 095dbc7b..6b74fd6c 100644 --- a/unic/ucd/category/src/category.rs +++ b/unic/ucd/category/src/category.rs @@ -8,7 +8,11 @@ // option. This file may not be copied, modified, or distributed // except according to those terms. -use unic_utils::CharDataTable; + +use std::fmt; + +use unic_utils::{CharDataTable, CharProperty, EnumeratedCharProperty}; + /// Represents the Unicode Character /// [*General_Category*](http://unicode.org/reports/tr44/#General_Category) property. @@ -16,7 +20,7 @@ use unic_utils::CharDataTable; /// This is a useful breakdown into various character types which can be used as a default /// categorization in implementations. For the property values, see /// [*General_Category Values*](http://unicode.org/reports/tr44/#General_Category_Values). -#[derive(Clone, Copy, Debug, PartialEq, Eq)] +#[derive(Clone, Copy, Debug, Eq, Hash, PartialEq)] pub enum GeneralCategory { /// An uppercase letter (Short form: `Lu`) UppercaseLetter, @@ -80,6 +84,21 @@ pub enum GeneralCategory { Unassigned, } + +impl CharProperty for GeneralCategory { + fn of(ch: char) -> Self { + Self::of(ch) + } +} + + +impl EnumeratedCharProperty for GeneralCategory { + fn all_values() -> &'static [Self] { + Self::all_values() + } +} + + pub mod abbr_names { pub use super::GeneralCategory::UppercaseLetter as Lu; pub use super::GeneralCategory::LowercaseLetter as Ll; @@ -125,8 +144,6 @@ impl GeneralCategory { } /// Exhaustive list of all `GeneralCategory` property values. - /// - /// Reference: pub fn all_values() -> &'static [GeneralCategory] { use GeneralCategory::*; const ALL_VALUES: &[GeneralCategory] = &[ @@ -163,8 +180,16 @@ impl GeneralCategory { ]; ALL_VALUES } + + /// Human-readable description of the property value. + // TODO: Needs to be improved by returning long-name with underscores replaced by space. + #[inline] + pub fn display(&self) -> String { + format!("{:?}", self).to_owned() + } } + impl GeneralCategory { /// `Lu` | `Ll` | `Lt` (Short form: `LC`) pub fn is_cased_letter(&self) -> bool { @@ -207,6 +232,21 @@ impl GeneralCategory { } } + +impl Default for GeneralCategory { + fn default() -> Self { + GeneralCategory::Unassigned + } +} + + +impl fmt::Display for GeneralCategory { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + write!(f, "{}", self.display()) + } +} + + #[cfg(test)] mod tests { use super::GeneralCategory as GC; @@ -305,4 +345,11 @@ mod tests { assert_eq!(GC::of(c), GC::Unassigned); } } + + #[test] + fn test_display() { + //assert_eq!(format!("{}", GC::UppercaseLetter), "Uppercase Letter"); + assert_eq!(format!("{}", GC::UppercaseLetter), "UppercaseLetter"); + assert_eq!(format!("{}", GC::Unassigned), "Unassigned"); + } } diff --git a/unic/ucd/category/src/lib.rs b/unic/ucd/category/src/lib.rs index 4e505b53..5171cd8c 100644 --- a/unic/ucd/category/src/lib.rs +++ b/unic/ucd/category/src/lib.rs @@ -8,7 +8,7 @@ // option. This file may not be copied, modified, or distributed // except according to those terms. -#![deny(unsafe_code, missing_docs)] +#![deny(unsafe_code, missing_docs, unconditional_recursion)] //! # UNIC — UCD — Category //! @@ -38,6 +38,7 @@ #[macro_use] extern crate matches; + extern crate unic_ucd_core; extern crate unic_utils; diff --git a/unic/ucd/core/src/lib.rs b/unic/ucd/core/src/lib.rs index 83f73c89..1d2ee6ad 100644 --- a/unic/ucd/core/src/lib.rs +++ b/unic/ucd/core/src/lib.rs @@ -9,7 +9,7 @@ // except according to those terms. -#![forbid(unsafe_code, missing_docs)] +#![forbid(unsafe_code, missing_docs, unconditional_recursion)] //! # UNIC — UCD — Core //! diff --git a/unic/ucd/normal/src/canonical_combining_class.rs b/unic/ucd/normal/src/canonical_combining_class.rs index ae2245f1..ee4b5d2d 100644 --- a/unic/ucd/normal/src/canonical_combining_class.rs +++ b/unic/ucd/normal/src/canonical_combining_class.rs @@ -15,7 +15,9 @@ //! Reference: -use unic_utils::CharDataTable; +use std::fmt; + +use unic_utils::{CharDataTable, CharProperty}; /// Represents *Canonical_Combining_Class* property of a Unicode character. @@ -82,6 +84,13 @@ pub mod values { } +impl CharProperty for CanonicalCombiningClass { + fn of(ch: char) -> Self { + Self::of(ch) + } +} + + impl CanonicalCombiningClass { /// Find the character *Canonical_Combining_Class* property value. pub fn of(ch: char) -> CanonicalCombiningClass { @@ -89,6 +98,19 @@ impl CanonicalCombiningClass { include!("tables/canonical_combining_class_values.rsv"); *TABLE.find_or(ch, &CanonicalCombiningClass(0)) } + + /// Human-readable description of the property value. + // TODO: Needs to be improved by returning long-name with underscores replaced by space. + #[inline] + pub fn display(&self) -> String { + format!("{}", self.number()) + } +} + +impl fmt::Display for CanonicalCombiningClass { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + write!(f, "{}", self.display()) + } } @@ -224,4 +246,10 @@ mod tests { assert_eq!(CCC::of('\u{0315}').number(), 232); assert_eq!(CCC::of('\u{1e94a}').number(), 7); } + + #[test] + fn test_display() { + assert_eq!(format!("{}", CCC::of('\u{0000}')), "0"); + assert_eq!(format!("{}", CCC::of('\u{0300}')), "230"); + } } diff --git a/unic/ucd/normal/src/decomposition_type.rs b/unic/ucd/normal/src/decomposition_type.rs index aa8ecb26..285c1a4b 100644 --- a/unic/ucd/normal/src/decomposition_type.rs +++ b/unic/ucd/normal/src/decomposition_type.rs @@ -12,7 +12,9 @@ //! Accessor for *Decomposition_Type* (dt) property -use unic_utils::CharDataTable; +use std::fmt; + +use unic_utils::{CharDataTable, EnumeratedCharProperty, OptionCharProperty}; use composition::canonical_decomposition; use hangul; @@ -22,7 +24,7 @@ use hangul; /// [*Decomposition_Type*](http://www.unicode.org/reports/tr44/#Decomposition_Type) property. /// /// * -#[derive(Clone, Copy, Debug, PartialEq, Eq)] +#[derive(Clone, Copy, Debug, Eq, Hash, PartialEq)] #[allow(missing_docs)] pub enum DecompositionType { Canonical, // abbreviated: Can @@ -46,6 +48,20 @@ pub enum DecompositionType { } +impl OptionCharProperty for DecompositionType { + fn of(ch: char) -> Option { + Self::of(ch) + } +} + + +impl EnumeratedCharProperty for DecompositionType { + fn all_values() -> &'static [Self] { + Self::all_values() + } +} + + use self::DecompositionType::*; @@ -61,6 +77,46 @@ impl DecompositionType { include!("tables/compatibility_decomposition_type_values.rsv"); TABLE.find(ch).cloned() } + + /// Exhaustive list of all `DecompositionType` property values. + pub fn all_values() -> &'static [DecompositionType] { + use DecompositionType::*; + const ALL_VALUES: &[DecompositionType] = &[ + Canonical, + Compat, + Circle, + Final, + Font, + Fraction, + Initial, + Isolated, + Medial, + Narrow, + Nobreak, + None, + Small, + Square, + Sub, + Super, + Vertical, + Wide, + ]; + ALL_VALUES + } + + /// Human-readable description of the property value. + // TODO: Needs to be improved by returning long-name with underscores replaced by space. + #[inline] + pub fn display(&self) -> String { + format!("{:?}", self).to_owned() + } +} + + +impl fmt::Display for DecompositionType { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + write!(f, "{}", self.display()) + } } @@ -192,4 +248,9 @@ mod tests { assert_eq!(DT::of('\u{90000}'), None); assert_eq!(DT::of('\u{a0000}'), None); } + + #[test] + fn test_display() { + assert_eq!(format!("{}", DT::of('\u{a0}').unwrap()), "Nobreak"); + } } diff --git a/unic/ucd/normal/src/lib.rs b/unic/ucd/normal/src/lib.rs index 335c66a0..d4c3cb8c 100644 --- a/unic/ucd/normal/src/lib.rs +++ b/unic/ucd/normal/src/lib.rs @@ -10,7 +10,7 @@ // except according to those terms. -#![deny(unsafe_code, missing_docs)] +#![deny(unsafe_code, missing_docs, unconditional_recursion)] //! # UNIC — UCD — Normalization //! diff --git a/unic/ucd/src/lib.rs b/unic/ucd/src/lib.rs index af4210fc..99f8e666 100644 --- a/unic/ucd/src/lib.rs +++ b/unic/ucd/src/lib.rs @@ -9,7 +9,7 @@ // except according to those terms. -#![forbid(unsafe_code, missing_docs)] +#![forbid(unsafe_code, missing_docs, unconditional_recursion)] //! # UNIC — Unicode Character Database //! diff --git a/unic/utils/README.md b/unic/utils/README.md new file mode 100644 index 00000000..7300dc76 --- /dev/null +++ b/unic/utils/README.md @@ -0,0 +1,6 @@ +# UNIC — Utilities + +[![Crates.io](https://img.shields.io/crates/v/unic-utils.svg)](https://crates.io/crates/unic-utils) +[![Documentation](https://docs.rs/unic-utils/badge.svg)](https://docs.rs/unic-utils/) + +This UNIC component provides utility libraries that do not depend on Unicode data. diff --git a/unic/utils/src/char_property.rs b/unic/utils/src/char_property.rs new file mode 100644 index 00000000..b4c97401 --- /dev/null +++ b/unic/utils/src/char_property.rs @@ -0,0 +1,46 @@ +// Copyright 2017 The UNIC Project Developers. +// +// See the COPYRIGHT file at the top-level directory of this distribution. +// +// Licensed under the Apache License, Version 2.0 or the MIT license +// , at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. + + +//! TBD. + + +use std::fmt::{Debug, Display}; +use std::hash::Hash; + + +/// TBD. +pub trait CharProperty +where + Self: Copy + Debug + Display + Eq + Hash, +{ + /// TBD + fn of(ch: char) -> Self; +} + + +/// TBD. +pub trait OptionCharProperty +where + Self: Copy + Debug + Display + Eq + Hash, +{ + /// TBD + fn of(ch: char) -> Option; +} + + +/// TBD. +pub trait EnumeratedCharProperty +where + Self: Copy + Debug + Display + Eq + Hash, +{ + /// TBD + fn all_values() -> &'static [Self]; +} diff --git a/unic/utils/src/lib.rs b/unic/utils/src/lib.rs index 1975c558..4af3293c 100644 --- a/unic/utils/src/lib.rs +++ b/unic/utils/src/lib.rs @@ -29,7 +29,9 @@ pub const PKG_DESCRIPTION: &'static str = env!("CARGO_PKG_DESCRIPTION"); pub mod codepoints; pub mod tables; +pub mod char_property; +pub use char_property::{CharProperty, EnumeratedCharProperty, OptionCharProperty}; pub use codepoints::iter_all_chars; pub use tables::CharDataTable;