-
Notifications
You must be signed in to change notification settings - Fork 24
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
CharRange and CharIter #111
Changes from all commits
018fef0
76b29e7
b823615
85579ae
d24057a
4b63146
afa38a0
a8fb34d
868a38b
c3726ef
b6fb062
d1b9990
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,27 @@ | ||
[package] | ||
name = "unic-char-range" | ||
version = "0.5.0" | ||
authors = ["The UNIC Project Developers"] | ||
repository = "https://github.com/behnam/rust-unic/" | ||
license = "MIT/Apache-2.0" | ||
keywords = ["text", "unicode", "iteration"] | ||
description = "UNIC - CharRange" | ||
categories = ["text-processing", "iteration"] | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Haven't actually checked that this is valid yet. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. There's no "iteration" category. Maybe we don't need more than one. Here's the list: https://crates.io/categories/ FYI, I've got "internationalization" (and "localization") added to the list, but it hasn't made it to the live version yet. When that happens, almost all UNIC components get both |
||
readme = "README.md" | ||
|
||
# No tests/benches that depends on /data/ | ||
exclude = [] | ||
|
||
[features] | ||
default = [] | ||
|
||
# Unstable features | ||
unstable = [ "associated-consts", "fused", "trusted-len" ] | ||
associated-consts = [] | ||
fused = [] | ||
trusted-len = [] | ||
|
||
[dependencies] | ||
|
||
[badges] | ||
travis-ci = { repository = "behnam/rust-unic", branch = "master" } |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,17 @@ | ||
#![feature(test)] | ||
|
||
extern crate test; | ||
extern crate unic_char_range; | ||
|
||
use unic_char_range::*; | ||
|
||
#[bench] | ||
fn count(b: &mut test::Bencher) { | ||
b.iter(|| CharRange::all().iter().count()) | ||
} | ||
|
||
#[bench] | ||
// iterate the same range without skipping surrogates | ||
fn count_baseline(b: &mut test::Bencher) { | ||
b.iter(|| (0..0x110000).take_while(|_| true).count()) | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,112 @@ | ||
use std::{char, iter}; | ||
use CharRange; | ||
|
||
/// Range of Surrogate Code Points. | ||
/// | ||
/// Reference: <http://unicode.org/glossary/#surrogate_code_point> | ||
const SURROGATE_RANGE: ::std::ops::Range<u32> = 0xD800..0xE000; | ||
|
||
/// An iterator over `char`. | ||
#[derive(Copy, Clone, Debug, Eq, PartialEq)] | ||
pub struct CharIter { | ||
/// The codepoint of the smallest uniterated codepoint. | ||
/// | ||
/// If greater than or equal to `high`, iteration is finished. | ||
/// | ||
/// # Safety | ||
/// | ||
/// Must be a valid, non-surrogate codepoint while iteration is unfinished. | ||
low: u32, | ||
|
||
/// The codepoint one greater than the largest uniterated codepoint. | ||
/// | ||
/// If less than or equal to `low`, iteration is finished. | ||
/// | ||
/// # Safety | ||
/// | ||
/// Must be one greater than a valid, non surrogate codepoint while iteration is unfinished. | ||
high: u32, | ||
} | ||
|
||
impl<'a> From<&'a CharRange> for CharIter { | ||
fn from(range: &CharRange) -> CharIter { | ||
CharIter { | ||
low: range.first() as u32, | ||
high: range.last() as u32 + 1, | ||
} | ||
} | ||
} | ||
|
||
impl CharIter { | ||
#[inline] | ||
fn is_finished(&self) -> bool { | ||
self.low >= self.high | ||
} | ||
} | ||
|
||
impl Iterator for CharIter { | ||
type Item = char; | ||
|
||
#[inline] | ||
#[allow(unsafe_code)] | ||
fn next(&mut self) -> Option<char> { | ||
if self.is_finished() { | ||
return None; | ||
} | ||
|
||
let char = unsafe { char::from_u32_unchecked(self.low) }; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I didn't know it's possible to call a variable There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 👍 it's a bad habit. It's not something you'd really expect to work, but it does (I think because Rust does type and variable name lookup differently). Surprisingly though, my IDE highlights it right, which is part of why I do it even though I probably shouldn't. |
||
self.low += 1; | ||
|
||
// ensure `low` is never a surrogate code point | ||
if self.low == SURROGATE_RANGE.start { | ||
self.low = SURROGATE_RANGE.end; | ||
} | ||
|
||
Some(char) | ||
} | ||
|
||
fn size_hint(&self) -> (usize, Option<usize>) { | ||
let len = self.len(); | ||
(len, Some(len)) | ||
} | ||
} | ||
|
||
impl iter::DoubleEndedIterator for CharIter { | ||
#[allow(unsafe_code)] | ||
fn next_back(&mut self) -> Option<char> { | ||
if self.is_finished() { | ||
return None; | ||
} | ||
|
||
self.high -= 1; | ||
let char = unsafe { char::from_u32_unchecked(self.high) }; | ||
|
||
// ensure `high` is never one greater than a surrogate code point | ||
if self.high == SURROGATE_RANGE.end { | ||
self.high = SURROGATE_RANGE.start; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This implementation with the new invariants is even more clean and slim! 👍 There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. You'll like #112 then 😄 |
||
} | ||
|
||
Some(char) | ||
} | ||
} | ||
|
||
impl iter::ExactSizeIterator for CharIter { | ||
fn len(&self) -> usize { | ||
if self.is_finished() { | ||
return 0; | ||
} | ||
let naive_len = self.high as usize - self.low as usize; | ||
if self.low <= SURROGATE_RANGE.start && SURROGATE_RANGE.end <= self.high { | ||
naive_len - SURROGATE_RANGE.len() | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Maybe you can optimize here by putting this number in a There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Unfortunately doing so and still calling There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Turns out it takes |
||
} else { | ||
naive_len | ||
} | ||
} | ||
} | ||
|
||
#[cfg(feature = "fused")] | ||
impl iter::FusedIterator for CharIter {} | ||
|
||
#[allow(unsafe_code)] | ||
#[cfg(feature = "trusted-len")] | ||
unsafe impl iter::TrustedLen for CharIter {} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,33 @@ | ||
//! # Unic - Char - Range | ||
//! | ||
//! A simple way to control iteration over a range of characters. | ||
//! | ||
//! # Examples | ||
//! | ||
//! ``` | ||
//! # #[macro_use] extern crate unic_char_range; | ||
//! # use unic_char_range::*; | ||
//! | ||
//! # fn main() { | ||
//! for character in chars!('a'=..='z') { | ||
//! // character is each character in the lowercase english alphabet in order | ||
//! } | ||
//! | ||
//! for character in CharRange::all() { | ||
//! // character is every valid char from lowest codepoint to highest | ||
//! } | ||
//! # } | ||
//! ``` | ||
//! | ||
#![forbid(bad_style, missing_debug_implementations, unconditional_recursion)] | ||
#![deny(missing_docs, unsafe_code, unused, future_incompatible)] | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. d24057a for Basically, I picked There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I see. I guess it's fine for non-major flags, and we should probably just rely on clippy (in a future version) to hint on all |
||
#![cfg_attr(feature = "fused", feature(fused))] | ||
#![cfg_attr(feature = "trusted-len", feature(trusted_len))] | ||
|
||
mod range; | ||
mod iter; | ||
mod step; | ||
mod macros; | ||
|
||
pub use range::CharRange; | ||
pub use iter::CharIter; |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,74 @@ | ||
#[macro_export] | ||
/// Convenience macro to allow simple construction of character ranges. | ||
/// | ||
/// # Syntax | ||
/// | ||
/// ``` | ||
/// # #[macro_use] extern crate unic_char_range; | ||
/// # fn main() { | ||
/// chars!('a'=..='z'); // iterates the inclusive range 'a' through 'z' | ||
/// chars!('a'=..<'z'); // iterates the inclusive range 'a' through 'y' | ||
/// chars!('a'<..='z'); // iterates the inclusive range 'b' through 'z' | ||
/// chars!('a'<..<'z'); // iterates the inclusive range 'b' through 'y' | ||
/// # } | ||
/// ``` | ||
macro_rules! chars { | ||
// $:expr can only be terminated by `=>`, `,`, `;` so use a $:tt | ||
( $start:tt =..= $end:tt ) => ( $crate::CharRange::closed_range($start, $end) ); | ||
( $start:tt =..< $end:tt ) => ( $crate::CharRange::half_open_right_range($start, $end) ); | ||
( $start:tt <..= $end:tt ) => ( $crate::CharRange::half_open_left_range($start, $end) ); | ||
( $start:tt <..< $end:tt ) => ( $crate::CharRange::open_range($start, $end) ); | ||
} | ||
|
||
#[cfg(test)] | ||
mod test { | ||
use std::char; | ||
|
||
#[test] | ||
fn char_closed_iteration_works() { | ||
let mut target = 'a' as u32 - 1; | ||
|
||
for char in chars!('a'=..='z') { | ||
target += 1; | ||
assert_eq!(Some(char), char::from_u32(target)); | ||
} | ||
|
||
assert_eq!(target, 'z' as u32, "All characters were iterated"); | ||
} | ||
|
||
#[test] | ||
fn char_half_open_right_iteration_works() { | ||
let mut target = 'a' as u32 - 1; | ||
|
||
for char in chars!('a'=..<'z') { | ||
target += 1; | ||
assert_eq!(Some(char), char::from_u32(target)); | ||
} | ||
|
||
assert_eq!(target, 'y' as u32, "All characters were iterated"); | ||
} | ||
|
||
#[test] | ||
fn char_half_open_left_iteration_works() { | ||
let mut target = 'b' as u32 - 1; | ||
|
||
for char in chars!('a'<..='z') { | ||
target += 1; | ||
assert_eq!(Some(char), char::from_u32(target)); | ||
} | ||
|
||
assert_eq!(target, 'z' as u32, "All characters were iterated"); | ||
} | ||
|
||
#[test] | ||
fn char_open_iteration_works() { | ||
let mut target = 'b' as u32 - 1; | ||
|
||
for char in chars!('a'<..<'z') { | ||
target += 1; | ||
assert_eq!(Some(char), char::from_u32(target)); | ||
} | ||
|
||
assert_eq!(target, 'y' as u32, "All characters were iterated"); | ||
} | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Would be better to use more informative language in description, as it's the only text that shows up on crates.io lists and item page, and allows better matching in search.
To follow the pattern from other UNIC components, it can be something like this:
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
👍