Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[char/range] Add CharRange and CharIter #112

Merged
merged 21 commits into from
Aug 13, 2017
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions unic/char/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -16,3 +16,4 @@ travis-ci = { repository = "behnam/rust-unic", branch = "master" }

[dependencies]
unic-char-property = { path = "property/", version = "0.5.0" }
unic-char-range = { path = "range/", version = "0.5.0" }
25 changes: 25 additions & 0 deletions unic/char/range/Cargo.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
[package]
name = "unic-char-range"
version = "0.5.0"
authors = ["The UNIC Project Developers"]
repository = "https://github.com/behnam/rust-unic/"
license = "MIT/Apache-2.0"
keywords = ["text", "unicode", "iteration"]
description = "UNIC - Unicode Characters - Character Range and Iteration"
categories = ["text-processing"]

# No tests/benches that depends on /data/
exclude = []

[features]
default = []

# Unstable features
unstable = [ "exact-size-is-empty", "fused", "trusted-len" ]
exact-size-is-empty = []
fused = []
trusted-len = []


[badges]
travis-ci = { repository = "behnam/rust-unic", branch = "master" }
32 changes: 32 additions & 0 deletions unic/char/range/benches/benchmarks.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
#![feature(test)]

extern crate test;
extern crate unic_char_range;

use std::char;
use unic_char_range::CharRange;

#[bench]
fn forward_iteration(b: &mut test::Bencher) {
b.iter(|| CharRange::all().iter().count())
}

#[bench]
fn forward_iteration_baseline(b: &mut test::Bencher) {
b.iter(|| (0..0x11_0000).filter_map(char::from_u32).count())
}

#[bench]
fn reverse_iteration(b: &mut test::Bencher) {
b.iter(|| CharRange::all().iter().rev().count())
}

#[bench]
fn reverse_iteration_baseline(b: &mut test::Bencher) {
b.iter(|| (0..0x11_0000).rev().filter_map(char::from_u32).count())
}

#[bench]
fn range_length(b: &mut test::Bencher) {
b.iter(|| CharRange::all().len())
}
151 changes: 151 additions & 0 deletions unic/char/range/src/iter.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,151 @@
use std::char;
use std::ops::Range;
use {step, CharRange};

const SURROGATE_RANGE: Range<u32> = 0xD800..0xE000;

/// An iterator over a range of unicode code points.
///
/// Constructed via `CharRange::iter`. See `CharRange` for more information.
#[derive(Clone, Debug)]
pub struct CharIter {
/// The lowest uniterated character (inclusive).
///
/// Iteration is finished if this is higher than `high`.
low: char,

/// The highest uniterated character (inclusive).
///
/// Iteration is finished if this is lower than `low`.
high: char,
}

impl From<CharRange> for CharIter {
fn from(range: CharRange) -> CharIter {
CharIter {
low: range.low,
high: range.high,
}
}
}

impl From<CharIter> for CharRange {
fn from(iter: CharIter) -> CharRange {
CharRange {
low: iter.low,
high: iter.high,
}
}
}

impl CharIter {
#[inline]
#[allow(unsafe_code)]
// When stepping `self.low` forward would go over `char::MAX`,
// Set `self.high` to `'\0'` instead. It will have the same effect --
// consuming the last element from the iterator and ending iteration.
fn step_forward(&mut self) {
if self.low == char::MAX {
self.high = '\0'
} else {
self.low = unsafe { step::forward(self.low) }
}
}

#[inline]
#[allow(unsafe_code)]
// When stepping `self.high` backward would cause underflow,
// set `self.low` to `char::MAX` instead. It will have the same effect --
// consuming the last element from the iterator and ending iteration.
fn step_backward(&mut self) {
if self.high == '\0' {
self.low = char::MAX;
} else {
self.high = unsafe { step::backward(self.high) }
}
}

#[inline]
/// ExactSizeIterator::is_empty() for stable
fn is_finished(&self) -> bool {
self.low > self.high
}
}

impl Iterator for CharIter {
type Item = char;

#[inline]
fn next(&mut self) -> Option<char> {
if self.is_finished() {
return None;
}

let ch = self.low;
self.step_forward();
Some(ch)
}

fn size_hint(&self) -> (usize, Option<usize>) {
let len = self.len();
(len, Some(len))
}

fn last(self) -> Option<char> {
if self.is_finished() {
None
} else {
Some(self.high)
}
}

fn max(self) -> Option<char> {
self.last()
}

fn min(mut self) -> Option<char> {
self.next()
}
}

impl DoubleEndedIterator for CharIter {
#[inline]
fn next_back(&mut self) -> Option<Self::Item> {
if self.is_finished() {
None
} else {
let ch = self.high;
self.step_backward();
Some(ch)
}
}
}

impl ExactSizeIterator for CharIter {
fn len(&self) -> usize {
if self.is_finished() {
return 0;
}
let naive_range = (self.low as u32)..(self.high as u32 + 1);
if naive_range.start <= SURROGATE_RANGE.start && SURROGATE_RANGE.end <= naive_range.end {
naive_range.len() - SURROGATE_RANGE.len()
} else {
naive_range.len()
}
}

#[cfg(feature = "exact-size-is-empty")]
fn is_empty(&self) -> bool {
self.is_finished()
}
}

#[cfg(any(feature = "fused", feature = "trusted-len"))]
use std::iter;

#[cfg(feature = "fused")]
impl iter::FusedIterator for CharIter {}

#[allow(unsafe_code)]
#[cfg(feature = "trusted-len")]
unsafe impl iter::TrustedLen for CharIter {}
46 changes: 46 additions & 0 deletions unic/char/range/src/lib.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
//! # Unic - Char - Range
//!
//! A simple way to control iteration over a range of characters.
//!
//! # Examples
//!
//! ```
//! #[macro_use] extern crate unic_char_range;
//!
//! # fn main() {
//! for character in chars!('a'..='z') {
//! // character is each character in the lowercase english alphabet in order
//! }
//!
//! for character in chars!(..) {
//! // character is every valid char from lowest codepoint to highest
//! }
//! # }
//! ```
//!
//! # Features
//!
//! None of these features are included by default; they rely on unstable Rust feature gates.
//!
//! - `unstable`: enables all features
//! - `exact-size-is-empty`: provide a specific impl of [`ExactSizeIterator::is_empty`][is_empty]
//! - `fused`: impl the [`FusedIterator`] contract
//! - `trusted-len`: impl the [`TrustedLen`] contract
//!
//! [is_empty]: https://doc.rust-lang.org/std/iter/trait.ExactSizeIterator.html#method.is_empty
//! [`FusedIterator`]: https://doc.rust-lang.org/std/iter/trait.FusedIterator.html
//! [`TrustedLen`]: https://doc.rust-lang.org/std/iter/trait.TrustedLen.html
//!
#![forbid(bad_style, missing_debug_implementations, unconditional_recursion)]
#![deny(missing_docs, unsafe_code, unused, future_incompatible)]
#![cfg_attr(feature = "exact-size-is-empty", feature(exact_size_is_empty))]
#![cfg_attr(feature = "fused", feature(fused))]
#![cfg_attr(feature = "trusted-len", feature(trusted_len))]

mod macros;
mod range;
mod iter;
mod step;

pub use range::CharRange;
pub use iter::CharIter;
25 changes: 25 additions & 0 deletions unic/char/range/src/macros.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
#[macro_export]
/// Convenience macro for the initialization of `CharRange`s.
///
/// # Syntax
///
/// ```
/// # #[macro_use] extern crate unic_char_range;
/// # fn main() {
/// chars!('a'..'z'); // The half open range including 'a' and excluding 'z'
/// chars!('a'..='z'); // The closed range including 'a' and including 'z'
/// chars!(..); // All characters
/// # }
/// ```
///
/// `chars!('a'..='z')` and `chars!(..)` are constant-time expressions, and can be used
/// where such are required, such as in the initialization of constant data structures.
///
/// Note that because an `expr` capture cannot be followed by a `..`/`..=`,
/// this macro captures token trees. This means that if you want to pass more than one token,
/// you must parenthesize it (e.g. `chars!('\0' ..= (char::MAX)`).
macro_rules! chars {
( $low:tt .. $high:tt ) => ( $crate::CharRange::open_right($low, $high) );
( $low:tt ..= $high:tt ) => ( $crate::CharRange { low: $low, high: $high } );
( .. ) => ( chars!( '\0' ..= (::std::char::MAX) ) );
}
Loading