-
Notifications
You must be signed in to change notification settings - Fork 13.1k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Move static code outside of unciode.py.
- Loading branch information
Clar Charr
committed
Jan 3, 2018
1 parent
b65f0be
commit b4b3ddd
Showing
6 changed files
with
143 additions
and
225 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,76 @@ | ||
// Copyright 2012-2014 The Rust Project Developers. See the COPYRIGHT | ||
// file at the top-level directory of this distribution and at | ||
// http://rust-lang.org/COPYRIGHT. | ||
// | ||
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or | ||
// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license | ||
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your | ||
// option. This file may not be copied, modified, or distributed | ||
// except according to those terms. | ||
|
||
/// BoolTrie is a trie for representing a set of Unicode codepoints. It is | ||
/// implemented with postfix compression (sharing of identical child nodes), | ||
/// which gives both compact size and fast lookup. | ||
/// | ||
/// The space of Unicode codepoints is divided into 3 subareas, each | ||
/// represented by a trie with different depth. In the first (0..0x800), there | ||
/// is no trie structure at all; each u64 entry corresponds to a bitvector | ||
/// effectively holding 64 bool values. | ||
/// | ||
/// In the second (0x800..0x10000), each child of the root node represents a | ||
/// 64-wide subrange, but instead of storing the full 64-bit value of the leaf, | ||
/// the trie stores an 8-bit index into a shared table of leaf values. This | ||
/// exploits the fact that in reasonable sets, many such leaves can be shared. | ||
/// | ||
/// In the third (0x10000..0x110000), each child of the root node represents a | ||
/// 4096-wide subrange, and the trie stores an 8-bit index into a 64-byte slice | ||
/// of a child tree. Each of these 64 bytes represents an index into the table | ||
/// of shared 64-bit leaf values. This exploits the sparse structure in the | ||
/// non-BMP range of most Unicode sets. | ||
pub struct BoolTrie { | ||
// 0..0x800 (corresponding to 1 and 2 byte utf-8 sequences) | ||
pub r1: [u64; 32], // leaves | ||
|
||
// 0x800..0x10000 (corresponding to 3 byte utf-8 sequences) | ||
pub r2: [u8; 992], // first level | ||
pub r3: &'static [u64], // leaves | ||
|
||
// 0x10000..0x110000 (corresponding to 4 byte utf-8 sequences) | ||
pub r4: [u8; 256], // first level | ||
pub r5: &'static [u8], // second level | ||
pub r6: &'static [u64], // leaves | ||
} | ||
impl BoolTrie { | ||
pub fn lookup(&self, c: char) -> bool { | ||
let c = c as usize; | ||
if c < 0x800 { | ||
trie_range_leaf(c, self.r1[c >> 6]) | ||
} else if c < 0x10000 { | ||
let child = self.r2[(c >> 6) - 0x20]; | ||
trie_range_leaf(c, self.r3[child as usize]) | ||
} else { | ||
let child = self.r4[(c >> 12) - 0x10]; | ||
let leaf = self.r5[((child as usize) << 6) + ((c >> 6) & 0x3f)]; | ||
trie_range_leaf(c, self.r6[leaf as usize]) | ||
} | ||
} | ||
} | ||
|
||
pub struct SmallBoolTrie { | ||
pub(crate) r1: &'static [u8], // first level | ||
pub(crate) r2: &'static [u64], // leaves | ||
} | ||
|
||
impl SmallBoolTrie { | ||
pub fn lookup(&self, c: char) -> bool { | ||
let c = c as usize; | ||
match self.r1.get(c >> 6) { | ||
Some(&child) => trie_range_leaf(c, self.r2[child as usize]), | ||
None => false, | ||
} | ||
} | ||
} | ||
|
||
fn trie_range_leaf(c: usize, bitmap_chunk: u64) -> bool { | ||
((bitmap_chunk >> (c & 63)) & 1) != 0 | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.