diff --git a/src/libcollectionstest/str.rs b/src/libcollectionstest/str.rs index 3f32136bc2639..87a018ced195a 100644 --- a/src/libcollectionstest/str.rs +++ b/src/libcollectionstest/str.rs @@ -705,7 +705,7 @@ fn test_split_at() { #[should_panic] fn test_split_at_boundscheck() { let s = "ศไทย中华Việt Nam"; - let (a, b) = s.split_at(1); + s.split_at(1); } #[test] @@ -1820,6 +1820,14 @@ mod pattern { Match (4, 6), Reject(6, 7), ]); + make_test!(str_searcher_ascii_haystack_seq, "bb", "abbcbbbbd", [ + Reject(0, 1), + Match (1, 3), + Reject(3, 4), + Match (4, 6), + Match (6, 8), + Reject(8, 9), + ]); make_test!(str_searcher_empty_needle_ascii_haystack, "", "abbcbbd", [ Match (0, 0), Reject(0, 1), diff --git a/src/libcore/str/mod.rs b/src/libcore/str/mod.rs index 5a621176c4a81..bd6e1a4063abd 100644 --- a/src/libcore/str/mod.rs +++ b/src/libcore/str/mod.rs @@ -15,13 +15,12 @@ #![doc(primitive = "str")] #![stable(feature = "rust1", since = "1.0.0")] -use self::OldSearcher::{TwoWay, TwoWayLong}; use self::pattern::Pattern; use self::pattern::{Searcher, ReverseSearcher, DoubleEndedSearcher}; use char::CharExt; use clone::Clone; -use cmp::{self, Eq}; +use cmp::Eq; use convert::AsRef; use default::Default; use fmt; @@ -33,7 +32,6 @@ use option::Option::{self, None, Some}; use raw::{Repr, Slice}; use result::Result::{self, Ok, Err}; use slice::{self, SliceExt}; -use usize; pub mod pattern; @@ -870,301 +868,6 @@ impl<'a> DoubleEndedIterator for LinesAny<'a> { } } -/// The internal state of an iterator that searches for matches of a substring -/// within a larger string using two-way search -#[derive(Clone)] -struct TwoWaySearcher { - // constants - crit_pos: usize, - period: usize, - byteset: u64, - - // variables - position: usize, - memory: usize -} - -/* - This is the Two-Way search algorithm, which was introduced in the paper: - Crochemore, M., Perrin, D., 1991, Two-way string-matching, Journal of the ACM 38(3):651-675. - - Here's some background information. - - A *word* is a string of symbols. The *length* of a word should be a familiar - notion, and here we denote it for any word x by |x|. - (We also allow for the possibility of the *empty word*, a word of length zero). - - If x is any non-empty word, then an integer p with 0 < p <= |x| is said to be a - *period* for x iff for all i with 0 <= i <= |x| - p - 1, we have x[i] == x[i+p]. - For example, both 1 and 2 are periods for the string "aa". As another example, - the only period of the string "abcd" is 4. - - We denote by period(x) the *smallest* period of x (provided that x is non-empty). - This is always well-defined since every non-empty word x has at least one period, - |x|. We sometimes call this *the period* of x. - - If u, v and x are words such that x = uv, where uv is the concatenation of u and - v, then we say that (u, v) is a *factorization* of x. - - Let (u, v) be a factorization for a word x. Then if w is a non-empty word such - that both of the following hold - - - either w is a suffix of u or u is a suffix of w - - either w is a prefix of v or v is a prefix of w - - then w is said to be a *repetition* for the factorization (u, v). - - Just to unpack this, there are four possibilities here. Let w = "abc". Then we - might have: - - - w is a suffix of u and w is a prefix of v. ex: ("lolabc", "abcde") - - w is a suffix of u and v is a prefix of w. ex: ("lolabc", "ab") - - u is a suffix of w and w is a prefix of v. ex: ("bc", "abchi") - - u is a suffix of w and v is a prefix of w. ex: ("bc", "a") - - Note that the word vu is a repetition for any factorization (u,v) of x = uv, - so every factorization has at least one repetition. - - If x is a string and (u, v) is a factorization for x, then a *local period* for - (u, v) is an integer r such that there is some word w such that |w| = r and w is - a repetition for (u, v). - - We denote by local_period(u, v) the smallest local period of (u, v). We sometimes - call this *the local period* of (u, v). Provided that x = uv is non-empty, this - is well-defined (because each non-empty word has at least one factorization, as - noted above). - - It can be proven that the following is an equivalent definition of a local period - for a factorization (u, v): any positive integer r such that x[i] == x[i+r] for - all i such that |u| - r <= i <= |u| - 1 and such that both x[i] and x[i+r] are - defined. (i.e. i > 0 and i + r < |x|). - - Using the above reformulation, it is easy to prove that - - 1 <= local_period(u, v) <= period(uv) - - A factorization (u, v) of x such that local_period(u,v) = period(x) is called a - *critical factorization*. - - The algorithm hinges on the following theorem, which is stated without proof: - - **Critical Factorization Theorem** Any word x has at least one critical - factorization (u, v) such that |u| < period(x). - - The purpose of maximal_suffix is to find such a critical factorization. - -*/ -impl TwoWaySearcher { - #[allow(dead_code)] - fn new(needle: &[u8]) -> TwoWaySearcher { - let (crit_pos_false, period_false) = TwoWaySearcher::maximal_suffix(needle, false); - let (crit_pos_true, period_true) = TwoWaySearcher::maximal_suffix(needle, true); - - let (crit_pos, period) = - if crit_pos_false > crit_pos_true { - (crit_pos_false, period_false) - } else { - (crit_pos_true, period_true) - }; - - // This isn't in the original algorithm, as far as I'm aware. - let byteset = needle.iter() - .fold(0, |a, &b| (1 << ((b & 0x3f) as usize)) | a); - - // A particularly readable explanation of what's going on here can be found - // in Crochemore and Rytter's book "Text Algorithms", ch 13. Specifically - // see the code for "Algorithm CP" on p. 323. - // - // What's going on is we have some critical factorization (u, v) of the - // needle, and we want to determine whether u is a suffix of - // &v[..period]. If it is, we use "Algorithm CP1". Otherwise we use - // "Algorithm CP2", which is optimized for when the period of the needle - // is large. - if &needle[..crit_pos] == &needle[period.. period + crit_pos] { - TwoWaySearcher { - crit_pos: crit_pos, - period: period, - byteset: byteset, - - position: 0, - memory: 0 - } - } else { - TwoWaySearcher { - crit_pos: crit_pos, - period: cmp::max(crit_pos, needle.len() - crit_pos) + 1, - byteset: byteset, - - position: 0, - memory: usize::MAX // Dummy value to signify that the period is long - } - } - } - - // One of the main ideas of Two-Way is that we factorize the needle into - // two halves, (u, v), and begin trying to find v in the haystack by scanning - // left to right. If v matches, we try to match u by scanning right to left. - // How far we can jump when we encounter a mismatch is all based on the fact - // that (u, v) is a critical factorization for the needle. - #[inline] - fn next(&mut self, haystack: &[u8], needle: &[u8], long_period: bool) - -> Option<(usize, usize)> { - 'search: loop { - // Check that we have room to search in - if self.position + needle.len() > haystack.len() { - return None; - } - - // Quickly skip by large portions unrelated to our substring - if (self.byteset >> - ((haystack[self.position + needle.len() - 1] & 0x3f) - as usize)) & 1 == 0 { - self.position += needle.len(); - if !long_period { - self.memory = 0; - } - continue 'search; - } - - // See if the right part of the needle matches - let start = if long_period { self.crit_pos } - else { cmp::max(self.crit_pos, self.memory) }; - for i in start..needle.len() { - if needle[i] != haystack[self.position + i] { - self.position += i - self.crit_pos + 1; - if !long_period { - self.memory = 0; - } - continue 'search; - } - } - - // See if the left part of the needle matches - let start = if long_period { 0 } else { self.memory }; - for i in (start..self.crit_pos).rev() { - if needle[i] != haystack[self.position + i] { - self.position += self.period; - if !long_period { - self.memory = needle.len() - self.period; - } - continue 'search; - } - } - - // We have found a match! - let match_pos = self.position; - self.position += needle.len(); // add self.period for all matches - if !long_period { - self.memory = 0; // set to needle.len() - self.period for all matches - } - return Some((match_pos, match_pos + needle.len())); - } - } - - // Computes a critical factorization (u, v) of `arr`. - // Specifically, returns (i, p), where i is the starting index of v in some - // critical factorization (u, v) and p = period(v) - #[inline] - #[allow(dead_code)] - #[allow(deprecated)] - fn maximal_suffix(arr: &[u8], reversed: bool) -> (usize, usize) { - let mut left: usize = !0; // Corresponds to i in the paper - let mut right = 0; // Corresponds to j in the paper - let mut offset = 1; // Corresponds to k in the paper - let mut period = 1; // Corresponds to p in the paper - - while right + offset < arr.len() { - let a; - let b; - if reversed { - a = arr[left.wrapping_add(offset)]; - b = arr[right + offset]; - } else { - a = arr[right + offset]; - b = arr[left.wrapping_add(offset)]; - } - if a < b { - // Suffix is smaller, period is entire prefix so far. - right += offset; - offset = 1; - period = right.wrapping_sub(left); - } else if a == b { - // Advance through repetition of the current period. - if offset == period { - right += offset; - offset = 1; - } else { - offset += 1; - } - } else { - // Suffix is larger, start over from current location. - left = right; - right += 1; - offset = 1; - period = 1; - } - } - (left.wrapping_add(1), period) - } -} - -/// The internal state of an iterator that searches for matches of a substring -/// within a larger string using a dynamically chosen search algorithm -#[derive(Clone)] -// NB: This is kept around for convenience because -// it is planned to be used again in the future -enum OldSearcher { - TwoWay(TwoWaySearcher), - TwoWayLong(TwoWaySearcher), -} - -impl OldSearcher { - #[allow(dead_code)] - fn new(haystack: &[u8], needle: &[u8]) -> OldSearcher { - if needle.is_empty() { - // Handle specially - unimplemented!() - // FIXME: Tune this. - // FIXME(#16715): This unsigned integer addition will probably not - // overflow because that would mean that the memory almost solely - // consists of the needle. Needs #16715 to be formally fixed. - } else if needle.len() + 20 > haystack.len() { - // Use naive searcher - unimplemented!() - } else { - let searcher = TwoWaySearcher::new(needle); - if searcher.memory == usize::MAX { // If the period is long - TwoWayLong(searcher) - } else { - TwoWay(searcher) - } - } - } -} - -#[derive(Clone)] -// NB: This is kept around for convenience because -// it is planned to be used again in the future -struct OldMatchIndices<'a, 'b> { - // constants - haystack: &'a str, - needle: &'b str, - searcher: OldSearcher -} - -impl<'a, 'b> OldMatchIndices<'a, 'b> { - #[inline] - #[allow(dead_code)] - fn next(&mut self) -> Option<(usize, usize)> { - match self.searcher { - TwoWay(ref mut searcher) - => searcher.next(self.haystack.as_bytes(), self.needle.as_bytes(), false), - TwoWayLong(ref mut searcher) - => searcher.next(self.haystack.as_bytes(), self.needle.as_bytes(), true), - } - } -} - /* Section: Comparing strings */ diff --git a/src/libcore/str/pattern.rs b/src/libcore/str/pattern.rs index 8bdbab55211d8..dccdaa9120d10 100644 --- a/src/libcore/str/pattern.rs +++ b/src/libcore/str/pattern.rs @@ -17,6 +17,8 @@ reason = "API not fully fleshed out and ready to be stabilized")] use prelude::*; +use core::cmp; +use usize; // Pattern @@ -341,148 +343,6 @@ unsafe impl<'a, C: CharEq> ReverseSearcher<'a> for CharEqSearcher<'a, C> { impl<'a, C: CharEq> DoubleEndedSearcher<'a> for CharEqSearcher<'a, C> {} -///////////////////////////////////////////////////////////////////////////// -// Impl for &str -///////////////////////////////////////////////////////////////////////////// - -// Todo: Optimize the naive implementation here - -/// Associated type for `<&str as Pattern<'a>>::Searcher`. -#[derive(Clone)] -pub struct StrSearcher<'a, 'b> { - haystack: &'a str, - needle: &'b str, - start: usize, - end: usize, - state: State, -} - -#[derive(Clone, PartialEq)] -enum State { Done, NotDone, Reject(usize, usize) } -impl State { - #[inline] fn done(&self) -> bool { *self == State::Done } - #[inline] fn take(&mut self) -> State { ::mem::replace(self, State::NotDone) } -} - -/// Non-allocating substring search. -/// -/// Will handle the pattern `""` as returning empty matches at each utf8 -/// boundary. -impl<'a, 'b> Pattern<'a> for &'b str { - type Searcher = StrSearcher<'a, 'b>; - - #[inline] - fn into_searcher(self, haystack: &'a str) -> StrSearcher<'a, 'b> { - StrSearcher { - haystack: haystack, - needle: self, - start: 0, - end: haystack.len(), - state: State::NotDone, - } - } -} - -unsafe impl<'a, 'b> Searcher<'a> for StrSearcher<'a, 'b> { - #[inline] - fn haystack(&self) -> &'a str { - self.haystack - } - - #[inline] - fn next(&mut self) -> SearchStep { - str_search_step(self, - |m: &mut StrSearcher| { - // Forward step for empty needle - let current_start = m.start; - if !m.state.done() { - m.start = m.haystack.char_range_at(current_start).next; - m.state = State::Reject(current_start, m.start); - } - SearchStep::Match(current_start, current_start) - }, - |m: &mut StrSearcher| { - // Forward step for nonempty needle - let current_start = m.start; - // Compare byte window because this might break utf8 boundaries - let possible_match = &m.haystack.as_bytes()[m.start .. m.start + m.needle.len()]; - if possible_match == m.needle.as_bytes() { - m.start += m.needle.len(); - SearchStep::Match(current_start, m.start) - } else { - // Skip a char - let haystack_suffix = &m.haystack[m.start..]; - m.start += haystack_suffix.chars().next().unwrap().len_utf8(); - SearchStep::Reject(current_start, m.start) - } - }) - } -} - -unsafe impl<'a, 'b> ReverseSearcher<'a> for StrSearcher<'a, 'b> { - #[inline] - fn next_back(&mut self) -> SearchStep { - str_search_step(self, - |m: &mut StrSearcher| { - // Backward step for empty needle - let current_end = m.end; - if !m.state.done() { - m.end = m.haystack.char_range_at_reverse(current_end).next; - m.state = State::Reject(m.end, current_end); - } - SearchStep::Match(current_end, current_end) - }, - |m: &mut StrSearcher| { - // Backward step for nonempty needle - let current_end = m.end; - // Compare byte window because this might break utf8 boundaries - let possible_match = &m.haystack.as_bytes()[m.end - m.needle.len() .. m.end]; - if possible_match == m.needle.as_bytes() { - m.end -= m.needle.len(); - SearchStep::Match(m.end, current_end) - } else { - // Skip a char - let haystack_prefix = &m.haystack[..m.end]; - m.end -= haystack_prefix.chars().rev().next().unwrap().len_utf8(); - SearchStep::Reject(m.end, current_end) - } - }) - } -} - -// Helper function for encapsulating the common control flow -// of doing a search step from the front or doing a search step from the back -fn str_search_step(mut m: &mut StrSearcher, - empty_needle_step: F, - nonempty_needle_step: G) -> SearchStep - where F: FnOnce(&mut StrSearcher) -> SearchStep, - G: FnOnce(&mut StrSearcher) -> SearchStep -{ - if m.state.done() { - SearchStep::Done - } else if m.needle.is_empty() && m.start <= m.end { - // Case for needle == "" - if let State::Reject(a, b) = m.state.take() { - SearchStep::Reject(a, b) - } else { - if m.start == m.end { - m.state = State::Done; - } - empty_needle_step(&mut m) - } - } else if m.start + m.needle.len() <= m.end { - // Case for needle != "" - nonempty_needle_step(&mut m) - } else if m.start < m.end { - // Remaining slice shorter than needle, reject it - m.state = State::Done; - SearchStep::Reject(m.start, m.end) - } else { - m.state = State::Done; - SearchStep::Done - } -} - ///////////////////////////////////////////////////////////////////////////// macro_rules! pattern_methods { @@ -633,3 +493,579 @@ impl<'a, F> Pattern<'a> for F where F: FnMut(char) -> bool { impl<'a, 'b> Pattern<'a> for &'b &'b str { pattern_methods!(StrSearcher<'a, 'b>, |&s| s, |s| s); } + +///////////////////////////////////////////////////////////////////////////// +// Impl for &str +///////////////////////////////////////////////////////////////////////////// + +/// Non-allocating substring search. +/// +/// Will handle the pattern `""` as returning empty matches at each character +/// boundary. +impl<'a, 'b> Pattern<'a> for &'b str { + type Searcher = StrSearcher<'a, 'b>; + + #[inline] + fn into_searcher(self, haystack: &'a str) -> StrSearcher<'a, 'b> { + StrSearcher::new(haystack, self) + } + + /// Checks whether the pattern matches at the front of the haystack + #[inline] + fn is_prefix_of(self, haystack: &'a str) -> bool { + // Use `as_bytes` so that we can slice through a character in the haystack. + // Since self is always valid UTF-8, this can't result in a false positive. + self.len() <= haystack.len() && + self.as_bytes() == &haystack.as_bytes()[..self.len()] + } + + /// Checks whether the pattern matches at the back of the haystack + #[inline] + fn is_suffix_of(self, haystack: &'a str) -> bool { + self.len() <= haystack.len() && + self.as_bytes() == &haystack.as_bytes()[haystack.len() - self.len()..] + } +} + + +///////////////////////////////////////////////////////////////////////////// +// Two Way substring searcher +///////////////////////////////////////////////////////////////////////////// + +#[derive(Clone, Debug)] +/// Associated type for `<&str as Pattern<'a>>::Searcher`. +pub struct StrSearcher<'a, 'b> { + haystack: &'a str, + needle: &'b str, + + searcher: StrSearcherImpl, +} + +#[derive(Clone, Debug)] +enum StrSearcherImpl { + Empty(EmptyNeedle), + TwoWay(TwoWaySearcher), +} + +#[derive(Clone, Debug)] +struct EmptyNeedle { + position: usize, + end: usize, + is_match_fw: bool, + is_match_bw: bool, +} + +impl<'a, 'b> StrSearcher<'a, 'b> { + fn new(haystack: &'a str, needle: &'b str) -> StrSearcher<'a, 'b> { + if needle.is_empty() { + StrSearcher { + haystack: haystack, + needle: needle, + searcher: StrSearcherImpl::Empty(EmptyNeedle { + position: 0, + end: haystack.len(), + is_match_fw: true, + is_match_bw: true, + }), + } + } else { + StrSearcher { + haystack: haystack, + needle: needle, + searcher: StrSearcherImpl::TwoWay( + TwoWaySearcher::new(needle.as_bytes(), haystack.len()) + ), + } + } + } +} + +unsafe impl<'a, 'b> Searcher<'a> for StrSearcher<'a, 'b> { + fn haystack(&self) -> &'a str { self.haystack } + + #[inline] + fn next(&mut self) -> SearchStep { + match self.searcher { + StrSearcherImpl::Empty(ref mut searcher) => { + // empty needle rejects every char and matches every empty string between them + let is_match = searcher.is_match_fw; + searcher.is_match_fw = !searcher.is_match_fw; + let pos = searcher.position; + match self.haystack[pos..].chars().next() { + _ if is_match => SearchStep::Match(pos, pos), + None => SearchStep::Done, + Some(ch) => { + searcher.position += ch.len_utf8(); + SearchStep::Reject(pos, searcher.position) + } + } + } + StrSearcherImpl::TwoWay(ref mut searcher) => { + // TwoWaySearcher produces valid *Match* indices that split at char boundaries + // as long as it does correct matching and that haystack and needle are + // valid UTF-8 + // *Rejects* from the algorithm can fall on any indices, but we will walk them + // manually to the next character boundary, so that they are utf-8 safe. + if searcher.position == self.haystack.len() { + return SearchStep::Done; + } + let is_long = searcher.memory == usize::MAX; + match searcher.next::(self.haystack.as_bytes(), + self.needle.as_bytes(), + is_long) + { + SearchStep::Reject(a, mut b) => { + // skip to next char boundary + while !self.haystack.is_char_boundary(b) { + b += 1; + } + searcher.position = cmp::max(b, searcher.position); + SearchStep::Reject(a, b) + } + otherwise => otherwise, + } + } + } + } + + #[inline(always)] + fn next_match(&mut self) -> Option<(usize, usize)> { + match self.searcher { + StrSearcherImpl::Empty(..) => { + loop { + match self.next() { + SearchStep::Match(a, b) => return Some((a, b)), + SearchStep::Done => return None, + SearchStep::Reject(..) => { } + } + } + } + StrSearcherImpl::TwoWay(ref mut searcher) => { + let is_long = searcher.memory == usize::MAX; + if is_long { + searcher.next::(self.haystack.as_bytes(), + self.needle.as_bytes(), + true) + } else { + searcher.next::(self.haystack.as_bytes(), + self.needle.as_bytes(), + false) + } + } + } + } + +} +unsafe impl<'a, 'b> ReverseSearcher<'a> for StrSearcher<'a, 'b> { + #[inline] + fn next_back(&mut self) -> SearchStep { + match self.searcher { + StrSearcherImpl::Empty(ref mut searcher) => { + let is_match = searcher.is_match_bw; + searcher.is_match_bw = !searcher.is_match_bw; + let end = searcher.end; + match self.haystack[..end].chars().next_back() { + _ if is_match => SearchStep::Match(end, end), + None => SearchStep::Done, + Some(ch) => { + searcher.end -= ch.len_utf8(); + SearchStep::Reject(searcher.end, end) + } + } + } + StrSearcherImpl::TwoWay(ref mut searcher) => { + if searcher.end == 0 { + return SearchStep::Done; + } + match searcher.next_back::(self.haystack.as_bytes(), + self.needle.as_bytes()) + { + SearchStep::Reject(mut a, b) => { + // skip to next char boundary + while !self.haystack.is_char_boundary(a) { + a -= 1; + } + searcher.end = cmp::min(a, searcher.end); + SearchStep::Reject(a, b) + } + otherwise => otherwise, + } + } + } + } + + #[inline] + fn next_match_back(&mut self) -> Option<(usize, usize)> { + match self.searcher { + StrSearcherImpl::Empty(..) => { + loop { + match self.next_back() { + SearchStep::Match(a, b) => return Some((a, b)), + SearchStep::Done => return None, + SearchStep::Reject(..) => { } + } + } + } + StrSearcherImpl::TwoWay(ref mut searcher) => { + searcher.next_back::(self.haystack.as_bytes(), + self.needle.as_bytes()) + } + } + } +} + +/// The internal state of an iterator that searches for matches of a substring +/// within a larger string using two-way search +#[derive(Clone, Debug)] +struct TwoWaySearcher { + // constants + crit_pos: usize, + period: usize, + byteset: u64, + + // variables + position: usize, + end: usize, + memory: usize +} + +/* + This is the Two-Way search algorithm, which was introduced in the paper: + Crochemore, M., Perrin, D., 1991, Two-way string-matching, Journal of the ACM 38(3):651-675. + + Here's some background information. + + A *word* is a string of symbols. The *length* of a word should be a familiar + notion, and here we denote it for any word x by |x|. + (We also allow for the possibility of the *empty word*, a word of length zero). + + If x is any non-empty word, then an integer p with 0 < p <= |x| is said to be a + *period* for x iff for all i with 0 <= i <= |x| - p - 1, we have x[i] == x[i+p]. + For example, both 1 and 2 are periods for the string "aa". As another example, + the only period of the string "abcd" is 4. + + We denote by period(x) the *smallest* period of x (provided that x is non-empty). + This is always well-defined since every non-empty word x has at least one period, + |x|. We sometimes call this *the period* of x. + + If u, v and x are words such that x = uv, where uv is the concatenation of u and + v, then we say that (u, v) is a *factorization* of x. + + Let (u, v) be a factorization for a word x. Then if w is a non-empty word such + that both of the following hold + + - either w is a suffix of u or u is a suffix of w + - either w is a prefix of v or v is a prefix of w + + then w is said to be a *repetition* for the factorization (u, v). + + Just to unpack this, there are four possibilities here. Let w = "abc". Then we + might have: + + - w is a suffix of u and w is a prefix of v. ex: ("lolabc", "abcde") + - w is a suffix of u and v is a prefix of w. ex: ("lolabc", "ab") + - u is a suffix of w and w is a prefix of v. ex: ("bc", "abchi") + - u is a suffix of w and v is a prefix of w. ex: ("bc", "a") + + Note that the word vu is a repetition for any factorization (u,v) of x = uv, + so every factorization has at least one repetition. + + If x is a string and (u, v) is a factorization for x, then a *local period* for + (u, v) is an integer r such that there is some word w such that |w| = r and w is + a repetition for (u, v). + + We denote by local_period(u, v) the smallest local period of (u, v). We sometimes + call this *the local period* of (u, v). Provided that x = uv is non-empty, this + is well-defined (because each non-empty word has at least one factorization, as + noted above). + + It can be proven that the following is an equivalent definition of a local period + for a factorization (u, v): any positive integer r such that x[i] == x[i+r] for + all i such that |u| - r <= i <= |u| - 1 and such that both x[i] and x[i+r] are + defined. (i.e. i > 0 and i + r < |x|). + + Using the above reformulation, it is easy to prove that + + 1 <= local_period(u, v) <= period(uv) + + A factorization (u, v) of x such that local_period(u,v) = period(x) is called a + *critical factorization*. + + The algorithm hinges on the following theorem, which is stated without proof: + + **Critical Factorization Theorem** Any word x has at least one critical + factorization (u, v) such that |u| < period(x). + + The purpose of maximal_suffix is to find such a critical factorization. + +*/ +impl TwoWaySearcher { + fn new(needle: &[u8], end: usize) -> TwoWaySearcher { + let (crit_pos_false, period_false) = TwoWaySearcher::maximal_suffix(needle, false); + let (crit_pos_true, period_true) = TwoWaySearcher::maximal_suffix(needle, true); + + let (crit_pos, period) = + if crit_pos_false > crit_pos_true { + (crit_pos_false, period_false) + } else { + (crit_pos_true, period_true) + }; + + // This isn't in the original algorithm, as far as I'm aware. + let byteset = needle.iter() + .fold(0, |a, &b| (1 << ((b & 0x3f) as usize)) | a); + + // A particularly readable explanation of what's going on here can be found + // in Crochemore and Rytter's book "Text Algorithms", ch 13. Specifically + // see the code for "Algorithm CP" on p. 323. + // + // What's going on is we have some critical factorization (u, v) of the + // needle, and we want to determine whether u is a suffix of + // &v[..period]. If it is, we use "Algorithm CP1". Otherwise we use + // "Algorithm CP2", which is optimized for when the period of the needle + // is large. + if &needle[..crit_pos] == &needle[period.. period + crit_pos] { + // short period case + TwoWaySearcher { + crit_pos: crit_pos, + period: period, + byteset: byteset, + + position: 0, + end: end, + memory: 0 + } + } else { + // long period case + // we have an approximation to the actual period, and don't use memory. + TwoWaySearcher { + crit_pos: crit_pos, + period: cmp::max(crit_pos, needle.len() - crit_pos) + 1, + byteset: byteset, + + position: 0, + end: end, + memory: usize::MAX // Dummy value to signify that the period is long + } + } + } + + #[inline(always)] + fn byteset_contains(&self, byte: u8) -> bool { + (self.byteset >> ((byte & 0x3f) as usize)) & 1 != 0 + } + + // One of the main ideas of Two-Way is that we factorize the needle into + // two halves, (u, v), and begin trying to find v in the haystack by scanning + // left to right. If v matches, we try to match u by scanning right to left. + // How far we can jump when we encounter a mismatch is all based on the fact + // that (u, v) is a critical factorization for the needle. + #[inline(always)] + fn next(&mut self, haystack: &[u8], needle: &[u8], long_period: bool) + -> S::Output + where S: TwoWayStrategy + { + // `next()` uses `self.position` as its cursor + let old_pos = self.position; + 'search: loop { + // Check that we have room to search in + if needle.len() > haystack.len() - self.position { + self.position = haystack.len(); + return S::rejecting(old_pos, self.position); + } + + if S::use_early_reject() && old_pos != self.position { + return S::rejecting(old_pos, self.position); + } + + // Quickly skip by large portions unrelated to our substring + if !self.byteset_contains(haystack[self.position + needle.len() - 1]) { + self.position += needle.len(); + if !long_period { + self.memory = 0; + } + continue 'search; + } + + // See if the right part of the needle matches + let start = if long_period { self.crit_pos } + else { cmp::max(self.crit_pos, self.memory) }; + for i in start..needle.len() { + if needle[i] != haystack[self.position + i] { + self.position += i - self.crit_pos + 1; + if !long_period { + self.memory = 0; + } + continue 'search; + } + } + + // See if the left part of the needle matches + let start = if long_period { 0 } else { self.memory }; + for i in (start..self.crit_pos).rev() { + if needle[i] != haystack[self.position + i] { + self.position += self.period; + if !long_period { + self.memory = needle.len() - self.period; + } + continue 'search; + } + } + + // We have found a match! + let match_pos = self.position; + + // Note: add self.period instead of needle.len() to have overlapping matches + self.position += needle.len(); + if !long_period { + self.memory = 0; // set to needle.len() - self.period for overlapping matches + } + + return S::matching(match_pos, match_pos + needle.len()); + } + } + + // Follows the ideas in `next()`. + // + // All the definitions are completely symmetrical, with period(x) = period(reverse(x)) + // and local_period(u, v) = local_period(reverse(v), reverse(u)), so if (u, v) + // is a critical factorization, so is (reverse(v), reverse(u)). Similarly, + // the "period" stored in self.period is the real period if long_period is + // false, and so is still valid for a reversed needle, and if long_period is + // true, all the algorithm requires is that self.period is less than or + // equal to the real period, which must be true for the forward case anyway. + // + // To search in reverse through the haystack, we search forward through + // a reversed haystack with a reversed needle, and the above paragraph shows + // that the precomputed parameters can be left alone. + #[inline] + fn next_back(&mut self, haystack: &[u8], needle: &[u8]) + -> S::Output + where S: TwoWayStrategy + { + // `next_back()` uses `self.end` as its cursor -- so that `next()` and `next_back()` + // are independent. + let old_end = self.end; + 'search: loop { + // Check that we have room to search in + if needle.len() > self.end { + self.end = 0; + return S::rejecting(0, old_end); + } + + if S::use_early_reject() && old_end != self.end { + return S::rejecting(self.end, old_end); + } + + // Quickly skip by large portions unrelated to our substring + if !self.byteset_contains(haystack[self.end - needle.len()]) { + self.end -= needle.len(); + continue 'search; + } + + // See if the left part of the needle matches + for i in (0..self.crit_pos).rev() { + if needle[i] != haystack[self.end - needle.len() + i] { + self.end -= self.crit_pos - i; + continue 'search; + } + } + + // See if the right part of the needle matches + for i in self.crit_pos..needle.len() { + if needle[i] != haystack[self.end - needle.len() + i] { + self.end -= self.period; + continue 'search; + } + } + + // We have found a match! + let match_pos = self.end - needle.len(); + // Note: sub self.period instead of needle.len() to have overlapping matches + self.end -= needle.len(); + + return S::matching(match_pos, match_pos + needle.len()); + } + } + + // Computes a critical factorization (u, v) of `arr`. + // Specifically, returns (i, p), where i is the starting index of v in some + // critical factorization (u, v) and p = period(v) + #[inline] + fn maximal_suffix(arr: &[u8], reversed: bool) -> (usize, usize) { + let mut left: usize = !0; // Corresponds to i in the paper + let mut right = 0; // Corresponds to j in the paper + let mut offset = 1; // Corresponds to k in the paper + let mut period = 1; // Corresponds to p in the paper + + while right + offset < arr.len() { + let a; + let b; + if reversed { + a = arr[left.wrapping_add(offset)]; + b = arr[right + offset]; + } else { + a = arr[right + offset]; + b = arr[left.wrapping_add(offset)]; + } + if a < b { + // Suffix is smaller, period is entire prefix so far. + right += offset; + offset = 1; + period = right.wrapping_sub(left); + } else if a == b { + // Advance through repetition of the current period. + if offset == period { + right += offset; + offset = 1; + } else { + offset += 1; + } + } else { + // Suffix is larger, start over from current location. + left = right; + right += 1; + offset = 1; + period = 1; + } + } + (left.wrapping_add(1), period) + } +} + +// TwoWayStrategy allows the algorithm to either skip non-matches as quickly +// as possible, or to work in a mode where it emits Rejects relatively quickly. +trait TwoWayStrategy { + type Output; + fn use_early_reject() -> bool; + fn rejecting(usize, usize) -> Self::Output; + fn matching(usize, usize) -> Self::Output; +} + +/// Skip to match intervals as quickly as possible +enum MatchOnly { } + +impl TwoWayStrategy for MatchOnly { + type Output = Option<(usize, usize)>; + + #[inline] + fn use_early_reject() -> bool { false } + #[inline] + fn rejecting(_a: usize, _b: usize) -> Self::Output { None } + #[inline] + fn matching(a: usize, b: usize) -> Self::Output { Some((a, b)) } +} + +/// Emit Rejects regularly +enum RejectAndMatch { } + +impl TwoWayStrategy for RejectAndMatch { + type Output = SearchStep; + + #[inline] + fn use_early_reject() -> bool { true } + #[inline] + fn rejecting(a: usize, b: usize) -> Self::Output { SearchStep::Reject(a, b) } + #[inline] + fn matching(a: usize, b: usize) -> Self::Output { SearchStep::Match(a, b) } +}