Skip to content

Commit

Permalink
test: add a case for multi-character breakpoint tokenizer
Browse files Browse the repository at this point in the history
  • Loading branch information
trag1c committed Dec 16, 2024
1 parent 21f24b9 commit b08f6e4
Showing 1 changed file with 26 additions and 0 deletions.
26 changes: 26 additions & 0 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -663,6 +663,32 @@ mod tests {
);
}

#[test]
fn multichar_breakpoint_tokenization() {
let (x, y, z) = (("x", "ab"), ("y", "bc"), ("z", "abcd"));
let tok = Tokenizer::default()
.with_literals(&FxHashMap::from_iter([x, y, z]))
.unwrap();
let source = "ccddbabcaabcccdcbaaabdaabcbaabbbabaaaccabcdabaabadbcacddacbddbcb";
let tokens: Vec<_> = tok.tokenize(source).flatten().collect();
assert_eq!(
tokens,
make_output(vec![
(x, 5),
(x, 9),
(x, 19),
(x, 23),
(x, 28),
(x, 32),
(z, 39),
(x, 43),
(x, 46),
(y, 50),
(y, 61)
])
);
}

#[test]
fn fast_tokenization_with_ignoreset() {
let (foo, bar) = (("foo", "x"), ("bar", "y"));
Expand Down

0 comments on commit b08f6e4

Please sign in to comment.