From b08f6e4e91b45fa8e7ca8113a029afbfca49f5dd Mon Sep 17 00:00:00 2001 From: trag1c Date: Mon, 16 Dec 2024 22:11:36 +0100 Subject: [PATCH] test: add a case for multi-character breakpoint tokenizer --- src/lib.rs | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/src/lib.rs b/src/lib.rs index a23e0c1..875f50e 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -663,6 +663,32 @@ mod tests { ); } + #[test] + fn multichar_breakpoint_tokenization() { + let (x, y, z) = (("x", "ab"), ("y", "bc"), ("z", "abcd")); + let tok = Tokenizer::default() + .with_literals(&FxHashMap::from_iter([x, y, z])) + .unwrap(); + let source = "ccddbabcaabcccdcbaaabdaabcbaabbbabaaaccabcdabaabadbcacddacbddbcb"; + let tokens: Vec<_> = tok.tokenize(source).flatten().collect(); + assert_eq!( + tokens, + make_output(vec![ + (x, 5), + (x, 9), + (x, 19), + (x, 23), + (x, 28), + (x, 32), + (z, 39), + (x, 43), + (x, 46), + (y, 50), + (y, 61) + ]) + ); + } + #[test] fn fast_tokenization_with_ignoreset() { let (foo, bar) = (("foo", "x"), ("bar", "y"));