that is still fairly ugly

huggingface · Jun 19, 2024 · 9d389bc · 9d389bc
1 parent 8c36539
commit 9d389bc
Show file tree

Hide file tree

Showing 2 changed files with 4 additions and 14 deletions.
diff --git a/tokenizers/src/tokenizer/mod.rs b/tokenizers/src/tokenizer/mod.rs
@@ -862,19 +862,9 @@ where
                                 "Pre-tok String: {} vs token {} vs pret {:?}",
                                 string.original,
                                 token,
-                                string
-                                    .get_splits(OffsetReferential::Normalized, OffsetType::Byte)
-                                    .first()
-                                    .unwrap()
+                                string.splits.first().unwrap().normalized.normalized.clone()
                             );
-                            Some(
-                                string
-                                    .get_splits(OffsetReferential::Normalized, OffsetType::Byte)
-                                    .first()
-                                    .unwrap()
-                                    .0
-                                    .to_string(),
-                            )
+                            Some(string.splits.first().unwrap().normalized.normalized.clone())
                         } else {
                             println!("String: {}", token);
                             Some(token)
@@ -1334,7 +1324,7 @@ mod test {
         let mut tokenizer = Tokenizer::from_pretrained("meta-llama/Meta-Llama-3-8B", None).unwrap();
         tokenizer.add_tokens(&[AddedToken::from("ĠåĹİ", false)]); // this is the byte-level for 嗎
         let encoded = tokenizer
-            .encode("Hey! how is this token: 嗎", false)
+            .encode("Hey! how is this token: 嗎 and ĠåĹİ", false)
             .unwrap();
         println!("Encoded tokens: {:?}", encoded.get_ids());
         let decoded = tokenizer.decode(encoded.get_ids(), false);

diff --git a/tokenizers/src/tokenizer/normalizer.rs b/tokenizers/src/tokenizer/normalizer.rs
@@ -100,7 +100,7 @@ pub struct NormalizedString {
     /// The original version of the string, before any modification
     original: String,
     /// The normalized version of the string, after all modifications
-    normalized: String,
+    pub normalized: String,
     /// Mapping from normalized string to original one: (start, end) for each
     /// byte of the normalized string
     alignments: Vec<(usize, usize)>,