From f53e5147c2063e00652a870ca94f7c61ede1a49c Mon Sep 17 00:00:00 2001 From: Arthur Zucker Date: Fri, 28 Jun 2024 09:36:10 +0200 Subject: [PATCH] revert and simplify --- tokenizers/src/tokenizer/added_vocabulary.rs | 5 +---- tokenizers/src/tokenizer/mod.rs | 19 ++++--------------- 2 files changed, 5 insertions(+), 19 deletions(-) diff --git a/tokenizers/src/tokenizer/added_vocabulary.rs b/tokenizers/src/tokenizer/added_vocabulary.rs index a0c2f4542..87158a8c2 100644 --- a/tokenizers/src/tokenizer/added_vocabulary.rs +++ b/tokenizers/src/tokenizer/added_vocabulary.rs @@ -216,10 +216,6 @@ impl AddedVocabulary { } /// Get the token matching the given id if it exists - #[deprecated( - since = "0.19.0", - note = "please use `added_vocabulary.simple_id_to_token(id).or_else(|| model.id_to_token(id)` instead" - )] pub fn id_to_token(&self, id: u32, model: &impl Model) -> Option { self.added_tokens_map_r .get(&id) @@ -227,6 +223,7 @@ impl AddedVocabulary { .or_else(|| model.id_to_token(id)) } + // pub fn simple_id_to_token(&self, id: u32) -> Option { self.added_tokens_map_r.get(&id).map(|t| t.content.clone()) } diff --git a/tokenizers/src/tokenizer/mod.rs b/tokenizers/src/tokenizer/mod.rs index eb6b6ff3c..17d10aca9 100644 --- a/tokenizers/src/tokenizer/mod.rs +++ b/tokenizers/src/tokenizer/mod.rs @@ -851,24 +851,13 @@ where .iter() .filter_map(|id| { self.added_vocabulary - .simple_id_to_token(*id) - .and_then(|token| { - if skip_special_tokens && self.added_vocabulary.is_special_token(&token) { - None - } else if let Some(pre_tok) = &self.pre_tokenizer { - let mut string = PreTokenizedString::from(token); - pre_tok.pre_tokenize(&mut string); - println!("Pre-tok String: {}", string.original); - Some(string.original) - } else { - println!("String: {}", token); - Some(token) - } + .id_to_token(*id, &self.model) + .filter(|token| { + !skip_special_tokens || !self.added_vocabulary.is_special_token(token) }) - .or_else(|| self.model.id_to_token(*id)) }) .collect::>(); - println!("This should print: {:?}", tokens); + if let Some(decoder) = &self.decoder { decoder.decode(tokens) } else {