From d6f7625553c2b5b7e11cec66bd537ac4b5f77aec Mon Sep 17 00:00:00 2001 From: Hamir Mahal Date: Tue, 1 Oct 2024 15:54:47 -0700 Subject: [PATCH] style: simplify string formatting for readability --- tokenizers/benches/unigram_benchmark.rs | 4 ++-- tokenizers/examples/serialization.rs | 2 +- tokenizers/src/decoders/wordpiece.rs | 2 +- tokenizers/src/models/bpe/model.rs | 10 +++++----- tokenizers/src/models/bpe/trainer.rs | 6 +++--- tokenizers/src/models/mod.rs | 2 +- tokenizers/src/models/unigram/model.rs | 6 +++--- tokenizers/src/models/unigram/serialization.rs | 2 +- tokenizers/src/models/wordlevel/mod.rs | 2 +- tokenizers/src/models/wordpiece/mod.rs | 4 ++-- tokenizers/src/processors/template.rs | 2 +- tokenizers/src/tokenizer/normalizer.rs | 2 +- tokenizers/src/tokenizer/serialization.rs | 2 +- tokenizers/tests/documentation.rs | 8 ++++---- tokenizers/tests/unigram.rs | 2 +- 15 files changed, 28 insertions(+), 28 deletions(-) diff --git a/tokenizers/benches/unigram_benchmark.rs b/tokenizers/benches/unigram_benchmark.rs index 0b0cf8c46..9121a1937 100644 --- a/tokenizers/benches/unigram_benchmark.rs +++ b/tokenizers/benches/unigram_benchmark.rs @@ -21,7 +21,7 @@ pub fn bench_train(c: &mut Criterion) { let mut word_counts = HashMap::new(); content.split_whitespace().for_each(|word| { // This is important for the test of char vs u8 - let word = format!("▁{}", word); + let word = format!("▁{word}"); *word_counts.entry(word).or_insert(0) += 1; }); @@ -49,7 +49,7 @@ pub fn bench_train(c: &mut Criterion) { let mut word_counts = HashMap::new(); content.split_whitespace().for_each(|word| { // This is important for the test of char vs u8 - let word = format!("▁{}", word); + let word = format!("▁{word}"); *word_counts.entry(word).or_insert(0) += 1; }); diff --git a/tokenizers/examples/serialization.rs b/tokenizers/examples/serialization.rs index 242d339f8..b3f68ae62 100644 --- a/tokenizers/examples/serialization.rs +++ b/tokenizers/examples/serialization.rs @@ -8,7 +8,7 @@ fn main() { // Mix special and not special // You can make sure ids are in order, and special status is correct. let tokens: Vec<_> = (0..120_000) - .map(|i| AddedToken::from(format!("[SPECIAL_{}]", i), i % 2 == 0)) + .map(|i| AddedToken::from(format!("[SPECIAL_{i}]"), i % 2 == 0)) .collect(); tokenizer.add_tokens(&tokens); tokenizer.save("_tok.json", true).unwrap(); diff --git a/tokenizers/src/decoders/wordpiece.rs b/tokenizers/src/decoders/wordpiece.rs index 8ecd3987c..1a78586e2 100644 --- a/tokenizers/src/decoders/wordpiece.rs +++ b/tokenizers/src/decoders/wordpiece.rs @@ -53,7 +53,7 @@ impl Decoder for WordPiece { if token.starts_with(&self.prefix) { *token = token.replacen(&self.prefix, "", 1); } else { - *token = format!(" {}", token); + *token = format!(" {token}"); } } if self.cleanup { diff --git a/tokenizers/src/models/bpe/model.rs b/tokenizers/src/models/bpe/model.rs index 9cdd6213b..86fe74d50 100644 --- a/tokenizers/src/models/bpe/model.rs +++ b/tokenizers/src/models/bpe/model.rs @@ -385,13 +385,13 @@ impl BPE { // Add the `continuing_subword_prefix` if relevant if !is_first { if let Some(ref prefix) = self.continuing_subword_prefix { - s = format!("{}{}", prefix, s).into() + s = format!("{prefix}{s}").into() } } // Add the `end_of_word_suffix` if relevant if is_last { if let Some(ref suffix) = self.end_of_word_suffix { - s = format!("{}{}", s, suffix).into() + s = format!("{s}{suffix}").into() } } @@ -406,7 +406,7 @@ impl BPE { let tokens: Option> = s .bytes() .map(|b| -> Option<&u32> { - let code = format!("<{:#04X}>", b); + let code = format!("<{b:#04X}>"); self.vocab.get(&code) }) @@ -515,7 +515,7 @@ impl Model for BPE { fn save(&self, folder: &Path, name: Option<&str>) -> Result> { let vocab_file_name = match name { - Some(name) => format!("{}-vocab.json", name), + Some(name) => format!("{name}-vocab.json"), None => "vocab.json".to_string(), }; @@ -530,7 +530,7 @@ impl Model for BPE { // Write merges.txt let merges_file_name = match name { - Some(name) => format!("{}-merges.txt", name), + Some(name) => format!("{name}-merges.txt"), None => "merges.txt".to_string(), }; diff --git a/tokenizers/src/models/bpe/trainer.rs b/tokenizers/src/models/bpe/trainer.rs index 3689a856a..2876f1ef5 100644 --- a/tokenizers/src/models/bpe/trainer.rs +++ b/tokenizers/src/models/bpe/trainer.rs @@ -342,13 +342,13 @@ impl BpeTrainer { // Add the `continuing_subword_prefix` if relevant if !is_first { if let Some(prefix) = &self.continuing_subword_prefix { - s = format!("{}{}", prefix, s); + s = format!("{prefix}{s}"); } } // Add the `end_of_word_suffix` if relevant if is_last { if let Some(suffix) = &self.end_of_word_suffix { - s = format!("{}{}", s, suffix); + s = format!("{s}{suffix}"); } } @@ -513,7 +513,7 @@ impl BpeTrainer { part_b = part_b[prefix_byte_len..].to_string(); } } - let new_token = format!("{}{}", part_a, part_b); + let new_token = format!("{part_a}{part_b}"); // implement sentencepiece-like merge. // if this code were to be merged, integrate a way in the python bindings to communicate this variable // default should be 0/None to maintain previous behavior. 16 is the spm default. diff --git a/tokenizers/src/models/mod.rs b/tokenizers/src/models/mod.rs index cdfb731a8..a6021d90e 100644 --- a/tokenizers/src/models/mod.rs +++ b/tokenizers/src/models/mod.rs @@ -51,7 +51,7 @@ impl<'a> Serialize for OrderedVocabIter<'a> { if !holes.is_empty() { warn!("The OrderedVocab you are attempting to save contains holes for indices {:?}, your vocabulary could be corrupted !", holes); - println!("The OrderedVocab you are attempting to save contains holes for indices {:?}, your vocabulary could be corrupted !", holes); + println!("The OrderedVocab you are attempting to save contains holes for indices {holes:?}, your vocabulary could be corrupted !"); } result } diff --git a/tokenizers/src/models/unigram/model.rs b/tokenizers/src/models/unigram/model.rs index defc7d93d..dba5a0400 100644 --- a/tokenizers/src/models/unigram/model.rs +++ b/tokenizers/src/models/unigram/model.rs @@ -425,7 +425,7 @@ impl Model for Unigram { let byte_tokens: Option> = string .bytes() .map(|byte| -> Option { - let byte_string = format!("<0x{:02X}>", byte); + let byte_string = format!("<0x{byte:02X}>"); let id = self.token_to_ids.get(&byte_string); id.map(|id| Token::new(*id, byte_string, (offset, offset + len))) }) @@ -457,7 +457,7 @@ impl Model for Unigram { fn save(&self, folder: &Path, name: Option<&str>) -> Result> { let name = match name { - Some(name) => format!("{}-unigram.json", name), + Some(name) => format!("{name}-unigram.json"), None => "unigram.json".to_string(), }; let mut fullpath = PathBuf::new(); @@ -568,7 +568,7 @@ mod tests { for is_optimized in &[true, false] { model.set_optimized(*is_optimized); - println!("IsOptimized {:?}", is_optimized); + println!("IsOptimized {is_optimized:?}"); assert_eq!(model.encode("abc").unwrap(), vec!["abc"]); assert_eq!(model.encode("AB").unwrap(), vec!["AB"]); diff --git a/tokenizers/src/models/unigram/serialization.rs b/tokenizers/src/models/unigram/serialization.rs index d7123bab6..a6e56b735 100644 --- a/tokenizers/src/models/unigram/serialization.rs +++ b/tokenizers/src/models/unigram/serialization.rs @@ -70,7 +70,7 @@ impl<'de> Visitor<'de> for UnigramVisitor { } match (vocab, unk_id, byte_fallback) { (Some(vocab), unk_id, byte_fallback) => Ok(Unigram::from(vocab, unk_id, byte_fallback) - .map_err(|err| Error::custom(format!("Unable to load vocab {:?}", err)))?), + .map_err(|err| Error::custom(format!("Unable to load vocab {err:?}")))?), (None, _, _) => Err(Error::custom("Missing vocab")), } } diff --git a/tokenizers/src/models/wordlevel/mod.rs b/tokenizers/src/models/wordlevel/mod.rs index 3482ffee0..545db13a7 100644 --- a/tokenizers/src/models/wordlevel/mod.rs +++ b/tokenizers/src/models/wordlevel/mod.rs @@ -194,7 +194,7 @@ impl Model for WordLevel { fn save(&self, folder: &Path, name: Option<&str>) -> Result> { let vocab_file_name = match name { - Some(name) => format!("{}-vocab.json", name), + Some(name) => format!("{name}-vocab.json"), None => "vocab.json".to_string(), }; diff --git a/tokenizers/src/models/wordpiece/mod.rs b/tokenizers/src/models/wordpiece/mod.rs index a75134d2c..0c63405c1 100644 --- a/tokenizers/src/models/wordpiece/mod.rs +++ b/tokenizers/src/models/wordpiece/mod.rs @@ -271,7 +271,7 @@ impl Model for WordPiece { fn save(&self, folder: &Path, name: Option<&str>) -> Result> { let vocab_file_name = match name { - Some(name) => format!("{}-vocab.txt", name), + Some(name) => format!("{name}-vocab.txt"), None => "vocab.txt".to_string(), }; @@ -285,7 +285,7 @@ impl Model for WordPiece { vocab_file.write_all( &vocab .into_iter() - .flat_map(|(token, _)| format!("{}\n", token).as_bytes().to_owned()) + .flat_map(|(token, _)| format!("{token}\n").as_bytes().to_owned()) .collect::>()[..], )?; diff --git a/tokenizers/src/processors/template.rs b/tokenizers/src/processors/template.rs index 9259180d0..7f1fed54d 100644 --- a/tokenizers/src/processors/template.rs +++ b/tokenizers/src/processors/template.rs @@ -150,7 +150,7 @@ impl TryFrom for Piece { fn try_from(s: String) -> StdResult { let parts = s.split(':').collect::>(); - let err = || format!("Cannot build Piece from string \"{}\"", s); + let err = || format!("Cannot build Piece from string \"{s}\""); match parts.as_slice() { [id, type_id] => { let type_id: u32 = type_id.parse().map_err(|_| err())?; diff --git a/tokenizers/src/tokenizer/normalizer.rs b/tokenizers/src/tokenizer/normalizer.rs index e2f501abe..a8a05c795 100644 --- a/tokenizers/src/tokenizer/normalizer.rs +++ b/tokenizers/src/tokenizer/normalizer.rs @@ -351,7 +351,7 @@ impl NormalizedString { match changes { 0 => "Replacing".into(), ch if ch > 0 => "Adding".into(), - ch if ch < 0 => format!("Replacing + removing {} following chars", ch), + ch if ch < 0 => format!("Replacing + removing {ch} following chars"), _ => "Undefined".into(), }, offset diff --git a/tokenizers/src/tokenizer/serialization.rs b/tokenizers/src/tokenizer/serialization.rs index db9b0a403..26d8344f4 100644 --- a/tokenizers/src/tokenizer/serialization.rs +++ b/tokenizers/src/tokenizer/serialization.rs @@ -116,7 +116,7 @@ where "version" => { let v: String = map.next_value()?; if &v != "1.0" { - return Err(Error::custom(format!("Unknown tokenizer version '{}'", v))); + return Err(Error::custom(format!("Unknown tokenizer version '{v}'"))); } } "truncation" => { diff --git a/tokenizers/tests/documentation.rs b/tokenizers/tests/documentation.rs index c0c471a93..ad29590b9 100644 --- a/tokenizers/tests/documentation.rs +++ b/tokenizers/tests/documentation.rs @@ -199,7 +199,7 @@ fn quicktour() -> tokenizers::Result<()> { // START quicktour_encode_batch let output = tokenizer.encode_batch(vec!["Hello, y'all!", "How are you 😁 ?"], true)?; // END quicktour_encode_batch - println!("{:?}", output); + println!("{output:?}"); // START quicktour_encode_batch_pair let output = tokenizer.encode_batch( vec![ @@ -209,7 +209,7 @@ fn quicktour() -> tokenizers::Result<()> { true, )?; // END quicktour_encode_batch_pair - println!("{:?}", output); + println!("{output:?}"); // START quicktour_enable_padding use tokenizers::PaddingParams; @@ -350,7 +350,7 @@ fn pipeline() -> tokenizers::Result<()> { &[1, 27253, 16, 93, 11, 5097, 5, 7961, 5112, 6218, 0, 35, 2], true, )?; - println!("{}", decoded); + println!("{decoded}"); // "Hello , y ' all ! How are you ?" // END pipeline_test_decoding @@ -436,7 +436,7 @@ fn pipeline_bert() -> tokenizers::Result<()> { // ["[CLS]", "welcome", "to", "the", "[UNK]", "tok", "##eni", "##zer", "##s", "library", ".", "[SEP]"] let decoded = bert_tokenizer.decode(output.get_ids(), true)?; - println!("{}", decoded); + println!("{decoded}"); // "welcome to the tok ##eni ##zer ##s library ." // END bert_test_decoding assert_eq!( diff --git a/tokenizers/tests/unigram.rs b/tokenizers/tests/unigram.rs index bccd0bca2..dc0dfdc07 100644 --- a/tokenizers/tests/unigram.rs +++ b/tokenizers/tests/unigram.rs @@ -44,7 +44,7 @@ fn test_train_unigram_from_file() { let mut word_counts = HashMap::new(); content.split_whitespace().for_each(|word| { // This is important for the test of char vs u8 - let word = format!("▁{}", word); + let word = format!("▁{word}"); *word_counts.entry(word).or_insert(0) += 1; });