Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

style: simplify string formatting for readability #1632

Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions tokenizers/benches/unigram_benchmark.rs
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ pub fn bench_train(c: &mut Criterion) {
let mut word_counts = HashMap::new();
content.split_whitespace().for_each(|word| {
// This is important for the test of char vs u8
let word = format!("▁{}", word);
let word = format!("▁{word}");
*word_counts.entry(word).or_insert(0) += 1;
});

Expand Down Expand Up @@ -49,7 +49,7 @@ pub fn bench_train(c: &mut Criterion) {
let mut word_counts = HashMap::new();
content.split_whitespace().for_each(|word| {
// This is important for the test of char vs u8
let word = format!("▁{}", word);
let word = format!("▁{word}");
*word_counts.entry(word).or_insert(0) += 1;
});

Expand Down
2 changes: 1 addition & 1 deletion tokenizers/examples/serialization.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ fn main() {
// Mix special and not special
// You can make sure ids are in order, and special status is correct.
let tokens: Vec<_> = (0..120_000)
.map(|i| AddedToken::from(format!("[SPECIAL_{}]", i), i % 2 == 0))
.map(|i| AddedToken::from(format!("[SPECIAL_{i}]"), i % 2 == 0))
.collect();
tokenizer.add_tokens(&tokens);
tokenizer.save("_tok.json", true).unwrap();
Expand Down
2 changes: 1 addition & 1 deletion tokenizers/src/decoders/wordpiece.rs
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ impl Decoder for WordPiece {
if token.starts_with(&self.prefix) {
*token = token.replacen(&self.prefix, "", 1);
} else {
*token = format!(" {}", token);
*token = format!(" {token}");
}
}
if self.cleanup {
Expand Down
10 changes: 5 additions & 5 deletions tokenizers/src/models/bpe/model.rs
Original file line number Diff line number Diff line change
Expand Up @@ -385,13 +385,13 @@ impl BPE {
// Add the `continuing_subword_prefix` if relevant
if !is_first {
if let Some(ref prefix) = self.continuing_subword_prefix {
s = format!("{}{}", prefix, s).into()
s = format!("{prefix}{s}").into()
}
}
// Add the `end_of_word_suffix` if relevant
if is_last {
if let Some(ref suffix) = self.end_of_word_suffix {
s = format!("{}{}", s, suffix).into()
s = format!("{s}{suffix}").into()
}
}

Expand All @@ -406,7 +406,7 @@ impl BPE {
let tokens: Option<Vec<_>> = s
.bytes()
.map(|b| -> Option<&u32> {
let code = format!("<{:#04X}>", b);
let code = format!("<{b:#04X}>");

self.vocab.get(&code)
})
Expand Down Expand Up @@ -515,7 +515,7 @@ impl Model for BPE {

fn save(&self, folder: &Path, name: Option<&str>) -> Result<Vec<PathBuf>> {
let vocab_file_name = match name {
Some(name) => format!("{}-vocab.json", name),
Some(name) => format!("{name}-vocab.json"),
None => "vocab.json".to_string(),
};

Expand All @@ -530,7 +530,7 @@ impl Model for BPE {

// Write merges.txt
let merges_file_name = match name {
Some(name) => format!("{}-merges.txt", name),
Some(name) => format!("{name}-merges.txt"),
None => "merges.txt".to_string(),
};

Expand Down
6 changes: 3 additions & 3 deletions tokenizers/src/models/bpe/trainer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -342,13 +342,13 @@ impl BpeTrainer {
// Add the `continuing_subword_prefix` if relevant
if !is_first {
if let Some(prefix) = &self.continuing_subword_prefix {
s = format!("{}{}", prefix, s);
s = format!("{prefix}{s}");
}
}
// Add the `end_of_word_suffix` if relevant
if is_last {
if let Some(suffix) = &self.end_of_word_suffix {
s = format!("{}{}", s, suffix);
s = format!("{s}{suffix}");
}
}

Expand Down Expand Up @@ -513,7 +513,7 @@ impl BpeTrainer {
part_b = part_b[prefix_byte_len..].to_string();
}
}
let new_token = format!("{}{}", part_a, part_b);
let new_token = format!("{part_a}{part_b}");
// implement sentencepiece-like merge.
// if this code were to be merged, integrate a way in the python bindings to communicate this variable
// default should be 0/None to maintain previous behavior. 16 is the spm default.
Expand Down
2 changes: 1 addition & 1 deletion tokenizers/src/models/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ impl<'a> Serialize for OrderedVocabIter<'a> {

if !holes.is_empty() {
warn!("The OrderedVocab you are attempting to save contains holes for indices {:?}, your vocabulary could be corrupted !", holes);
println!("The OrderedVocab you are attempting to save contains holes for indices {:?}, your vocabulary could be corrupted !", holes);
println!("The OrderedVocab you are attempting to save contains holes for indices {holes:?}, your vocabulary could be corrupted !");
}
result
}
Expand Down
6 changes: 3 additions & 3 deletions tokenizers/src/models/unigram/model.rs
Original file line number Diff line number Diff line change
Expand Up @@ -425,7 +425,7 @@ impl Model for Unigram {
let byte_tokens: Option<Vec<_>> = string
.bytes()
.map(|byte| -> Option<Token> {
let byte_string = format!("<0x{:02X}>", byte);
let byte_string = format!("<0x{byte:02X}>");
let id = self.token_to_ids.get(&byte_string);
id.map(|id| Token::new(*id, byte_string, (offset, offset + len)))
})
Expand Down Expand Up @@ -457,7 +457,7 @@ impl Model for Unigram {

fn save(&self, folder: &Path, name: Option<&str>) -> Result<Vec<PathBuf>> {
let name = match name {
Some(name) => format!("{}-unigram.json", name),
Some(name) => format!("{name}-unigram.json"),
None => "unigram.json".to_string(),
};
let mut fullpath = PathBuf::new();
Expand Down Expand Up @@ -568,7 +568,7 @@ mod tests {

for is_optimized in &[true, false] {
model.set_optimized(*is_optimized);
println!("IsOptimized {:?}", is_optimized);
println!("IsOptimized {is_optimized:?}");
assert_eq!(model.encode("abc").unwrap(), vec!["abc"]);
assert_eq!(model.encode("AB").unwrap(), vec!["AB"]);

Expand Down
2 changes: 1 addition & 1 deletion tokenizers/src/models/unigram/serialization.rs
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,7 @@ impl<'de> Visitor<'de> for UnigramVisitor {
}
match (vocab, unk_id, byte_fallback) {
(Some(vocab), unk_id, byte_fallback) => Ok(Unigram::from(vocab, unk_id, byte_fallback)
.map_err(|err| Error::custom(format!("Unable to load vocab {:?}", err)))?),
.map_err(|err| Error::custom(format!("Unable to load vocab {err:?}")))?),
(None, _, _) => Err(Error::custom("Missing vocab")),
}
}
Expand Down
2 changes: 1 addition & 1 deletion tokenizers/src/models/wordlevel/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -194,7 +194,7 @@ impl Model for WordLevel {

fn save(&self, folder: &Path, name: Option<&str>) -> Result<Vec<PathBuf>> {
let vocab_file_name = match name {
Some(name) => format!("{}-vocab.json", name),
Some(name) => format!("{name}-vocab.json"),
None => "vocab.json".to_string(),
};

Expand Down
4 changes: 2 additions & 2 deletions tokenizers/src/models/wordpiece/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -271,7 +271,7 @@ impl Model for WordPiece {

fn save(&self, folder: &Path, name: Option<&str>) -> Result<Vec<PathBuf>> {
let vocab_file_name = match name {
Some(name) => format!("{}-vocab.txt", name),
Some(name) => format!("{name}-vocab.txt"),
None => "vocab.txt".to_string(),
};

Expand All @@ -285,7 +285,7 @@ impl Model for WordPiece {
vocab_file.write_all(
&vocab
.into_iter()
.flat_map(|(token, _)| format!("{}\n", token).as_bytes().to_owned())
.flat_map(|(token, _)| format!("{token}\n").as_bytes().to_owned())
.collect::<Vec<_>>()[..],
)?;

Expand Down
2 changes: 1 addition & 1 deletion tokenizers/src/processors/template.rs
Original file line number Diff line number Diff line change
Expand Up @@ -150,7 +150,7 @@ impl TryFrom<String> for Piece {
fn try_from(s: String) -> StdResult<Self, Self::Error> {
let parts = s.split(':').collect::<Vec<_>>();

let err = || format!("Cannot build Piece from string \"{}\"", s);
let err = || format!("Cannot build Piece from string \"{s}\"");
match parts.as_slice() {
[id, type_id] => {
let type_id: u32 = type_id.parse().map_err(|_| err())?;
Expand Down
2 changes: 1 addition & 1 deletion tokenizers/src/tokenizer/normalizer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -351,7 +351,7 @@ impl NormalizedString {
match changes {
0 => "Replacing".into(),
ch if ch > 0 => "Adding".into(),
ch if ch < 0 => format!("Replacing + removing {} following chars", ch),
ch if ch < 0 => format!("Replacing + removing {ch} following chars"),
_ => "Undefined".into(),
},
offset
Expand Down
2 changes: 1 addition & 1 deletion tokenizers/src/tokenizer/serialization.rs
Original file line number Diff line number Diff line change
Expand Up @@ -116,7 +116,7 @@ where
"version" => {
let v: String = map.next_value()?;
if &v != "1.0" {
return Err(Error::custom(format!("Unknown tokenizer version '{}'", v)));
return Err(Error::custom(format!("Unknown tokenizer version '{v}'")));
}
}
"truncation" => {
Expand Down
8 changes: 4 additions & 4 deletions tokenizers/tests/documentation.rs
Original file line number Diff line number Diff line change
Expand Up @@ -199,7 +199,7 @@ fn quicktour() -> tokenizers::Result<()> {
// START quicktour_encode_batch
let output = tokenizer.encode_batch(vec!["Hello, y'all!", "How are you 😁 ?"], true)?;
// END quicktour_encode_batch
println!("{:?}", output);
println!("{output:?}");
// START quicktour_encode_batch_pair
let output = tokenizer.encode_batch(
vec![
Expand All @@ -209,7 +209,7 @@ fn quicktour() -> tokenizers::Result<()> {
true,
)?;
// END quicktour_encode_batch_pair
println!("{:?}", output);
println!("{output:?}");
// START quicktour_enable_padding
use tokenizers::PaddingParams;

Expand Down Expand Up @@ -350,7 +350,7 @@ fn pipeline() -> tokenizers::Result<()> {
&[1, 27253, 16, 93, 11, 5097, 5, 7961, 5112, 6218, 0, 35, 2],
true,
)?;
println!("{}", decoded);
println!("{decoded}");
// "Hello , y ' all ! How are you ?"
// END pipeline_test_decoding

Expand Down Expand Up @@ -436,7 +436,7 @@ fn pipeline_bert() -> tokenizers::Result<()> {
// ["[CLS]", "welcome", "to", "the", "[UNK]", "tok", "##eni", "##zer", "##s", "library", ".", "[SEP]"]

let decoded = bert_tokenizer.decode(output.get_ids(), true)?;
println!("{}", decoded);
println!("{decoded}");
// "welcome to the tok ##eni ##zer ##s library ."
// END bert_test_decoding
assert_eq!(
Expand Down
2 changes: 1 addition & 1 deletion tokenizers/tests/unigram.rs
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ fn test_train_unigram_from_file() {
let mut word_counts = HashMap::new();
content.split_whitespace().for_each(|word| {
// This is important for the test of char vs u8
let word = format!("▁{}", word);
let word = format!("▁{word}");
*word_counts.entry(word).or_insert(0) += 1;
});

Expand Down
Loading