From e34bc7bd9daf00380eeca3489f22ace2c7bd7bb0 Mon Sep 17 00:00:00 2001 From: cOng Date: Sat, 3 Feb 2024 16:33:51 +0800 Subject: [PATCH 1/2] fix: inconsistent tokenization by llama tokenzier --- fastchat/train/train_with_template.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fastchat/train/train_with_template.py b/fastchat/train/train_with_template.py index 4511a2108..9ca8924df 100644 --- a/fastchat/train/train_with_template.py +++ b/fastchat/train/train_with_template.py @@ -163,7 +163,7 @@ def mask_targets(conversations, targets, tokenizer, conv): if i != 0: turn = user_turn_separator + turn - turn_len = len(tokenizer(turn).input_ids) + turn_len = len(tokenizer(turn, add_special_tokens=False).input_ids) if assistant_turn_separator in turn: parts = turn.rsplit(assistant_turn_separator) From 55c2258c76e3de5aecd64dfa721566305e0a071e Mon Sep 17 00:00:00 2001 From: cOng Date: Sat, 3 Feb 2024 17:01:16 +0800 Subject: [PATCH 2/2] explicitly set teh pad_token_id to unk_token_id --- fastchat/train/train_with_template.py | 1 + 1 file changed, 1 insertion(+) diff --git a/fastchat/train/train_with_template.py b/fastchat/train/train_with_template.py index 9ca8924df..e5c5f353d 100644 --- a/fastchat/train/train_with_template.py +++ b/fastchat/train/train_with_template.py @@ -373,6 +373,7 @@ def train(): ) # NOTE: if the token_id exceed the vocab_size will cause failing in training process! we need add special config and resize the embedding size! tokenizer.pad_token = tokenizer.unk_token + tokenizer.pad_token_id = tokenizer.unk_token_id print(f"tokens len: {len(tokenizer)}") model.resize_token_embeddings(len(tokenizer))