From e34bc7bd9daf00380eeca3489f22ace2c7bd7bb0 Mon Sep 17 00:00:00 2001
From: cOng <erdongerzong@qq.com>
Date: Sat, 3 Feb 2024 16:33:51 +0800
Subject: [PATCH 1/2] fix: inconsistent tokenization by llama tokenzier

---
 fastchat/train/train_with_template.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fastchat/train/train_with_template.py b/fastchat/train/train_with_template.py
index 4511a2108..9ca8924df 100644
--- a/fastchat/train/train_with_template.py
+++ b/fastchat/train/train_with_template.py
@@ -163,7 +163,7 @@ def mask_targets(conversations, targets, tokenizer, conv):
             if i != 0:
                 turn = user_turn_separator + turn
 
-            turn_len = len(tokenizer(turn).input_ids)
+            turn_len = len(tokenizer(turn, add_special_tokens=False).input_ids)
 
             if assistant_turn_separator in turn:
                 parts = turn.rsplit(assistant_turn_separator)

From 55c2258c76e3de5aecd64dfa721566305e0a071e Mon Sep 17 00:00:00 2001
From: cOng <erdongerzong@qq.com>
Date: Sat, 3 Feb 2024 17:01:16 +0800
Subject: [PATCH 2/2] explicitly set teh pad_token_id to unk_token_id

---
 fastchat/train/train_with_template.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/fastchat/train/train_with_template.py b/fastchat/train/train_with_template.py
index 9ca8924df..e5c5f353d 100644
--- a/fastchat/train/train_with_template.py
+++ b/fastchat/train/train_with_template.py
@@ -373,6 +373,7 @@ def train():
     )
     # NOTE: if the token_id exceed the vocab_size will cause failing in training process! we need add special config and resize the embedding size!
     tokenizer.pad_token = tokenizer.unk_token
+    tokenizer.pad_token_id = tokenizer.unk_token_id
     print(f"tokens len: {len(tokenizer)}")
     model.resize_token_embeddings(len(tokenizer))