[Bugfix] fix qwen tokenizer config when converting to nemo format (#1…

…1098) * updated qwen tokenizer config when converting to nemo format * Apply isort and black reformatting Signed-off-by: chrjxj <chrjxj@users.noreply.github.com> --------- Signed-off-by: chrjxj <chrjxj@users.noreply.github.com> Co-authored-by: lukex <lukex@nvidia.com>
NVIDIA · Jan 9, 2025 · 7aac482 · 7aac482
1 parent 3146703
commit 7aac482
Showing 1 changed file with 10 additions and 2 deletions.
diff --git a/scripts/checkpoint_converters/convert_qwen2_hf_to_nemo.py b/scripts/checkpoint_converters/convert_qwen2_hf_to_nemo.py
@@ -81,8 +81,16 @@ def load_config(args, qwen_config):
         nemo_config.num_query_groups = qwen_config['num_key_value_heads']
     nemo_config.use_cpu_initialization = True
     nemo_config.activation = 'fast-swiglu'
-    nemo_config.tokenizer.type = str(args.input_name_or_path)
-    nemo_config.tokenizer.model = str(args.input_name_or_path) + '/vocab.json'
+
+    # use HF tokenizer
+    tokenizer_dict = {
+        'library': 'huggingface',
+        'type': args.input_name_or_path,
+        'use_fast': True,
+        'trust_remote_code': True,
+    }
+    nemo_config.tokenizer = tokenizer_dict
+
     nemo_config.override_vocab_size = qwen_config['vocab_size']
 
     base = 128