From 69c0c0683fdab4d756e2e989c2bfee611fccca51 Mon Sep 17 00:00:00 2001 From: Manjunath Siddaiah Date: Wed, 30 Oct 2024 10:38:00 -0500 Subject: [PATCH] Enable HuggingFaceTokenizer in preprocessing --- tools/preprocess_data.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/preprocess_data.py b/tools/preprocess_data.py index a81fe8ca7e..a9575707b9 100644 --- a/tools/preprocess_data.py +++ b/tools/preprocess_data.py @@ -203,7 +203,7 @@ def get_args(): choices=['BertWordPieceLowerCase','BertWordPieceCase', 'GPT2BPETokenizer', 'SentencePieceTokenizer', 'GPTSentencePieceTokenizer', 'Llama2Tokenizer', - 'Llama3Tokenizer', 'MistralTokenizer', 'NullTokenizer'], + 'Llama3Tokenizer', 'MistralTokenizer', 'HuggingFaceTokenizer', 'NullTokenizer'], help='What type of tokenizer to use.') group.add_argument('--tokenizer-model', type=str, default=None, help='YTTM tokenizer model.')