-
-
Notifications
You must be signed in to change notification settings - Fork 4.3k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Make initialization of tokenizer and detokenizer optional #3748
Changes from 28 commits
0ca0b3d
a50f0c7
e144c2e
5fb16f2
904edcc
cfc2660
013a36a
fb3eefd
07cc2e5
5b30825
676256f
f7cd883
8dfb59b
0ea8446
a7be734
1e613f6
3b94adb
eab1dd7
58ccf64
a0d1405
0af6b47
a497ed9
af078a8
ad2c920
78d4091
4f67490
a093589
59fc5eb
ac7a3d4
400224d
6941628
7e851ac
ad4da7c
3f60c11
4b3c5e3
87c695b
aa5ec54
4943cbd
c208c77
68f77b1
c0951f3
5f8b5fd
47dce6e
50a7fad
7c25549
5f2b8ed
e584673
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -15,6 +15,7 @@ class EngineArgs: | |
"""Arguments for vLLM engine.""" | ||
model: str | ||
tokenizer: Optional[str] = None | ||
disable_tokenizer: bool = False | ||
tokenizer_mode: str = 'auto' | ||
trust_remote_code: bool = False | ||
download_dir: Optional[str] = None | ||
|
@@ -94,6 +95,9 @@ def add_cli_args( | |
type=str, | ||
default=EngineArgs.tokenizer, | ||
help='name or path of the huggingface tokenizer to use') | ||
parser.add_argument('--disable_tokenizer', | ||
action='store_true', | ||
help='Disable tokenization and detokenization') | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. ditto There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. done |
||
parser.add_argument( | ||
'--revision', | ||
type=str, | ||
|
@@ -422,7 +426,8 @@ def create_engine_config(self, ) -> EngineConfig: | |
self.dtype, self.seed, self.revision, self.code_revision, | ||
self.tokenizer_revision, self.max_model_len, self.quantization, | ||
self.quantization_param_path, self.enforce_eager, | ||
self.max_context_len_to_capture, self.max_logprobs) | ||
self.max_context_len_to_capture, self.max_logprobs, | ||
self.disable_tokenizer) | ||
cache_config = CacheConfig(self.block_size, | ||
self.gpu_memory_utilization, | ||
self.swap_space, self.kv_cache_dtype, | ||
|
Original file line number | Diff line number | Diff line change | ||||||
---|---|---|---|---|---|---|---|---|
|
@@ -83,6 +83,7 @@ def __init__( | |||||||
f"model={model_config.model!r}, " | ||||||||
f"speculative_config={speculative_config!r}, " | ||||||||
f"tokenizer={model_config.tokenizer!r}, " | ||||||||
f"disable_tokenizer={model_config.disable_tokenizer}, " | ||||||||
f"tokenizer_mode={model_config.tokenizer_mode}, " | ||||||||
f"revision={model_config.revision}, " | ||||||||
f"tokenizer_revision={model_config.tokenizer_revision}, " | ||||||||
|
@@ -112,8 +113,13 @@ def __init__( | |||||||
self.speculative_config = speculative_config | ||||||||
self.log_stats = log_stats | ||||||||
|
||||||||
self._init_tokenizer() | ||||||||
self.detokenizer = Detokenizer(self.tokenizer) | ||||||||
if not self.model_config.disable_tokenizer: | ||||||||
self._init_tokenizer() | ||||||||
self.detokenizer = Detokenizer(self.tokenizer) | ||||||||
else: | ||||||||
self.detokenizer = None | ||||||||
self.tokenizer = None | ||||||||
|
||||||||
self.seq_counter = Counter() | ||||||||
|
||||||||
self.model_executor = executor_class( | ||||||||
|
@@ -162,9 +168,10 @@ def __init__( | |||||||
parallel_config.disable_custom_all_reduce, | ||||||||
}) | ||||||||
|
||||||||
# Ping the tokenizer to ensure liveness if it runs in a | ||||||||
# different process. | ||||||||
self.tokenizer.ping() | ||||||||
if self.tokenizer: | ||||||||
# Ping the tokenizer to ensure liveness if it runs in a | ||||||||
# different process. | ||||||||
self.tokenizer.ping() | ||||||||
|
||||||||
# Create the scheduler. | ||||||||
# NOTE: the cache_config here have been updated with the numbers of | ||||||||
|
@@ -333,8 +340,10 @@ def add_request( | |||||||
# Create the sequences. | ||||||||
block_size = self.cache_config.block_size | ||||||||
seq_id = next(self.seq_counter) | ||||||||
eos_token_id = self.tokenizer.get_lora_tokenizer( | ||||||||
lora_request).eos_token_id | ||||||||
eos_token_id = None | ||||||||
if self.tokenizer: | ||||||||
eos_token_id = self.tokenizer.get_lora_tokenizer( | ||||||||
lora_request).eos_token_id | ||||||||
Comment on lines
+404
to
+407
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. IMO we should add a warning here - WDYT? This will also affect components that use vllm/vllm/engine/llm_engine.py Lines 344 to 346 in 0ce0539
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I still think it's worth having an warning here about |
||||||||
seq = Sequence(seq_id, prompt, prompt_token_ids, block_size, | ||||||||
eos_token_id, lora_request) | ||||||||
|
||||||||
|
@@ -478,7 +487,7 @@ def _process_sequence_group_outputs(self, seq_group: SequenceGroup, | |||||||
child_seqs.append((parent, parent)) | ||||||||
|
||||||||
for seq, _ in child_seqs: | ||||||||
if seq_group.sampling_params.detokenize: | ||||||||
if seq_group.sampling_params.detokenize and self.detokenizer: | ||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Don't we need to do add changes to the following similar to what you did here as well? vllm/vllm/engine/llm_engine.py Lines 435 to 437 in 0ce0539
Otherwise we will get a error when |
||||||||
self.detokenizer.decode_sequence_inplace( | ||||||||
seq, seq_group.sampling_params) | ||||||||
self._check_stop(seq, seq_group.sampling_params) | ||||||||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
nit: IMO this is slightly misleading because we're not simply disabling tokenization and detokenization but in fact the tokenizers and detokenizers are not initialized at all.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
done