From 4338cc475029dcd37a291a867d52419122648e72 Mon Sep 17 00:00:00 2001 From: Woosuk Kwon Date: Wed, 28 Jun 2023 09:46:58 -0700 Subject: [PATCH] [Tokenizer] Add an option to specify tokenizer (#284) --- benchmarks/benchmark_latency.py | 2 + benchmarks/benchmark_serving.py | 11 +--- benchmarks/benchmark_throughput.py | 35 ++++++------- vllm/config.py | 3 ++ vllm/engine/arg_utils.py | 7 ++- vllm/engine/llm_engine.py | 6 ++- vllm/entrypoints/llm.py | 3 ++ vllm/entrypoints/openai/api_server.py | 2 +- vllm/transformers_utils/__init__.py | 0 .../tokenizer.py} | 52 +++++++++---------- 10 files changed, 61 insertions(+), 60 deletions(-) create mode 100644 vllm/transformers_utils/__init__.py rename vllm/{engine/tokenizer_utils.py => transformers_utils/tokenizer.py} (63%) diff --git a/benchmarks/benchmark_latency.py b/benchmarks/benchmark_latency.py index 1a215bdd552c..48cdffb8afcc 100644 --- a/benchmarks/benchmark_latency.py +++ b/benchmarks/benchmark_latency.py @@ -17,6 +17,7 @@ def main(args: argparse.Namespace): # the engine will automatically process the request in multiple batches. llm = LLM( model=args.model, + tokenizer=args.tokenizer, tensor_parallel_size=args.tensor_parallel_size, max_num_seqs=args.batch_size, max_num_batched_tokens=args.batch_size * args.input_len, @@ -63,6 +64,7 @@ def run_to_completion(profile: bool = False): description='Benchmark the latency of processing a single batch of ' 'requests till completion.') parser.add_argument('--model', type=str, default='facebook/opt-125m') + parser.add_argument('--tokenizer', type=str, default=None) parser.add_argument('--tensor-parallel-size', '-tp', type=int, default=1) parser.add_argument('--input-len', type=int, default=32) parser.add_argument('--output-len', type=int, default=128) diff --git a/benchmarks/benchmark_serving.py b/benchmarks/benchmark_serving.py index 5e3af12f5b53..b0705ec0fe80 100644 --- a/benchmarks/benchmark_serving.py +++ b/benchmarks/benchmark_serving.py @@ -24,20 +24,13 @@ import aiohttp import numpy as np -from transformers import AutoConfig, AutoTokenizer, PreTrainedTokenizerBase +from transformers import PreTrainedTokenizerBase +from vllm.transformers_utils.tokenizer import get_tokenizer # (prompt len, output len, latency) REQUEST_LATENCY: List[Tuple[int, int, float]] = [] -def get_tokenizer(model_name: str) -> PreTrainedTokenizerBase: - config = AutoConfig.from_pretrained(model_name) - if config.model_type == "llama": - # A workaround for potential protobuf errors. - model_name = "hf-internal-testing/llama-tokenizer" - return AutoTokenizer.from_pretrained(model_name) - - def sample_requests( dataset_path: str, num_requests: int, diff --git a/benchmarks/benchmark_throughput.py b/benchmarks/benchmark_throughput.py index c40145abcc00..9ae5aa9b42da 100644 --- a/benchmarks/benchmark_throughput.py +++ b/benchmarks/benchmark_throughput.py @@ -6,23 +6,11 @@ from typing import List, Tuple import torch -from transformers import (AutoConfig, AutoTokenizer, AutoModelForCausalLM, - PreTrainedTokenizerBase) +from transformers import AutoModelForCausalLM, PreTrainedTokenizerBase from tqdm import tqdm from vllm import LLM, SamplingParams - - -def get_tokenizer(model_name: str) -> PreTrainedTokenizerBase: - config = AutoConfig.from_pretrained(model_name) - if config.model_type == "llama": - # A workaround for potential protobuf errors. - model_name = "hf-internal-testing/llama-tokenizer" - tokenizer = AutoTokenizer.from_pretrained(model_name) - # To enable padding in the HF backend. - tokenizer.pad_token = tokenizer.eos_token - return tokenizer - return AutoTokenizer.from_pretrained(model_name) +from vllm.transformers_utils.tokenizer import get_tokenizer def sample_requests( @@ -74,6 +62,7 @@ def sample_requests( def run_vllm( requests: List[Tuple[str, int, int]], model: str, + tokenizer: str, tensor_parallel_size: int, seed: int, n: int, @@ -81,6 +70,7 @@ def run_vllm( ) -> float: llm = LLM( model=model, + tokenizer=tokenizer, tensor_parallel_size=tensor_parallel_size, seed=seed, ) @@ -118,9 +108,10 @@ def run_hf( max_batch_size: int, ) -> float: assert not use_beam_search - tokenizer = get_tokenizer(model) - llm = AutoModelForCausalLM.from_pretrained( - model, torch_dtype=torch.float16) + llm = AutoModelForCausalLM.from_pretrained(model, torch_dtype=torch.float16) + if llm.config.model_type == "llama": + # To enable padding in the HF backend. + tokenizer.pad_token = tokenizer.eos_token llm = llm.cuda() pbar = tqdm(total=len(requests)) @@ -170,13 +161,13 @@ def main(args: argparse.Namespace): random.seed(args.seed) # Sample the requests. - tokenizer = get_tokenizer(args.model) + tokenizer = get_tokenizer(args.tokenizer) requests = sample_requests(args.dataset, args.num_prompts, tokenizer) if args.backend == "vllm": elapsed_time = run_vllm( - requests, args.model, args.tensor_parallel_size, args.seed, args.n, - args.use_beam_search) + requests, args.model, args.tokenizer, args.tensor_parallel_size, + args.seed, args.n, args.use_beam_search) elif args.backend == "hf": assert args.tensor_parallel_size == 1 elapsed_time = run_hf(requests, args.model, tokenizer, args.n, @@ -198,6 +189,7 @@ def main(args: argparse.Namespace): parser.add_argument("--dataset", type=str, required=True, help="Path to the dataset.") parser.add_argument("--model", type=str, default="facebook/opt-125m") + parser.add_argument("--tokenizer", type=str, default=None) parser.add_argument("--tensor-parallel-size", "-tp", type=int, default=1) parser.add_argument("--n", type=int, default=1, help="Number of generated sequences per prompt.") @@ -208,11 +200,14 @@ def main(args: argparse.Namespace): parser.add_argument("--hf-max-batch-size", type=int, default=None, help="Maximum batch size for HF backend.") args = parser.parse_args() + if args.backend == "vllm": if args.hf_max_batch_size is not None: raise ValueError("HF max batch size is only for HF backend.") elif args.backend == "hf": if args.hf_max_batch_size is None: raise ValueError("HF max batch size is required for HF backend.") + if args.tokenizer is None: + args.tokenizer = args.model main(args) diff --git a/vllm/config.py b/vllm/config.py index fd9550d524d7..a102df17c662 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -16,6 +16,7 @@ class ModelConfig: Args: model: Name or path of the huggingface model to use. + tokenizer: Name or path of the huggingface tokenizer to use. download_dir: Directory to download and load the weights, default to the default cache directory of huggingface. use_np_weights: Save a numpy copy of model weights for faster loading. @@ -30,6 +31,7 @@ class ModelConfig: def __init__( self, model: str, + tokenizer: Optional[str], download_dir: Optional[str], use_np_weights: bool, use_dummy_weights: bool, @@ -37,6 +39,7 @@ def __init__( seed: int, ) -> None: self.model = model + self.tokenizer = tokenizer self.download_dir = download_dir self.use_np_weights = use_np_weights self.use_dummy_weights = use_dummy_weights diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index 10e6070b42c7..ffd39ddf0eb6 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -11,6 +11,7 @@ class EngineArgs: """Arguments for vLLM engine.""" model: str + tokenizer: Optional[str] = None download_dir: Optional[str] = None use_np_weights: bool = False use_dummy_weights: bool = False @@ -27,6 +28,8 @@ class EngineArgs: disable_log_stats: bool = False def __post_init__(self): + if self.tokenizer is None: + self.tokenizer = self.model self.max_num_seqs = min(self.max_num_seqs, self.max_num_batched_tokens) @staticmethod @@ -37,6 +40,8 @@ def add_cli_args( # Model arguments parser.add_argument('--model', type=str, default='facebook/opt-125m', help='name or path of the huggingface model to use') + parser.add_argument('--tokenizer', type=str, default=EngineArgs.tokenizer, + help='name or path of the huggingface tokenizer to use') parser.add_argument('--download-dir', type=str, default=EngineArgs.download_dir, help='directory to download and load the weights, ' @@ -104,7 +109,7 @@ def create_engine_configs( ) -> Tuple[ModelConfig, CacheConfig, ParallelConfig, SchedulerConfig]: # Initialize the configs. model_config = ModelConfig( - self.model, self.download_dir, self.use_np_weights, + self.model, self.tokenizer, self.download_dir, self.use_np_weights, self.use_dummy_weights, self.dtype, self.seed) cache_config = CacheConfig(self.block_size, self.gpu_memory_utilization, self.swap_space) diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index e9d616febc14..f7b4197387ae 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -6,11 +6,12 @@ from vllm.core.scheduler import Scheduler from vllm.engine.arg_utils import EngineArgs from vllm.engine.ray_utils import DeviceID, initialize_cluster, ray -from vllm.engine.tokenizer_utils import detokenize_incrementally, get_tokenizer from vllm.logger import init_logger from vllm.outputs import RequestOutput from vllm.sampling_params import SamplingParams from vllm.sequence import Sequence, SequenceGroup, SequenceStatus +from vllm.transformers_utils.tokenizer import (detokenize_incrementally, + get_tokenizer) from vllm.utils import Counter from vllm.worker.worker import Worker @@ -59,6 +60,7 @@ def __init__( logger.info( "Initializing an LLM engine with config: " f"model={model_config.model!r}, " + f"tokenizer={model_config.tokenizer!r}, " f"dtype={model_config.dtype}, " f"use_dummy_weights={model_config.use_dummy_weights}, " f"download_dir={model_config.download_dir!r}, " @@ -75,7 +77,7 @@ def __init__( self.log_stats = log_stats self._verify_args() - self.tokenizer = get_tokenizer(model_config.model) + self.tokenizer = get_tokenizer(model_config.tokenizer) self.seq_counter = Counter() # Create the parallel GPU workers. diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py index 1895103b123a..6cffd26829c4 100644 --- a/vllm/entrypoints/llm.py +++ b/vllm/entrypoints/llm.py @@ -25,6 +25,7 @@ class LLM: Args: model: The name or path of a HuggingFace Transformers model. + tokenizer: The name or path of a HuggingFace Transformers tokenizer. tensor_parallel_size: The number of GPUs to use for distributed execution with tensor parallelism. dtype: The data type for the model weights and activations. Currently, @@ -38,6 +39,7 @@ class LLM: def __init__( self, model: str, + tokenizer: Optional[str] = None, tensor_parallel_size: int = 1, dtype: str = "auto", seed: int = 0, @@ -47,6 +49,7 @@ def __init__( kwargs["disable_log_stats"] = True engine_args = EngineArgs( model=model, + tokenizer=tokenizer, tensor_parallel_size=tensor_parallel_size, dtype=dtype, seed=seed, diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py index bb417eaf3c2e..3b51e2d99ac4 100644 --- a/vllm/entrypoints/openai/api_server.py +++ b/vllm/entrypoints/openai/api_server.py @@ -15,7 +15,6 @@ from vllm.engine.arg_utils import AsyncEngineArgs from vllm.engine.async_llm_engine import AsyncLLMEngine -from vllm.engine.tokenizer_utils import get_tokenizer from vllm.entrypoints.openai.protocol import ( CompletionRequest, CompletionResponse, CompletionResponseChoice, CompletionResponseStreamChoice, CompletionStreamResponse, ErrorResponse, @@ -23,6 +22,7 @@ from vllm.logger import init_logger from vllm.outputs import RequestOutput from vllm.sampling_params import SamplingParams +from vllm.transformers_utils.tokenizer import get_tokenizer from vllm.utils import random_uuid TIMEOUT_KEEP_ALIVE = 5 # seconds diff --git a/vllm/transformers_utils/__init__.py b/vllm/transformers_utils/__init__.py new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/vllm/engine/tokenizer_utils.py b/vllm/transformers_utils/tokenizer.py similarity index 63% rename from vllm/engine/tokenizer_utils.py rename to vllm/transformers_utils/tokenizer.py index 2b082c33a216..e3debc882431 100644 --- a/vllm/engine/tokenizer_utils.py +++ b/vllm/transformers_utils/tokenizer.py @@ -1,46 +1,44 @@ from typing import List, Tuple, Union -from transformers import (AutoConfig, AutoTokenizer, PreTrainedTokenizer, +from transformers import (AutoTokenizer, PreTrainedTokenizer, PreTrainedTokenizerFast) from vllm.logger import init_logger logger = init_logger(__name__) -_MODEL_TYPES_WITH_SLOW_TOKENIZER = [] +# A fast LLaMA tokenizer with the pre-processed `tokenizer.json` file. +_FAST_LLAMA_TOKENIZER = "hf-internal-testing/llama-tokenizer" def get_tokenizer( - model_name: str, + tokenizer_name: str, *args, **kwargs, ) -> Union[PreTrainedTokenizer, PreTrainedTokenizerFast]: """Gets a tokenizer for the given model name via Huggingface.""" - config = AutoConfig.from_pretrained(model_name) - if "open_llama" in model_name: - kwargs["use_fast"] = False + if "llama" in tokenizer_name.lower() and kwargs.get("use_fast", True): logger.info( - "OpenLLaMA models do not support the fast tokenizer. " - "Using the slow tokenizer instead.") - elif config.model_type == "llama" and kwargs.get("use_fast", True): - # LLaMA fast tokenizer causes protobuf errors in some environments. - # However, we found that the below LLaMA fast tokenizer works well in - # most environments. - model_name = "hf-internal-testing/llama-tokenizer" - logger.info( - f"Using the LLaMA fast tokenizer in '{model_name}' to avoid " - "potential protobuf errors.") - elif config.model_type in _MODEL_TYPES_WITH_SLOW_TOKENIZER: - if kwargs.get("use_fast", False) == True: - raise ValueError( - f"Cannot use the fast tokenizer for {config.model_type} due to " - "bugs in the fast tokenizer.") - logger.info( - f"Using the slow tokenizer for {config.model_type} due to bugs in " - "the fast tokenizer. This could potentially lead to performance " - "degradation.") - kwargs["use_fast"] = False - return AutoTokenizer.from_pretrained(model_name, *args, **kwargs) + "For some LLaMA-based models, initializing the fast tokenizer may " + "take a long time. To eliminate the initialization time, consider " + f"using '{_FAST_LLAMA_TOKENIZER}' instead of the original " + "tokenizer.") + try: + tokenizer = AutoTokenizer.from_pretrained(tokenizer_name, *args, + **kwargs) + except TypeError as e: + # The LLaMA tokenizer causes a protobuf error in some environments. + err_msg = ( + "Failed to load the tokenizer. If you are using a LLaMA-based " + f"model, use '{_FAST_LLAMA_TOKENIZER}' instead of the original " + "tokenizer.") + raise RuntimeError(err_msg) from e + + if not isinstance(tokenizer, PreTrainedTokenizerFast): + logger.warning( + "Using a slow tokenizer. This might cause a significant " + "slowdown. Consider using a fast tokenizer instead.") + return tokenizer def detokenize_incrementally(