diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index 15f971b66e3b..8badc16d0cb7 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -13,7 +13,7 @@ steps: - label: Basic Correctness Test command: pytest -v -s --forked basic_correctness - + - label: Core Test command: pytest -v -s core @@ -28,7 +28,7 @@ steps: num_gpus: 2 # only support 1 or 2 for now. - label: Engine Test - command: pytest -v -s engine + command: pytest -v -s engine tokenization test_sequence.py - label: Entrypoints Test command: pytest -v -s entrypoints @@ -52,6 +52,9 @@ steps: - label: Worker Test command: pytest -v -s worker +- label: Speculative decoding tests + command: pytest -v -s spec_decode + - label: LoRA Test command: pytest -v -s lora --forked diff --git a/.github/ISSUE_TEMPLATE/100-documentation.yml b/.github/ISSUE_TEMPLATE/100-documentation.yml new file mode 100644 index 000000000000..501c0aa48b88 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/100-documentation.yml @@ -0,0 +1,22 @@ +name: 📚 Documentation +description: Report an issue related to https://docs.vllm.ai/ +title: "[Doc]: " +labels: ["documentation"] + +body: +- type: textarea + attributes: + label: 📚 The doc issue + description: > + A clear and concise description of what content in https://docs.vllm.ai/ is an issue. + validations: + required: true +- type: textarea + attributes: + label: Suggest a potential alternative/fix + description: > + Tell us how we could improve the documentation in this regard. +- type: markdown + attributes: + value: > + Thanks for contributing 🎉! diff --git a/.github/ISSUE_TEMPLATE/200-installation.yml b/.github/ISSUE_TEMPLATE/200-installation.yml new file mode 100644 index 000000000000..4c6c96187cc6 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/200-installation.yml @@ -0,0 +1,39 @@ +name: 🛠️ Installation +description: Report an issue here when you hit errors during installation. +title: "[Installation]: " +labels: ["installation"] + +body: +- type: markdown + attributes: + value: > + #### Before submitting an issue, please make sure the issue hasn't been already addressed by searching through [the existing and past issues](https://github.com/vllm-project/vllm/issues?q=is%3Aissue+sort%3Acreated-desc+). +- type: textarea + attributes: + label: Your current environment + description: | + Please run the following and paste the output below. + ```sh + wget https://raw.githubusercontent.com/vllm-project/vllm/main/collect_env.py + # For security purposes, please feel free to check the contents of collect_env.py before running it. + python collect_env.py + ``` + value: | + ```text + The output of `python collect_env.py` + ``` + validations: + required: true +- type: textarea + attributes: + label: How you are installing vllm + description: | + Paste the full command you are trying to execute. + value: | + ```sh + pip install -vvv vllm + ``` +- type: markdown + attributes: + value: > + Thanks for contributing 🎉! diff --git a/.github/ISSUE_TEMPLATE/300-usage.yml b/.github/ISSUE_TEMPLATE/300-usage.yml new file mode 100644 index 000000000000..88227b4b2e7b --- /dev/null +++ b/.github/ISSUE_TEMPLATE/300-usage.yml @@ -0,0 +1,37 @@ +name: 💻 Usage +description: Raise an issue here if you don't know how to use vllm. +title: "[Usage]: " +labels: ["usage"] + +body: +- type: markdown + attributes: + value: > + #### Before submitting an issue, please make sure the issue hasn't been already addressed by searching through [the existing and past issues](https://github.com/vllm-project/vllm/issues?q=is%3Aissue+sort%3Acreated-desc+). +- type: textarea + attributes: + label: Your current environment + description: | + Please run the following and paste the output below. + ```sh + wget https://raw.githubusercontent.com/vllm-project/vllm/main/collect_env.py + # For security purposes, please feel free to check the contents of collect_env.py before running it. + python collect_env.py + ``` + value: | + ```text + The output of `python collect_env.py` + ``` + validations: + required: true +- type: textarea + attributes: + label: How would you like to use vllm + description: | + A detailed description of how you want to use vllm. + value: | + I want to run inference of a [specific model](put link here). I don't know how to integrate it with vllm. +- type: markdown + attributes: + value: > + Thanks for contributing 🎉! diff --git a/.github/ISSUE_TEMPLATE/400-bug report.yml b/.github/ISSUE_TEMPLATE/400-bug report.yml new file mode 100644 index 000000000000..f1124dfa78bb --- /dev/null +++ b/.github/ISSUE_TEMPLATE/400-bug report.yml @@ -0,0 +1,81 @@ +name: 🐛 Bug report +description: Raise an issue here if you find a bug. +title: "[Bug]: " +labels: ["bug"] + +body: +- type: markdown + attributes: + value: > + #### Before submitting an issue, please make sure the issue hasn't been already addressed by searching through [the existing and past issues](https://github.com/vllm-project/vllm/issues?q=is%3Aissue+sort%3Acreated-desc+). +- type: textarea + attributes: + label: Your current environment + description: | + Please run the following and paste the output below. + ```sh + wget https://raw.githubusercontent.com/vllm-project/vllm/main/collect_env.py + # For security purposes, please feel free to check the contents of collect_env.py before running it. + python collect_env.py + ``` + value: | + ```text + The output of `python collect_env.py` + ``` + validations: + required: true +- type: textarea + attributes: + label: 🐛 Describe the bug + description: | + Please provide a clear and concise description of what the bug is. + + If relevant, add a minimal example so that we can reproduce the error by running the code. It is very important for the snippet to be as succinct (minimal) as possible, so please take time to trim down any irrelevant code to help us debug efficiently. We are going to copy-paste your code and we expect to get the same result as you did: avoid any external data, and include the relevant imports, etc. For example: + + ```python + from vllm import LLM, SamplingParams + + prompts = [ + "Hello, my name is", + "The president of the United States is", + "The capital of France is", + "The future of AI is", + ] + sampling_params = SamplingParams(temperature=0.8, top_p=0.95) + + llm = LLM(model="facebook/opt-125m") + + outputs = llm.generate(prompts, sampling_params) + + # Print the outputs. + for output in outputs: + prompt = output.prompt + generated_text = output.outputs[0].text + print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") + ``` + + If the code is too long (hopefully, it isn't), feel free to put it in a public gist and link it in the issue: https://gist.github.com. + + Please also paste or describe the results you observe instead of the expected results. If you observe an error, please paste the error message including the **full** traceback of the exception. It may be relevant to wrap error messages in ```` ```triple quotes blocks``` ````. + placeholder: | + A clear and concise description of what the bug is. + + ```python + # Sample code to reproduce the problem + ``` + + ``` + The error message you got, with the full traceback. + ``` + validations: + required: true +- type: markdown + attributes: + value: > + ⚠️ Please separate bugs of `transformers` implementation or usage from bugs of `vllm`. If you think anything is wrong with the models' output: + + - Try the counterpart of `transformers` first. If the error appears, please go to [their issues](https://github.com/huggingface/transformers/issues?q=is%3Aissue+is%3Aopen+sort%3Aupdated-desc). + + - If the error only appears in vllm, please provide the detailed script of how you run `transformers` and `vllm`, also highlight the difference and what you expect. + + Thanks for contributing 🎉! diff --git a/.github/ISSUE_TEMPLATE/500-feature request.yml b/.github/ISSUE_TEMPLATE/500-feature request.yml new file mode 100644 index 000000000000..47a90628c76c --- /dev/null +++ b/.github/ISSUE_TEMPLATE/500-feature request.yml @@ -0,0 +1,31 @@ +name: 🚀 Feature request +description: Submit a proposal/request for a new vllm feature +title: "[Feature]: " +labels: ["feature request"] + +body: +- type: markdown + attributes: + value: > + #### Before submitting an issue, please make sure the issue hasn't been already addressed by searching through [the existing and past issues](https://github.com/vllm-project/vllm/issues?q=is%3Aissue+sort%3Acreated-desc+). +- type: textarea + attributes: + label: 🚀 The feature, motivation and pitch + description: > + A clear and concise description of the feature proposal. Please outline the motivation for the proposal. Is your feature request related to a specific problem? e.g., *"I'm working on X and would like Y to be possible"*. If this is related to another GitHub issue, please link here too. + validations: + required: true +- type: textarea + attributes: + label: Alternatives + description: > + A description of any alternative solutions or features you've considered, if any. +- type: textarea + attributes: + label: Additional context + description: > + Add any other context or screenshots about the feature request. +- type: markdown + attributes: + value: > + Thanks for contributing 🎉! diff --git a/.github/ISSUE_TEMPLATE/600-new model.yml b/.github/ISSUE_TEMPLATE/600-new model.yml new file mode 100644 index 000000000000..bbddbfd67138 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/600-new model.yml @@ -0,0 +1,33 @@ +name: 🤗 Support request for a new model from huggingface +description: Submit a proposal/request for a new model from huggingface +title: "[New Model]: " +labels: ["new model"] + +body: +- type: markdown + attributes: + value: > + #### Before submitting an issue, please make sure the issue hasn't been already addressed by searching through [the existing and past issues](https://github.com/vllm-project/vllm/issues?q=is%3Aissue+sort%3Acreated-desc+). + + #### We also highly recommend you read https://docs.vllm.ai/en/latest/models/adding_model.html first to understand how to add a new model. +- type: textarea + attributes: + label: The model to consider. + description: > + A huggingface url, pointing to the model, e.g. https://huggingface.co/openai-community/gpt2 . + validations: + required: true +- type: textarea + attributes: + label: The closest model vllm already supports. + description: > + Here is the list of models already supported by vllm: https://github.com/vllm-project/vllm/tree/main/vllm/model_executor/models . Which model is the most similar to the model you want to add support for? +- type: textarea + attributes: + label: What's your difficulty of supporting the model you want? + description: > + For example, any new operators or new architecture? +- type: markdown + attributes: + value: > + Thanks for contributing 🎉! diff --git a/.github/ISSUE_TEMPLATE/700-performance discussion.yml b/.github/ISSUE_TEMPLATE/700-performance discussion.yml new file mode 100644 index 000000000000..9e8e7b4aa353 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/700-performance discussion.yml @@ -0,0 +1,51 @@ +name: ⚡ Discussion on the performance of vllm +description: Submit a proposal/discussion about the performance of vllm +title: "[Performance]: " +labels: ["performance"] + +body: +- type: markdown + attributes: + value: > + #### Before submitting an issue, please make sure the issue hasn't been already addressed by searching through [the existing and past issues](https://github.com/vllm-project/vllm/issues?q=is%3Aissue+sort%3Acreated-desc+). +- type: textarea + attributes: + label: Proposal to improve performance + description: > + How do you plan to improve vllm's performance? + validations: + required: false +- type: textarea + attributes: + label: Report of performance regression + description: > + Please provide detailed description of performance comparison to confirm the regression. You may want to run the benchmark script at https://github.com/vllm-project/vllm/tree/main/benchmarks . + validations: + required: false +- type: textarea + attributes: + label: Misc discussion on performance + description: > + Anything about the performance. + validations: + required: false +- type: textarea + attributes: + label: Your current environment (if you think it is necessary) + description: | + Please run the following and paste the output below. + ```sh + wget https://raw.githubusercontent.com/vllm-project/vllm/main/collect_env.py + # For security purposes, please feel free to check the contents of collect_env.py before running it. + python collect_env.py + ``` + value: | + ```text + The output of `python collect_env.py` + ``` + validations: + required: false +- type: markdown + attributes: + value: > + Thanks for contributing 🎉! diff --git a/.github/ISSUE_TEMPLATE/800-misc discussion.yml b/.github/ISSUE_TEMPLATE/800-misc discussion.yml new file mode 100644 index 000000000000..ddb10f72db29 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/800-misc discussion.yml @@ -0,0 +1,21 @@ +name: 🎲 Misc/random discussions that do not fit into the above categories. +description: Submit a discussion as you like. Note that developers are heavily overloaded and we mainly rely on community users to answer these issues. +title: "[Misc]: " +labels: ["misc"] + +body: +- type: markdown + attributes: + value: > + #### Before submitting an issue, please make sure the issue hasn't been already addressed by searching through [the existing and past issues](https://github.com/vllm-project/vllm/issues?q=is%3Aissue+sort%3Acreated-desc+). +- type: textarea + attributes: + label: Anything you want to discuss about vllm. + description: > + Anything you want to discuss about vllm. + validations: + required: true +- type: markdown + attributes: + value: > + Thanks for contributing 🎉! diff --git a/.github/ISSUE_TEMPLATE/config.yml b/.github/ISSUE_TEMPLATE/config.yml new file mode 100644 index 000000000000..3ba13e0cec6c --- /dev/null +++ b/.github/ISSUE_TEMPLATE/config.yml @@ -0,0 +1 @@ +blank_issues_enabled: false diff --git a/.yapfignore b/.yapfignore new file mode 100644 index 000000000000..2d6dcf8380ca --- /dev/null +++ b/.yapfignore @@ -0,0 +1 @@ +collect_env.py diff --git a/Dockerfile b/Dockerfile index dd4867702d3d..8be03b3567f0 100644 --- a/Dockerfile +++ b/Dockerfile @@ -57,6 +57,22 @@ ENV VLLM_INSTALL_PUNICA_KERNELS=1 RUN python3 setup.py build_ext --inplace #################### EXTENSION Build IMAGE #################### +#################### FLASH_ATTENTION Build IMAGE #################### +FROM dev as flash-attn-builder +# max jobs used for build +ARG max_jobs=2 +ENV MAX_JOBS=${max_jobs} +# flash attention version +ARG flash_attn_version=v2.5.6 +ENV FLASH_ATTN_VERSION=${flash_attn_version} + +WORKDIR /usr/src/flash-attention-v2 + +# Download the wheel or build it if a pre-compiled release doesn't exist +RUN pip --verbose wheel flash-attn==${FLASH_ATTN_VERSION} \ + --no-build-isolation --no-deps --no-cache-dir + +#################### FLASH_ATTENTION Build IMAGE #################### #################### TEST IMAGE #################### # image to run unit testing suite @@ -68,6 +84,9 @@ WORKDIR /vllm-workspace # ADD is used to preserve directory structure ADD . /vllm-workspace/ COPY --from=build /workspace/vllm/*.so /vllm-workspace/vllm/ +# Install flash attention (from pre-built wheel) +RUN --mount=type=bind,from=flash-attn-builder,src=/usr/src/flash-attention-v2,target=/usr/src/flash-attention-v2 \ + pip install /usr/src/flash-attention-v2/*.whl --no-cache-dir # ignore build dependencies installation because we are using pre-complied extensions RUN rm pyproject.toml RUN --mount=type=cache,target=/root/.cache/pip VLLM_USE_PRECOMPILED=1 pip install . --verbose @@ -88,6 +107,11 @@ WORKDIR /workspace COPY requirements.txt requirements.txt RUN --mount=type=cache,target=/root/.cache/pip \ pip install -r requirements.txt + +# Install flash attention (from pre-built wheel) +RUN --mount=type=bind,from=flash-attn-builder,src=/usr/src/flash-attention-v2,target=/usr/src/flash-attention-v2 \ + pip install /usr/src/flash-attention-v2/*.whl --no-cache-dir + #################### RUNTIME BASE IMAGE #################### @@ -96,7 +120,7 @@ RUN --mount=type=cache,target=/root/.cache/pip \ FROM vllm-base AS vllm-openai # install additional dependencies for openai api server RUN --mount=type=cache,target=/root/.cache/pip \ - pip install accelerate + pip install accelerate hf_transfer COPY --from=build /workspace/vllm/*.so /workspace/vllm/ COPY vllm vllm diff --git a/benchmarks/kernels/benchmark_mixtral_moe.py b/benchmarks/kernels/benchmark_mixtral_moe.py index 9e08df76947f..964eca5aaf72 100644 --- a/benchmarks/kernels/benchmark_mixtral_moe.py +++ b/benchmarks/kernels/benchmark_mixtral_moe.py @@ -2,13 +2,13 @@ import os import sys -os.environ['CUDA_VISIBLE_DEVICES'] = '0' - -from vllm.model_executor.layers.fused_moe import fused_moe +from vllm.model_executor.layers.fused_moe import fused_moe, get_config_file_name import torch import torch.nn.functional as F import triton +os.environ['CUDA_VISIBLE_DEVICES'] = '0' + def main(): method = fused_moe @@ -64,7 +64,7 @@ def run_grid(bs, method): print(f'{tp_size=} {bs=}') print(f'{config}') # warmup - print(f'warming up') + print('warming up') try: for _ in range(num_warmup_trials): run_timing( @@ -82,7 +82,7 @@ def run_grid(bs, method): continue # trial - print(f'benchmarking') + print('benchmarking') for _ in range(num_trials): kernel_dur_ms = run_timing( num_calls=num_calls, @@ -103,17 +103,25 @@ def run_grid(bs, method): best_config = config best_time_us = kernel_dur_us - print( - f'{kernel_dur_us=:.1f} {model_dur_ms=:.1f} {bs=} {tp_size=} {top_k=} {num_total_experts=} {d_model=} {model_intermediate_size=} {num_layers=}' - ) + print(f'{kernel_dur_us=:.1f} {model_dur_ms=:.1f}' + f' {bs=} {tp_size=} {top_k=} {num_total_experts=} ' + f'{d_model=} {model_intermediate_size=} {num_layers=}') print("best_time_us", best_time_us) print("best_config", best_config) - filename = "/tmp/config.jsonl" + # holds Dict[str, Dict[str, int]] + filename = get_config_file_name(num_total_experts, + model_intermediate_size // tp_size) print(f"writing config to file {filename}") - with open(filename, "a") as f: - f.write(json.dumps({str(bs): best_config}) + "\n") + existing_content = {} + if os.path.exists(filename): + with open(filename, "r") as f: + existing_content = json.load(f) + existing_content[str(bs)] = best_config + with open(filename, "w") as f: + json.dump(existing_content, f, indent=4) + f.write("\n") def run_timing(num_calls: int, bs: int, d_model: int, num_total_experts: int, diff --git a/benchmarks/kernels/benchmark_rope.py b/benchmarks/kernels/benchmark_rope.py new file mode 100644 index 000000000000..f9564dd9588f --- /dev/null +++ b/benchmarks/kernels/benchmark_rope.py @@ -0,0 +1,120 @@ +from typing import Optional + +import argparse +import torch +import nvtx +from itertools import accumulate +from vllm.model_executor.layers.rotary_embedding import get_rope + + +def benchmark_rope_kernels_multi_lora( + is_neox_style: bool, + batch_size: int, + seq_len: int, + num_heads: int, + head_size: int, + rotary_dim: Optional[int], + dtype: torch.dtype, + seed: int, + device: str, + max_position: int = 8192, + base: int = 10000, +) -> None: + torch.random.manual_seed(seed) + if torch.cuda.is_available(): + torch.cuda.manual_seed(seed) + torch.set_default_device(device) + if rotary_dim is None: + rotary_dim = head_size + # silulating serving 4 LoRAs + scaling_factors = [1, 2, 4, 8] + # batched RoPE can take multiple scaling factors + batched_rope = get_rope(head_size, rotary_dim, max_position, base, + is_neox_style, { + "type": "linear", + "factor": tuple(scaling_factors) + }) + # non-batched RoPE takes only one scaling factor, we create multiple + # instances to simulate the same behavior + non_batched_ropes = [] + for scaling_factor in scaling_factors: + non_batched_ropes.append( + get_rope(head_size, rotary_dim, max_position, base, is_neox_style, + { + "type": "linear", + "factor": (scaling_factor, ) + })) + + positions = torch.randint(0, max_position, (batch_size, seq_len)) + query = torch.randn(batch_size, + seq_len, + num_heads * head_size, + dtype=dtype) + key = torch.randn_like(query) + + # create query offsets for batched RoPE, we concat multiple kv cache + # together and each query needs to find the right kv cache of its type + offset_map = torch.tensor( + list( + accumulate([0] + [ + max_position * scaling_factor * 2 + for scaling_factor in scaling_factors[:-1] + ]))) + query_types = torch.randint(0, + len(scaling_factors), (batch_size, seq_len), + device=device) + # map query types to offsets + query_offsets = offset_map[query_types] + # the kernel takes flattened offsets + flatten_offsets = query_offsets.flatten() + + # batched queries of the same type together for non-batched RoPE + queries = [query[query_types == i] for i in range(len(scaling_factors))] + keys = [key[query_types == i] for i in range(len(scaling_factors))] + packed_qkr = zip(queries, keys, non_batched_ropes) + # synchronize before start timing + torch.cuda.synchronize() + with nvtx.annotate("non-batched", color="yellow"): + for q, k, r in packed_qkr: + r.forward(positions, q, k) + torch.cuda.synchronize() + with nvtx.annotate("batched", color="green"): + batched_rope.forward(positions, query, key, flatten_offsets) + torch.cuda.synchronize() + + +if __name__ == '__main__': + parser = argparse.ArgumentParser( + description="Benchmark the rotary embedding kernels.") + parser.add_argument("--is-neox-style", type=bool, default=True) + parser.add_argument("--batch-size", type=int, default=16) + parser.add_argument("--seq-len", type=int, default=512) + parser.add_argument("--num-heads", type=int, default=8) + parser.add_argument("--head-size", + type=int, + choices=[64, 80, 96, 112, 128, 256], + default=128) + parser.add_argument("--rotary-dim", type=int, choices=[16, 32], default=32) + parser.add_argument("--dtype", + type=str, + choices=["bfloat16", "float"], + default="float") + parser.add_argument("--seed", type=int, default=0) + parser.add_argument("--device", + type=str, + choices=["cuda:0", "cuda:1"], + default="cuda:0") + args = parser.parse_args() + print(args) + + benchmark_rope_kernels_multi_lora( + is_neox_style=args.is_neox_style, + batch_size=args.batch_size, + seq_len=args.seq_len, + num_heads=args.num_heads, + head_size=args.head_size, + rotary_dim=args.rotary_dim, + dtype=getattr(torch, args.dtype), + seed=args.seed, + device=args.device, + ) diff --git a/collect_env.py b/collect_env.py new file mode 100644 index 000000000000..a886db693e2f --- /dev/null +++ b/collect_env.py @@ -0,0 +1,688 @@ +# code borrowed from https://github.com/pytorch/pytorch/blob/main/torch/utils/collect_env.py + +# Unlike the rest of the PyTorch this file must be python2 compliant. +# This script outputs relevant system environment info +# Run it with `python collect_env.py` or `python -m torch.utils.collect_env` +import datetime +import locale +import re +import subprocess +import sys +import os +from collections import namedtuple + + +try: + import torch + TORCH_AVAILABLE = True +except (ImportError, NameError, AttributeError, OSError): + TORCH_AVAILABLE = False + +# System Environment Information +SystemEnv = namedtuple('SystemEnv', [ + 'torch_version', + 'is_debug_build', + 'cuda_compiled_version', + 'gcc_version', + 'clang_version', + 'cmake_version', + 'os', + 'libc_version', + 'python_version', + 'python_platform', + 'is_cuda_available', + 'cuda_runtime_version', + 'cuda_module_loading', + 'nvidia_driver_version', + 'nvidia_gpu_models', + 'cudnn_version', + 'pip_version', # 'pip' or 'pip3' + 'pip_packages', + 'conda_packages', + 'hip_compiled_version', + 'hip_runtime_version', + 'miopen_runtime_version', + 'caching_allocator_config', + 'is_xnnpack_available', + 'cpu_info', + 'rocm_version', # vllm specific field + 'neuron_sdk_version', # vllm specific field + 'vllm_version', # vllm specific field + 'vllm_build_flags', # vllm specific field + 'gpu_topo', # vllm specific field +]) + +DEFAULT_CONDA_PATTERNS = { + "torch", + "numpy", + "cudatoolkit", + "soumith", + "mkl", + "magma", + "triton", + "optree", +} + +DEFAULT_PIP_PATTERNS = { + "torch", + "numpy", + "mypy", + "flake8", + "triton", + "optree", + "onnx", +} + + +def run(command): + """Return (return-code, stdout, stderr).""" + shell = True if type(command) is str else False + p = subprocess.Popen(command, stdout=subprocess.PIPE, + stderr=subprocess.PIPE, shell=shell) + raw_output, raw_err = p.communicate() + rc = p.returncode + if get_platform() == 'win32': + enc = 'oem' + else: + enc = locale.getpreferredencoding() + output = raw_output.decode(enc) + err = raw_err.decode(enc) + return rc, output.strip(), err.strip() + + +def run_and_read_all(run_lambda, command): + """Run command using run_lambda; reads and returns entire output if rc is 0.""" + rc, out, _ = run_lambda(command) + if rc != 0: + return None + return out + + +def run_and_parse_first_match(run_lambda, command, regex): + """Run command using run_lambda, returns the first regex match if it exists.""" + rc, out, _ = run_lambda(command) + if rc != 0: + return None + match = re.search(regex, out) + if match is None: + return None + return match.group(1) + +def run_and_return_first_line(run_lambda, command): + """Run command using run_lambda and returns first line if output is not empty.""" + rc, out, _ = run_lambda(command) + if rc != 0: + return None + return out.split('\n')[0] + + +def get_conda_packages(run_lambda, patterns=None): + if patterns is None: + patterns = DEFAULT_CONDA_PATTERNS + conda = os.environ.get('CONDA_EXE', 'conda') + out = run_and_read_all(run_lambda, "{} list".format(conda)) + if out is None: + return out + + return "\n".join( + line + for line in out.splitlines() + if not line.startswith("#") + and any(name in line for name in patterns) + ) + +def get_gcc_version(run_lambda): + return run_and_parse_first_match(run_lambda, 'gcc --version', r'gcc (.*)') + +def get_clang_version(run_lambda): + return run_and_parse_first_match(run_lambda, 'clang --version', r'clang version (.*)') + + +def get_cmake_version(run_lambda): + return run_and_parse_first_match(run_lambda, 'cmake --version', r'cmake (.*)') + + +def get_nvidia_driver_version(run_lambda): + if get_platform() == 'darwin': + cmd = 'kextstat | grep -i cuda' + return run_and_parse_first_match(run_lambda, cmd, + r'com[.]nvidia[.]CUDA [(](.*?)[)]') + smi = get_nvidia_smi() + return run_and_parse_first_match(run_lambda, smi, r'Driver Version: (.*?) ') + + +def get_gpu_info(run_lambda): + if get_platform() == 'darwin' or (TORCH_AVAILABLE and hasattr(torch.version, 'hip') and torch.version.hip is not None): + if TORCH_AVAILABLE and torch.cuda.is_available(): + if torch.version.hip is not None: + prop = torch.cuda.get_device_properties(0) + if hasattr(prop, "gcnArchName"): + gcnArch = " ({})".format(prop.gcnArchName) + else: + gcnArch = "NoGCNArchNameOnOldPyTorch" + else: + gcnArch = "" + return torch.cuda.get_device_name(None) + gcnArch + return None + smi = get_nvidia_smi() + uuid_regex = re.compile(r' \(UUID: .+?\)') + rc, out, _ = run_lambda(smi + ' -L') + if rc != 0: + return None + # Anonymize GPUs by removing their UUID + return re.sub(uuid_regex, '', out) + + +def get_running_cuda_version(run_lambda): + return run_and_parse_first_match(run_lambda, 'nvcc --version', r'release .+ V(.*)') + + +def get_cudnn_version(run_lambda): + """Return a list of libcudnn.so; it's hard to tell which one is being used.""" + if get_platform() == 'win32': + system_root = os.environ.get('SYSTEMROOT', 'C:\\Windows') + cuda_path = os.environ.get('CUDA_PATH', "%CUDA_PATH%") + where_cmd = os.path.join(system_root, 'System32', 'where') + cudnn_cmd = '{} /R "{}\\bin" cudnn*.dll'.format(where_cmd, cuda_path) + elif get_platform() == 'darwin': + # CUDA libraries and drivers can be found in /usr/local/cuda/. See + # https://docs.nvidia.com/cuda/cuda-installation-guide-mac-os-x/index.html#install + # https://docs.nvidia.com/deeplearning/sdk/cudnn-install/index.html#installmac + # Use CUDNN_LIBRARY when cudnn library is installed elsewhere. + cudnn_cmd = 'ls /usr/local/cuda/lib/libcudnn*' + else: + cudnn_cmd = 'ldconfig -p | grep libcudnn | rev | cut -d" " -f1 | rev' + rc, out, _ = run_lambda(cudnn_cmd) + # find will return 1 if there are permission errors or if not found + if len(out) == 0 or (rc != 1 and rc != 0): + l = os.environ.get('CUDNN_LIBRARY') + if l is not None and os.path.isfile(l): + return os.path.realpath(l) + return None + files_set = set() + for fn in out.split('\n'): + fn = os.path.realpath(fn) # eliminate symbolic links + if os.path.isfile(fn): + files_set.add(fn) + if not files_set: + return None + # Alphabetize the result because the order is non-deterministic otherwise + files = sorted(files_set) + if len(files) == 1: + return files[0] + result = '\n'.join(files) + return 'Probably one of the following:\n{}'.format(result) + + +def get_nvidia_smi(): + # Note: nvidia-smi is currently available only on Windows and Linux + smi = 'nvidia-smi' + if get_platform() == 'win32': + system_root = os.environ.get('SYSTEMROOT', 'C:\\Windows') + program_files_root = os.environ.get('PROGRAMFILES', 'C:\\Program Files') + legacy_path = os.path.join(program_files_root, 'NVIDIA Corporation', 'NVSMI', smi) + new_path = os.path.join(system_root, 'System32', smi) + smis = [new_path, legacy_path] + for candidate_smi in smis: + if os.path.exists(candidate_smi): + smi = '"{}"'.format(candidate_smi) + break + return smi + + +def get_rocm_version(run_lambda): + """Returns the ROCm version if available, otherwise 'N/A'.""" + return run_and_parse_first_match(run_lambda, 'hipcc --version', r'HIP version: (\S+)') + + +def get_neuron_sdk_version(run_lambda): + # Adapted from your install script + try: + result = run_lambda(["neuron-ls"]) + return result if result[0] == 0 else 'N/A' + except Exception: + return 'N/A' + + +def get_vllm_version(): + try: + import vllm + return vllm.__version__ + except ImportError: + return 'N/A' + + +def summarize_vllm_build_flags(): + # This could be a static method if the flags are constant, or dynamic if you need to check environment variables, etc. + return 'CUDA Archs: {}; ROCm: {}; Neuron: {}'.format( + os.environ.get('TORCH_CUDA_ARCH_LIST', 'Not Set'), + 'Enabled' if os.environ.get('ROCM_HOME') else 'Disabled', + 'Enabled' if os.environ.get('NEURON_CORES') else 'Disabled', + ) + + +def get_gpu_topo(run_lambda): + if get_platform() == 'linux': + return run_and_read_all(run_lambda, 'nvidia-smi topo -m') + return None + + +# example outputs of CPU infos +# * linux +# Architecture: x86_64 +# CPU op-mode(s): 32-bit, 64-bit +# Address sizes: 46 bits physical, 48 bits virtual +# Byte Order: Little Endian +# CPU(s): 128 +# On-line CPU(s) list: 0-127 +# Vendor ID: GenuineIntel +# Model name: Intel(R) Xeon(R) Platinum 8375C CPU @ 2.90GHz +# CPU family: 6 +# Model: 106 +# Thread(s) per core: 2 +# Core(s) per socket: 32 +# Socket(s): 2 +# Stepping: 6 +# BogoMIPS: 5799.78 +# Flags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr +# sse sse2 ss ht syscall nx pdpe1gb rdtscp lm constant_tsc arch_perfmon rep_good nopl +# xtopology nonstop_tsc cpuid aperfmperf tsc_known_freq pni pclmulqdq monitor ssse3 fma cx16 +# pcid sse4_1 sse4_2 x2apic movbe popcnt tsc_deadline_timer aes xsave avx f16c rdrand +# hypervisor lahf_lm abm 3dnowprefetch invpcid_single ssbd ibrs ibpb stibp ibrs_enhanced +# fsgsbase tsc_adjust bmi1 avx2 smep bmi2 erms invpcid avx512f avx512dq rdseed adx smap +# avx512ifma clflushopt clwb avx512cd sha_ni avx512bw avx512vl xsaveopt xsavec xgetbv1 +# xsaves wbnoinvd ida arat avx512vbmi pku ospke avx512_vbmi2 gfni vaes vpclmulqdq +# avx512_vnni avx512_bitalg tme avx512_vpopcntdq rdpid md_clear flush_l1d arch_capabilities +# Virtualization features: +# Hypervisor vendor: KVM +# Virtualization type: full +# Caches (sum of all): +# L1d: 3 MiB (64 instances) +# L1i: 2 MiB (64 instances) +# L2: 80 MiB (64 instances) +# L3: 108 MiB (2 instances) +# NUMA: +# NUMA node(s): 2 +# NUMA node0 CPU(s): 0-31,64-95 +# NUMA node1 CPU(s): 32-63,96-127 +# Vulnerabilities: +# Itlb multihit: Not affected +# L1tf: Not affected +# Mds: Not affected +# Meltdown: Not affected +# Mmio stale data: Vulnerable: Clear CPU buffers attempted, no microcode; SMT Host state unknown +# Retbleed: Not affected +# Spec store bypass: Mitigation; Speculative Store Bypass disabled via prctl and seccomp +# Spectre v1: Mitigation; usercopy/swapgs barriers and __user pointer sanitization +# Spectre v2: Mitigation; Enhanced IBRS, IBPB conditional, RSB filling, PBRSB-eIBRS SW sequence +# Srbds: Not affected +# Tsx async abort: Not affected +# * win32 +# Architecture=9 +# CurrentClockSpeed=2900 +# DeviceID=CPU0 +# Family=179 +# L2CacheSize=40960 +# L2CacheSpeed= +# Manufacturer=GenuineIntel +# MaxClockSpeed=2900 +# Name=Intel(R) Xeon(R) Platinum 8375C CPU @ 2.90GHz +# ProcessorType=3 +# Revision=27142 +# +# Architecture=9 +# CurrentClockSpeed=2900 +# DeviceID=CPU1 +# Family=179 +# L2CacheSize=40960 +# L2CacheSpeed= +# Manufacturer=GenuineIntel +# MaxClockSpeed=2900 +# Name=Intel(R) Xeon(R) Platinum 8375C CPU @ 2.90GHz +# ProcessorType=3 +# Revision=27142 + +def get_cpu_info(run_lambda): + rc, out, err = 0, '', '' + if get_platform() == 'linux': + rc, out, err = run_lambda('lscpu') + elif get_platform() == 'win32': + rc, out, err = run_lambda('wmic cpu get Name,Manufacturer,Family,Architecture,ProcessorType,DeviceID, \ + CurrentClockSpeed,MaxClockSpeed,L2CacheSize,L2CacheSpeed,Revision /VALUE') + elif get_platform() == 'darwin': + rc, out, err = run_lambda("sysctl -n machdep.cpu.brand_string") + cpu_info = 'None' + if rc == 0: + cpu_info = out + else: + cpu_info = err + return cpu_info + + +def get_platform(): + if sys.platform.startswith('linux'): + return 'linux' + elif sys.platform.startswith('win32'): + return 'win32' + elif sys.platform.startswith('cygwin'): + return 'cygwin' + elif sys.platform.startswith('darwin'): + return 'darwin' + else: + return sys.platform + + +def get_mac_version(run_lambda): + return run_and_parse_first_match(run_lambda, 'sw_vers -productVersion', r'(.*)') + + +def get_windows_version(run_lambda): + system_root = os.environ.get('SYSTEMROOT', 'C:\\Windows') + wmic_cmd = os.path.join(system_root, 'System32', 'Wbem', 'wmic') + findstr_cmd = os.path.join(system_root, 'System32', 'findstr') + return run_and_read_all(run_lambda, '{} os get Caption | {} /v Caption'.format(wmic_cmd, findstr_cmd)) + + +def get_lsb_version(run_lambda): + return run_and_parse_first_match(run_lambda, 'lsb_release -a', r'Description:\t(.*)') + + +def check_release_file(run_lambda): + return run_and_parse_first_match(run_lambda, 'cat /etc/*-release', + r'PRETTY_NAME="(.*)"') + + +def get_os(run_lambda): + from platform import machine + platform = get_platform() + + if platform == 'win32' or platform == 'cygwin': + return get_windows_version(run_lambda) + + if platform == 'darwin': + version = get_mac_version(run_lambda) + if version is None: + return None + return 'macOS {} ({})'.format(version, machine()) + + if platform == 'linux': + # Ubuntu/Debian based + desc = get_lsb_version(run_lambda) + if desc is not None: + return '{} ({})'.format(desc, machine()) + + # Try reading /etc/*-release + desc = check_release_file(run_lambda) + if desc is not None: + return '{} ({})'.format(desc, machine()) + + return '{} ({})'.format(platform, machine()) + + # Unknown platform + return platform + + +def get_python_platform(): + import platform + return platform.platform() + + +def get_libc_version(): + import platform + if get_platform() != 'linux': + return 'N/A' + return '-'.join(platform.libc_ver()) + + +def get_pip_packages(run_lambda, patterns=None): + """Return `pip list` output. Note: will also find conda-installed pytorch and numpy packages.""" + if patterns is None: + patterns = DEFAULT_PIP_PATTERNS + + # People generally have `pip` as `pip` or `pip3` + # But here it is invoked as `python -mpip` + def run_with_pip(pip): + out = run_and_read_all(run_lambda, pip + ["list", "--format=freeze"]) + return "\n".join( + line + for line in out.splitlines() + if any(name in line for name in patterns) + ) + + pip_version = 'pip3' if sys.version[0] == '3' else 'pip' + out = run_with_pip([sys.executable, '-mpip']) + + return pip_version, out + + +def get_cachingallocator_config(): + ca_config = os.environ.get('PYTORCH_CUDA_ALLOC_CONF', '') + return ca_config + + +def get_cuda_module_loading_config(): + if TORCH_AVAILABLE and torch.cuda.is_available(): + torch.cuda.init() + config = os.environ.get('CUDA_MODULE_LOADING', '') + return config + else: + return "N/A" + + +def is_xnnpack_available(): + if TORCH_AVAILABLE: + import torch.backends.xnnpack + return str(torch.backends.xnnpack.enabled) # type: ignore[attr-defined] + else: + return "N/A" + +def get_env_info(): + run_lambda = run + pip_version, pip_list_output = get_pip_packages(run_lambda) + + if TORCH_AVAILABLE: + version_str = torch.__version__ + debug_mode_str = str(torch.version.debug) + cuda_available_str = str(torch.cuda.is_available()) + cuda_version_str = torch.version.cuda + if not hasattr(torch.version, 'hip') or torch.version.hip is None: # cuda version + hip_compiled_version = hip_runtime_version = miopen_runtime_version = 'N/A' + else: # HIP version + def get_version_or_na(cfg, prefix): + _lst = [s.rsplit(None, 1)[-1] for s in cfg if prefix in s] + return _lst[0] if _lst else 'N/A' + + cfg = torch._C._show_config().split('\n') + hip_runtime_version = get_version_or_na(cfg, 'HIP Runtime') + miopen_runtime_version = get_version_or_na(cfg, 'MIOpen') + cuda_version_str = 'N/A' + hip_compiled_version = torch.version.hip + else: + version_str = debug_mode_str = cuda_available_str = cuda_version_str = 'N/A' + hip_compiled_version = hip_runtime_version = miopen_runtime_version = 'N/A' + + sys_version = sys.version.replace("\n", " ") + + conda_packages = get_conda_packages(run_lambda) + + rocm_version = get_rocm_version(run_lambda) + neuron_sdk_version = get_neuron_sdk_version(run_lambda) + vllm_version = get_vllm_version() + vllm_build_flags = summarize_vllm_build_flags() + gpu_topo = get_gpu_topo(run_lambda) + + return SystemEnv( + torch_version=version_str, + is_debug_build=debug_mode_str, + python_version='{} ({}-bit runtime)'.format(sys_version, sys.maxsize.bit_length() + 1), + python_platform=get_python_platform(), + is_cuda_available=cuda_available_str, + cuda_compiled_version=cuda_version_str, + cuda_runtime_version=get_running_cuda_version(run_lambda), + cuda_module_loading=get_cuda_module_loading_config(), + nvidia_gpu_models=get_gpu_info(run_lambda), + nvidia_driver_version=get_nvidia_driver_version(run_lambda), + cudnn_version=get_cudnn_version(run_lambda), + hip_compiled_version=hip_compiled_version, + hip_runtime_version=hip_runtime_version, + miopen_runtime_version=miopen_runtime_version, + pip_version=pip_version, + pip_packages=pip_list_output, + conda_packages=conda_packages, + os=get_os(run_lambda), + libc_version=get_libc_version(), + gcc_version=get_gcc_version(run_lambda), + clang_version=get_clang_version(run_lambda), + cmake_version=get_cmake_version(run_lambda), + caching_allocator_config=get_cachingallocator_config(), + is_xnnpack_available=is_xnnpack_available(), + cpu_info=get_cpu_info(run_lambda), + rocm_version=rocm_version, + neuron_sdk_version=neuron_sdk_version, + vllm_version=vllm_version, + vllm_build_flags=vllm_build_flags, + gpu_topo=gpu_topo, + ) + +env_info_fmt = """ +PyTorch version: {torch_version} +Is debug build: {is_debug_build} +CUDA used to build PyTorch: {cuda_compiled_version} +ROCM used to build PyTorch: {hip_compiled_version} + +OS: {os} +GCC version: {gcc_version} +Clang version: {clang_version} +CMake version: {cmake_version} +Libc version: {libc_version} + +Python version: {python_version} +Python platform: {python_platform} +Is CUDA available: {is_cuda_available} +CUDA runtime version: {cuda_runtime_version} +CUDA_MODULE_LOADING set to: {cuda_module_loading} +GPU models and configuration: {nvidia_gpu_models} +Nvidia driver version: {nvidia_driver_version} +cuDNN version: {cudnn_version} +HIP runtime version: {hip_runtime_version} +MIOpen runtime version: {miopen_runtime_version} +Is XNNPACK available: {is_xnnpack_available} + +CPU: +{cpu_info} + +Versions of relevant libraries: +{pip_packages} +{conda_packages} +""".strip() + +env_info_fmt += """ +ROCM Version: {rocm_version} +Neuron SDK Version: {neuron_sdk_version} +vLLM Version: {vllm_version} +vLLM Build Flags: +{vllm_build_flags} +GPU Topology: +{gpu_topo} +""".strip() + + +def pretty_str(envinfo): + def replace_nones(dct, replacement='Could not collect'): + for key in dct.keys(): + if dct[key] is not None: + continue + dct[key] = replacement + return dct + + def replace_bools(dct, true='Yes', false='No'): + for key in dct.keys(): + if dct[key] is True: + dct[key] = true + elif dct[key] is False: + dct[key] = false + return dct + + def prepend(text, tag='[prepend]'): + lines = text.split('\n') + updated_lines = [tag + line for line in lines] + return '\n'.join(updated_lines) + + def replace_if_empty(text, replacement='No relevant packages'): + if text is not None and len(text) == 0: + return replacement + return text + + def maybe_start_on_next_line(string): + # If `string` is multiline, prepend a \n to it. + if string is not None and len(string.split('\n')) > 1: + return '\n{}\n'.format(string) + return string + + mutable_dict = envinfo._asdict() + + # If nvidia_gpu_models is multiline, start on the next line + mutable_dict['nvidia_gpu_models'] = \ + maybe_start_on_next_line(envinfo.nvidia_gpu_models) + + # If the machine doesn't have CUDA, report some fields as 'No CUDA' + dynamic_cuda_fields = [ + 'cuda_runtime_version', + 'nvidia_gpu_models', + 'nvidia_driver_version', + ] + all_cuda_fields = dynamic_cuda_fields + ['cudnn_version'] + all_dynamic_cuda_fields_missing = all( + mutable_dict[field] is None for field in dynamic_cuda_fields) + if TORCH_AVAILABLE and not torch.cuda.is_available() and all_dynamic_cuda_fields_missing: + for field in all_cuda_fields: + mutable_dict[field] = 'No CUDA' + if envinfo.cuda_compiled_version is None: + mutable_dict['cuda_compiled_version'] = 'None' + + # Replace True with Yes, False with No + mutable_dict = replace_bools(mutable_dict) + + # Replace all None objects with 'Could not collect' + mutable_dict = replace_nones(mutable_dict) + + # If either of these are '', replace with 'No relevant packages' + mutable_dict['pip_packages'] = replace_if_empty(mutable_dict['pip_packages']) + mutable_dict['conda_packages'] = replace_if_empty(mutable_dict['conda_packages']) + + # Tag conda and pip packages with a prefix + # If they were previously None, they'll show up as ie '[conda] Could not collect' + if mutable_dict['pip_packages']: + mutable_dict['pip_packages'] = prepend(mutable_dict['pip_packages'], + '[{}] '.format(envinfo.pip_version)) + if mutable_dict['conda_packages']: + mutable_dict['conda_packages'] = prepend(mutable_dict['conda_packages'], + '[conda] ') + mutable_dict['cpu_info'] = envinfo.cpu_info + return env_info_fmt.format(**mutable_dict) + + +def get_pretty_env_info(): + return pretty_str(get_env_info()) + + +def main(): + print("Collecting environment information...") + output = get_pretty_env_info() + print(output) + + if TORCH_AVAILABLE and hasattr(torch, 'utils') and hasattr(torch.utils, '_crash_handler'): + minidump_dir = torch.utils._crash_handler.DEFAULT_MINIDUMP_DIR + if sys.platform == "linux" and os.path.exists(minidump_dir): + dumps = [os.path.join(minidump_dir, dump) for dump in os.listdir(minidump_dir)] + latest = max(dumps, key=os.path.getctime) + ctime = os.path.getctime(latest) + creation_time = datetime.datetime.fromtimestamp(ctime).strftime('%Y-%m-%d %H:%M:%S') + msg = "\n*** Detected a minidump at {} created on {}, ".format(latest, creation_time) + \ + "if this is related to your bug please include it when you file a report ***" + print(msg, file=sys.stderr) + + + +if __name__ == '__main__': + main() diff --git a/csrc/activation_kernels.cu b/csrc/activation_kernels.cu index 22b10f0571d1..24d972702c85 100644 --- a/csrc/activation_kernels.cu +++ b/csrc/activation_kernels.cu @@ -33,12 +33,25 @@ template __device__ __forceinline__ T gelu_kernel(const T& x) { // Equivalent to PyTorch GELU with 'none' approximation. // Refer to: - // https://github.com/pytorch/pytorch/blob/8ac9b20d4b090c213799e81acf48a55ea8d437d6/aten/src/ATen/native/cuda/ActivationGeluKernel.cu#L38 + // https://github.com/pytorch/pytorch/blob/8ac9b20d4b090c213799e81acf48a55ea8d437d6/aten/src/ATen/native/cuda/ActivationGeluKernel.cu#L36-L38 const float f = (float) x; constexpr float ALPHA = M_SQRT1_2; return (T) (f * 0.5f * (1.0f + ::erf(f * ALPHA))); } +template +__device__ __forceinline__ T gelu_tanh_kernel(const T& x) { + // Equivalent to PyTorch GELU with 'tanh' approximation. + // Refer to: + // https://github.com/pytorch/pytorch/blob/8ac9b20d4b090c213799e81acf48a55ea8d437d6/aten/src/ATen/native/cuda/ActivationGeluKernel.cu#L25-L30 + const float f = (float) x; + constexpr float BETA = M_SQRT2 * M_2_SQRTPI * 0.5f; + constexpr float KAPPA = 0.044715; + float x_cube = f * f * f; + float inner = BETA * (f + KAPPA * x_cube); + return (T) (0.5f * f * (1.0f + ::tanhf(inner))); +} + } // namespace vllm // Launch activation and gating kernel. @@ -73,6 +86,13 @@ void gelu_and_mul( LAUNCH_ACTIVATION_GATE_KERNEL(vllm::gelu_kernel); } +void gelu_tanh_and_mul( + torch::Tensor& out, // [..., d] + torch::Tensor& input) // [..., 2 * d] +{ + LAUNCH_ACTIVATION_GATE_KERNEL(vllm::gelu_tanh_kernel); +} + namespace vllm { // Element-wise activation kernel template. diff --git a/csrc/attention/attention_kernels.cu b/csrc/attention/attention_kernels.cu index b5be3befa07e..5e61668d5cc1 100644 --- a/csrc/attention/attention_kernels.cu +++ b/csrc/attention/attention_kernels.cu @@ -15,9 +15,6 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -#ifdef USE_ROCM -#include -#endif #include #include @@ -31,11 +28,6 @@ #include -#ifndef USE_ROCM -#define WARP_SIZE 32 -#else -#define WARP_SIZE warpSize -#endif #define MAX(a, b) ((a) > (b) ? (a) : (b)) #define MIN(a, b) ((a) < (b) ? (a) : (b)) #define DIVIDE_ROUND_UP(a, b) (((a) + (b) - 1) / (b)) diff --git a/csrc/cuda_compat.h b/csrc/cuda_compat.h index aa58dd73c148..c711d8d1b24b 100644 --- a/csrc/cuda_compat.h +++ b/csrc/cuda_compat.h @@ -1,5 +1,15 @@ #pragma once +#ifdef USE_ROCM +#include +#endif + +#ifndef USE_ROCM + #define WARP_SIZE 32 +#else + #define WARP_SIZE warpSize +#endif + #ifndef USE_ROCM #define VLLM_LDG(arg) __ldg(arg) #else diff --git a/csrc/moe_align_block_size_kernels.cu b/csrc/moe_align_block_size_kernels.cu index de6a0ec0a972..138615a4bfba 100644 --- a/csrc/moe_align_block_size_kernels.cu +++ b/csrc/moe_align_block_size_kernels.cu @@ -7,10 +7,17 @@ #include "cuda_compat.h" #include "dispatch_utils.h" -const static size_t NUM_MAX_EXPERTS = 64; #define CEILDIV(x,y) (((x) + (y) - 1) / (y)) namespace vllm { + +namespace { +__device__ __forceinline__ int32_t index(int32_t total_col, int32_t row, int32_t col) { + // don't worry about overflow because num_experts is relatively small + return row * total_col + col; +} +} + template __global__ void moe_align_block_size_kernel(scalar_t *__restrict__ topk_ids, int32_t *sorted_token_ids, @@ -21,10 +28,14 @@ __global__ void moe_align_block_size_kernel(scalar_t *__restrict__ topk_ids, size_t numel) { const size_t tokens_per_thread = CEILDIV(numel, blockDim.x); const size_t start_idx = threadIdx.x * tokens_per_thread; - __shared__ int32_t tokens_cnts[NUM_MAX_EXPERTS + 1][NUM_MAX_EXPERTS]; - __shared__ int32_t cumsum[NUM_MAX_EXPERTS + 1]; + + extern __shared__ int32_t shared_mem[]; + + int32_t* tokens_cnts = shared_mem; // 2d tensor with shape (num_experts + 1, num_experts) + int32_t* cumsum = shared_mem + (num_experts + 1) * num_experts; // 1d tensor with shape (num_experts + 1) + for (int i = 0; i < num_experts; ++i) { - tokens_cnts[threadIdx.x + 1][i] = 0; + tokens_cnts[index(num_experts, threadIdx.x + 1, i)] = 0; } /** @@ -33,15 +44,15 @@ __global__ void moe_align_block_size_kernel(scalar_t *__restrict__ topk_ids, * to expert expert_index. */ for (int i = start_idx; i < numel && i < start_idx + tokens_per_thread; ++i) { - ++tokens_cnts[threadIdx.x + 1][topk_ids[i]]; + ++tokens_cnts[index(num_experts, threadIdx.x + 1, topk_ids[i])]; } __syncthreads(); // For each expert we accumulate the token counts from the different threads. - tokens_cnts[0][threadIdx.x] = 0; + tokens_cnts[index(num_experts, 0, threadIdx.x)] = 0; for (int i = 1; i <= blockDim.x; ++i) { - tokens_cnts[i][threadIdx.x] += tokens_cnts[i-1][threadIdx.x]; + tokens_cnts[index(num_experts, i, threadIdx.x)] += tokens_cnts[index(num_experts, i-1, threadIdx.x)]; } __syncthreads(); @@ -50,7 +61,7 @@ __global__ void moe_align_block_size_kernel(scalar_t *__restrict__ topk_ids, if (threadIdx.x == 0) { cumsum[0] = 0; for (int i = 1; i <= num_experts; ++i) { - cumsum[i] = cumsum[i-1] + CEILDIV(tokens_cnts[blockDim.x][i - 1], block_size) * block_size; + cumsum[i] = cumsum[i-1] + CEILDIV(tokens_cnts[index(num_experts, blockDim.x, i - 1)], block_size) * block_size; } *total_tokens_post_pad = cumsum[num_experts]; } @@ -78,9 +89,9 @@ __global__ void moe_align_block_size_kernel(scalar_t *__restrict__ topk_ids, * stores the indices of the tokens processed by the expert with expert_id within * the current thread's token shard. */ - int32_t rank_post_pad = tokens_cnts[threadIdx.x][expert_id] + cumsum[expert_id]; + int32_t rank_post_pad = tokens_cnts[index(num_experts, threadIdx.x, expert_id)] + cumsum[expert_id]; sorted_token_ids[rank_post_pad] = i; - ++tokens_cnts[threadIdx.x][expert_id]; + ++tokens_cnts[index(num_experts, threadIdx.x, expert_id)]; } } } @@ -93,11 +104,16 @@ void moe_align_block_size( torch::Tensor experts_ids, torch::Tensor num_tokens_post_pad) { const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); - assert(num_experts <= NUM_MAX_EXPERTS); VLLM_DISPATCH_INTEGRAL_TYPES( topk_ids.scalar_type(), "moe_align_block_size_kernel", [&] { - vllm::moe_align_block_size_kernel<<<1, num_experts, 0, stream>>>( - topk_ids.data_ptr(), + // calc needed amount of shared mem for `tokens_cnts` and `cumsum` tensors + const int32_t shared_mem = ((num_experts + 1) * num_experts + (num_experts + 1)) * sizeof(int32_t); + + // set dynamic shared mem + auto kernel = vllm::moe_align_block_size_kernel; + AT_CUDA_CHECK(cudaFuncSetAttribute(kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, shared_mem)); + kernel<<<1, num_experts, shared_mem, stream>>>( + topk_ids.data_ptr(), sorted_token_ids.data_ptr(), experts_ids.data_ptr(), num_tokens_post_pad.data_ptr(), diff --git a/csrc/ops.h b/csrc/ops.h index 249c7451bf73..d5d6e240da7c 100644 --- a/csrc/ops.h +++ b/csrc/ops.h @@ -53,6 +53,16 @@ void rotary_embedding( torch::Tensor& cos_sin_cache, bool is_neox); +void batched_rotary_embedding( + torch::Tensor& positions, + torch::Tensor& query, + torch::Tensor& key, + int head_size, + torch::Tensor& cos_sin_cache, + bool is_neox, + int rot_dim, + torch::Tensor& cos_sin_cache_offsets); + void silu_and_mul( torch::Tensor& out, torch::Tensor& input); @@ -61,6 +71,10 @@ void gelu_and_mul( torch::Tensor& out, torch::Tensor& input); +void gelu_tanh_and_mul( + torch::Tensor& out, + torch::Tensor& input); + void gelu_new( torch::Tensor& out, torch::Tensor& input); diff --git a/csrc/pos_encoding_kernels.cu b/csrc/pos_encoding_kernels.cu index 5f522795619e..d80cb6973fad 100644 --- a/csrc/pos_encoding_kernels.cu +++ b/csrc/pos_encoding_kernels.cu @@ -8,7 +8,7 @@ namespace vllm { template -inline __device__ void apply_rotary_embedding( +inline __device__ void apply_token_rotary_embedding( scalar_t* __restrict__ arr, const scalar_t* __restrict__ cos_ptr, const scalar_t* __restrict__ sin_ptr, @@ -38,22 +38,18 @@ inline __device__ void apply_rotary_embedding( } template -__global__ void rotary_embedding_kernel( - const int64_t* __restrict__ positions, // [batch_size, seq_len] or [num_tokens] +inline __device__ void apply_rotary_embedding( scalar_t* __restrict__ query, // [batch_size, seq_len, num_heads, head_size] or [num_tokens, num_heads, head_size] scalar_t* __restrict__ key, // [batch_size, seq_len, num_kv_heads, head_size] or [num_tokens, num_kv_heads, head_size] - const scalar_t* __restrict__ cos_sin_cache, // [max_position, 2, rot_dim // 2] - const int rot_dim, - const int64_t query_stride, - const int64_t key_stride, + const scalar_t* cache_ptr, + const int head_size, const int num_heads, const int num_kv_heads, - const int head_size) { - // Each thread block is responsible for one token. - const int token_idx = blockIdx.x; - int64_t pos = positions[token_idx]; - const scalar_t* cache_ptr = cos_sin_cache + pos * rot_dim; - + const int rot_dim, + const int token_idx, + const int64_t query_stride, + const int64_t key_stride) +{ const int embed_dim = rot_dim / 2; const scalar_t* cos_ptr = cache_ptr; const scalar_t* sin_ptr = cache_ptr + embed_dim; @@ -63,7 +59,7 @@ __global__ void rotary_embedding_kernel( const int head_idx = i / embed_dim; const int64_t token_head = token_idx * query_stride + head_idx * head_size; const int rot_offset = i % embed_dim; - apply_rotary_embedding(query + token_head, cos_ptr, + apply_token_rotary_embedding(query + token_head, cos_ptr, sin_ptr, rot_offset, embed_dim); } @@ -72,11 +68,53 @@ __global__ void rotary_embedding_kernel( const int head_idx = i / embed_dim; const int64_t token_head = token_idx * key_stride + head_idx * head_size; const int rot_offset = i % embed_dim; - apply_rotary_embedding(key + token_head, cos_ptr, + apply_token_rotary_embedding(key + token_head, cos_ptr, sin_ptr, rot_offset, embed_dim); } } +template +__global__ void rotary_embedding_kernel( + const int64_t* __restrict__ positions, // [batch_size, seq_len] or [num_tokens] + scalar_t* __restrict__ query, // [batch_size, seq_len, num_heads, head_size] or [num_tokens, num_heads, head_size] + scalar_t* __restrict__ key, // [batch_size, seq_len, num_kv_heads, head_size] or [num_tokens, num_kv_heads, head_size] + const scalar_t* __restrict__ cos_sin_cache, // [max_position, 2, rot_dim // 2] + const int rot_dim, + const int64_t query_stride, + const int64_t key_stride, + const int num_heads, + const int num_kv_heads, + const int head_size) { + // Each thread block is responsible for one token. + const int token_idx = blockIdx.x; + int64_t pos = positions[token_idx]; + const scalar_t* cache_ptr = cos_sin_cache + pos * rot_dim; + + apply_rotary_embedding(query, key, cache_ptr, head_size, num_heads, num_kv_heads, rot_dim, token_idx, query_stride, key_stride); +} + +template +__global__ void batched_rotary_embedding_kernel( + const int64_t* __restrict__ positions, // [batch_size, seq_len] or [num_tokens] + scalar_t* __restrict__ query, // [batch_size, seq_len, num_heads, head_size] or [num_tokens, num_heads, head_size] + scalar_t* __restrict__ key, // [batch_size, seq_len, num_kv_heads, head_size] or [num_tokens, num_kv_heads, head_size] + const scalar_t* __restrict__ cos_sin_cache, // [max_position, 2, rot_dim // 2] + const int64_t* __restrict__ cos_sin_cache_offsets, // [batch_size, seq_len] or [num_tokens] + const int rot_dim, + const int64_t query_stride, + const int64_t key_stride, + const int num_heads, + const int num_kv_heads, + const int head_size) { + // Each thread block is responsible for one token. + const int token_idx = blockIdx.x; + int64_t pos = positions[token_idx]; + int64_t cos_sin_cache_offset = cos_sin_cache_offsets[token_idx]; + const scalar_t* cache_ptr = cos_sin_cache + (cos_sin_cache_offset + pos) * rot_dim; + + apply_rotary_embedding(query, key, cache_ptr, head_size, num_heads, num_kv_heads, rot_dim, token_idx, query_stride, key_stride); +} + } // namespace vllm void rotary_embedding( @@ -128,3 +166,61 @@ void rotary_embedding( } }); } + +/* +Batched version of rotary embedding, pack multiple LoRAs together +and process in batched manner. +*/ +void batched_rotary_embedding( + torch::Tensor& positions, // [batch_size, seq_len] or [num_tokens] + torch::Tensor& query, // [batch_size, seq_len, num_heads * head_size] or [num_tokens, num_heads * head_size] + torch::Tensor& key, // [batch_size, seq_len, num_kv_heads * head_size] or [num_tokens, num_kv_heads * head_size] + int head_size, + torch::Tensor& cos_sin_cache, // [max_position, rot_dim] + bool is_neox, + int rot_dim, + torch::Tensor& cos_sin_cache_offsets // [num_tokens] +) { + int64_t num_tokens = cos_sin_cache_offsets.size(0); + int num_heads = query.size(-1) / head_size; + int num_kv_heads = key.size(-1) / head_size; + int64_t query_stride = query.stride(-2); + int64_t key_stride = key.stride(-2); + + dim3 grid(num_tokens); + dim3 block(std::min(num_heads * rot_dim / 2, 512)); + const at::cuda::OptionalCUDAGuard device_guard(device_of(query)); + const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); + VLLM_DISPATCH_FLOATING_TYPES( + query.scalar_type(), + "rotary_embedding", + [&] { + if (is_neox) { + vllm::batched_rotary_embedding_kernel<<>>( + positions.data_ptr(), + query.data_ptr(), + key.data_ptr(), + cos_sin_cache.data_ptr(), + cos_sin_cache_offsets.data_ptr(), + rot_dim, + query_stride, + key_stride, + num_heads, + num_kv_heads, + head_size); + } else { + vllm::batched_rotary_embedding_kernel<<>>( + positions.data_ptr(), + query.data_ptr(), + key.data_ptr(), + cos_sin_cache.data_ptr(), + cos_sin_cache_offsets.data_ptr(), + rot_dim, + query_stride, + key_stride, + num_heads, + num_kv_heads, + head_size); + } + }); +} diff --git a/csrc/punica/bgmv/bgmv_config.h b/csrc/punica/bgmv/bgmv_config.h index 3eb84ceb4d53..a7415dfc9136 100644 --- a/csrc/punica/bgmv/bgmv_config.h +++ b/csrc/punica/bgmv/bgmv_config.h @@ -14,6 +14,7 @@ void bgmv_kernel(out_T *__restrict__ Y, const in_T *__restrict__ X, f(in_T, out_T, W_T, narrow, 128) \ f(in_T, out_T, W_T, narrow, 256) \ f(in_T, out_T, W_T, narrow, 512) \ + f(in_T, out_T, W_T, narrow, 768) \ f(in_T, out_T, W_T, narrow, 1024) \ f(in_T, out_T, W_T, narrow, 1280) \ f(in_T, out_T, W_T, narrow, 1728) \ @@ -42,6 +43,7 @@ void bgmv_kernel(out_T *__restrict__ Y, const in_T *__restrict__ X, f(in_T, out_T, W_T, narrow, 14336) \ f(in_T, out_T, W_T, narrow, 16384) \ f(in_T, out_T, W_T, narrow, 20480) \ + f(in_T, out_T, W_T, narrow, 22016) \ f(in_T, out_T, W_T, narrow, 24576) \ f(in_T, out_T, W_T, narrow, 28672) \ f(in_T, out_T, W_T, narrow, 32000) \ diff --git a/csrc/pybind.cpp b/csrc/pybind.cpp index 4b6ade756639..a5c6439fd690 100644 --- a/csrc/pybind.cpp +++ b/csrc/pybind.cpp @@ -25,7 +25,11 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { ops.def( "gelu_and_mul", &gelu_and_mul, - "Activation function used in GeGLU."); + "Activation function used in GeGLU with `none` approximation."); + ops.def( + "gelu_tanh_and_mul", + &gelu_tanh_and_mul, + "Activation function used in GeGLU with `tanh` approximation."); ops.def( "gelu_new", &gelu_new, @@ -52,6 +56,11 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { &rotary_embedding, "Apply GPT-NeoX or GPT-J style rotary embedding to query and key"); + ops.def( + "batched_rotary_embedding", + &batched_rotary_embedding, + "Apply GPT-NeoX or GPT-J style rotary embedding to query and key (supports multiple loras)"); + // Quantization ops #ifndef USE_ROCM ops.def("awq_gemm", &awq_gemm, "Quantized GEMM for AWQ"); diff --git a/csrc/reduction_utils.cuh b/csrc/reduction_utils.cuh index b95ccef16207..c25464e866e5 100644 --- a/csrc/reduction_utils.cuh +++ b/csrc/reduction_utils.cuh @@ -24,17 +24,27 @@ namespace vllm { template __inline__ __device__ T warpReduceSum(T val) { #pragma unroll - for (int mask = 16; mask > 0; mask >>= 1) + for (int mask = WARP_SIZE/2; mask > 0; mask >>= 1) val += VLLM_SHFL_XOR_SYNC(val, mask); return val; } +__inline__ __device__ constexpr int _calculateLaneMask(int warp_size) { + return warp_size - 1; +} + +__inline__ __device__ constexpr int _calculateWidShift(int warp_size) { + return 5 + (warp_size >> 6); +} + /* Calculate the sum of all elements in a block */ template __inline__ __device__ T blockReduceSum(T val) { - static __shared__ T shared[32]; - int lane = threadIdx.x & 0x1f; - int wid = threadIdx.x >> 5; + static __shared__ T shared[WARP_SIZE]; + constexpr auto LANE_MASK = _calculateLaneMask(WARP_SIZE); + constexpr auto WID_SHIFT = _calculateWidShift(WARP_SIZE); + int lane = threadIdx.x & LANE_MASK; + int wid = threadIdx.x >> WID_SHIFT; val = warpReduceSum(val); @@ -45,7 +55,7 @@ __inline__ __device__ T blockReduceSum(T val) { // Modify from blockDim.x << 5 to blockDim.x / 32. to prevent // blockDim.x is not divided by 32 - val = (threadIdx.x < (blockDim.x / 32.f)) ? shared[lane] : (T)(0.0f); + val = (threadIdx.x < (blockDim.x / (WARP_SIZE * 1.0f))) ? shared[lane] : (T)(0.0f); val = warpReduceSum(val); return val; } diff --git a/docs/source/assets/kernel/v_vec.png b/docs/source/assets/kernel/v_vec.png index bac3c10949f6..75d344ab933f 100644 Binary files a/docs/source/assets/kernel/v_vec.png and b/docs/source/assets/kernel/v_vec.png differ diff --git a/docs/source/assets/kernel/value.png b/docs/source/assets/kernel/value.png index f585c77b2e14..56b0b9e0f56d 100644 Binary files a/docs/source/assets/kernel/value.png and b/docs/source/assets/kernel/value.png differ diff --git a/docs/source/dev/engine/llm_engine.rst b/docs/source/dev/engine/llm_engine.rst index b550a9b5faa6..1de6d7adc87c 100644 --- a/docs/source/dev/engine/llm_engine.rst +++ b/docs/source/dev/engine/llm_engine.rst @@ -2,5 +2,5 @@ LLMEngine ================================= .. autoclass:: vllm.engine.llm_engine.LLMEngine - :members: add_request, abort_request, step, _init_cache + :members: add_request, abort_request, step :show-inheritance: \ No newline at end of file diff --git a/docs/source/dev/kernel/paged_attention.rst b/docs/source/dev/kernel/paged_attention.rst index 6fcadeeec27b..ba4f7a271815 100644 --- a/docs/source/dev/kernel/paged_attention.rst +++ b/docs/source/dev/kernel/paged_attention.rst @@ -447,7 +447,7 @@ Value a whole block of value tokens. And each ``accs`` in each thread contains 8 elements that accumulated at 8 different head positions. For the thread 0, the ``accs`` variable will have 8 elements, which - are 0th, 16th … 112th elements of a value head that are accumulated + are 0th, 32th … 224th elements of a value head that are accumulated from all assigned 8 tokens. LV diff --git a/docs/source/index.rst b/docs/source/index.rst index c0250bf99f7a..65bfbbabf8be 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -73,6 +73,7 @@ Documentation serving/run_on_sky serving/deploying_with_kserve serving/deploying_with_triton + serving/deploying_with_bentoml serving/deploying_with_docker serving/serving_with_langchain serving/metrics diff --git a/docs/source/models/lora.rst b/docs/source/models/lora.rst index 21b18c75fc55..f05fafe9f827 100644 --- a/docs/source/models/lora.rst +++ b/docs/source/models/lora.rst @@ -92,7 +92,8 @@ LoRA adapter requests if they were provided and ``max_loras`` is set high enough The following is an example request -.. code-block::bash +.. code-block:: bash + curl http://localhost:8000/v1/completions \ -H "Content-Type: application/json" \ -d '{ diff --git a/docs/source/models/supported_models.rst b/docs/source/models/supported_models.rst index 9d4ec663a16e..4019e0bbd90f 100644 --- a/docs/source/models/supported_models.rst +++ b/docs/source/models/supported_models.rst @@ -8,84 +8,109 @@ The following is the list of model architectures that are currently supported by Alongside each architecture, we include some popular models that use it. .. list-table:: - :widths: 25 25 50 + :widths: 25 25 50 5 :header-rows: 1 * - Architecture - Models - Example HuggingFace Models + - :ref:`LoRA ` * - :code:`AquilaForCausalLM` - Aquila - :code:`BAAI/Aquila-7B`, :code:`BAAI/AquilaChat-7B`, etc. + - ✅︎ * - :code:`BaiChuanForCausalLM` - Baichuan - :code:`baichuan-inc/Baichuan2-13B-Chat`, :code:`baichuan-inc/Baichuan-7B`, etc. + - * - :code:`ChatGLMModel` - ChatGLM - :code:`THUDM/chatglm2-6b`, :code:`THUDM/chatglm3-6b`, etc. + - * - :code:`DeciLMForCausalLM` - DeciLM - :code:`Deci/DeciLM-7B`, :code:`Deci/DeciLM-7B-instruct`, etc. + - * - :code:`BloomForCausalLM` - BLOOM, BLOOMZ, BLOOMChat - :code:`bigscience/bloom`, :code:`bigscience/bloomz`, etc. + - * - :code:`FalconForCausalLM` - Falcon - :code:`tiiuae/falcon-7b`, :code:`tiiuae/falcon-40b`, :code:`tiiuae/falcon-rw-7b`, etc. + - * - :code:`GemmaForCausalLM` - Gemma - :code:`google/gemma-2b`, :code:`google/gemma-7b`, etc. + - ✅︎ * - :code:`GPT2LMHeadModel` - GPT-2 - :code:`gpt2`, :code:`gpt2-xl`, etc. + - * - :code:`GPTBigCodeForCausalLM` - StarCoder, SantaCoder, WizardCoder - :code:`bigcode/starcoder`, :code:`bigcode/gpt_bigcode-santacoder`, :code:`WizardLM/WizardCoder-15B-V1.0`, etc. + - * - :code:`GPTJForCausalLM` - GPT-J - :code:`EleutherAI/gpt-j-6b`, :code:`nomic-ai/gpt4all-j`, etc. + - * - :code:`GPTNeoXForCausalLM` - GPT-NeoX, Pythia, OpenAssistant, Dolly V2, StableLM - :code:`EleutherAI/gpt-neox-20b`, :code:`EleutherAI/pythia-12b`, :code:`OpenAssistant/oasst-sft-4-pythia-12b-epoch-3.5`, :code:`databricks/dolly-v2-12b`, :code:`stabilityai/stablelm-tuned-alpha-7b`, etc. + - * - :code:`InternLMForCausalLM` - InternLM - :code:`internlm/internlm-7b`, :code:`internlm/internlm-chat-7b`, etc. + - ✅︎ * - :code:`InternLM2ForCausalLM` - InternLM2 - :code:`internlm/internlm2-7b`, :code:`internlm/internlm2-chat-7b`, etc. + - * - :code:`LlamaForCausalLM` - LLaMA, LLaMA-2, Vicuna, Alpaca, Yi - :code:`meta-llama/Llama-2-13b-hf`, :code:`meta-llama/Llama-2-70b-hf`, :code:`openlm-research/open_llama_13b`, :code:`lmsys/vicuna-13b-v1.3`, :code:`01-ai/Yi-6B`, :code:`01-ai/Yi-34B`, etc. + - ✅︎ * - :code:`MistralForCausalLM` - Mistral, Mistral-Instruct - :code:`mistralai/Mistral-7B-v0.1`, :code:`mistralai/Mistral-7B-Instruct-v0.1`, etc. + - ✅︎ * - :code:`MixtralForCausalLM` - Mixtral-8x7B, Mixtral-8x7B-Instruct - :code:`mistralai/Mixtral-8x7B-v0.1`, :code:`mistralai/Mixtral-8x7B-Instruct-v0.1`, etc. + - ✅︎ * - :code:`MPTForCausalLM` - MPT, MPT-Instruct, MPT-Chat, MPT-StoryWriter - :code:`mosaicml/mpt-7b`, :code:`mosaicml/mpt-7b-storywriter`, :code:`mosaicml/mpt-30b`, etc. + - * - :code:`OLMoForCausalLM` - OLMo - :code:`allenai/OLMo-1B`, :code:`allenai/OLMo-7B`, etc. + - * - :code:`OPTForCausalLM` - OPT, OPT-IML - :code:`facebook/opt-66b`, :code:`facebook/opt-iml-max-30b`, etc. + - * - :code:`OrionForCausalLM` - Orion - :code:`OrionStarAI/Orion-14B-Base`, :code:`OrionStarAI/Orion-14B-Chat`, etc. + - * - :code:`PhiForCausalLM` - Phi - :code:`microsoft/phi-1_5`, :code:`microsoft/phi-2`, etc. + - * - :code:`QWenLMHeadModel` - Qwen - :code:`Qwen/Qwen-7B`, :code:`Qwen/Qwen-7B-Chat`, etc. + - * - :code:`Qwen2ForCausalLM` - Qwen2 - :code:`Qwen/Qwen2-beta-7B`, :code:`Qwen/Qwen2-beta-7B-Chat`, etc. + - ✅︎ * - :code:`StableLmForCausalLM` - StableLM - :code:`stabilityai/stablelm-3b-4e1t/` , :code:`stabilityai/stablelm-base-alpha-7b-v2`, etc. + - If your model uses one of the above model architectures, you can seamlessly run your model with vLLM. Otherwise, please refer to :ref:`Adding a New Model ` for instructions on how to implement support for your model. diff --git a/docs/source/serving/deploying_with_bentoml.rst b/docs/source/serving/deploying_with_bentoml.rst new file mode 100644 index 000000000000..4b9d19f5bdb7 --- /dev/null +++ b/docs/source/serving/deploying_with_bentoml.rst @@ -0,0 +1,8 @@ +.. _deploying_with_bentoml: + +Deploying with BentoML +====================== + +`BentoML `_ allows you to deploy a large language model (LLM) server with vLLM as the backend, which exposes OpenAI-compatible endpoints. You can serve the model locally or containerize it as an OCI-complicant image and deploy it on Kubernetes. + +For details, see the tutorial `vLLM inference in the BentoML documentation `_. \ No newline at end of file diff --git a/examples/offline_inference_neuron.py b/examples/offline_inference_neuron.py old mode 100644 new mode 100755 index 9b9dc4d94892..da8874abd92a --- a/examples/offline_inference_neuron.py +++ b/examples/offline_inference_neuron.py @@ -14,14 +14,16 @@ llm = LLM( model="openlm-research/open_llama_3b", max_num_seqs=8, - # The max_model_len and block_size arguments are required to be same as max sequence length, - # when targeting neuron device. Currently, this is a known limitation in continuous batching - # support in transformers-neuronx. + # The max_model_len and block_size arguments are required to be same as + # max sequence length when targeting neuron device. + # Currently, this is a known limitation in continuous batching support + # in transformers-neuronx. # TODO(liangfu): Support paged-attention in transformers-neuronx. max_model_len=128, block_size=128, # The device can be automatically detected when AWS Neuron SDK is installed. - # The device argument can be either unspecified for automated detection, or explicitly assigned. + # The device argument can be either unspecified for automated detection, + # or explicitly assigned. device="neuron") # Generate texts from the prompts. The output is a list of RequestOutput objects # that contain the prompt, generated text, and other information. diff --git a/examples/production_monitoring/grafana.json b/examples/production_monitoring/grafana.json index f48b6314eb05..071f134c6e5e 100644 --- a/examples/production_monitoring/grafana.json +++ b/examples/production_monitoring/grafana.json @@ -1,35 +1,4 @@ { - "__inputs": [ - { - "name": "DS_PROMETHEUS", - "label": "prometheus", - "description": "", - "type": "datasource", - "pluginId": "prometheus", - "pluginName": "Prometheus" - } - ], - "__elements": {}, - "__requires": [ - { - "type": "grafana", - "id": "grafana", - "name": "Grafana", - "version": "10.2.3" - }, - { - "type": "datasource", - "id": "prometheus", - "name": "Prometheus", - "version": "1.0.0" - }, - { - "type": "panel", - "id": "timeseries", - "name": "Time series", - "version": "" - } - ], "annotations": { "list": [ { @@ -42,6 +11,12 @@ "hide": true, "iconColor": "rgba(0, 211, 255, 1)", "name": "Annotations & Alerts", + "target": { + "limit": 100, + "matchAny": false, + "tags": [], + "type": "dashboard" + }, "type": "dashboard" } ] @@ -50,14 +25,14 @@ "editable": true, "fiscalYearStartMonth": 0, "graphTooltip": 0, - "id": null, + "id": 29, "links": [], "liveNow": false, "panels": [ { "datasource": { "type": "prometheus", - "uid": "${DS_PROMETHEUS}" + "uid": "prometheus" }, "description": "End to end request latency measured in seconds.", "fieldConfig": { @@ -66,7 +41,6 @@ "mode": "palette-classic" }, "custom": { - "axisBorderShow": false, "axisCenteredZero": false, "axisColorMode": "text", "axisLabel": "", @@ -80,7 +54,6 @@ "tooltip": false, "viz": false }, - "insertNulls": false, "lineInterpolation": "linear", "lineWidth": 1, "pointSize": 5, @@ -138,11 +111,11 @@ { "datasource": { "type": "prometheus", - "uid": "${DS_PROMETHEUS}" + "uid": "prometheus" }, "disableTextWrap": false, "editorMode": "builder", - "expr": "histogram_quantile(0.99, sum by(le) (rate(vllm:e2e_request_latency_seconds_bucket[$__rate_interval])))", + "expr": "histogram_quantile(0.99, sum by(le) (rate(vllm:e2e_request_latency_seconds_bucket{model_name=\"$model_name\"}[$__rate_interval])))", "fullMetaSearch": false, "includeNullMetadata": false, "instant": false, @@ -154,11 +127,11 @@ { "datasource": { "type": "prometheus", - "uid": "${DS_PROMETHEUS}" + "uid": "prometheus" }, "disableTextWrap": false, "editorMode": "builder", - "expr": "histogram_quantile(0.95, sum by(le) (rate(vllm:e2e_request_latency_seconds_bucket[$__rate_interval])))", + "expr": "histogram_quantile(0.95, sum by(le) (rate(vllm:e2e_request_latency_seconds_bucket{model_name=\"$model_name\"}[$__rate_interval])))", "fullMetaSearch": false, "hide": false, "includeNullMetadata": false, @@ -171,11 +144,11 @@ { "datasource": { "type": "prometheus", - "uid": "${DS_PROMETHEUS}" + "uid": "prometheus" }, "disableTextWrap": false, "editorMode": "builder", - "expr": "histogram_quantile(0.9, sum by(le) (rate(vllm:e2e_request_latency_seconds_bucket[$__rate_interval])))", + "expr": "histogram_quantile(0.9, sum by(le) (rate(vllm:e2e_request_latency_seconds_bucket{model_name=\"$model_name\"}[$__rate_interval])))", "fullMetaSearch": false, "hide": false, "includeNullMetadata": false, @@ -188,11 +161,11 @@ { "datasource": { "type": "prometheus", - "uid": "${DS_PROMETHEUS}" + "uid": "prometheus" }, "disableTextWrap": false, "editorMode": "builder", - "expr": "histogram_quantile(0.5, sum by(le) (rate(vllm:e2e_request_latency_seconds_bucket[$__rate_interval])))", + "expr": "histogram_quantile(0.5, sum by(le) (rate(vllm:e2e_request_latency_seconds_bucket{model_name=\"$model_name\"}[$__rate_interval])))", "fullMetaSearch": false, "hide": false, "includeNullMetadata": false, @@ -205,10 +178,10 @@ { "datasource": { "type": "prometheus", - "uid": "${DS_PROMETHEUS}" + "uid": "prometheus" }, "editorMode": "code", - "expr": "rate(vllm:e2e_request_latency_seconds_sum[$__rate_interval])\n/\nrate(vllm:e2e_request_latency_seconds_count[$__rate_interval])", + "expr": "rate(vllm:e2e_request_latency_seconds_sum{model_name=\"$model_name\"}[$__rate_interval])\n/\nrate(vllm:e2e_request_latency_seconds_count{model_name=\"$model_name\"}[$__rate_interval])", "hide": false, "instant": false, "legendFormat": "Average", @@ -222,7 +195,7 @@ { "datasource": { "type": "prometheus", - "uid": "${DS_PROMETHEUS}" + "uid": "prometheus" }, "description": "Number of tokens processed per second", "fieldConfig": { @@ -231,7 +204,6 @@ "mode": "palette-classic" }, "custom": { - "axisBorderShow": false, "axisCenteredZero": false, "axisColorMode": "text", "axisLabel": "", @@ -245,7 +217,6 @@ "tooltip": false, "viz": false }, - "insertNulls": false, "lineInterpolation": "linear", "lineWidth": 1, "pointSize": 5, @@ -302,11 +273,11 @@ { "datasource": { "type": "prometheus", - "uid": "${DS_PROMETHEUS}" + "uid": "prometheus" }, "disableTextWrap": false, "editorMode": "builder", - "expr": "rate(vllm:prompt_tokens_total[$__rate_interval])", + "expr": "rate(vllm:prompt_tokens_total{model_name=\"$model_name\"}[$__rate_interval])", "fullMetaSearch": false, "includeNullMetadata": false, "instant": false, @@ -318,11 +289,11 @@ { "datasource": { "type": "prometheus", - "uid": "${DS_PROMETHEUS}" + "uid": "prometheus" }, "disableTextWrap": false, "editorMode": "builder", - "expr": "rate(vllm:generation_tokens_total[$__rate_interval])", + "expr": "rate(vllm:generation_tokens_total{model_name=\"$model_name\"}[$__rate_interval])", "fullMetaSearch": false, "hide": false, "includeNullMetadata": false, @@ -339,7 +310,7 @@ { "datasource": { "type": "prometheus", - "uid": "${DS_PROMETHEUS}" + "uid": "prometheus" }, "description": "Inter token latency in seconds.", "fieldConfig": { @@ -348,7 +319,6 @@ "mode": "palette-classic" }, "custom": { - "axisBorderShow": false, "axisCenteredZero": false, "axisColorMode": "text", "axisLabel": "", @@ -362,7 +332,6 @@ "tooltip": false, "viz": false }, - "insertNulls": false, "lineInterpolation": "linear", "lineWidth": 1, "pointSize": 5, @@ -420,11 +389,11 @@ { "datasource": { "type": "prometheus", - "uid": "${DS_PROMETHEUS}" + "uid": "prometheus" }, "disableTextWrap": false, "editorMode": "builder", - "expr": "histogram_quantile(0.99, sum by(le) (rate(vllm:time_per_output_token_seconds_bucket[$__rate_interval])))", + "expr": "histogram_quantile(0.99, sum by(le) (rate(vllm:time_per_output_token_seconds_bucket{model_name=\"$model_name\"}[$__rate_interval])))", "fullMetaSearch": false, "includeNullMetadata": false, "instant": false, @@ -436,11 +405,11 @@ { "datasource": { "type": "prometheus", - "uid": "${DS_PROMETHEUS}" + "uid": "prometheus" }, "disableTextWrap": false, "editorMode": "builder", - "expr": "histogram_quantile(0.95, sum by(le) (rate(vllm:time_per_output_token_seconds_bucket[$__rate_interval])))", + "expr": "histogram_quantile(0.95, sum by(le) (rate(vllm:time_per_output_token_seconds_bucket{model_name=\"$model_name\"}[$__rate_interval])))", "fullMetaSearch": false, "hide": false, "includeNullMetadata": false, @@ -453,11 +422,11 @@ { "datasource": { "type": "prometheus", - "uid": "${DS_PROMETHEUS}" + "uid": "prometheus" }, "disableTextWrap": false, "editorMode": "builder", - "expr": "histogram_quantile(0.9, sum by(le) (rate(vllm:time_per_output_token_seconds_bucket[$__rate_interval])))", + "expr": "histogram_quantile(0.9, sum by(le) (rate(vllm:time_per_output_token_seconds_bucket{model_name=\"$model_name\"}[$__rate_interval])))", "fullMetaSearch": false, "hide": false, "includeNullMetadata": false, @@ -470,11 +439,11 @@ { "datasource": { "type": "prometheus", - "uid": "${DS_PROMETHEUS}" + "uid": "prometheus" }, "disableTextWrap": false, "editorMode": "builder", - "expr": "histogram_quantile(0.5, sum by(le) (rate(vllm:time_per_output_token_seconds_bucket[$__rate_interval])))", + "expr": "histogram_quantile(0.5, sum by(le) (rate(vllm:time_per_output_token_seconds_bucket{model_name=\"$model_name\"}[$__rate_interval])))", "fullMetaSearch": false, "hide": false, "includeNullMetadata": false, @@ -487,10 +456,10 @@ { "datasource": { "type": "prometheus", - "uid": "${DS_PROMETHEUS}" + "uid": "prometheus" }, "editorMode": "code", - "expr": "rate(vllm:time_per_output_token_seconds_sum[$__rate_interval])\n/\nrate(vllm:time_per_output_token_seconds_count[$__rate_interval])", + "expr": "rate(vllm:time_per_output_token_seconds_sum{model_name=\"$model_name\"}[$__rate_interval])\n/\nrate(vllm:time_per_output_token_seconds_count{model_name=\"$model_name\"}[$__rate_interval])", "hide": false, "instant": false, "legendFormat": "Mean", @@ -504,7 +473,7 @@ { "datasource": { "type": "prometheus", - "uid": "${DS_PROMETHEUS}" + "uid": "prometheus" }, "description": "Number of requests in RUNNING, WAITING, and SWAPPED state", "fieldConfig": { @@ -513,7 +482,6 @@ "mode": "palette-classic" }, "custom": { - "axisBorderShow": false, "axisCenteredZero": false, "axisColorMode": "text", "axisLabel": "", @@ -527,7 +495,6 @@ "tooltip": false, "viz": false }, - "insertNulls": false, "lineInterpolation": "linear", "lineWidth": 1, "pointSize": 5, @@ -585,11 +552,11 @@ { "datasource": { "type": "prometheus", - "uid": "${DS_PROMETHEUS}" + "uid": "prometheus" }, "disableTextWrap": false, "editorMode": "builder", - "expr": "vllm:num_requests_running", + "expr": "vllm:num_requests_running{model_name=\"$model_name\"}", "fullMetaSearch": false, "includeNullMetadata": true, "instant": false, @@ -601,11 +568,11 @@ { "datasource": { "type": "prometheus", - "uid": "${DS_PROMETHEUS}" + "uid": "prometheus" }, "disableTextWrap": false, "editorMode": "builder", - "expr": "vllm:num_requests_swapped", + "expr": "vllm:num_requests_swapped{model_name=\"$model_name\"}", "fullMetaSearch": false, "hide": false, "includeNullMetadata": true, @@ -618,11 +585,11 @@ { "datasource": { "type": "prometheus", - "uid": "${DS_PROMETHEUS}" + "uid": "prometheus" }, "disableTextWrap": false, "editorMode": "builder", - "expr": "vllm:num_requests_waiting", + "expr": "vllm:num_requests_waiting{model_name=\"$model_name\"}", "fullMetaSearch": false, "hide": false, "includeNullMetadata": true, @@ -639,7 +606,7 @@ { "datasource": { "type": "prometheus", - "uid": "${DS_PROMETHEUS}" + "uid": "prometheus" }, "description": "P50, P90, P95, and P99 TTFT latency in seconds.", "fieldConfig": { @@ -648,7 +615,6 @@ "mode": "palette-classic" }, "custom": { - "axisBorderShow": false, "axisCenteredZero": false, "axisColorMode": "text", "axisLabel": "", @@ -662,7 +628,6 @@ "tooltip": false, "viz": false }, - "insertNulls": false, "lineInterpolation": "linear", "lineWidth": 1, "pointSize": 5, @@ -720,11 +685,11 @@ { "datasource": { "type": "prometheus", - "uid": "${DS_PROMETHEUS}" + "uid": "prometheus" }, "disableTextWrap": false, "editorMode": "builder", - "expr": "histogram_quantile(0.99, sum by(le) (rate(vllm:time_to_first_token_seconds_bucket[$__rate_interval])))", + "expr": "histogram_quantile(0.99, sum by(le) (rate(vllm:time_to_first_token_seconds_bucket{model_name=\"$model_name\"}[$__rate_interval])))", "fullMetaSearch": false, "hide": false, "includeNullMetadata": false, @@ -737,11 +702,11 @@ { "datasource": { "type": "prometheus", - "uid": "${DS_PROMETHEUS}" + "uid": "prometheus" }, "disableTextWrap": false, "editorMode": "builder", - "expr": "histogram_quantile(0.95, sum by(le) (rate(vllm:time_to_first_token_seconds_bucket[$__rate_interval])))", + "expr": "histogram_quantile(0.95, sum by(le) (rate(vllm:time_to_first_token_seconds_bucket{model_name=\"$model_name\"}[$__rate_interval])))", "fullMetaSearch": false, "includeNullMetadata": false, "instant": false, @@ -753,11 +718,11 @@ { "datasource": { "type": "prometheus", - "uid": "${DS_PROMETHEUS}" + "uid": "prometheus" }, "disableTextWrap": false, "editorMode": "builder", - "expr": "histogram_quantile(0.9, sum by(le) (rate(vllm:time_to_first_token_seconds_bucket[$__rate_interval])))", + "expr": "histogram_quantile(0.9, sum by(le) (rate(vllm:time_to_first_token_seconds_bucket{model_name=\"$model_name\"}[$__rate_interval])))", "fullMetaSearch": false, "hide": false, "includeNullMetadata": false, @@ -770,11 +735,11 @@ { "datasource": { "type": "prometheus", - "uid": "${DS_PROMETHEUS}" + "uid": "prometheus" }, "disableTextWrap": false, "editorMode": "builder", - "expr": "histogram_quantile(0.5, sum by(le) (rate(vllm:time_to_first_token_seconds_bucket[$__rate_interval])))", + "expr": "histogram_quantile(0.5, sum by(le) (rate(vllm:time_to_first_token_seconds_bucket{model_name=\"$model_name\"}[$__rate_interval])))", "fullMetaSearch": false, "hide": false, "includeNullMetadata": false, @@ -787,10 +752,10 @@ { "datasource": { "type": "prometheus", - "uid": "${DS_PROMETHEUS}" + "uid": "prometheus" }, "editorMode": "code", - "expr": "rate(vllm:time_to_first_token_seconds_sum[$__rate_interval])\n/\nrate(vllm:time_to_first_token_seconds_count[$__rate_interval])", + "expr": "rate(vllm:time_to_first_token_seconds_sum{model_name=\"$model_name\"}[$__rate_interval])\n/\nrate(vllm:time_to_first_token_seconds_count{model_name=\"$model_name\"}[$__rate_interval])", "hide": false, "instant": false, "legendFormat": "Average", @@ -804,7 +769,7 @@ { "datasource": { "type": "prometheus", - "uid": "${DS_PROMETHEUS}" + "uid": "prometheus" }, "description": "Percentage of used cache blocks by vLLM.", "fieldConfig": { @@ -813,7 +778,6 @@ "mode": "palette-classic" }, "custom": { - "axisBorderShow": false, "axisCenteredZero": false, "axisColorMode": "text", "axisLabel": "", @@ -827,7 +791,6 @@ "tooltip": false, "viz": false }, - "insertNulls": false, "lineInterpolation": "linear", "lineWidth": 1, "pointSize": 5, @@ -885,10 +848,10 @@ { "datasource": { "type": "prometheus", - "uid": "${DS_PROMETHEUS}" + "uid": "prometheus" }, "editorMode": "code", - "expr": "vllm:gpu_cache_usage_perc", + "expr": "vllm:gpu_cache_usage_perc{model_name=\"$model_name\"}", "instant": false, "legendFormat": "GPU Cache Usage", "range": true, @@ -897,10 +860,10 @@ { "datasource": { "type": "prometheus", - "uid": "${DS_PROMETHEUS}" + "uid": "prometheus" }, "editorMode": "code", - "expr": "vllm:cpu_cache_usage_perc", + "expr": "vllm:cpu_cache_usage_perc{model_name=\"$model_name\"}", "hide": false, "instant": false, "legendFormat": "CPU Cache Usage", @@ -913,10 +876,39 @@ } ], "refresh": "", - "schemaVersion": 39, + "schemaVersion": 37, + "style": "dark", "tags": [], "templating": { - "list": [] + "list": [ + { + "current": { + "selected": false, + "text": "vllm", + "value": "vllm" + }, + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "definition": "label_values(model_name)", + "hide": 0, + "includeAll": false, + "label": "model_name", + "multi": false, + "name": "model_name", + "options": [], + "query": { + "query": "label_values(model_name)", + "refId": "StandardVariableQuery" + }, + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "sort": 0, + "type": "query" + } + ] }, "time": { "from": "now-5m", diff --git a/examples/template_chatglm.jinja b/examples/template_chatglm.jinja new file mode 100644 index 000000000000..bf26f27274ef --- /dev/null +++ b/examples/template_chatglm.jinja @@ -0,0 +1,18 @@ +{%- set counter = namespace(index=0) -%} +{%- for message in messages -%} + {%- if message['role'] == 'user' -%} + {{- '[Round ' + counter.index|string + ']\n问:' + message['content'] -}} + {%- set counter.index = counter.index + 1 -%} + {%- endif -%} + {%- if message['role'] == 'assistant' -%} + {{- '\n答:' + message['content'] -}} + {%- if (loop.last and add_generation_prompt) or not loop.last -%} + {{- '\n' -}} + {%- endif -%} + {%- endif -%} +{%- endfor -%} + + +{%- if add_generation_prompt and messages[-1]['role'] != 'assistant' -%} + {{- '\n答:' -}} +{%- endif -%} \ No newline at end of file diff --git a/examples/template_chatglm2.jinja b/examples/template_chatglm2.jinja new file mode 100644 index 000000000000..c155b7c23f64 --- /dev/null +++ b/examples/template_chatglm2.jinja @@ -0,0 +1,18 @@ +{%- set counter = namespace(index=1) -%} +{%- for message in messages -%} + {%- if message['role'] == 'user' -%} + {{- '[Round ' + counter.index|string + ']\n\n问:' + message['content'] -}} + {%- set counter.index = counter.index + 1 -%} + {%- endif -%} + {%- if message['role'] == 'assistant' -%} + {{- '\n\n答:' + message['content'] -}} + {%- if (loop.last and add_generation_prompt) or not loop.last -%} + {{- '\n\n' -}} + {%- endif -%} + {%- endif -%} +{%- endfor -%} + + +{%- if add_generation_prompt and messages[-1]['role'] != 'assistant' -%} + {{- '\n\n答:' -}} +{%- endif -%} \ No newline at end of file diff --git a/examples/template_falcon.jinja b/examples/template_falcon.jinja new file mode 100644 index 000000000000..01cf0e2670d0 --- /dev/null +++ b/examples/template_falcon.jinja @@ -0,0 +1,15 @@ +{%- for message in messages -%} + {%- if message['role'] == 'user' -%} + {{- 'User: ' + message['content'] -}} + {%- elif message['role'] == 'assistant' -%} + {{- 'Assistant: ' + message['content'] -}} + {%- endif -%} + {%- if (loop.last and add_generation_prompt) or not loop.last -%} + {{- '\n' -}} + {%- endif -%} +{%- endfor -%} + + +{%- if add_generation_prompt and messages[-1]['role'] != 'assistant' -%} + {{- 'Assistant:' -}} +{% endif %} \ No newline at end of file diff --git a/examples/template_falcon_180b.jinja b/examples/template_falcon_180b.jinja new file mode 100644 index 000000000000..f08f7395b7fd --- /dev/null +++ b/examples/template_falcon_180b.jinja @@ -0,0 +1,17 @@ +{%- for message in messages -%} + {%- if message['role'] == 'system' -%} + {{- 'System: ' + message['content'] -}} + {%- elif message['role'] == 'user' -%} + {{- 'User: ' + message['content'] -}} + {%- elif message['role'] == 'assistant' -%} + {{- 'Falcon: ' + message['content'] -}} + {%- endif -%} + {%- if (loop.last and add_generation_prompt) or not loop.last -%} + {{- '\n' -}} + {%- endif -%} +{%- endfor -%} + + +{%- if add_generation_prompt and messages[-1]['role'] != 'assistant' -%} + {{- 'Falcon:' -}} +{% endif %} \ No newline at end of file diff --git a/format.sh b/format.sh index eb2c5ab03162..ff30111123be 100755 --- a/format.sh +++ b/format.sh @@ -95,13 +95,17 @@ echo 'vLLM yapf: Done' # echo 'vLLM mypy:' # mypy +CODESPELL_EXCLUDES=( + '--skip' '*docs/source/_build/**' +) + # check spelling of specified files spell_check() { codespell "$@" } spell_check_all(){ - codespell --toml pyproject.toml + codespell --toml pyproject.toml "${CODESPELL_EXCLUDES[@]}" } # Spelling check of files that differ from main branch. @@ -116,7 +120,7 @@ spell_check_changed() { if ! git diff --diff-filter=ACM --quiet --exit-code "$MERGEBASE" -- '*.py' '*.pyi' &>/dev/null; then git diff --name-only --diff-filter=ACM "$MERGEBASE" -- '*.py' '*.pyi' | xargs \ - codespell + codespell "${CODESPELL_EXCLUDES[@]}" fi } diff --git a/pyproject.toml b/pyproject.toml index c5db016cebdb..d6fa5d7a035f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -9,6 +9,10 @@ requires = [ ] build-backend = "setuptools.build_meta" +[tool.ruff] +# Allow lines to be as long as 80. +line-length = 80 + [tool.ruff.lint] select = [ # pycodestyle @@ -29,8 +33,6 @@ ignore = [ "F405", "F403", # lambda expression assignment "E731", - # line too long, handled by black formatting - "E501", # .strip() with multi-character strings "B005", # Loop control variable not used within loop body diff --git a/requirements-dev.txt b/requirements-dev.txt index dfcbfa4253f1..5502c97d014a 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -21,6 +21,7 @@ einops # required for MPT openai requests ray +peft # Benchmarking aiohttp diff --git a/requirements.txt b/requirements.txt index 05ec2e804e13..d6c33ad85da5 100644 --- a/requirements.txt +++ b/requirements.txt @@ -12,5 +12,5 @@ pydantic >= 2.0 # Required for OpenAI server. prometheus_client >= 0.18.0 pynvml == 11.5.0 triton >= 2.1.0 -outlines >= 0.0.27 +outlines == 0.0.34 cupy-cuda12x == 12.1.0 # Required for CUDA graphs. CUDA 11.8 users should install cupy-cuda11x instead. diff --git a/setup.py b/setup.py index 745b5a9b2d02..4e2bb2ce851f 100644 --- a/setup.py +++ b/setup.py @@ -9,12 +9,17 @@ from packaging.version import parse, Version import setuptools +import sys import torch import torch.utils.cpp_extension as torch_cpp_ext from torch.utils.cpp_extension import BuildExtension, CUDAExtension, CUDA_HOME, ROCM_HOME ROOT_DIR = os.path.dirname(__file__) +# vLLM only supports Linux platform +assert sys.platform.startswith( + "linux"), "vLLM only supports Linux platform (including WSL)." + # If you are developing the C++ backend of vLLM, consider building vLLM with # `python setup.py develop` since it will give you incremental builds. # The downside is that this method is deprecated, see @@ -142,8 +147,8 @@ def get_pytorch_rocm_arch() -> Set[str]: # If we don't have PYTORCH_ROCM_ARCH specified pull the list from rocm_agent_enumerator if env_arch_list is None: command = "rocm_agent_enumerator" - env_arch_list = subprocess.check_output([command]).decode('utf-8')\ - .strip().replace("\n", ";") + env_arch_list = (subprocess.check_output( + [command]).decode('utf-8').strip().replace("\n", ";")) arch_source_str = "rocm_agent_enumerator" else: arch_source_str = "PYTORCH_ROCM_ARCH env variable" @@ -402,11 +407,13 @@ def get_vllm_version() -> str: if neuron_version != MAIN_CUDA_VERSION: neuron_version_str = neuron_version.replace(".", "")[:3] version += f"+neuron{neuron_version_str}" - else: + elif _is_cuda(): cuda_version = str(nvcc_cuda_version) if cuda_version != MAIN_CUDA_VERSION: cuda_version_str = cuda_version.replace(".", "")[:3] version += f"+cu{cuda_version_str}" + else: + raise RuntimeError("Unknown runtime environment.") return version @@ -431,6 +438,12 @@ def get_requirements() -> List[str]: else: with open(get_path("requirements.txt")) as f: requirements = f.read().strip().split("\n") + if nvcc_cuda_version <= Version("11.8"): + # replace cupy-cuda12x with cupy-cuda11x for cuda 11.x + for i in range(len(requirements)): + if requirements[i].startswith("cupy-cuda12x"): + requirements[i] = "cupy-cuda11x" + break return requirements diff --git a/tests/async_engine/test_api_server.py b/tests/async_engine/test_api_server.py index ed9017c1e3e9..248bfbc8ab5c 100644 --- a/tests/async_engine/test_api_server.py +++ b/tests/async_engine/test_api_server.py @@ -25,23 +25,21 @@ def _query_server_long(prompt: str) -> dict: @pytest.fixture -def api_server(): +def api_server(tokenizer_pool_size: int): script_path = Path(__file__).parent.joinpath( "api_server_async_engine.py").absolute() uvicorn_process = subprocess.Popen([ - sys.executable, - "-u", - str(script_path), - "--model", - "facebook/opt-125m", - "--host", - "127.0.0.1", + sys.executable, "-u", + str(script_path), "--model", "facebook/opt-125m", "--host", + "127.0.0.1", "--tokenizer-pool-size", + str(tokenizer_pool_size) ]) yield uvicorn_process.terminate() -def test_api_server(api_server): +@pytest.mark.parametrize("tokenizer_pool_size", [0, 2]) +def test_api_server(api_server, tokenizer_pool_size: int): """ Run the API server and test it. diff --git a/tests/async_engine/test_async_llm_engine.py b/tests/async_engine/test_async_llm_engine.py index 1e31ff737303..cb125a7bfec3 100644 --- a/tests/async_engine/test_async_llm_engine.py +++ b/tests/async_engine/test_async_llm_engine.py @@ -89,3 +89,6 @@ async def test_new_requests_event(): await asyncio.sleep(0.01) assert engine.engine.add_request_calls == 3 assert engine.engine.step_calls == old_step_calls + 1 + + engine = MockAsyncLLMEngine(worker_use_ray=True, engine_use_ray=True) + assert engine.get_tokenizer() is not None diff --git a/tests/async_engine/test_chat_template.py b/tests/async_engine/test_chat_template.py index 32d110e0f0b4..e98bba8d43b4 100644 --- a/tests/async_engine/test_chat_template.py +++ b/tests/async_engine/test_chat_template.py @@ -73,7 +73,7 @@ def test_load_chat_template(): assert template_content is not None # Hard coded value for template_chatml.jinja assert template_content == """{% for message in messages %}{{'<|im_start|>' + message['role'] + '\\n' + message['content']}}{% if (loop.last and add_generation_prompt) or not loop.last %}{{ '<|im_end|>' + '\\n'}}{% endif %}{% endfor %} -{% if add_generation_prompt and messages[-1]['role'] != 'assistant' %}{{ '<|im_start|>assistant\\n' }}{% endif %}""" +{% if add_generation_prompt and messages[-1]['role'] != 'assistant' %}{{ '<|im_start|>assistant\\n' }}{% endif %}""" # noqa: E501 def test_no_load_chat_template(): @@ -117,4 +117,6 @@ async def test_get_gen_prompt(model, template, add_generation_prompt, add_generation_prompt=mock_request.add_generation_prompt) # Test assertion - assert result == expected_output, f"The generated prompt does not match the expected output for model {model} and template {template}" + assert result == expected_output, ( + f"The generated prompt does not match the expected output for " + f"model {model} and template {template}") diff --git a/tests/conftest.py b/tests/conftest.py index 6eb8159837d5..c06b271e6c7f 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -7,6 +7,7 @@ from vllm import LLM, SamplingParams from vllm.transformers_utils.tokenizer import get_tokenizer +from vllm.config import TokenizerPoolConfig _TEST_DIR = os.path.dirname(__file__) _TEST_PROMPTS = [os.path.join(_TEST_DIR, "prompts", "example.txt")] @@ -258,3 +259,13 @@ def generate_beam_search( @pytest.fixture def vllm_runner(): return VllmRunner + + +def get_tokenizer_pool_config(tokenizer_group_type): + if tokenizer_group_type is None: + return None + if tokenizer_group_type == "ray": + return TokenizerPoolConfig(pool_size=1, + pool_type="ray", + extra_config={}) + raise ValueError(f"Unknown tokenizer_group_type: {tokenizer_group_type}") diff --git a/tests/core/test_block_manager.py b/tests/core/test_block_manager.py index 04d01f7724e4..44ac05a1430b 100644 --- a/tests/core/test_block_manager.py +++ b/tests/core/test_block_manager.py @@ -4,7 +4,8 @@ from vllm import SamplingParams from vllm.block import PhysicalTokenBlock -from vllm.core.block_manager import BlockAllocator, BlockSpaceManager, AllocStatus +from vllm.core.block_manager import (BlockAllocator, BlockSpaceManager, + AllocStatus) from vllm.utils import Device from vllm.sequence import Sequence, SequenceGroup, SequenceStatus, Logprob @@ -273,3 +274,90 @@ def test_reset(): # Resetting block manager frees all allocated blocks. block_manager.reset() assert block_manager.get_num_free_gpu_blocks() == original_blocks + + +def test_sliding_window_multi_seq(): + """ + Tests that memory allocation and deallocation is handled + correctly with multiple sequences that exceed the sliding + window's capacity. + """ + block_size = 1 + num_cpu_blocks = 8 + num_gpu_blocks = 8 + sliding_window = 2 + block_manager = BlockSpaceManager(block_size, + num_cpu_blocks, + num_gpu_blocks, + sliding_window=sliding_window, + watermark=0) + + assert block_manager.get_num_free_gpu_blocks() == num_gpu_blocks + + parent = Sequence(1, "one two three", [0, 1, 2], block_size) + seq_group = SequenceGroup("1", [parent], SamplingParams(), time.time(), + None) + block_manager.allocate(seq_group) + + # assert the number of blocks allocated is correct + # the parent seq has len 3, but since sliding_window is 2, + # we will use at most 2 blocks + assert block_manager.get_num_free_gpu_blocks( + ) == num_gpu_blocks - sliding_window + + # Fork prompt and copy block tables. + child = parent.fork(2) + block_manager.fork(parent, child) + + # assert the number of blocks allocated is correct + # forking does not increase memory consumption + assert block_manager.get_num_free_gpu_blocks( + ) == num_gpu_blocks - sliding_window + + # assert both parent and child share all blocks + assert block_manager.get_block_table( + parent) == block_manager.get_block_table(child) + + token_id = 4 + # Append token to child. Block is shared so copy on write occurs. + child.append_token_id(token_id, {token_id: Logprob(0.0)}) + block_manager.append_slot(child) + + # assert the number of blocks allocated is correct + # we will use now one block more. Each seq will use 2 blocks, + # but only one can be shared + assert block_manager.get_num_free_gpu_blocks( + ) == num_gpu_blocks - sliding_window - 1 + + token_id = 5 + parent.append_token_id(token_id, {token_id: Logprob(0.0)}) + block_manager.append_slot(parent) + + # assert the number of blocks allocated is correct + # no change, because both sequences are still just sharing one block + assert block_manager.get_num_free_gpu_blocks( + ) == num_gpu_blocks - sliding_window - 1 + + block_table_parent = block_manager.get_block_table(parent) + block_table_child = block_manager.get_block_table(child) + + assert block_table_parent != block_table_child + + # assert both blocks are sharing the second-last block + assert block_table_parent[-2] == block_table_child[-2] + + # now let's clean up... + block_manager.free(parent) + + # assert the number of blocks allocated is correct + # We have freed one seq, reducing the ref count of two blocks by one. + # One of the two was only used by the parent seq, so this is now free. + # The child seq still consumes sliding_window blocks + assert block_manager.get_num_free_gpu_blocks( + ) == num_gpu_blocks - sliding_window + + # free all blocks + block_manager.free(child) + + # assert all blocks are free now + assert block_manager.get_num_free_gpu_blocks() == num_gpu_blocks diff --git a/tests/entrypoints/test_guided_processors.py b/tests/entrypoints/test_guided_processors.py index 5b39269916f8..4a0e3e759e25 100644 --- a/tests/entrypoints/test_guided_processors.py +++ b/tests/entrypoints/test_guided_processors.py @@ -46,8 +46,8 @@ "required": ["name", "age", "skills", "work history"] } -TEST_REGEX = r"((25[0-5]|(2[0-4]|1\d|[1-9]|)\d)\.){3}" + \ - r"(25[0-5]|(2[0-4]|1\d|[1-9]|)\d)" +TEST_REGEX = (r"((25[0-5]|(2[0-4]|1\d|[1-9]|)\d)\.){3}" + r"(25[0-5]|(2[0-4]|1\d|[1-9]|)\d)") def test_guided_logits_processors(): diff --git a/tests/entrypoints/test_openai_server.py b/tests/entrypoints/test_openai_server.py index f4a6e44d88a8..a5b2bf4c0f0c 100644 --- a/tests/entrypoints/test_openai_server.py +++ b/tests/entrypoints/test_openai_server.py @@ -5,9 +5,12 @@ import sys import pytest import requests -import ray # using Ray for overall ease of process management, parallel requests, and debugging. +# using Ray for overall ease of process management, parallel requests, +# and debugging. +import ray import openai # use the official client for correctness check -from huggingface_hub import snapshot_download # downloading lora to test lora requests +# downloading lora to test lora requests +from huggingface_hub import snapshot_download # imports for guided decoding tests import json @@ -17,8 +20,11 @@ from vllm.transformers_utils.tokenizer import get_tokenizer MAX_SERVER_START_WAIT_S = 600 # wait for server to start for 60 seconds -MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta" # any model with a chat template should work here -LORA_NAME = "typeof/zephyr-7b-beta-lora" # technically this needs Mistral-7B-v0.1 as base, but we're not testing generation quality here +# any model with a chat template should work here +MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta" +# technically this needs Mistral-7B-v0.1 as base, but we're not testing +# generation quality here +LORA_NAME = "typeof/zephyr-7b-beta-lora" TEST_SCHEMA = { "type": "object", @@ -59,8 +65,8 @@ "required": ["name", "age", "skills", "work history"] } -TEST_REGEX = r"((25[0-5]|(2[0-4]|1\d|[1-9]|)\d)\.){3}" + \ - r"(25[0-5]|(2[0-4]|1\d|[1-9]|)\d)" +TEST_REGEX = (r"((25[0-5]|(2[0-4]|1\d|[1-9]|)\d)\.){3}" + r"(25[0-5]|(2[0-4]|1\d|[1-9]|)\d)") TEST_CHOICE = [ "Python", "Java", "JavaScript", "C++", "C#", "PHP", "TypeScript", "Ruby", @@ -120,8 +126,9 @@ def server(zephyr_lora_files): server_runner = ServerRunner.remote([ "--model", MODEL_NAME, + # use half precision for speed and memory savings in CI environment "--dtype", - "bfloat16", # use half precision for speed and memory savings in CI environment + "bfloat16", "--max-model-len", "8192", "--enforce-eager", @@ -392,7 +399,8 @@ async def test_batch_completions(server, client: openai.AsyncOpenAI, max_tokens=5, temperature=0.0, extra_body=dict( - # NOTE: this has to be true for n > 1 in vLLM, but not necessary for official client. + # NOTE: this has to be true for n > 1 in vLLM, but not necessary + # for official client. use_beam_search=True), ) assert len(batch.choices) == 4 @@ -469,8 +477,8 @@ async def test_logits_bias(server, client: openai.AsyncOpenAI): async def test_guided_json_completion(server, client: openai.AsyncOpenAI): completion = await client.completions.create( model=MODEL_NAME, - prompt= - f"Give an example JSON for an employee profile that fits this schema: {TEST_SCHEMA}", + prompt=f"Give an example JSON for an employee profile " + f"that fits this schema: {TEST_SCHEMA}", n=3, temperature=1.0, max_tokens=500, @@ -489,9 +497,11 @@ async def test_guided_json_chat(server, client: openai.AsyncOpenAI): "role": "system", "content": "you are a helpful assistant" }, { - "role": "user", - "content": "Give an example JSON for an employee profile that " + \ - f"fits this schema: {TEST_SCHEMA}" + "role": + "user", + "content": + f"Give an example JSON for an employee profile that " + f"fits this schema: {TEST_SCHEMA}" }] chat_completion = await client.chat.completions.create( model=MODEL_NAME, diff --git a/tests/kernels/test_activation.py b/tests/kernels/test_activation.py index e0dec144eba1..f78913f120aa 100644 --- a/tests/kernels/test_activation.py +++ b/tests/kernels/test_activation.py @@ -16,7 +16,7 @@ ] -@pytest.mark.parametrize("activation", [SiluAndMul, GeluAndMul]) +@pytest.mark.parametrize("activation", ["silu", "gelu", "gelu_tanh"]) @pytest.mark.parametrize("num_tokens", NUM_TOKENS) @pytest.mark.parametrize("d", D) @pytest.mark.parametrize("dtype", DTYPES) @@ -24,7 +24,7 @@ @pytest.mark.parametrize("device", CUDA_DEVICES) @torch.inference_mode() def test_act_and_mul( - activation: Type[torch.nn.Module], + activation: str, num_tokens: int, d: int, dtype: torch.dtype, @@ -36,7 +36,12 @@ def test_act_and_mul( torch.cuda.manual_seed(seed) torch.set_default_device(device) x = torch.randn(num_tokens, 2 * d, dtype=dtype) - layer = activation() + if activation == "silu": + layer = SiluAndMul() + elif activation == "gelu": + layer = GeluAndMul(approximate="none") + elif activation == "gelu_tanh": + layer = GeluAndMul(approximate="tanh") out = layer(x) ref_out = layer._forward(x) # The SiLU and GELU implementations are equivalent to the native PyTorch diff --git a/tests/kernels/test_moe.py b/tests/kernels/test_moe.py index c402fe3e98c7..6165225d2d81 100644 --- a/tests/kernels/test_moe.py +++ b/tests/kernels/test_moe.py @@ -57,7 +57,8 @@ def test_fused_moe( [torch.float32, torch.float16, torch.bfloat16]) @torch.inference_mode() def test_mixtral_moe(dtype: torch.dtype): - "Make sure our Mixtral MoE implementation agrees with the one from huggingface." + """Make sure our Mixtral MoE implementation agrees with the one from + huggingface.""" # Instantiate our and huggingface's MoE blocks config = MixtralConfig() diff --git a/tests/kernels/test_pos_encoding.py b/tests/kernels/test_pos_encoding.py index 0d27bbaff9fc..ffdcc1e8c80f 100644 --- a/tests/kernels/test_pos_encoding.py +++ b/tests/kernels/test_pos_encoding.py @@ -1,8 +1,9 @@ -from typing import Optional +from typing import List, Optional import pytest import torch from allclose_default import get_default_atol, get_default_rtol +from itertools import accumulate from vllm.model_executor.layers.rotary_embedding import get_rope IS_NEOX_STYLE = [True, False] @@ -72,3 +73,135 @@ def test_rotary_embedding( ref_key, atol=get_default_atol(out_key), rtol=get_default_rtol(out_key)) + + +@pytest.mark.parametrize("is_neox_style", IS_NEOX_STYLE) +@pytest.mark.parametrize("batch_size", BATCH_SIZES) +@pytest.mark.parametrize("seq_len", SEQ_LENS) +@pytest.mark.parametrize("num_heads", NUM_HEADS) +@pytest.mark.parametrize("head_size", HEAD_SIZES) +@pytest.mark.parametrize("rotary_dim", ROTARY_DIMS) +@pytest.mark.parametrize("dtype", DTYPES) +@pytest.mark.parametrize("seed", SEEDS) +@pytest.mark.parametrize("device", CUDA_DEVICES) +@torch.inference_mode() +def test_batched_rotary_embedding( + is_neox_style: bool, + batch_size: int, + seq_len: int, + num_heads: int, + head_size: int, + rotary_dim: Optional[int], + dtype: torch.dtype, + seed: int, + device: str, + max_position: int = 8192, + base: int = 10000, +) -> None: + torch.random.manual_seed(seed) + if torch.cuda.is_available(): + torch.cuda.manual_seed(seed) + torch.set_default_device(device) + if rotary_dim is None: + rotary_dim = head_size + rope = get_rope(head_size, rotary_dim, max_position, base, is_neox_style, { + "type": "linear", + "factor": (1, ) + }) + rope = rope.to(dtype=dtype) + + positions = torch.randint(0, max_position, (batch_size, seq_len)) + query = torch.randn(batch_size, + seq_len, + num_heads * head_size, + dtype=dtype) + key = torch.randn_like(query) + + # NOTE(woosuk): The reference implementation should be executed first + # because the custom kernel is in-place. + ref_query, ref_key = rope._forward(positions, query, key) + out_query, out_key = rope.forward(positions, + query, + key, + offsets=torch.zeros(batch_size * seq_len, + dtype=int, + device=device)) + # Compare the results. + assert torch.allclose(out_query, + ref_query, + atol=get_default_atol(out_query), + rtol=get_default_rtol(out_query)) + assert torch.allclose(out_key, + ref_key, + atol=get_default_atol(out_key), + rtol=get_default_rtol(out_key)) + + +@pytest.mark.parametrize("is_neox_style", IS_NEOX_STYLE) +@pytest.mark.parametrize("batch_size", BATCH_SIZES) +@pytest.mark.parametrize("seq_len", SEQ_LENS) +@pytest.mark.parametrize("num_heads", NUM_HEADS) +@pytest.mark.parametrize("head_size", HEAD_SIZES) +@pytest.mark.parametrize("rotary_dim", ROTARY_DIMS) +@pytest.mark.parametrize("dtype", DTYPES) +@pytest.mark.parametrize("seed", SEEDS) +@pytest.mark.parametrize("device", CUDA_DEVICES) +@torch.inference_mode() +def test_batched_rotary_embedding_multi_lora( + is_neox_style: bool, + batch_size: int, + seq_len: int, + num_heads: int, + head_size: int, + rotary_dim: Optional[int], + dtype: torch.dtype, + seed: int, + device: str, + max_position: int = 8192, + base: int = 10000, +) -> None: + torch.random.manual_seed(seed) + if torch.cuda.is_available(): + torch.cuda.manual_seed(seed) + torch.set_default_device(device) + if rotary_dim is None: + rotary_dim = head_size + scaling_factors: List[int] = [1, 2, 4] + rope = get_rope(head_size, rotary_dim, max_position, base, is_neox_style, { + "type": "linear", + "factor": tuple(scaling_factors) + }) + rope = rope.to(dtype=dtype) + + positions = torch.randint(0, max_position, (batch_size, seq_len)) + query = torch.randn(batch_size, + seq_len, + num_heads * head_size, + dtype=dtype) + key = torch.randn_like(query) + + offset_map = torch.tensor( + list( + accumulate([0] + [ + max_position * scaling_factor * 2 + for scaling_factor in scaling_factors[:-1] + ]))) + query_types = torch.randint(0, + len(scaling_factors), (batch_size, seq_len), + device=device) + query_offsets = offset_map[query_types] + + # NOTE(woosuk): The reference implementation should be executed first + # because the custom kernel is in-place. + ref_query, ref_key = rope._forward(positions, query, key, query_offsets) + out_query, out_key = rope.forward(positions, query, key, + query_offsets.flatten()) + # Compare the results. + assert torch.allclose(out_query, + ref_query, + atol=get_default_atol(out_query), + rtol=get_default_rtol(out_query)) + assert torch.allclose(out_key, + ref_key, + atol=get_default_atol(out_key), + rtol=get_default_rtol(out_key)) diff --git a/tests/kernels/test_prefix_prefill.py b/tests/kernels/test_prefix_prefill.py index e881cd1ec375..a0be658acac7 100644 --- a/tests/kernels/test_prefix_prefill.py +++ b/tests/kernels/test_prefix_prefill.py @@ -114,7 +114,8 @@ def test_contexted_kv_attention( v_cache = v_cache.view(-1, block_size, num_kv_heads, head_size).permute(0, 2, 3, 1).contiguous() - # Warm up the Triton kernel by calling it once before actually measuring generation time + # Warm up the Triton kernel by calling it once before actually measuring + # generation time context_attention_fwd(query, k, v, output, k_cache, v_cache, block_table, b_start_loc, b_seq_len, b_ctx_len, max_input_len) torch.cuda.synchronize() diff --git a/tests/lora/conftest.py b/tests/lora/conftest.py index 67273144ecd0..30a8ad03c8ad 100644 --- a/tests/lora/conftest.py +++ b/tests/lora/conftest.py @@ -152,4 +152,5 @@ def get_model_patched(model_config, device_config, **kwargs): @pytest.fixture def llama_2_7b_model_extra_embeddings( llama_2_7b_engine_extra_embeddings) -> nn.Module: - yield llama_2_7b_engine_extra_embeddings.driver_worker.model_runner.model + yield (llama_2_7b_engine_extra_embeddings.model_executor.driver_worker. + model_runner.model) diff --git a/tests/lora/test_layer_variation.py b/tests/lora/test_layer_variation.py new file mode 100644 index 000000000000..95cf0cede872 --- /dev/null +++ b/tests/lora/test_layer_variation.py @@ -0,0 +1,104 @@ +from typing import List, Optional +import peft +import pytest +from random import sample +import tempfile +from transformers import AutoModelForCausalLM + +import vllm +from vllm.lora.request import LoRARequest +from .conftest import cleanup + +MODEL_PATH = "Felladrin/Llama-68M-Chat-v1" +PROMPTS = [ + "[system] Given a target sentence construct the underlying meaning representation\nof the input sentence as a single function with attributes and attribute\nvalues. This function should describe the target string accurately and the\nfunction must be one of the following ['inform', 'request', 'give_opinion',\n'confirm', 'verify_attribute', 'suggest', 'request_explanation',\n'recommend', 'request_attribute'].\n\nThe attributes must be one of the following:\n['name', 'exp_release_date', 'release_year', 'developer', 'esrb', 'rating',\n'genres', 'player_perspective', 'has_multiplayer', 'platforms',\n'available_on_steam', 'has_linux_release', 'has_mac_release', 'specifier'] [/system] [user] Here is the target sentence:\nSpellForce 3 is a pretty bad game. The developer Grimlore Games is clearly a bunch of no-talent hacks, and 2017 was a terrible year for games anyway. [/user] [assistant]", # noqa: E501 + "[system] Given a target sentence construct the underlying meaning representation\nof the input sentence as a single function with attributes and attribute\nvalues. This function should describe the target string accurately and the\nfunction must be one of the following ['inform', 'request', 'give_opinion',\n'confirm', 'verify_attribute', 'suggest', 'request_explanation',\n'recommend', 'request_attribute'].\n\nThe attributes must be one of the following:\n['name', 'exp_release_date', 'release_year', 'developer', 'esrb', 'rating',\n'genres', 'player_perspective', 'has_multiplayer', 'platforms',\n'available_on_steam', 'has_linux_release', 'has_mac_release', 'specifier'] [/system] [user] Here is the target sentence:\nI wanted to like Grimlore Games' 2017 entry, but in SpellForce 3 they just didn't get anything right. [/user] [assistant]", # noqa: E501 + "[system] Given a target sentence construct the underlying meaning representation\nof the input sentence as a single function with attributes and attribute\nvalues. This function should describe the target string accurately and the\nfunction must be one of the following ['inform', 'request', 'give_opinion',\n'confirm', 'verify_attribute', 'suggest', 'request_explanation',\n'recommend', 'request_attribute'].\n\nThe attributes must be one of the following:\n['name', 'exp_release_date', 'release_year', 'developer', 'esrb', 'rating',\n'genres', 'player_perspective', 'has_multiplayer', 'platforms',\n'available_on_steam', 'has_linux_release', 'has_mac_release', 'specifier'] [/system] [user] Here is the target sentence:\nBioShock is a good role-playing, action-adventure, shooter that released for PlayStation, Xbox, and PC in 2007. It is available on Steam, and it has a Mac release but not a Linux release. [/user] [assistant]", # noqa: E501 +] + + +def get_lora_model(model_id: str, target_modules: List[str], rank: int): + model = AutoModelForCausalLM.from_pretrained(model_id) + lora_config = peft.tuners.lora.LoraConfig(target_modules, rank) + lora_model = peft.PeftModel(model, lora_config) + return lora_model + + +def do_sample(llm, + lora_path: Optional[str] = None, + lora_id: Optional[int] = None, + logprobs: int = 0, + n_tokens: int = 256): + prompts = PROMPTS + sampling_params = vllm.SamplingParams(temperature=0, + max_tokens=n_tokens, + logprobs=logprobs, + stop=["[/assistant]"]) + outputs = llm.generate( + prompts, + sampling_params, + lora_request=LoRARequest(str(lora_id), lora_id, lora_path) + if lora_id else None) + # Print the outputs. + generated_texts = [] + generated_logprobs = [] + for output in outputs: + prompt = output.prompt + generated_text = output.outputs[0].text + generated_texts.append(generated_text) + print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") + generated_logprobs.append([ + list(logprob.keys()) for out in output.outputs + for logprob in out.logprobs + ]) + return generated_logprobs if logprobs else generated_texts + + +SUPPORTED_MODULES = [ + "qkv_proj", "o_proj", "gate_up_proj", "down_proj", "embed_tokens", + "lm_head" +] +TARGET_MODULES_LIST = [] +for length in range(2, 6): + TARGET_MODULES_LIST.extend( + [sample(SUPPORTED_MODULES, length) for _ in range(3)]) + + +# Test the correctness when layer and rank are varied +# step 1: init a base model and serve with LoRA to get the reference results +# step 2: merge the same LoRA to the base model, serve the merged model +# step 3: compare the results from step 1 and step 2 +@pytest.mark.parametrize("tp_size", [1]) +@pytest.mark.parametrize("target_modules", TARGET_MODULES_LIST) +@pytest.mark.parametrize("rank", [8, 16, 32, 64]) +def test_layer_variation_correctness(tp_size, target_modules, rank): + llm = vllm.LLM(MODEL_PATH, + enable_lora=True, + max_num_seqs=16, + max_loras=4, + tensor_parallel_size=tp_size, + worker_use_ray=True) + model = get_lora_model(MODEL_PATH, target_modules, rank) + with tempfile.TemporaryDirectory() as tmpdir: + model.save_pretrained(tmpdir) + merged_probs = do_sample(llm, tmpdir, 1, logprobs=5, n_tokens=32) + del llm + cleanup() + reference_id_sets = [set(prob[0]) for prob in merged_probs] + + model = get_lora_model(MODEL_PATH, target_modules, rank) + with tempfile.TemporaryDirectory() as tmpdir: + merged_model = model.merge_and_unload() + merged_model.save_pretrained(tmpdir) + llm = vllm.LLM(tmpdir, + tokenizer=MODEL_PATH, + enable_lora=False, + max_num_seqs=16, + tensor_parallel_size=tp_size, + worker_use_ray=True) + probs = do_sample(llm, logprobs=5, n_tokens=32) + del llm + cleanup() + # verify the top-5 tokens are identical for each token + id_sets = [set(prob[0]) for prob in probs] + assert id_sets == reference_id_sets diff --git a/tests/lora/test_layers.py b/tests/lora/test_layers.py index 18ce300449db..46f054c5b84e 100644 --- a/tests/lora/test_layers.py +++ b/tests/lora/test_layers.py @@ -17,14 +17,16 @@ LoRAMapping, BaseLayerWithLoRA, ) -from vllm.lora.models import LoRALayerWeights, convert_mapping, PackedLoRALayerWeights +from vllm.lora.models import (LoRALayerWeights, convert_mapping, + PackedLoRALayerWeights) from vllm.config import LoRAConfig from vllm.model_executor.layers.sampler import Sampler from vllm.model_executor.layers.linear import (ColumnParallelLinear, MergedColumnParallelLinear, RowParallelLinear, QKVParallelLinear) -from vllm.model_executor.layers.vocab_parallel_embedding import VocabParallelEmbedding, ParallelLMHead +from vllm.model_executor.layers.vocab_parallel_embedding import ( + VocabParallelEmbedding, ParallelLMHead) from vllm.model_executor.utils import set_random_seed from .utils import DummyLoRAManager @@ -258,7 +260,8 @@ def create_random_embedding_layer(): @torch.inference_mode() -# @pytest.mark.skip(reason="Fails when loras are in any slot other than the first.") +# @pytest.mark.skip( +# reason="Fails when loras are in any slot other than the first.") @pytest.mark.parametrize("num_loras", [1, 2, 4, 8]) @pytest.mark.parametrize("device", CUDA_DEVICES) def test_embeddings_with_new_embeddings(dist_init, num_loras, device) -> None: @@ -674,9 +677,9 @@ class FakeConfig: result = linear(input_)[0] subloras = sublora_dict[lora_id] for i, sublora in enumerate(subloras): - result[:, sublora.lora_b.shape[1] * i:sublora.lora_b.shape[1] * ( - i + 1 - )] += input_ @ sublora.lora_a @ sublora.lora_b * sublora.scaling + result[:, sublora.lora_b.shape[1] * i:sublora.lora_b.shape[1] * + (i + 1)] += (input_ @ sublora.lora_a @ sublora.lora_b * + sublora.scaling) expected_results.append(result) expected_result = torch.cat(expected_results) diff --git a/tests/lora/test_llama.py b/tests/lora/test_llama.py index dfaf8c700695..130906c3d584 100644 --- a/tests/lora/test_llama.py +++ b/tests/lora/test_llama.py @@ -10,12 +10,12 @@ def do_sample(llm, lora_path: str, lora_id: int): prompts = [ - "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_74 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user] [assistant]", - "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_11 (nationality VARCHAR, elector VARCHAR)\n\n question: When Anchero Pantaleone was the elector what is under nationality? [/user] [assistant]", - "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_95 (one_mora VARCHAR, gloss VARCHAR, accented_mora VARCHAR)\n\n question: What is the one mora for a low tone mora with a gloss of /˩okiru/ [òkìɽɯ́]? [/user] [assistant]", - "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE candidate (people_id VARCHAR, unsure_rate INTEGER); CREATE TABLE people (sex VARCHAR, people_id VARCHAR)\n\n question: which gender got the highest average uncertain ratio. [/user] [assistant]", - "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_60 (pick INTEGER, former_wnba_team VARCHAR)\n\n question: What pick was a player that previously played for the Minnesota Lynx? [/user] [assistant]", - "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_28138035_4 (womens_doubles VARCHAR, mens_singles VARCHAR)\n\n question: Name the women's doubles for werner schlager [/user] [assistant]" + "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_74 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user] [assistant]", # noqa: E501 + "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_11 (nationality VARCHAR, elector VARCHAR)\n\n question: When Anchero Pantaleone was the elector what is under nationality? [/user] [assistant]", # noqa: E501 + "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_95 (one_mora VARCHAR, gloss VARCHAR, accented_mora VARCHAR)\n\n question: What is the one mora for a low tone mora with a gloss of /˩okiru/ [òkìɽɯ́]? [/user] [assistant]", # noqa: E501 + "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE candidate (people_id VARCHAR, unsure_rate INTEGER); CREATE TABLE people (sex VARCHAR, people_id VARCHAR)\n\n question: which gender got the highest average uncertain ratio. [/user] [assistant]", # noqa: E501 + "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_60 (pick INTEGER, former_wnba_team VARCHAR)\n\n question: What pick was a player that previously played for the Minnesota Lynx? [/user] [assistant]", # noqa: E501 + "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_28138035_4 (womens_doubles VARCHAR, mens_singles VARCHAR)\n\n question: Name the women's doubles for werner schlager [/user] [assistant]" # noqa: E501 ] sampling_params = vllm.SamplingParams(temperature=0, max_tokens=256, @@ -48,20 +48,20 @@ def test_llama_lora(sql_lora_files, tp_size): tensor_parallel_size=tp_size) expected_no_lora_output = [ - "\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_75 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user] [assistant]\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_76 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user] [assistant]\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_77 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user] [assistant]\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_78 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user]", - " Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_11 (nationality VARCHAR, elector VARCHAR)\n\n question: When Anchero Pantaleone was the elector what is under nationality? ", - "\n\n answer: 1\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_96 (one_mora VARCHAR, gloss VARCHAR, accented_mora VARCHAR)\n\n question: What is the one mora for a high tone mora with a gloss of /˧kot/ [kòt]? [/user] [assistant]\n\n answer: 2\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_97 (one_mora VARCHAR, gloss VARCHAR, accented_mora VARCHAR)\n\n question: What is the one mora for a high tone mora with a gloss of /˧kot/ [kòt]? [/user] [assistant]\n\n answer: 2\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_98 (one_mora VARCHAR, gloss VARCHAR, accented_mora VARCHAR)\n\n question: What is the one m", - " Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE candidate (people_id VARCHAR, unsure_rate INTEGER); CREATE TABLE people (sex VARCHAR, people_id VARCHAR)\n\n question: which gender got the highest average uncertain ratio. ", - " Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_60 (pick INTEGER, former_wnba_team VARCHAR)\n\n question: What pick was a player that previously played for the Minnesota Lynx? ", - "\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_28138035_4 (womens_doubles VARCHAR, mens_singles VARCHAR)\n\n question: Name the women's doubles for werner schlager [/user] [assistant]\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_28138035_4 (womens_doubles VARCHAR, mens_singles VARCHAR)\n\n question: Name the women's doubles for werner schlager [/user] [assistant]\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_28138035_4 (womens_doubles VARCHAR, mens_singles VARCHAR)\n\n question: Name the women's doubles for werner schlager [/user] [assistant]\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE", + "\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_75 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user] [assistant]\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_76 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user] [assistant]\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_77 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user] [assistant]\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_78 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user]", # noqa: E501 + " Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_11 (nationality VARCHAR, elector VARCHAR)\n\n question: When Anchero Pantaleone was the elector what is under nationality? ", # noqa: E501 + "\n\n answer: 1\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_96 (one_mora VARCHAR, gloss VARCHAR, accented_mora VARCHAR)\n\n question: What is the one mora for a high tone mora with a gloss of /˧kot/ [kòt]? [/user] [assistant]\n\n answer: 2\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_97 (one_mora VARCHAR, gloss VARCHAR, accented_mora VARCHAR)\n\n question: What is the one mora for a high tone mora with a gloss of /˧kot/ [kòt]? [/user] [assistant]\n\n answer: 2\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_98 (one_mora VARCHAR, gloss VARCHAR, accented_mora VARCHAR)\n\n question: What is the one m", # noqa: E501 + " Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE candidate (people_id VARCHAR, unsure_rate INTEGER); CREATE TABLE people (sex VARCHAR, people_id VARCHAR)\n\n question: which gender got the highest average uncertain ratio. ", # noqa: E501 + " Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_60 (pick INTEGER, former_wnba_team VARCHAR)\n\n question: What pick was a player that previously played for the Minnesota Lynx? ", # noqa: E501 + "\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_28138035_4 (womens_doubles VARCHAR, mens_singles VARCHAR)\n\n question: Name the women's doubles for werner schlager [/user] [assistant]\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_28138035_4 (womens_doubles VARCHAR, mens_singles VARCHAR)\n\n question: Name the women's doubles for werner schlager [/user] [assistant]\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_28138035_4 (womens_doubles VARCHAR, mens_singles VARCHAR)\n\n question: Name the women's doubles for werner schlager [/user] [assistant]\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE", # noqa: E501 ] expected_lora_output = [ - " SELECT icao FROM table_name_74 WHERE airport = 'lilongwe international airport' ", - " SELECT nationality FROM table_name_11 WHERE elector = 'anchero pantaleone' ", - " SELECT one_mora FROM table_name_95 WHERE gloss = 'low tone mora with a gloss of /˩okiru/' [òkìɽɯ́] AND accented_mora = 'low tone mora with a gloss of /˩okiru/' [òkìɽɯ́] ", - " SELECT sex FROM people WHERE people_id IN (SELECT people_id FROM candidate GROUP BY sex ORDER BY COUNT(people_id) DESC LIMIT 1) ", - " SELECT pick FROM table_name_60 WHERE former_wnba_team = 'Minnesota Lynx' ", - " SELECT womens_doubles FROM table_28138035_4 WHERE mens_singles = 'Werner Schlager' " + " SELECT icao FROM table_name_74 WHERE airport = 'lilongwe international airport' ", # noqa: E501 + " SELECT nationality FROM table_name_11 WHERE elector = 'anchero pantaleone' ", # noqa: E501 + " SELECT one_mora FROM table_name_95 WHERE gloss = 'low tone mora with a gloss of /˩okiru/' [òkìɽɯ́] AND accented_mora = 'low tone mora with a gloss of /˩okiru/' [òkìɽɯ́] ", # noqa: E501 + " SELECT sex FROM people WHERE people_id IN (SELECT people_id FROM candidate GROUP BY sex ORDER BY COUNT(people_id) DESC LIMIT 1) ", # noqa: E501 + " SELECT pick FROM table_name_60 WHERE former_wnba_team = 'Minnesota Lynx' ", # noqa: E501 + " SELECT womens_doubles FROM table_28138035_4 WHERE mens_singles = 'Werner Schlager' " # noqa: E501 ] print("lora adapter created") @@ -121,7 +121,8 @@ def test_llama_tensor_parallel_equality(sql_lora_files): def test_llama_lora_warmup(sql_lora_files): - """Test that the LLM initialization works with a warmup LORA path and is more conservative""" + """Test that the LLM initialization works with a warmup LORA path and + is more conservative""" @ray.remote(num_gpus=1) def get_num_gpu_blocks_lora(): @@ -132,13 +133,15 @@ def get_num_gpu_blocks_lora(): @ray.remote(num_gpus=1) def get_num_gpu_blocks_no_lora(): llm = vllm.LLM(MODEL_PATH, max_num_seqs=16) - num_gpu_blocks_no_lora_warmup = llm.llm_engine.cache_config.num_gpu_blocks + num_gpu_blocks_no_lora_warmup = ( + llm.llm_engine.cache_config.num_gpu_blocks) return num_gpu_blocks_no_lora_warmup num_gpu_blocks_lora_warmup = ray.get(get_num_gpu_blocks_lora.remote()) num_gpu_blocks_no_lora_warmup = ray.get( get_num_gpu_blocks_no_lora.remote()) assert num_gpu_blocks_lora_warmup < num_gpu_blocks_no_lora_warmup, ( - "The warmup with lora should be more" - " conservative than without lora, therefore the number of memory blocks for the KV cache should be " + "The warmup with lora should be more " + "conservative than without lora, therefore the number of " + "memory blocks for the KV cache should be " "less when using lora than when not using lora") diff --git a/tests/lora/test_mixtral.py b/tests/lora/test_mixtral.py index e45fb92ab7ed..4d74722aaa92 100644 --- a/tests/lora/test_mixtral.py +++ b/tests/lora/test_mixtral.py @@ -9,9 +9,9 @@ def do_sample(llm, lora_path: str, lora_id: int): prompts = [ - "[system] Given a target sentence construct the underlying meaning representation\nof the input sentence as a single function with attributes and attribute\nvalues. This function should describe the target string accurately and the\nfunction must be one of the following ['inform', 'request', 'give_opinion',\n'confirm', 'verify_attribute', 'suggest', 'request_explanation',\n'recommend', 'request_attribute'].\n\nThe attributes must be one of the following:\n['name', 'exp_release_date', 'release_year', 'developer', 'esrb', 'rating',\n'genres', 'player_perspective', 'has_multiplayer', 'platforms',\n'available_on_steam', 'has_linux_release', 'has_mac_release', 'specifier'] [/system] [user] Here is the target sentence:\nSpellForce 3 is a pretty bad game. The developer Grimlore Games is clearly a bunch of no-talent hacks, and 2017 was a terrible year for games anyway. [/user] [assistant]", - "[system] Given a target sentence construct the underlying meaning representation\nof the input sentence as a single function with attributes and attribute\nvalues. This function should describe the target string accurately and the\nfunction must be one of the following ['inform', 'request', 'give_opinion',\n'confirm', 'verify_attribute', 'suggest', 'request_explanation',\n'recommend', 'request_attribute'].\n\nThe attributes must be one of the following:\n['name', 'exp_release_date', 'release_year', 'developer', 'esrb', 'rating',\n'genres', 'player_perspective', 'has_multiplayer', 'platforms',\n'available_on_steam', 'has_linux_release', 'has_mac_release', 'specifier'] [/system] [user] Here is the target sentence:\nI wanted to like Grimlore Games' 2017 entry, but in SpellForce 3 they just didn't get anything right. [/user] [assistant]", - "[system] Given a target sentence construct the underlying meaning representation\nof the input sentence as a single function with attributes and attribute\nvalues. This function should describe the target string accurately and the\nfunction must be one of the following ['inform', 'request', 'give_opinion',\n'confirm', 'verify_attribute', 'suggest', 'request_explanation',\n'recommend', 'request_attribute'].\n\nThe attributes must be one of the following:\n['name', 'exp_release_date', 'release_year', 'developer', 'esrb', 'rating',\n'genres', 'player_perspective', 'has_multiplayer', 'platforms',\n'available_on_steam', 'has_linux_release', 'has_mac_release', 'specifier'] [/system] [user] Here is the target sentence:\nBioShock is a good role-playing, action-adventure, shooter that released for PlayStation, Xbox, and PC in 2007. It is available on Steam, and it has a Mac release but not a Linux release. [/user] [assistant]", + "[system] Given a target sentence construct the underlying meaning representation\nof the input sentence as a single function with attributes and attribute\nvalues. This function should describe the target string accurately and the\nfunction must be one of the following ['inform', 'request', 'give_opinion',\n'confirm', 'verify_attribute', 'suggest', 'request_explanation',\n'recommend', 'request_attribute'].\n\nThe attributes must be one of the following:\n['name', 'exp_release_date', 'release_year', 'developer', 'esrb', 'rating',\n'genres', 'player_perspective', 'has_multiplayer', 'platforms',\n'available_on_steam', 'has_linux_release', 'has_mac_release', 'specifier'] [/system] [user] Here is the target sentence:\nSpellForce 3 is a pretty bad game. The developer Grimlore Games is clearly a bunch of no-talent hacks, and 2017 was a terrible year for games anyway. [/user] [assistant]", # noqa: E501 + "[system] Given a target sentence construct the underlying meaning representation\nof the input sentence as a single function with attributes and attribute\nvalues. This function should describe the target string accurately and the\nfunction must be one of the following ['inform', 'request', 'give_opinion',\n'confirm', 'verify_attribute', 'suggest', 'request_explanation',\n'recommend', 'request_attribute'].\n\nThe attributes must be one of the following:\n['name', 'exp_release_date', 'release_year', 'developer', 'esrb', 'rating',\n'genres', 'player_perspective', 'has_multiplayer', 'platforms',\n'available_on_steam', 'has_linux_release', 'has_mac_release', 'specifier'] [/system] [user] Here is the target sentence:\nI wanted to like Grimlore Games' 2017 entry, but in SpellForce 3 they just didn't get anything right. [/user] [assistant]", # noqa: E501 + "[system] Given a target sentence construct the underlying meaning representation\nof the input sentence as a single function with attributes and attribute\nvalues. This function should describe the target string accurately and the\nfunction must be one of the following ['inform', 'request', 'give_opinion',\n'confirm', 'verify_attribute', 'suggest', 'request_explanation',\n'recommend', 'request_attribute'].\n\nThe attributes must be one of the following:\n['name', 'exp_release_date', 'release_year', 'developer', 'esrb', 'rating',\n'genres', 'player_perspective', 'has_multiplayer', 'platforms',\n'available_on_steam', 'has_linux_release', 'has_mac_release', 'specifier'] [/system] [user] Here is the target sentence:\nBioShock is a good role-playing, action-adventure, shooter that released for PlayStation, Xbox, and PC in 2007. It is available on Steam, and it has a Mac release but not a Linux release. [/user] [assistant]", # noqa: E501 ] sampling_params = vllm.SamplingParams(temperature=0, max_tokens=256) outputs = llm.generate( @@ -42,9 +42,9 @@ def test_mixtral_lora(mixtral_lora_files, tp_size): worker_use_ray=True) expected_lora_output = [ - "give_opinion(name[SpellForce 3], release_year[2017], developer[Grimlore Games], rating[poor])", - "give_opinion(name[SpellForce 3], release_year[2017], developer[Grimlore Games], rating[poor])", - "inform(name[BioShock], release_year[2007], rating[good], genres[action-adventure, role-playing, shooter], platforms[PlayStation, Xbox, PC], available_on_steam[yes], has_linux_release[no], has_mac_release[yes])", + "give_opinion(name[SpellForce 3], release_year[2017], developer[Grimlore Games], rating[poor])", # noqa: E501 + "give_opinion(name[SpellForce 3], release_year[2017], developer[Grimlore Games], rating[poor])", # noqa: E501 + "inform(name[BioShock], release_year[2007], rating[good], genres[action-adventure, role-playing, shooter], platforms[PlayStation, Xbox, PC], available_on_steam[yes], has_linux_release[no], has_mac_release[yes])", # noqa: E501 ] assert do_sample(llm, mixtral_lora_files, diff --git a/tests/lora/test_punica.py b/tests/lora/test_punica.py index cbe0f6fa2e85..fd707766c6a3 100644 --- a/tests/lora/test_punica.py +++ b/tests/lora/test_punica.py @@ -45,7 +45,7 @@ def _lora_ref_impl( H1 = H2 = [ 128, 256, 512, 1024, 1280, 2048, 2560, 2752, 3072, 3456, 3584, 4096, 5120, 5504, 5632, 6144, 6912, 7168, 8192, 9216, 10240, 11008, 13824, 14336, - 24576, 32000, 32256, 32512, 32768, 33024 + 22016, 24576, 32000, 32256, 32512, 32768, 33024 ] SEED = [0xabcdabcd987] diff --git a/tests/lora/test_tokenizer.py b/tests/lora/test_tokenizer.py deleted file mode 100644 index 6c4c91fce812..000000000000 --- a/tests/lora/test_tokenizer.py +++ /dev/null @@ -1,69 +0,0 @@ -import pytest -from transformers import AutoTokenizer, PreTrainedTokenizerBase - -from vllm.lora.request import LoRARequest -from vllm.transformers_utils.tokenizer import TokenizerGroup, get_lora_tokenizer - - -@pytest.mark.asyncio -async def test_transformers_tokenizer(): - reference_tokenizer = AutoTokenizer.from_pretrained("gpt2") - tokenizer = TokenizerGroup( - tokenizer_id="gpt2", - enable_lora=False, - max_num_seqs=1, - max_input_length=None, - ) - assert reference_tokenizer.encode("prompt") == tokenizer.encode( - request_id="request_id", prompt="prompt", lora_request=None) - assert reference_tokenizer.encode( - "prompt") == await tokenizer.encode_async(request_id="request_id", - prompt="prompt", - lora_request=None) - assert isinstance(tokenizer.get_lora_tokenizer(None), - PreTrainedTokenizerBase) - assert tokenizer.get_lora_tokenizer( - None) == await tokenizer.get_lora_tokenizer_async(None) - - -@pytest.mark.asyncio -async def test_transformers_tokenizer_lora(sql_lora_files): - reference_tokenizer = AutoTokenizer.from_pretrained(sql_lora_files) - tokenizer = TokenizerGroup( - tokenizer_id="gpt2", - enable_lora=True, - max_num_seqs=1, - max_input_length=None, - ) - lora_request = LoRARequest("1", 1, sql_lora_files) - assert reference_tokenizer.encode("prompt") == tokenizer.encode( - request_id="request_id", prompt="prompt", lora_request=lora_request) - assert reference_tokenizer.encode( - "prompt") == await tokenizer.encode_async(request_id="request_id", - prompt="prompt", - lora_request=lora_request) - assert isinstance(tokenizer.get_lora_tokenizer(None), - PreTrainedTokenizerBase) - assert tokenizer.get_lora_tokenizer( - None) == await tokenizer.get_lora_tokenizer_async(None) - - assert isinstance(tokenizer.get_lora_tokenizer(lora_request), - PreTrainedTokenizerBase) - assert tokenizer.get_lora_tokenizer( - lora_request) != tokenizer.get_lora_tokenizer(None) - assert tokenizer.get_lora_tokenizer( - lora_request) == await tokenizer.get_lora_tokenizer_async(lora_request) - - -def test_get_lora_tokenizer(sql_lora_files, tmpdir): - lora_request = None - tokenizer = get_lora_tokenizer(lora_request) - assert not tokenizer - - lora_request = LoRARequest("1", 1, sql_lora_files) - tokenizer = get_lora_tokenizer(lora_request) - assert tokenizer.get_added_vocab() - - lora_request = LoRARequest("1", 1, str(tmpdir)) - tokenizer = get_lora_tokenizer(lora_request) - assert not tokenizer diff --git a/tests/lora/test_tokenizer_group.py b/tests/lora/test_tokenizer_group.py new file mode 100644 index 000000000000..5fec3f179925 --- /dev/null +++ b/tests/lora/test_tokenizer_group.py @@ -0,0 +1,53 @@ +import pytest +from transformers import AutoTokenizer, PreTrainedTokenizerBase +from vllm.lora.request import LoRARequest +from vllm.transformers_utils.tokenizer_group import get_tokenizer_group +from vllm.transformers_utils.tokenizer import get_lora_tokenizer +from ..conftest import get_tokenizer_pool_config + + +@pytest.mark.asyncio +@pytest.mark.parametrize("tokenizer_group_type", [None, "ray"]) +async def test_tokenizer_group_lora(sql_lora_files, tokenizer_group_type): + reference_tokenizer = AutoTokenizer.from_pretrained(sql_lora_files) + tokenizer_group = get_tokenizer_group( + get_tokenizer_pool_config(tokenizer_group_type), + tokenizer_id="gpt2", + enable_lora=True, + max_num_seqs=1, + max_input_length=None, + ) + lora_request = LoRARequest("1", 1, sql_lora_files) + assert reference_tokenizer.encode("prompt") == tokenizer_group.encode( + request_id="request_id", prompt="prompt", lora_request=lora_request) + assert reference_tokenizer.encode( + "prompt") == await tokenizer_group.encode_async( + request_id="request_id", + prompt="prompt", + lora_request=lora_request) + assert isinstance(tokenizer_group.get_lora_tokenizer(None), + PreTrainedTokenizerBase) + assert tokenizer_group.get_lora_tokenizer( + None) == await tokenizer_group.get_lora_tokenizer_async(None) + + assert isinstance(tokenizer_group.get_lora_tokenizer(lora_request), + PreTrainedTokenizerBase) + assert tokenizer_group.get_lora_tokenizer( + lora_request) != tokenizer_group.get_lora_tokenizer(None) + assert tokenizer_group.get_lora_tokenizer( + lora_request) == await tokenizer_group.get_lora_tokenizer_async( + lora_request) + + +def test_get_lora_tokenizer(sql_lora_files, tmpdir): + lora_request = None + tokenizer = get_lora_tokenizer(lora_request) + assert not tokenizer + + lora_request = LoRARequest("1", 1, sql_lora_files) + tokenizer = get_lora_tokenizer(lora_request) + assert tokenizer.get_added_vocab() + + lora_request = LoRARequest("1", 1, str(tmpdir)) + tokenizer = get_lora_tokenizer(lora_request) + assert not tokenizer diff --git a/tests/metrics/test_metrics.py b/tests/metrics/test_metrics.py index 410bdfa5c69e..0ab9c63ce437 100644 --- a/tests/metrics/test_metrics.py +++ b/tests/metrics/test_metrics.py @@ -21,7 +21,8 @@ def test_metric_counter_prompt_tokens( gpu_memory_utilization=0.4) tokenizer = vllm_model.model.get_tokenizer() prompt_token_counts = [len(tokenizer.encode(p)) for p in example_prompts] - # This test needs at least 2 prompts in a batch of different lengths to verify their token count is correct despite padding. + # This test needs at least 2 prompts in a batch of different lengths to + # verify their token count is correct despite padding. assert len(example_prompts) > 1, "at least 2 prompts are required" assert prompt_token_counts[0] != prompt_token_counts[1], ( "prompts of different lengths are required") @@ -33,8 +34,8 @@ def test_metric_counter_prompt_tokens( **stat_logger.labels)._value.get() assert vllm_prompt_token_count == metric_count, ( - f"prompt token count: {vllm_prompt_token_count!r}\nmetric: {metric_count!r}" - ) + f"prompt token count: {vllm_prompt_token_count!r}\n" + f"metric: {metric_count!r}") @pytest.mark.parametrize("model", MODELS) @@ -60,9 +61,10 @@ def test_metric_counter_generation_tokens( for i in range(len(example_prompts)): vllm_output_ids, vllm_output_str = vllm_outputs[i] prompt_ids = tokenizer.encode(example_prompts[i]) - # vllm_output_ids contains both prompt tokens and generation tokens. We're interested only in the count of the generation tokens. + # vllm_output_ids contains both prompt tokens and generation tokens. + # We're interested only in the count of the generation tokens. vllm_generation_count += len(vllm_output_ids) - len(prompt_ids) assert vllm_generation_count == metric_count, ( - f"generation token count: {vllm_generation_count!r}\nmetric: {metric_count!r}" - ) + f"generation token count: {vllm_generation_count!r}\n" + f"metric: {metric_count!r}") diff --git a/tests/models/test_marlin.py b/tests/models/test_marlin.py index f3cc517364f0..a3a1487e62e0 100644 --- a/tests/models/test_marlin.py +++ b/tests/models/test_marlin.py @@ -1,7 +1,7 @@ """Compare the outputs of a GPTQ model to a Marlin model. -Note: GPTQ and Marlin do not have bitwise correctness. -As a result, in this test, we just confirm that the top selected tokens of the +Note: GPTQ and Marlin do not have bitwise correctness. +As a result, in this test, we just confirm that the top selected tokens of the Marlin/GPTQ models are in the top 3 selections of each other. Note: Marlin internally uses locks to synchronize the threads. This can @@ -14,7 +14,8 @@ import pytest import torch from dataclasses import dataclass -from vllm.model_executor.layers.quantization import _QUANTIZATION_CONFIG_REGISTRY +from vllm.model_executor.layers.quantization import ( + _QUANTIZATION_CONFIG_REGISTRY) capability = torch.cuda.get_device_capability() capability = capability[0] * 10 + capability[1] @@ -87,11 +88,11 @@ def test_models( if marlin_output_id != gptq_output_id: # Each predicted token must be in top 5 of the other's assert gptq_output_id in marlin_logprobs[idx], ( - f"Test{prompt_idx}:\nGPTQ:\t{gptq_output_str!r}\nMarlin:\t{marlin_output_str!r}" - ) + f"Test{prompt_idx}:\nGPTQ:\t{gptq_output_str!r}\n" + f"Marlin:\t{marlin_output_str!r}") assert marlin_output_id in gptq_logprobs[idx], ( - f"Test{prompt_idx}:\nGPTQ:\t{gptq_output_str!r}\nMarlin:\t{marlin_output_str!r}" - ) + f"Test{prompt_idx}:\nGPTQ:\t{gptq_output_str!r}\n" + f"Marlin:\t{marlin_output_str!r}") # Break out since sequences will now diverge. break diff --git a/tests/prefix_caching/test_prefix_caching.py b/tests/prefix_caching/test_prefix_caching.py index 7ef8dde7bb8f..c83551c36ef1 100644 --- a/tests/prefix_caching/test_prefix_caching.py +++ b/tests/prefix_caching/test_prefix_caching.py @@ -20,20 +20,23 @@ def test_block_allocator( num_blocks, enable_caching=True) - # Allocate two PysicalTokenBlocks with the same hash and check that they are the same PhysicalTokenBlock + # Allocate two PysicalTokenBlocks with the same hash and check + # that they are the same PhysicalTokenBlock first_block = block_allocator.allocate(block_hash, 0) second_block = block_allocator.allocate(block_hash, 0) assert (first_block == second_block) assert (second_block.ref_count == 2) - # Free the first_block and confirm that the ref_count is correctly decremented on the second block + # Free the first_block and confirm that the ref_count is correctly + # decremented on the second block block_allocator.free(first_block) assert (second_block.ref_count == 1) # Free the second block block_allocator.free(second_block) - # Reallocate the first block and confirm that, even after the block had its ref_count go to 0, we still get the same block back + # Reallocate the first block and confirm that, even after the block + # had its ref_count go to 0, we still get the same block back first_block = block_allocator.allocate(block_hash, 0) assert (first_block == second_block) assert (first_block.block_hash == block_hash) @@ -56,7 +59,8 @@ def test_eviction(num_blocks: int, ): for block in blocks: block_allocator.free(block) - # Allocate a new block and confirm that it's the first block freed. I.E The Least Recently Used block + # Allocate a new block and confirm that it's the first block freed. + # I.E The Least Recently Used block new_block_hash = block_size new_block = block_allocator.allocate(new_block_hash, 0) assert (new_block == blocks[0]) @@ -68,7 +72,8 @@ def test_eviction(num_blocks: int, ): assert (realloc_block == blocks[realloc_block_hash]) assert (realloc_block.block_hash == realloc_block_hash) - # Allocate a new block and confirm that it's not the realloc_block, since the realloc_block shouldn't be in the free list + # Allocate a new block and confirm that it's not the realloc_block, + # since the realloc_block shouldn't be in the free list new_block_hash = block_size + 1 new_block = block_allocator.allocate(new_block_hash, 0) assert (realloc_block != new_block) diff --git a/tests/samplers/test_logprobs.py b/tests/samplers/test_logprobs.py index 1abb55f02121..14f1872c4525 100644 --- a/tests/samplers/test_logprobs.py +++ b/tests/samplers/test_logprobs.py @@ -70,8 +70,8 @@ def test_get_prompt_logprobs( hf_logprob[i][-1][token_id].item(), atol=1e-2, rtol=1e-2) - assert isinstance(sample_logprob.decoded_token, str), \ - ("The token should be decoded by the time it is returned " + assert isinstance(sample_logprob.decoded_token, str), ( + "The token should be decoded by the time it is returned " " to the user.") diff --git a/tests/samplers/test_sampler.py b/tests/samplers/test_sampler.py index 31e865f42ff3..1bc8703d1a8e 100644 --- a/tests/samplers/test_sampler.py +++ b/tests/samplers/test_sampler.py @@ -255,9 +255,10 @@ def test_sampling(model_runner: ModelRunner): if metadata.sampling_params.use_beam_search: continue - if metadata.sampling_params.seed is not None \ - and expected_tokens[i] is None: - # Record seeded random result to compare with results of second invocation + if (metadata.sampling_params.seed is not None + and expected_tokens[i] is None): + # Record seeded random result to compare with results of + # second invocation expected_tokens[i] = [ nth_output.output_token for nth_output in sequence_output.samples @@ -265,11 +266,13 @@ def test_sampling(model_runner: ModelRunner): continue for n, nth_output in enumerate(sequence_output.samples): - if metadata.sampling_params.temperature == 0 or metadata.sampling_params.seed is not None: + if (metadata.sampling_params.temperature == 0 + or metadata.sampling_params.seed is not None): # Ensure exact matches for greedy or random with seed assert nth_output.output_token == expected_tokens[i][n] else: - # For non-seeded random check that one of the high-logit tokens were chosen + # For non-seeded random check that one of the high-logit + # tokens were chosen assert nth_output.output_token in expected_tokens[i] # Test batch @@ -284,8 +287,8 @@ def test_sampling(model_runner: ModelRunner): input_tensor.data = input_tensor.index_select(0, target_index) fake_logits.data = fake_logits.index_select(0, target_index) - # This time, results of seeded random samples will be compared with the corresponding - # sample in the pre-shuffled batch + # This time, results of seeded random samples will be compared with + # the corresponding sample in the pre-shuffled batch test_sampling(model_runner) del model_runner diff --git a/tests/worker/spec_decode/__init__.py b/tests/spec_decode/__init__.py similarity index 100% rename from tests/worker/spec_decode/__init__.py rename to tests/spec_decode/__init__.py diff --git a/tests/spec_decode/test_batch_expansion.py b/tests/spec_decode/test_batch_expansion.py new file mode 100644 index 000000000000..fddc3995452c --- /dev/null +++ b/tests/spec_decode/test_batch_expansion.py @@ -0,0 +1,95 @@ +import torch +import pytest + +from vllm.spec_decode.batch_expansion import BatchExpansionTop1Scorer + +from .utils import mock_worker, create_seq_group_metadata_from_prompts + + +@pytest.mark.parametrize('num_target_seq_ids', [100]) +def test_create_target_seq_id_iterator(num_target_seq_ids: int): + """Verify all new sequence ids are greater than all input + seq ids. + """ + scorer = BatchExpansionTop1Scorer(mock_worker(), 'cuda:0', 32_000) + + all_seq_ids = [ + [1, 3, 5, 7], + list(range(100)) + [0], + [100], + ] + + for seq_ids in all_seq_ids: + max_seq_id = max(seq_ids) + iterator = scorer._create_target_seq_id_iterator(seq_ids) # pylint: disable=protected-access + for _ in range(num_target_seq_ids): + assert next(iterator) > max_seq_id + + +@pytest.mark.parametrize('k', [1, 2, 6]) +def test_get_token_ids_to_score(k: int): + """Verify correct tokens are selected for scoring. + """ + proposal_token_ids = torch.tensor( + list(range(k)), + dtype=torch.int64, + device='cuda', + ) + + expected_output = [ + [], + ] + for i in range(proposal_token_ids.shape[0]): + expected_output.append(proposal_token_ids[:i + 1].tolist()) + + scorer = BatchExpansionTop1Scorer(mock_worker(), 'cuda:0', 32_000) + actual_output = scorer._get_token_ids_to_score(proposal_token_ids) # pylint: disable=protected-access + + actual_output = [ + x.tolist() if isinstance(x, torch.Tensor) else x for x in actual_output + ] + + assert actual_output == expected_output + + +@pytest.mark.parametrize('k', [1, 2, 6]) +def test_create_single_target_seq_group_metadata(k: int): + """Verify correct creation of a batch-expanded seq group metadata. + """ + + prompt_tokens = [1, 2, 3] + prev_output_tokens = [4, 5, 6] + + token_ids = list(range(k)) + + num_tokens_processed = len(prompt_tokens) + len(prev_output_tokens) - 1 + + final_seq_len = len(prompt_tokens) + len(prev_output_tokens) + len( + token_ids) + + block_size = 32 + input_seq_group_metadata = create_seq_group_metadata_from_prompts( + [prompt_tokens], 2048 // block_size, block_size, [final_seq_len], + [prev_output_tokens], [num_tokens_processed])[0] + + input_seq_id = list(input_seq_group_metadata.seq_data.keys())[0] + target_seq_id = 100 + + scorer = BatchExpansionTop1Scorer(mock_worker(), 'cuda:0', 32_000) + output = scorer._create_single_target_seq_group_metadata( # pylint: disable=protected-access + input_seq_group_metadata, + input_seq_id, + target_seq_id, + token_ids, + ) + + assert output.request_id == input_seq_group_metadata.request_id + assert len(output.seq_data) == 1 + assert output.seq_data[target_seq_id].get_prompt_token_ids( + ) == prompt_tokens + assert output.seq_data[target_seq_id].get_output_token_ids( + ) == prev_output_tokens + token_ids + + assert len(output.block_tables) == 1 + assert output.block_tables[ + target_seq_id] == input_seq_group_metadata.block_tables[input_seq_id] diff --git a/tests/spec_decode/test_metrics.py b/tests/spec_decode/test_metrics.py new file mode 100644 index 000000000000..09847136d13e --- /dev/null +++ b/tests/spec_decode/test_metrics.py @@ -0,0 +1,159 @@ +import torch +import math +import pytest + +from unittest.mock import MagicMock + +from vllm.spec_decode.metrics import AsyncMetricsCollector + + +def test_initial_call_returns_none(): + """Expect first call to get metrics to return None. + """ + rej_sampler = MagicMock() + rej_sampler.num_accepted_tokens = torch.tensor(0, + dtype=torch.long, + device='cuda') + rej_sampler.num_emitted_tokens = torch.tensor(0, + dtype=torch.long, + device='cuda') + rej_sampler.num_draft_tokens = 0 + + collector = AsyncMetricsCollector(rej_sampler) + collector.init_gpu_tensors(rank=0) + maybe_metrics = collector.maybe_collect_rejsample_metrics(k=5) + assert maybe_metrics is None + + +def test_second_call_returns_metrics(): + """Expect second call to not return None. + """ + rej_sampler = MagicMock() + rej_sampler.num_accepted_tokens = torch.tensor(0, + dtype=torch.long, + device='cuda') + rej_sampler.num_emitted_tokens = torch.tensor(0, + dtype=torch.long, + device='cuda') + rej_sampler.num_draft_tokens = 0 + + collect_interval_s = 5.0 + timer = MagicMock() + timer.side_effect = [ + 0.0, collect_interval_s + 0.1, collect_interval_s + 0.2 + ] + + collector = AsyncMetricsCollector(rejection_sampler=rej_sampler, + timer=timer, + collect_interval_s=collect_interval_s) + collector.init_gpu_tensors(rank=0) + _ = collector.maybe_collect_rejsample_metrics(k=5) + metrics = collector.maybe_collect_rejsample_metrics(k=5) + assert metrics is not None + + +@pytest.mark.parametrize("rank", [1, 2, 3, 4]) +def test_nonzero_rank_noop(rank): + """Verify nonzero ranks don't collect metrics. + """ + rej_sampler = MagicMock() + rej_sampler.num_accepted_tokens = torch.tensor(0, + dtype=torch.long, + device='cuda') + rej_sampler.num_emitted_tokens = torch.tensor(0, + dtype=torch.long, + device='cuda') + rej_sampler.num_draft_tokens = 0 + + collector = AsyncMetricsCollector(rej_sampler) + collector.init_gpu_tensors(rank=rank) + _ = collector.maybe_collect_rejsample_metrics(k=5) + metrics = collector.maybe_collect_rejsample_metrics(k=5) + assert metrics is None + + +def test_noop_until_time(): + """Verify metrics aren't collected until enough time passes. + """ + rej_sampler = MagicMock() + rej_sampler.num_accepted_tokens = torch.tensor(0, + dtype=torch.long, + device='cuda') + rej_sampler.num_emitted_tokens = torch.tensor(0, + dtype=torch.long, + device='cuda') + rej_sampler.num_draft_tokens = 0 + + collect_interval_s = 5.0 + timer = MagicMock() + timer.side_effect = [ + 0.0, collect_interval_s - 0.1, collect_interval_s - 0.1, + collect_interval_s + 0.1, collect_interval_s + 0.1 + ] + + collector = AsyncMetricsCollector(rejection_sampler=rej_sampler, + timer=timer, + collect_interval_s=collect_interval_s) + collector.init_gpu_tensors(rank=0) + + _ = collector.maybe_collect_rejsample_metrics(k=5) + metrics = collector.maybe_collect_rejsample_metrics(k=5) + assert metrics is None + + _ = collector.maybe_collect_rejsample_metrics(k=5) + metrics = collector.maybe_collect_rejsample_metrics(k=5) + assert metrics is not None + + +@pytest.mark.parametrize("has_data", [True, False]) +def test_initial_metrics_has_correct_values(has_data: bool): + """Test correctness of metrics data. + """ + if has_data: + num_accepted_tokens = 103 + num_emitted_tokens = 104 + num_draft_tokens = 105 + else: + num_accepted_tokens = 0 + num_emitted_tokens = 0 + num_draft_tokens = 0 + k = 5 + + num_possible_tokens = AsyncMetricsCollector.get_max_num_accepted_tokens( + num_draft_tokens, k) + + rej_sampler = MagicMock() + rej_sampler.num_accepted_tokens = torch.tensor(num_accepted_tokens, + dtype=torch.long, + device='cuda') + rej_sampler.num_emitted_tokens = torch.tensor(num_emitted_tokens, + dtype=torch.long, + device='cuda') + rej_sampler.num_draft_tokens = num_draft_tokens + + collect_interval_s = 5.0 + timer = MagicMock() + timer.side_effect = [ + 0.0, collect_interval_s + 0.1, collect_interval_s + 0.2 + ] + + collector = AsyncMetricsCollector(rejection_sampler=rej_sampler, + timer=timer, + collect_interval_s=collect_interval_s) + collector.init_gpu_tensors(rank=0) + _ = collector.maybe_collect_rejsample_metrics(k) + metrics = collector.maybe_collect_rejsample_metrics(k) + + assert metrics.num_spec_tokens == k + assert metrics.accepted_tokens == num_accepted_tokens + assert metrics.draft_tokens == num_draft_tokens + assert metrics.emitted_tokens == num_emitted_tokens + + if has_data: + assert (metrics.draft_acceptance_rate == num_accepted_tokens / + num_draft_tokens) + assert (metrics.system_efficiency == num_emitted_tokens / + num_possible_tokens) + else: + assert math.isnan(metrics.draft_acceptance_rate) + assert math.isnan(metrics.system_efficiency) diff --git a/tests/worker/spec_decode/test_multi_step_worker.py b/tests/spec_decode/test_multi_step_worker.py similarity index 61% rename from tests/worker/spec_decode/test_multi_step_worker.py rename to tests/spec_decode/test_multi_step_worker.py index ea5480290357..45b43ec59ee8 100644 --- a/tests/worker/spec_decode/test_multi_step_worker.py +++ b/tests/spec_decode/test_multi_step_worker.py @@ -3,14 +3,16 @@ import pytest from unittest.mock import MagicMock -from vllm.worker.spec_decode.multi_step_worker import MultiStepWorker +from vllm.spec_decode.multi_step_worker import (MultiStepWorker, + DraftModelTop1Proposer) from vllm.worker.worker import Worker from vllm.model_executor.utils import set_random_seed +from vllm.sequence import SamplerOutput from .utils import (create_execute_model_data, create_worker, create_seq_group_metadata_from_prompts, zero_kv_cache, patch_execute_model_with_seeds, - assert_logprobs_dict_allclose) + assert_logprobs_dict_allclose, create_batch) @pytest.mark.parametrize('num_steps', list(range(1, 17))) @@ -259,3 +261,160 @@ def test_same_output_for_multi_step(): multi_step_output_logprobs, single_step_output_logprobs): assert_logprobs_dict_allclose(multi_step_logprobs, single_step_logprobs) + + +@torch.inference_mode() +def test_draft_proposals_full_speculation_len(): + """Verify DraftModelTop1Proposer correctly handles case where all sequences + can speculate. + """ + k = 10 + batch_size = 32 + vocab_size = 32_000 + device = 'cuda:0' + + draft_worker = MagicMock() + proposer = DraftModelTop1Proposer( + draft_worker=draft_worker, + device=device, + max_model_len=2048, + vocab_size=vocab_size, + ) + draft_worker.execute_model_multi_step.return_value = [ + SamplerOutput( + outputs=[], + sampled_token_probs=torch.rand(batch_size, + vocab_size, + device=device, + dtype=torch.float32), + sampled_token_ids=torch.randint(low=0, + high=vocab_size, + size=(batch_size, ), + device=device, + dtype=torch.long), + ) for _ in range(k) + ] + + execute_model_data, _, _ = create_batch(batch_size, k) + + proposals = proposer.get_proposals( + **execute_model_data.to_dict(), + max_proposal_len=k, + ) + + assert torch.is_tensor(proposals.proposal_token_ids) + assert torch.is_tensor(proposals.proposal_probs) + + assert proposals.proposal_token_ids.shape == torch.Size([batch_size, k]) + assert proposals.proposal_probs.shape[:-1] == torch.Size([batch_size, k]) + + assert proposals.proposal_lens.shape == torch.Size([batch_size]) + assert proposals.proposal_lens.tolist() == [k for _ in range(batch_size)] + + +@torch.inference_mode() +def test_draft_proposals_no_speculations(): + """Verify DraftModelTop1Proposer correctly handles case where no sequences + can speculate. + """ + k = 10 + batch_size = 32 + vocab_size = 32_000 + device = 'cuda:0' + prompt_len = 10 + + draft_worker = MagicMock() + proposer = DraftModelTop1Proposer( + draft_worker=draft_worker, + device=device, + max_model_len=prompt_len + k - 1, + vocab_size=vocab_size, + ) + + execute_model_data, _, _ = create_batch(batch_size, + k, + prompt_len=prompt_len) + + proposals = proposer.get_proposals( + **execute_model_data.to_dict(), + max_proposal_len=k, + ) + + assert torch.is_tensor(proposals.proposal_token_ids) + assert torch.is_tensor(proposals.proposal_probs) + + assert proposals.proposal_token_ids.shape == torch.Size([0, k]) + assert proposals.proposal_probs.shape[:-1] == torch.Size([0, k]) + + assert proposals.proposal_lens.shape == torch.Size([batch_size]) + assert proposals.proposal_lens.tolist() == [0 for _ in range(batch_size)] + + +@torch.inference_mode() +def test_draft_proposals_mixed_k(): + """Verify DraftModelTop1Proposer correctly handles case some sequences can + speculate and some can't. + """ + k = 10 + batch_size = 32 + vocab_size = 32_000 + device = 'cuda:0' + + small_prompt_len = 5 + long_prompt_len = 10 + prev_output_token_len = 20 + + expected_num_proposal_seqs = 6 + expected_num_no_proposal_seqs = batch_size - expected_num_proposal_seqs + + prompt_len = [ + small_prompt_len for _ in range(expected_num_proposal_seqs - 1) + ] + [long_prompt_len + for _ in range(expected_num_no_proposal_seqs)] + [small_prompt_len] + + draft_worker = MagicMock() + proposer = DraftModelTop1Proposer( + draft_worker=draft_worker, + device=device, + max_model_len=long_prompt_len + prev_output_token_len + k - 1, + vocab_size=vocab_size, + ) + + draft_worker.execute_model_multi_step.return_value = [ + SamplerOutput( + outputs=[], + sampled_token_probs=torch.rand(expected_num_proposal_seqs, + vocab_size, + device=device, + dtype=torch.float32), + sampled_token_ids=torch.randint( + low=0, + high=vocab_size, + size=(expected_num_proposal_seqs, ), + device=device, + dtype=torch.long), + ) for _ in range(k) + ] + + execute_model_data, _, _ = create_batch( + batch_size, + k, + prompt_len=prompt_len, + prev_output_token_len=prev_output_token_len, + ) + + proposals = proposer.get_proposals( + **execute_model_data.to_dict(), + max_proposal_len=k, + ) + + assert torch.is_tensor(proposals.proposal_token_ids) + assert torch.is_tensor(proposals.proposal_probs) + + assert proposals.proposal_token_ids.shape == torch.Size([batch_size, k]) + assert proposals.proposal_probs.shape[:-1] == torch.Size([batch_size, k]) + + assert proposals.proposal_lens.shape == torch.Size([batch_size]) + assert proposals.proposal_lens.tolist() == [ + k for _ in range(expected_num_proposal_seqs - 1) + ] + [0 for _ in range(expected_num_no_proposal_seqs)] + [k] diff --git a/tests/spec_decode/test_spec_decode_worker.py b/tests/spec_decode/test_spec_decode_worker.py new file mode 100644 index 000000000000..bfc69e01e3eb --- /dev/null +++ b/tests/spec_decode/test_spec_decode_worker.py @@ -0,0 +1,597 @@ +import torch +import random +import pytest +from unittest.mock import MagicMock + +from vllm.spec_decode.multi_step_worker import MultiStepWorker +from vllm.spec_decode.spec_decode_worker import (SpecDecodeWorker, + split_num_cache_blocks_evenly) +from vllm.spec_decode.interfaces import SpeculativeProposals +from vllm.model_executor.utils import set_random_seed +from vllm.model_executor.layers.rejection_sampler import RejectionSampler +from .utils import (mock_worker, create_batch, ExecuteModelData, + create_sampler_output_list) +from vllm.spec_decode.metrics import (SpecDecodeWorkerMetrics, + AsyncMetricsCollector) + + +@pytest.mark.parametrize('k', [1, 2, 6]) +@pytest.mark.parametrize('batch_size', [1, 2, 32]) +@torch.inference_mode() +def test_correctly_calls_draft_model(k: int, batch_size: int): + """Verify SpecDecodeWorker calls the draft worker with correct + inputs. Everything else is mocked out. + """ + draft_worker = mock_worker(cls=MultiStepWorker) + target_worker = mock_worker() + rejection_sampler = MagicMock(spec=RejectionSampler) + metrics_collector = MagicMock(spec=AsyncMetricsCollector) + worker = SpecDecodeWorker(draft_worker, target_worker, rejection_sampler, + metrics_collector) + + exception_secret = 'artifical stop' + draft_worker.get_spec_proposals.side_effect = ValueError(exception_secret) + + execute_model_data, _, _ = create_batch(batch_size, k) + + with pytest.raises(ValueError, match=exception_secret): + worker.execute_model(**execute_model_data.to_dict(), num_spec_tokens=k) + + call_args_list = draft_worker.get_spec_proposals.call_args_list + assert len(call_args_list) == 1 + + for args, _ in call_args_list: + (seq_group_metadata_list, blocks_to_swap_in, blocks_to_swap_out, + blocks_to_copy, actual_k) = args + actual_execute_model_data = ExecuteModelData(seq_group_metadata_list, + blocks_to_swap_in, + blocks_to_swap_out, + blocks_to_copy) + assert actual_execute_model_data == execute_model_data + assert actual_k == k + + +@pytest.mark.parametrize('k', [1, 2, 6]) +@pytest.mark.parametrize('batch_size', [1, 2, 32]) +@torch.inference_mode() +def test_correctly_calls_target_model(k: int, batch_size: int): + """Verify SpecDecodeWorker calls the target model with correct + inputs. Everything else is mocked out. + """ + draft_worker = mock_worker(cls=MultiStepWorker) + target_worker = mock_worker() + rejection_sampler = MagicMock(spec=RejectionSampler) + rejection_sampler.token_id_dtype = torch.int64 + metrics_collector = MagicMock(spec=AsyncMetricsCollector) + + draft_worker.device = 'cuda' + target_worker.device = 'cuda' + + set_random_seed(1) + + worker = SpecDecodeWorker(draft_worker, target_worker, rejection_sampler, + metrics_collector) + worker.init_model() + + vocab_size = 32_000 + + proposal_token_ids = torch.randint(low=0, + high=vocab_size, + size=(batch_size, k), + dtype=torch.int64, + device='cuda') + proposal_probs = torch.rand(batch_size, + k, + vocab_size, + dtype=torch.float32, + device='cuda') + proposal_lens = torch.ones(batch_size, dtype=torch.int64, + device='cuda') * k + + execute_model_data, prompts, prev_output_tokens = create_batch( + batch_size, k) + + draft_worker.get_spec_proposals.return_value = SpeculativeProposals( + proposal_token_ids=proposal_token_ids, + proposal_probs=proposal_probs, + proposal_lens=proposal_lens) + + exception_secret = 'artifical stop' + target_worker.execute_model.side_effect = ValueError(exception_secret) + + with pytest.raises(ValueError, match=exception_secret): + worker.execute_model(**execute_model_data.to_dict(), num_spec_tokens=k) + + seen_contexts = [] + + call_args_list = target_worker.execute_model.call_args_list + assert len(call_args_list) == 1 + for args, kwargs in call_args_list: + target_execute_model_data = ExecuteModelData.from_dict(kwargs) + + assert len(target_execute_model_data.seq_group_metadata_list) == ( + k + 1) * batch_size + for seq_group_metadata in ( + target_execute_model_data.seq_group_metadata_list): + for seq_data in seq_group_metadata.seq_data.values(): + seen_contexts.append(seq_data.get_token_ids()) + + expected_seen_contexts = [] + + for prompt, prev_generated, draft_tokens in zip( + prompts, prev_output_tokens, proposal_token_ids.tolist()): + + for i in range(len(draft_tokens) + 1): + expected_seen_contexts.append(prompt + prev_generated + + draft_tokens[:i]) + + seen_contexts.sort() + expected_seen_contexts.sort() + assert expected_seen_contexts == seen_contexts + + +@pytest.mark.parametrize('k', [1, 2, 6]) +@pytest.mark.parametrize('batch_size', [1, 2, 32]) +@torch.inference_mode() +def test_correctly_calls_rejection_sampler(k: int, batch_size: int): + """Verify SpecDecodeWorker calls the rejection sampler with + correct inputs. Everything else is mocked out. + """ + vocab_size = 32_000 + + draft_worker = mock_worker(cls=MultiStepWorker, vocab_size=vocab_size) + target_worker = mock_worker(vocab_size=vocab_size) + rejection_sampler = MagicMock(spec=RejectionSampler) + rejection_sampler.token_id_dtype = torch.int64 + metrics_collector = MagicMock(spec=AsyncMetricsCollector) + draft_worker.device = 'cuda' + target_worker.device = 'cuda' + + set_random_seed(1) + + worker = SpecDecodeWorker(draft_worker, target_worker, rejection_sampler, + metrics_collector) + worker.init_model() + + proposal_token_ids = torch.randint(low=0, + high=vocab_size, + size=(batch_size, k), + dtype=torch.int64, + device='cuda') + proposal_probs = torch.rand(batch_size, + k, + vocab_size, + dtype=torch.float32, + device='cuda') + + proposal_lens = torch.ones(batch_size, dtype=torch.int64, + device='cuda') * k + + execute_model_data, _, _ = create_batch(batch_size, k) + + draft_worker.get_spec_proposals.return_value = SpeculativeProposals( + proposal_token_ids=proposal_token_ids, + proposal_probs=proposal_probs, + proposal_lens=proposal_lens) + + target_token_ids = torch.randint(low=0, + high=vocab_size, + size=(1, batch_size * (k + 1)), + dtype=torch.int64, + device='cuda') + target_token_probs = torch.rand(1, + batch_size * (k + 1), + vocab_size, + dtype=torch.float32, + device='cuda') + target_output = create_sampler_output_list(target_token_ids, + target_token_probs) + + target_worker.execute_model.return_value = target_output[0] + + exception_secret = 'artifical stop' + rejection_sampler.side_effect = ValueError(exception_secret) + + with pytest.raises(ValueError, match=exception_secret): + worker.execute_model(**execute_model_data.to_dict(), num_spec_tokens=k) + + assert len(rejection_sampler.call_args_list) == 1 + args, _ = rejection_sampler.call_args_list[0] + (actual_proposal_scores, actual_bonus_token_ids, actual_proposal_probs, + actual_proposal_token_ids) = args + + assert torch.equal(actual_bonus_token_ids, + target_token_ids.reshape(batch_size, k + 1)[:, -1:]) + assert torch.equal( + actual_proposal_scores, + target_token_probs.reshape(batch_size, k + 1, -1)[:, :-1]) + assert torch.equal(actual_proposal_token_ids, proposal_token_ids) + assert torch.equal(actual_proposal_probs, proposal_probs) + + +@pytest.mark.parametrize('k', [1, 2, 6]) +@pytest.mark.parametrize('batch_size', [1, 2, 32]) +@torch.inference_mode() +def test_correctly_formats_output(k: int, batch_size: int): + """Verify SpecDecodeWorker formats sampler output correctly. + Everything else is mocked out. + """ + vocab_size = 32_000 + + draft_worker = mock_worker(cls=MultiStepWorker, vocab_size=vocab_size) + target_worker = mock_worker(vocab_size=vocab_size) + rejection_sampler = MagicMock(spec=RejectionSampler) + rejection_sampler.token_id_dtype = torch.int64 + metrics_collector = MagicMock(spec=AsyncMetricsCollector) + draft_worker.device = 'cuda' + target_worker.device = 'cuda' + + set_random_seed(1) + + worker = SpecDecodeWorker(draft_worker, target_worker, rejection_sampler, + metrics_collector) + worker.init_model() + + proposal_token_ids = torch.randint(low=0, + high=vocab_size, + size=(batch_size, k), + dtype=torch.int64, + device='cuda') + proposal_probs = torch.rand(batch_size, + k, + vocab_size, + dtype=torch.float32, + device='cuda') + + proposal_lens = torch.ones(batch_size, dtype=torch.int64, + device='cuda') * k + + execute_model_data, _, _ = create_batch(batch_size, k) + + draft_worker.get_spec_proposals.return_value = SpeculativeProposals( + proposal_token_ids=proposal_token_ids, + proposal_probs=proposal_probs, + proposal_lens=proposal_lens) + + target_token_ids = torch.randint(low=0, + high=vocab_size, + size=(1, batch_size * (k + 1)), + dtype=torch.int64, + device='cuda') + target_token_probs = torch.rand(1, + batch_size * (k + 1), + vocab_size, + dtype=torch.float32, + device='cuda') + target_output = create_sampler_output_list(target_token_ids, + target_token_probs) + + target_worker.execute_model.return_value = target_output[0] + + rejection_sampler_output = torch.randint(low=0, + high=vocab_size, + size=(batch_size, k + 1), + dtype=torch.int64, + device='cuda') + for i in range(batch_size): + minimum_accepted_tokens = 1 + rejection_sampler_output[i][ + -random.randint(minimum_accepted_tokens, k + 1):] = -1 + + rejection_sampler.return_value = rejection_sampler_output + + output = worker.execute_model(**execute_model_data.to_dict(), + num_spec_tokens=k) + + expected_output = create_sampler_output_list( + rejection_sampler_output.transpose(0, 1), [None for _ in range(k + 1)]) + + seq_ids = [ + next(iter(seq_group_metadata.seq_data.keys())) + for seq_group_metadata in execute_model_data.seq_group_metadata_list + ] + actual_output_by_seq = {seq_id: [] for seq_id in seq_ids} + expected_output_by_seq = {seq_id: [] for seq_id in seq_ids} + + for step in output: + for seq_group in step: + for sample in seq_group.samples: + seq_id = sample.parent_seq_id + actual_output_by_seq[seq_id].append(sample) + + for step in expected_output: + for seq_group in step: + for sample in seq_group.samples: + seq_id = sample.parent_seq_id + expected_output_by_seq[seq_id].append(sample) + + all_seen_seq_ids = set( + list(actual_output_by_seq.keys()) + + list(expected_output_by_seq.keys())) + for seq_id in all_seen_seq_ids: + actual_by_step = actual_output_by_seq[seq_id] + expected_by_step = expected_output_by_seq[seq_id] + + for i in range(k + 1): + if i >= len(actual_by_step): + assert expected_by_step[i].output_token == -1 + continue + assert actual_by_step[i].output_token == expected_by_step[ + i].output_token + assert actual_by_step[i].logprobs == expected_by_step[i].logprobs + + +@pytest.mark.parametrize('k', [1, 2]) +@pytest.mark.parametrize('batch_size', [1]) +@pytest.mark.parametrize('returns_metrics', [True, False]) +@torch.inference_mode() +def test_collects_metrics(k: int, batch_size: int, returns_metrics: bool): + """Verify SpecDecodeWorker collects metrics. + """ + vocab_size = 32_000 + + draft_worker = mock_worker(cls=MultiStepWorker, vocab_size=vocab_size) + target_worker = mock_worker(vocab_size=vocab_size) + rejection_sampler = MagicMock(spec=RejectionSampler) + rejection_sampler.token_id_dtype = torch.int64 + metrics_collector = MagicMock(spec=AsyncMetricsCollector) + draft_worker.device = 'cuda' + target_worker.device = 'cuda' + + set_random_seed(1) + + worker = SpecDecodeWorker(draft_worker, target_worker, rejection_sampler, + metrics_collector) + worker.init_model() + + proposal_token_ids = torch.randint(low=0, + high=vocab_size, + size=(batch_size, k), + dtype=torch.int64, + device='cuda') + proposal_probs = torch.rand(batch_size, + k, + vocab_size, + dtype=torch.float32, + device='cuda') + + proposal_lens = torch.ones(batch_size, dtype=torch.int64, + device='cuda') * k + + execute_model_data, _, _ = create_batch(batch_size, k) + + draft_worker.get_spec_proposals.return_value = SpeculativeProposals( + proposal_token_ids=proposal_token_ids, + proposal_probs=proposal_probs, + proposal_lens=proposal_lens) + + target_token_ids = torch.randint(low=0, + high=vocab_size, + size=(1, batch_size * (k + 1)), + dtype=torch.int64, + device='cuda') + target_token_probs = torch.rand(1, + batch_size * (k + 1), + vocab_size, + dtype=torch.float32, + device='cuda') + target_output = create_sampler_output_list(target_token_ids, + target_token_probs) + + target_worker.execute_model.return_value = target_output[0] + + rejection_sampler_output = torch.randint(low=0, + high=vocab_size, + size=(batch_size, k + 1), + dtype=torch.int64, + device='cuda') + for i in range(batch_size): + minimum_accepted_tokens = 1 + rejection_sampler_output[i][ + -random.randint(minimum_accepted_tokens, k + 1):] = -1 + + rejection_sampler.return_value = rejection_sampler_output + + mock_rejsample_metrics = MagicMock( + spec=SpecDecodeWorkerMetrics) if returns_metrics else None + metrics_collector.maybe_collect_rejsample_metrics.return_value = ( + mock_rejsample_metrics) + + output = worker.execute_model(**execute_model_data.to_dict(), + num_spec_tokens=k) + assert output[0].spec_decode_worker_metrics == mock_rejsample_metrics + + call_args_list = ( + metrics_collector.maybe_collect_rejsample_metrics.call_args_list) + assert len(call_args_list) == 1 + args, kwargs = call_args_list[0] + assert args[0] == k or kwargs.get('k', -1) == k + + +@pytest.mark.parametrize('k', [0]) +@pytest.mark.parametrize('batch_size', [1, 2, 32]) +@torch.inference_mode() +def test_k_equals_zero(k: int, batch_size: int): + """Verify that the SpecDecodeWorker calls the draft and target workers + when k is zero. This happens during prefill. + """ + draft_worker = mock_worker(cls=MultiStepWorker) + target_worker = mock_worker() + rejection_sampler = MagicMock(spec=RejectionSampler) + rejection_sampler.token_id_dtype = torch.int64 + metrics_collector = MagicMock(spec=AsyncMetricsCollector) + + draft_worker.device = 'cuda' + target_worker.device = 'cuda' + + set_random_seed(1) + + worker = SpecDecodeWorker(draft_worker, target_worker, rejection_sampler, + metrics_collector) + + execute_model_data, prompts, prev_output_tokens = create_batch( + batch_size, k, prev_output_token_len=0) + + out = worker.execute_model(**execute_model_data.to_dict(), + num_spec_tokens=k) + + assert len(out) == 1, f"expected only one token output when {k=}" + assert out[0].probs is None, "expect gpu tensor references to be None" + assert out[ + 0].sampled_tokens is None, "expect gpu tensor references to be None" + + draft_worker.execute_model.assert_called_once_with( + **execute_model_data.to_dict(), return_python_output=False) + target_worker.execute_model.assert_called_once_with( + **execute_model_data.to_dict()) + + +@pytest.mark.parametrize('k', [0, 5]) +@pytest.mark.parametrize('batch_size', [0]) +@torch.inference_mode() +def test_empty_input_batch(k: int, batch_size: int): + """Verify that the SpecDecodeWorker calls the draft and target workers + when the input batch is empty. This can happen if the engine communicates + to the workers information without scheduling a batch. + """ + draft_worker = mock_worker(cls=MultiStepWorker) + target_worker = mock_worker() + rejection_sampler = MagicMock(spec=RejectionSampler) + rejection_sampler.token_id_dtype = torch.int64 + metrics_collector = MagicMock(spec=AsyncMetricsCollector) + + draft_worker.device = 'cuda' + target_worker.device = 'cuda' + + set_random_seed(1) + + worker = SpecDecodeWorker(draft_worker, target_worker, rejection_sampler, + metrics_collector) + + execute_model_data, prompts, prev_output_tokens = create_batch( + batch_size, k, prev_output_token_len=0) + + out = worker.execute_model(**execute_model_data.to_dict(), + num_spec_tokens=k) + + assert len(out) == 1, f"expected only one token output when {k=}" + assert out[0].probs is None, "expect gpu tensor references to be None" + assert out[ + 0].sampled_tokens is None, "expect gpu tensor references to be None" + + draft_worker.execute_model.assert_called_once_with( + **execute_model_data.to_dict(), return_python_output=False) + target_worker.execute_model.assert_called_once_with( + **execute_model_data.to_dict()) + + +@torch.inference_mode() +def test_init_model(): + """Verify SpecDecodeWorker invokes proposer/scorer worker init_model, as + well as other GPU initialization. + """ + draft_worker = mock_worker(cls=MultiStepWorker) + target_worker = mock_worker() + rejection_sampler = MagicMock(spec=RejectionSampler) + rejection_sampler.token_id_dtype = torch.int64 + metrics_collector = MagicMock(spec=AsyncMetricsCollector) + + worker = SpecDecodeWorker(draft_worker, target_worker, rejection_sampler, + metrics_collector) + + worker.init_model() + + draft_worker.init_model.assert_called_once() + + target_worker.init_model.assert_called_once() + + metrics_collector.init_gpu_tensors.assert_called_once() + rejection_sampler.init_gpu_tensors.assert_called_once() + + +@torch.inference_mode() +def test_init_cache_engine(): + """Verify SpecDecodeWorker invokes init_cache_engine on proposer/scorer + workers. + """ + draft_worker = mock_worker(cls=MultiStepWorker) + target_worker = mock_worker() + rejection_sampler = MagicMock(spec=RejectionSampler) + rejection_sampler.token_id_dtype = torch.int64 + metrics_collector = MagicMock(spec=AsyncMetricsCollector) + + worker = SpecDecodeWorker(draft_worker, target_worker, rejection_sampler, + metrics_collector) + + cache_config = MagicMock() + + worker.init_cache_engine(cache_config) + + draft_worker.init_cache_engine.assert_called_once_with(cache_config) + target_worker.init_cache_engine.assert_called_once_with(cache_config) + + +@pytest.mark.parametrize('available_gpu_blocks', [1, 1024]) +@pytest.mark.parametrize('available_cpu_blocks', [500]) +@pytest.mark.parametrize('target_cache_block_size_bytes', [2 * 2 * 4096]) +@pytest.mark.parametrize('draft_kv_size_bytes', [0, 2 * 2 * 768, 2 * 2 * 4096]) +@torch.inference_mode() +def test_profile_num_available_blocks(available_gpu_blocks: int, + available_cpu_blocks: int, + target_cache_block_size_bytes: int, + draft_kv_size_bytes: int): + """Verify SpecDecodeWorker correctly profiles num available GPU blocks. + Specifically, it should run profiling in the scorer worker, and then evenly + split the blocks between proposer and scorer worker. + """ + draft_worker = mock_worker(cls=MultiStepWorker) + target_worker = mock_worker() + rejection_sampler = MagicMock(spec=RejectionSampler) + rejection_sampler.token_id_dtype = torch.int64 + metrics_collector = MagicMock(spec=AsyncMetricsCollector) + + target_worker.profile_num_available_blocks.return_value = ( + available_gpu_blocks, available_cpu_blocks) + target_worker.get_cache_block_size_bytes.return_value = ( + target_cache_block_size_bytes) + draft_worker.get_cache_block_size_bytes.return_value = draft_kv_size_bytes + + worker = SpecDecodeWorker(draft_worker, target_worker, rejection_sampler, + metrics_collector) + + # These values do not directly impact the adjusted block size calculation, + # so they can be fixed. + gpu_memory_utilization = 0.9 + cpu_swap_space = 100 + block_size = 16 + + num_gpu_blocks, num_cpu_blocks = worker.profile_num_available_blocks( + block_size, gpu_memory_utilization, cpu_swap_space, cache_dtype="auto") + + target_worker.profile_num_available_blocks.assert_called_once_with( + block_size, gpu_memory_utilization, cpu_swap_space, "auto") + assert num_cpu_blocks == available_cpu_blocks + + assert num_gpu_blocks == split_num_cache_blocks_evenly( + target_cache_block_size_bytes, draft_kv_size_bytes, + available_gpu_blocks) + + +@pytest.mark.parametrize('available_gpu_blocks', + list(range(20)) + [1024, 1024**2]) +@pytest.mark.parametrize('target_cache_block_size_bytes', + [2 * 2 * 4096, 2 * 2 * 8192]) +@pytest.mark.parametrize('draft_kv_size_bytes', [0, 2 * 2 * 768, 2 * 2 * 4096]) +@torch.inference_mode() +def test_split_num_cache_blocks_evenly(available_gpu_blocks: int, + target_cache_block_size_bytes: int, + draft_kv_size_bytes: int): + """Verify split_num_cache_blocks_evenly does not exceed original memory + allocation in bytes. + """ + num_blocks = split_num_cache_blocks_evenly(target_cache_block_size_bytes, + draft_kv_size_bytes, + available_gpu_blocks) + assert (num_blocks * target_cache_block_size_bytes) + ( + num_blocks * draft_kv_size_bytes) <= (available_gpu_blocks * + target_cache_block_size_bytes) diff --git a/tests/spec_decode/test_utils.py b/tests/spec_decode/test_utils.py new file mode 100644 index 000000000000..19833ddb0615 --- /dev/null +++ b/tests/spec_decode/test_utils.py @@ -0,0 +1,111 @@ +from vllm.spec_decode.util import get_all_seq_ids +from vllm.sequence import SequenceGroupMetadata +from vllm.spec_decode.util import split_batch_by_proposal_len + +import pytest +from unittest.mock import MagicMock + + +def test_get_all_seq_ids(): + """Verify get_all_seq_ids extracts all seq ids. + """ + expected_seq_ids = list(range(10)) + list(range(100, 110)) + + seq_group_metadata_list = [ + SequenceGroupMetadata( + request_id=str(seq_id), + is_prompt=True, + seq_data={ + seq_id: MagicMock(), + }, + sampling_params=MagicMock(), + block_tables={ + seq_id: MagicMock(), + }, + lora_request=None, + ) for seq_id in expected_seq_ids + ] + + actual_seq_ids = get_all_seq_ids(seq_group_metadata_list) + assert actual_seq_ids == expected_seq_ids + + +@pytest.fixture +def fake_sequence_group_metadata(): + seq_ids = list(range(3)) + return [ + SequenceGroupMetadata( + request_id=str(i), + is_prompt=True, + seq_data={ + i: MagicMock(), + }, + sampling_params=MagicMock(), + block_tables={ + i: MagicMock(), + }, + lora_request=None, + ) for i in seq_ids + ] + + +def test_filter_zero_length_proposals(fake_sequence_group_metadata): + proposal_lens = [0, 1, 0] + filtered_groups, indices = split_batch_by_proposal_len( + fake_sequence_group_metadata, + proposal_lens, + select_proposal_len_zero=True) + + expected_groups = [ + fake_sequence_group_metadata[0], fake_sequence_group_metadata[2] + ] + expected_indices = [0, 2] + + assert filtered_groups == expected_groups + assert indices == expected_indices + + +def test_filter_non_zero_length_proposals(fake_sequence_group_metadata): + proposal_lens = [0, 1, 2] + filtered_groups, indices = split_batch_by_proposal_len( + fake_sequence_group_metadata, + proposal_lens, + select_proposal_len_zero=False) + + expected_groups = [ + fake_sequence_group_metadata[1], fake_sequence_group_metadata[2] + ] + expected_indices = [1, 2] + + assert filtered_groups == expected_groups + assert indices == expected_indices + + +def test_empty_inputs(): + filtered_groups, indices = split_batch_by_proposal_len( + [], [], select_proposal_len_zero=True) + + assert filtered_groups == [] + assert indices == [] + + +def test_all_zero_with_non_zero_filter(fake_sequence_group_metadata): + proposal_lens = [0, 0, 0] + filtered_groups, indices = split_batch_by_proposal_len( + fake_sequence_group_metadata, + proposal_lens, + select_proposal_len_zero=False) + + assert filtered_groups == [] + assert indices == [] + + +def test_all_non_zero_with_zero_filter(fake_sequence_group_metadata): + proposal_lens = [1, 1, 1] + filtered_groups, indices = split_batch_by_proposal_len( + fake_sequence_group_metadata, + proposal_lens, + select_proposal_len_zero=True) + + assert filtered_groups == [] + assert indices == [] diff --git a/tests/worker/spec_decode/utils.py b/tests/spec_decode/utils.py similarity index 60% rename from tests/worker/spec_decode/utils.py rename to tests/spec_decode/utils.py index fa8767cf898a..997093988c0e 100644 --- a/tests/worker/spec_decode/utils.py +++ b/tests/spec_decode/utils.py @@ -1,13 +1,16 @@ import torch -from typing import List, Optional, Dict +from typing import List, Optional, Dict, Iterable, Union +from unittest.mock import MagicMock from vllm.worker.worker import Worker from vllm.utils import get_distributed_init_method, get_ip, get_open_port from vllm.engine.arg_utils import EngineArgs -from vllm.sequence import Logprob, SequenceGroupMetadata, SequenceData +from vllm.sequence import (Logprob, SequenceGroupMetadata, SequenceData, + SamplerOutput, SequenceGroupOutput, SequenceOutput) from vllm.sampling_params import SamplingParams from vllm.worker.cache_engine import CacheEngine from vllm.model_executor.utils import set_random_seed +from itertools import count from dataclasses import dataclass, fields @@ -24,6 +27,11 @@ def to_dict(self): return dict( (field.name, getattr(self, field.name)) for field in fields(self)) + @classmethod + def from_dict(cls, d): + cleaned = dict((field.name, d[field.name]) for field in fields(cls)) + return cls(**cleaned) + def round_up_to_next_block(seq_len: int, block_size: int) -> int: return (seq_len + block_size - 1) // block_size @@ -50,6 +58,21 @@ def create_execute_model_data( ) +def mock_worker(cls=None, + vocab_size: int = 30_000, + max_model_len: int = 2048, + rank: int = 0) -> MagicMock: + if cls is None: + cls = Worker + + worker = MagicMock(spec=cls) + worker.vocab_size = vocab_size + worker.max_model_len = max_model_len + worker.rank = rank + worker.device = 'cuda:0' + return worker + + def patch_execute_model_with_seeds(worker: Worker, rand_seeds: List[int]): seed_iter = iter(rand_seeds) original_execute_model = worker.execute_model @@ -117,25 +140,12 @@ def create_seq_group_metadata_from_prompts( block_size: int, final_seq_lens: List[int], continuations: Optional[List[List[int]]] = None, - num_tokens_processed: Optional[List[int]] = None, seq_ids: Optional[List[int]] = None, ) -> List[SequenceGroupMetadata]: if continuations is None: continuations = [[] for _ in prompts] - if num_tokens_processed is None: - # Default to 1 token missing from kv cache for generation sequences. - num_tokens_processed = [] - for continuation, prompt in zip(continuations, prompts): - # If prefill, then default to zero tokens processed. - if not continuation: - num_tokens_processed.append(0) - else: - # If generation, then default to all but one tokens processed. - num_tokens_processed.append( - len(continuation) + len(prompt) - 1) - if seq_ids is None: seq_ids = list(i for i, _ in enumerate(prompts)) @@ -155,13 +165,15 @@ def create_seq_group_metadata_from_prompts( is_prompt=len(cont_token_ids) == 0, seq_data={ i: - SequenceData(prompt_token_ids=prompt_token_ids[:] + - cont_token_ids[:]) + SequenceData( + prompt_token_ids=prompt_token_ids[:], + output_token_ids=cont_token_ids[:], + ), }, sampling_params=SamplingParams(temperature=0.0, ), block_tables={i: block_allocations[i][:]}, - ) for i, (prompt_token_ids, cont_token_ids, num_tokens_saved) in - enumerate(zip(prompts, continuations, num_tokens_processed)) + ) for i, (prompt_token_ids, + cont_token_ids) in enumerate(zip(prompts, continuations)) ] @@ -178,3 +190,68 @@ def assert_logprobs_dict_allclose( expected = torch.tensor( single_step_expected_logprobs[token_id].logprob) assert torch.allclose(actual, expected) + + +def create_sampler_output_list( + token_ids: torch.Tensor, + probs: Iterable[Optional[torch.Tensor]], + seq_ids: Optional[List[int]] = None) -> List[SamplerOutput]: + num_steps, batch_size = token_ids.shape + token_ids_by_step = token_ids.tolist() + + if seq_ids is None: + seq_ids = list(range(batch_size)) + + return [ + SamplerOutput(outputs=[ + SequenceGroupOutput( + samples=[ + SequenceOutput( + output_token=token_id, + parent_seq_id=seq_ids[seq_index], + logprobs={token_id: 0}, + ) + ], + prompt_logprobs=None, + ) for seq_index, token_id in enumerate(token_ids_by_step[step]) + ], + sampled_token_probs=probs[step], + sampled_token_ids=token_ids[step]) + for step in range(num_steps) + ] + + +def create_batch(batch_size, + k, + prompt_len: Union[int, List[int]] = 10, + prev_output_token_len: int = 10, + seq_ids: Optional[List[int]] = None, + num_gpu_blocks: Optional[int] = None, + block_size: Optional[int] = None): + if block_size is None: + block_size = 8 + + if num_gpu_blocks is None: + num_gpu_blocks = 2048 // block_size + + iterator = count() + + if isinstance(prompt_len, int): + prompt_lens = [prompt_len for _ in range(batch_size)] + else: + prompt_lens = prompt_len + + prompts = [[next(iterator) for _ in range(p_len)] for p_len in prompt_lens] + prev_output_tokens = [[ + next(iterator) for _ in range(prev_output_token_len) + ] for _ in range(batch_size)] + final_seq_lens = [ + len(prompt) + len(prev_output_token) + k + 1 + for prompt, prev_output_token in zip(prompts, prev_output_tokens) + ] + + execute_model_data = create_execute_model_data( + create_seq_group_metadata_from_prompts(prompts, num_gpu_blocks, + block_size, final_seq_lens, + prev_output_tokens, seq_ids), ) + return execute_model_data, prompts, prev_output_tokens diff --git a/tests/test_config.py b/tests/test_config.py new file mode 100644 index 000000000000..13a9f7621267 --- /dev/null +++ b/tests/test_config.py @@ -0,0 +1,43 @@ +from vllm.config import ModelConfig + + +def test_get_sliding_window(): + TEST_SLIDING_WINDOW = 4096 + # Test that the sliding window is correctly computed. + # For Qwen1.5/Qwen2, get_sliding_window() should be None + # when use_sliding_window is False. + qwen2_model_config = ModelConfig( + "Qwen/Qwen1.5-7B", + "Qwen/Qwen1.5-7B", + tokenizer_mode="auto", + trust_remote_code=False, + download_dir=None, + load_format="dummy", + seed=0, + dtype="float16", + revision=None, + ) + + qwen2_model_config.hf_config.use_sliding_window = False + qwen2_model_config.hf_config.sliding_window = TEST_SLIDING_WINDOW + assert qwen2_model_config.get_sliding_window() is None + + qwen2_model_config.hf_config.use_sliding_window = True + assert qwen2_model_config.get_sliding_window() == TEST_SLIDING_WINDOW + + mistral_model_config = ModelConfig( + "mistralai/Mistral-7B-v0.1", + "mistralai/Mistral-7B-v0.1", + tokenizer_mode="auto", + trust_remote_code=False, + download_dir=None, + load_format="dummy", + seed=0, + dtype="float16", + revision=None, + ) + mistral_model_config.hf_config.sliding_window = None + assert mistral_model_config.get_sliding_window() is None + + mistral_model_config.hf_config.sliding_window = TEST_SLIDING_WINDOW + assert mistral_model_config.get_sliding_window() == TEST_SLIDING_WINDOW \ No newline at end of file diff --git a/tests/test_sequence.py b/tests/test_sequence.py new file mode 100644 index 000000000000..e18df059d770 --- /dev/null +++ b/tests/test_sequence.py @@ -0,0 +1,50 @@ +import pytest + +from vllm.sequence import SequenceGroupOutput, SamplerOutput, SequenceOutput + + +@pytest.fixture +def sample_outputs(): + return [ + SequenceGroupOutput(samples=[ + SequenceOutput(parent_seq_id=0, output_token=i, logprobs={}) + ], + prompt_logprobs=None) for i in range(5) + ] + + +@pytest.fixture +def sampler_output(sample_outputs): + return SamplerOutput(outputs=sample_outputs) + + +def test_sampler_output_initialization(sampler_output, sample_outputs): + assert len(sampler_output) == len(sample_outputs) + assert sampler_output.sampled_token_probs is None + assert sampler_output.sampled_token_ids is None + assert sampler_output.spec_decode_worker_metrics is None + + +def test_sampler_output_getitem(sampler_output, sample_outputs): + assert sampler_output[2] == sample_outputs[2] + + +def test_sampler_output_setitem(sampler_output): + new_output = SequenceGroupOutput(samples=[ + SequenceOutput(parent_seq_id=0, output_token=99, logprobs={}) + ], + prompt_logprobs=None) + sampler_output[2] = new_output + assert sampler_output[2] == new_output + + +def test_sampler_output_len(sampler_output, sample_outputs): + assert len(sampler_output) == len(sample_outputs) + + +def test_sampler_output_eq(sample_outputs): + sampler_output1 = SamplerOutput(outputs=sample_outputs) + sampler_output2 = SamplerOutput(outputs=sample_outputs.copy()) + sampler_output3 = SamplerOutput(outputs=sample_outputs[:-1]) + assert sampler_output1 == sampler_output2 + assert sampler_output1 != sampler_output3 diff --git a/tests/tokenization/__init__.py b/tests/tokenization/__init__.py new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/tests/tokenization/test_cached_tokenizer.py b/tests/tokenization/test_cached_tokenizer.py new file mode 100644 index 000000000000..181e80032512 --- /dev/null +++ b/tests/tokenization/test_cached_tokenizer.py @@ -0,0 +1,20 @@ +from copy import deepcopy +from vllm.transformers_utils.tokenizer import get_cached_tokenizer +from transformers import AutoTokenizer + + +def test_cached_tokenizer(): + reference_tokenizer = AutoTokenizer.from_pretrained("gpt2") + reference_tokenizer.add_special_tokens({"cls_token": ""}) + reference_tokenizer.add_special_tokens( + {"additional_special_tokens": [""]}) + cached_tokenizer = get_cached_tokenizer(deepcopy(reference_tokenizer)) + + assert reference_tokenizer.encode("prompt") == cached_tokenizer.encode( + "prompt") + assert set(reference_tokenizer.all_special_ids) == set( + cached_tokenizer.all_special_ids) + assert set(reference_tokenizer.all_special_tokens) == set( + cached_tokenizer.all_special_tokens) + assert set(reference_tokenizer.all_special_tokens_extended) == set( + cached_tokenizer.all_special_tokens_extended) diff --git a/tests/engine/test_detokenize.py b/tests/tokenization/test_detokenize.py similarity index 100% rename from tests/engine/test_detokenize.py rename to tests/tokenization/test_detokenize.py diff --git a/tests/tokenization/test_tokenizer_group.py b/tests/tokenization/test_tokenizer_group.py new file mode 100644 index 000000000000..d0788ee87563 --- /dev/null +++ b/tests/tokenization/test_tokenizer_group.py @@ -0,0 +1,100 @@ +import os +import pytest +import asyncio +from unittest.mock import patch + +from transformers import AutoTokenizer, PreTrainedTokenizerBase +from vllm.transformers_utils.tokenizer_group import get_tokenizer_group +from vllm.transformers_utils.tokenizer_group.ray_tokenizer_group import ( + RayTokenizerGroupPool) +from vllm.transformers_utils.tokenizer_group.tokenizer_group import ( + TokenizerGroup) +from ..conftest import get_tokenizer_pool_config + + +@pytest.mark.asyncio +@pytest.mark.parametrize("tokenizer_group_type", [None, "ray"]) +async def test_tokenizer_group(tokenizer_group_type): + reference_tokenizer = AutoTokenizer.from_pretrained("gpt2") + tokenizer_group = get_tokenizer_group( + get_tokenizer_pool_config(tokenizer_group_type), + tokenizer_id="gpt2", + enable_lora=False, + max_num_seqs=1, + max_input_length=None, + ) + assert reference_tokenizer.encode("prompt") == tokenizer_group.encode( + request_id="request_id", prompt="prompt", lora_request=None) + assert reference_tokenizer.encode( + "prompt") == await tokenizer_group.encode_async( + request_id="request_id", prompt="prompt", lora_request=None) + assert isinstance(tokenizer_group.get_lora_tokenizer(None), + PreTrainedTokenizerBase) + assert tokenizer_group.get_lora_tokenizer( + None) == await tokenizer_group.get_lora_tokenizer_async(None) + + +@pytest.mark.asyncio +@pytest.mark.parametrize("tokenizer_group_type", ["ray"]) +async def test_tokenizer_group_pool(tokenizer_group_type): + reference_tokenizer = AutoTokenizer.from_pretrained("gpt2") + tokenizer_group_pool = get_tokenizer_group( + get_tokenizer_pool_config(tokenizer_group_type), + tokenizer_id="gpt2", + enable_lora=False, + max_num_seqs=1, + max_input_length=None, + ) + # Send multiple requests to the tokenizer group pool + # (more than the pool size) + # and check that all requests are processed correctly. + num_requests = tokenizer_group_pool.pool_size * 5 + requests = [ + tokenizer_group_pool.encode_async(request_id=str(i), + prompt=f"prompt {i}", + lora_request=None) + for i in range(num_requests) + ] + results = await asyncio.gather(*requests) + expected_results = [ + reference_tokenizer.encode(f"prompt {i}") for i in range(num_requests) + ] + assert results == expected_results + + +@pytest.mark.asyncio +@pytest.mark.parametrize("tokenizer_group_type", ["ray"]) +async def test_tokenizer_group_ray_pool_env_var_propagation( + tokenizer_group_type): + """Test that env vars from caller process are propagated to + tokenizer Ray actors.""" + env_var = "MY_ENV_VAR" + + class EnvVarCheckerTokenizerGroup(TokenizerGroup): + + def ping(self): + assert os.environ.get(env_var) == "1" + return super().ping() + + class EnvVarCheckerRayTokenizerGroupPool(RayTokenizerGroupPool): + _worker_cls = EnvVarCheckerTokenizerGroup + + tokenizer_pool_config = get_tokenizer_pool_config(tokenizer_group_type) + tokenizer_pool = EnvVarCheckerRayTokenizerGroupPool.from_config( + tokenizer_pool_config, + tokenizer_id="gpt2", + enable_lora=False, + max_num_seqs=1, + max_input_length=None) + with pytest.raises(AssertionError): + tokenizer_pool.ping() + + with patch.dict(os.environ, {env_var: "1"}): + tokenizer_pool_config = get_tokenizer_pool_config(tokenizer_group_type) + tokenizer_pool = EnvVarCheckerRayTokenizerGroupPool.from_config( + tokenizer_pool_config, + tokenizer_id="gpt2", + enable_lora=False, + max_num_seqs=1, + max_input_length=None) + tokenizer_pool.ping() diff --git a/vllm/__init__.py b/vllm/__init__.py index f1e30f5eb6e6..5e40c3c20fcd 100644 --- a/vllm/__init__.py +++ b/vllm/__init__.py @@ -3,7 +3,7 @@ from vllm.engine.arg_utils import AsyncEngineArgs, EngineArgs from vllm.engine.async_llm_engine import AsyncLLMEngine from vllm.engine.llm_engine import LLMEngine -from vllm.engine.ray_utils import initialize_cluster +from vllm.engine.ray_utils import initialize_ray_cluster from vllm.entrypoints.llm import LLM from vllm.outputs import CompletionOutput, RequestOutput from vllm.sampling_params import SamplingParams @@ -19,5 +19,5 @@ "EngineArgs", "AsyncLLMEngine", "AsyncEngineArgs", - "initialize_cluster", + "initialize_ray_cluster", ] diff --git a/vllm/config.py b/vllm/config.py index ef9a920f29c2..f792e8909524 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -1,8 +1,9 @@ -from typing import Optional, Union, ClassVar +from typing import TYPE_CHECKING, Optional, Union, ClassVar from dataclasses import dataclass import os from packaging.version import Version +import json import torch from transformers import PretrainedConfig @@ -10,6 +11,9 @@ from vllm.transformers_utils.config import get_config from vllm.utils import get_cpu_memory, is_hip, is_neuron, get_nvcc_cuda_version +if TYPE_CHECKING: + from ray.util.placement_group import PlacementGroup + logger = init_logger(__name__) _GB = 1 << 30 @@ -45,7 +49,7 @@ class ModelConfig: a tag name, or a commit id. If unspecified, will use the default version. code_revision: The specific revision to use for the model code on - Hugging Face Hub. It can be a branch name, a tag name, or a + Hugging Face Hub. It can be a branch name, a tag name, or a commit id. If unspecified, will use the default version. tokenizer_revision: The specific tokenizer version to use. It can be a branch name, a tag name, or a commit id. If unspecified, will use @@ -100,6 +104,7 @@ def __init__( # download model from ModelScope hub, # lazy import so that modelscope is not required for normal use. from modelscope.hub.snapshot_download import snapshot_download # pylint: disable=C + if not os.path.exists(model): model_path = snapshot_download(model_id=model, cache_dir=download_dir, @@ -136,7 +141,7 @@ def _verify_load_format(self) -> None: if (f not in rocm_not_supported_load_format) ] raise ValueError( - f"load format \'{load_format}\' is not supported in ROCm. " + f"load format '{load_format}' is not supported in ROCm. " f"Supported load format are " f"{rocm_supported_load_format}") @@ -165,13 +170,18 @@ def _verify_quantization(self) -> None: # Parse quantization method from the HF model config, if available. hf_quant_config = getattr(self.hf_config, "quantization_config", None) if hf_quant_config is not None: - hf_quant_method = str(hf_quant_config["quant_method"]).lower() + # If the GPTQ model is serialized in marlin format, use marlin. if (hf_quant_method == "gptq" and "is_marlin_format" in hf_quant_config and hf_quant_config["is_marlin_format"]): + logger.info("The model is serialized in Marlin format. " + "Using Marlin kernel.") hf_quant_method = "marlin" + if self.quantization == "gptq": + self.quantization = hf_quant_method + if self.quantization is None: self.quantization = hf_quant_method elif self.quantization != hf_quant_method: @@ -189,8 +199,8 @@ def _verify_quantization(self) -> None: if is_hip( ) and self.quantization in rocm_not_supported_quantization: raise ValueError( - f"{self.quantization} quantization is currently not supported " - f"in ROCm.") + f"{self.quantization} quantization is currently not " + f"supported in ROCm.") if self.quantization != "marlin": logger.warning( f"{self.quantization} quantization is not fully " @@ -224,6 +234,15 @@ def verify_with_parallel_config( f"({pipeline_parallel_size}).") def get_sliding_window(self) -> Optional[int]: + """Get the sliding window size, or None if disabled. + """ + + # Some models, like Qwen2 and Qwen1.5, use `use_sliding_window` in + # addition to sliding window size. We check if that field is present + # and if it's False, return None. + if (hasattr(self.hf_config, "use_sliding_window") + and not self.hf_config.use_sliding_window): + return None return getattr(self.hf_config, "sliding_window", None) def get_vocab_size(self) -> int: @@ -321,7 +340,8 @@ def __init__( self.num_cpu_blocks = None def metrics_info(self): - # convert cache_config to dict(key: str, value: str) for prometheus metrics info + # convert cache_config to dict(key: str, value: str) for prometheus + # metrics info return {key: str(value) for key, value in self.__dict__.items()} def _verify_args(self) -> None: @@ -370,6 +390,58 @@ def verify_with_parallel_config( logger.warning("Possibly too large swap space. " + msg) +@dataclass +class TokenizerPoolConfig: + """Configuration for the tokenizer pool. + + Args: + pool_size: Number of tokenizer workers in the pool. + pool_type: Type of the pool. + extra_config: Additional config for the pool. + The way the config will be used depends on the + pool type. + """ + pool_size: int + pool_type: str + extra_config: dict + + def __post_init__(self): + if self.pool_type not in ("ray", ): + raise ValueError(f"Unknown pool type: {self.pool_type}") + if not isinstance(self.extra_config, dict): + raise ValueError("extra_config must be a dictionary.") + + @classmethod + def create_config( + cls, tokenizer_pool_size: int, tokenizer_pool_type: str, + tokenizer_pool_extra_config: Optional[Union[str, dict]] + ) -> Optional["TokenizerPoolConfig"]: + """Create a TokenizerPoolConfig from the given parameters. + + If tokenizer_pool_size is 0, return None. + + Args: + tokenizer_pool_size: Number of tokenizer workers in the pool. + tokenizer_pool_type: Type of the pool. + tokenizer_pool_extra_config: Additional config for the pool. + The way the config will be used depends on the + pool type. This can be a JSON string (will be parsed). + """ + if tokenizer_pool_size: + if isinstance(tokenizer_pool_extra_config, str): + tokenizer_pool_extra_config_parsed = json.loads( + tokenizer_pool_extra_config) + else: + tokenizer_pool_extra_config_parsed = ( + tokenizer_pool_extra_config or {}) + tokenizer_pool_config = cls(tokenizer_pool_size, + tokenizer_pool_type, + tokenizer_pool_extra_config_parsed) + else: + tokenizer_pool_config = None + return tokenizer_pool_config + + class ParallelConfig: """Configuration for the distributed execution. @@ -384,6 +456,8 @@ class ParallelConfig: parallel and large models. disable_custom_all_reduce: Disable the custom all-reduce kernel and fall back to NCCL. + tokenizer_pool_config: Config for the tokenizer pool. + If None, will use synchronous tokenization. ray_workers_use_nsight: Whether to profile Ray workers with nsight, see https://docs.ray.io/en/latest/ray-observability/user-guides/profiling.html#profiling-nsight-profiler. """ @@ -395,12 +469,15 @@ def __init__( worker_use_ray: bool, max_parallel_loading_workers: Optional[int] = None, disable_custom_all_reduce: bool = False, + tokenizer_pool_config: Optional[TokenizerPoolConfig] = None, ray_workers_use_nsight: bool = False, + placement_group: Optional["PlacementGroup"] = None, ) -> None: self.pipeline_parallel_size = pipeline_parallel_size if is_neuron(): - # For Neuron device support, here we assign TP=1 to avoid sharding within vLLM directly. - # Transformer-neuronx would take neuron_tp_degree attribute, and distribute the workload + # For Neuron device support, here we assign TP=1 to avoid sharding + # within vLLM directly. Transformer-neuronx would take + # neuron_tp_degree attribute, and distribute the workload # to multiple NeuronCores. self.tensor_parallel_size = 1 self.neuron_tp_degree = tensor_parallel_size @@ -409,7 +486,9 @@ def __init__( self.worker_use_ray = worker_use_ray self.max_parallel_loading_workers = max_parallel_loading_workers self.disable_custom_all_reduce = disable_custom_all_reduce + self.tokenizer_pool_config = tokenizer_pool_config self.ray_workers_use_nsight = ray_workers_use_nsight + self.placement_group = placement_group self.world_size = pipeline_parallel_size * self.tensor_parallel_size # Ray worker is not supported for Neuron backend. @@ -612,7 +691,7 @@ def _get_and_verify_dtype( k for k, v in _STR_DTYPE_TO_TORCH_DTYPE.items() if (k not in _ROCM_NOT_SUPPORTED_DTYPE) ] - raise ValueError(f"dtype \'{dtype}\' is not supported in ROCm. " + raise ValueError(f"dtype '{dtype}' is not supported in ROCm. " f"Supported dtypes are {rocm_supported_dtypes}") # Verify the dtype. diff --git a/vllm/core/block_manager.py b/vllm/core/block_manager.py index 52b120f227ed..8b089a5650f4 100644 --- a/vllm/core/block_manager.py +++ b/vllm/core/block_manager.py @@ -95,13 +95,15 @@ def free(self, block: PhysicalTokenBlock) -> None: del self.cached_blocks[block.block_hash] def get_num_free_blocks(self) -> int: - return self.num_blocks - self.current_num_blocks + self.evictor.num_blocks + return (self.num_blocks - self.current_num_blocks + + self.evictor.num_blocks) def contains_block(self, block_hash: int) -> bool: return block_hash in self.cached_blocks or block_hash in self.evictor def update_hash(self, block_hash: int, block: PhysicalTokenBlock): - # If caching is enabled, update the hash of block and the cached_blocks dictionary. + # If caching is enabled, update the hash of block and the + # cached_blocks dictionary. if self.enable_caching: assert not self.contains_block(block_hash) old_hash = block.block_hash @@ -218,10 +220,12 @@ def _promote_last_block( seq: Sequence, last_block: PhysicalTokenBlock, ) -> PhysicalTokenBlock: - # Compute a new hash for the block so that it can be shared by other Sequences + # Compute a new hash for the block so that it can be shared by + # other Sequences new_hash = seq.hash_of_block(len(seq.logical_token_blocks) - 1) - # if new_hash is already in the cached table, then free last_block and return the cached version + # if new_hash is already in the cached table, then free last_block + # and return the cached version if self.gpu_allocator.contains_block(new_hash): self.gpu_allocator.free(last_block) return self.gpu_allocator.allocate(new_hash) @@ -289,7 +293,8 @@ def append_slot( assert last_block.device == Device.GPU if last_block.ref_count == 1: # Not shared with other sequences. Appendable. - # If the last block is now complete, promote it to a full block so that it can be shared + # If the last block is now complete, promote it to a full block so + # that it can be shared new_block = self._maybe_promote_last_block(seq, last_block) block_table[-1] = new_block return None @@ -307,7 +312,12 @@ def fork(self, parent_seq: Sequence, child_seq: Sequence) -> None: # Thus, it is always safe from OOM. src_block_table = self.block_tables[parent_seq.seq_id] self.block_tables[child_seq.seq_id] = src_block_table.copy() - for block in src_block_table: + # When using a sliding window, blocks will be eventually reused. + # In this case the block tables will contain repeated blocks. + # When forking, we must make sure that each block's `ref_count` + # is only incremented by one, so we deduplicate them by wrapping + # them in a set. + for block in set(src_block_table): block.ref_count += 1 def _get_physical_blocks( @@ -388,7 +398,15 @@ def swap_out(self, seq_group: SequenceGroup) -> Dict[int, int]: return block_number_mapping def _free_block_table(self, block_table: BlockTable) -> None: - for block in set(block_table): + # when using a sliding window, each seq will only use up + # to `self.block_sliding_window` blocks. When freeing + # the block table, we must make sure to not free blocks more + # than once. If no sliding window is used, there is no block + # reuse in the block table, so we must free all blocks. + blocks_to_free = (block_table[-self.block_sliding_window:] + if self.block_sliding_window is not None else + block_table) + for block in set(blocks_to_free): if block.device == Device.GPU: self.gpu_allocator.free(block) else: diff --git a/vllm/core/evictor.py b/vllm/core/evictor.py index b538ea574b60..1d81f5a97d71 100644 --- a/vllm/core/evictor.py +++ b/vllm/core/evictor.py @@ -39,9 +39,9 @@ def add(self, block: PhysicalTokenBlock): @abstractmethod def remove(self, block_hash: int) -> PhysicalTokenBlock: """Simply removes the block with the hash value block_hash from the - evictor. Caller is responsible for making sure that block_hash is contained - in the evictor before calling remove. Should be used to "bring back" blocks - that have been freed but not evicted yet. + evictor. Caller is responsible for making sure that block_hash is + contained in the evictor before calling remove. Should be used to + "bring back" blocks that have been freed but not evicted yet. """ pass diff --git a/vllm/core/scheduler.py b/vllm/core/scheduler.py index c96c6d62ef19..c3f93a2928df 100644 --- a/vllm/core/scheduler.py +++ b/vllm/core/scheduler.py @@ -160,7 +160,7 @@ def _schedule(self) -> SchedulerOutputs: blocks_to_copy: Dict[int, List[int]] = {} # Fix the current time. - now = time.monotonic() + now = time.time() # Join waiting sequences if possible. if not self.swapped: @@ -214,8 +214,8 @@ def _schedule(self) -> SchedulerOutputs: lora_int_id = 0 if self.lora_enabled: lora_int_id = seq_group.lora_int_id - if lora_int_id > 0 and lora_int_id not in curr_loras and len( - curr_loras) >= self.lora_config.max_loras: + if (lora_int_id > 0 and lora_int_id not in curr_loras + and len(curr_loras) >= self.lora_config.max_loras): # We don't have a space for another LoRA, so # we ignore this request for now. leftover_waiting_sequences.appendleft(seq_group) @@ -309,8 +309,8 @@ def _schedule(self) -> SchedulerOutputs: lora_int_id = 0 if self.lora_enabled: lora_int_id = seq_group.lora_int_id - if lora_int_id > 0 and lora_int_id not in curr_loras and len( - curr_loras) >= self.lora_config.max_loras: + if (lora_int_id > 0 and lora_int_id not in curr_loras + and len(curr_loras) >= self.lora_config.max_loras): # We don't have a space for another LoRA, so # we ignore this request for now. leftover_swapped.appendleft(seq_group) diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index c3dccdd5bb50..3e146d2e6c0c 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -4,7 +4,8 @@ from typing import Optional, Tuple from vllm.config import (CacheConfig, DeviceConfig, ModelConfig, - ParallelConfig, SchedulerConfig, LoRAConfig) + ParallelConfig, SchedulerConfig, LoRAConfig, + TokenizerPoolConfig) @dataclass @@ -40,6 +41,9 @@ class EngineArgs: enforce_eager: bool = False max_context_len_to_capture: int = 8192 disable_custom_all_reduce: bool = False + tokenizer_pool_size: int = 0 + tokenizer_pool_type: str = "ray" + tokenizer_pool_extra_config: Optional[dict] = None enable_lora: bool = False max_loras: int = 1 max_lora_rank: int = 16 @@ -249,6 +253,25 @@ def add_cli_args( action='store_true', default=EngineArgs.disable_custom_all_reduce, help='See ParallelConfig') + parser.add_argument('--tokenizer-pool-size', + type=int, + default=EngineArgs.tokenizer_pool_size, + help='Size of tokenizer pool to use for ' + 'asynchronous tokenization. If 0, will ' + 'use synchronous tokenization.') + parser.add_argument('--tokenizer-pool-type', + type=str, + default=EngineArgs.tokenizer_pool_type, + help='Type of tokenizer pool to use for ' + 'asynchronous tokenization. Ignored ' + 'if tokenizer_pool_size is 0.') + parser.add_argument('--tokenizer-pool-extra-config', + type=str, + default=EngineArgs.tokenizer_pool_extra_config, + help='Extra config for tokenizer pool. ' + 'This should be a JSON string that will be ' + 'parsed into a dictionary. Ignored if ' + 'tokenizer_pool_size is 0.') # LoRA related configs parser.add_argument('--enable-lora', action='store_true', @@ -312,14 +335,16 @@ def create_engine_configs( cache_config = CacheConfig(self.block_size, self.gpu_memory_utilization, self.swap_space, self.kv_cache_dtype, - model_config.get_sliding_window(), - self.enable_prefix_caching) - parallel_config = ParallelConfig(self.pipeline_parallel_size, - self.tensor_parallel_size, - self.worker_use_ray, - self.max_parallel_loading_workers, - self.disable_custom_all_reduce, - self.ray_workers_use_nsight) + model_config.get_sliding_window()) + parallel_config = ParallelConfig( + self.pipeline_parallel_size, self.tensor_parallel_size, + self.worker_use_ray, self.max_parallel_loading_workers, + self.disable_custom_all_reduce, + TokenizerPoolConfig.create_config( + self.tokenizer_pool_size, + self.tokenizer_pool_type, + self.tokenizer_pool_extra_config, + ), self.ray_workers_use_nsight) scheduler_config = SchedulerConfig(self.max_num_batched_tokens, self.max_num_seqs, model_config.max_model_len, diff --git a/vllm/engine/async_llm_engine.py b/vllm/engine/async_llm_engine.py index 65ab0c063417..8bcd1e0ede6e 100644 --- a/vllm/engine/async_llm_engine.py +++ b/vllm/engine/async_llm_engine.py @@ -2,14 +2,16 @@ import os import time from functools import partial -from typing import (Any, Dict, Iterable, List, Optional, Set, Tuple, Type, - Union, AsyncIterator, Callable) +from typing import (Callable, Dict, Iterable, List, Optional, Set, Tuple, Type, + Union, AsyncIterator) + +from transformers import PreTrainedTokenizer from vllm.lora.request import LoRARequest from vllm.config import ModelConfig from vllm.engine.arg_utils import AsyncEngineArgs from vllm.engine.llm_engine import LLMEngine -from vllm.engine.ray_utils import initialize_cluster, ray +from vllm.engine.ray_utils import initialize_ray_cluster, ray from vllm.logger import init_logger from vllm.outputs import RequestOutput from vllm.sampling_params import SamplingParams @@ -206,17 +208,10 @@ async def step_async(self) -> List[RequestOutput]: if not scheduler_outputs.is_empty(): # Execute the model. - all_outputs = await self._run_workers_async( - "execute_model", - driver_kwargs={ - "seq_group_metadata_list": seq_group_metadata_list, - "blocks_to_swap_in": scheduler_outputs.blocks_to_swap_in, - "blocks_to_swap_out": scheduler_outputs.blocks_to_swap_out, - "blocks_to_copy": scheduler_outputs.blocks_to_copy, - }) - - # Only the driver worker returns the sampling results. - output = all_outputs[0] + output = await self.model_executor.execute_model_async( + seq_group_metadata_list, scheduler_outputs.blocks_to_swap_in, + scheduler_outputs.blocks_to_swap_out, + scheduler_outputs.blocks_to_copy) else: output = [] @@ -266,37 +261,8 @@ async def add_request_async( lora_request=lora_request, ) - async def _run_workers_async( - self, - method: str, - *args, - driver_args: Optional[List[Any]] = None, - driver_kwargs: Optional[Dict[str, Any]] = None, - **kwargs, - ) -> Any: - """Runs the given method on all workers.""" - coros = [] - - if driver_args is None: - driver_args = args - if driver_kwargs is None: - driver_kwargs = kwargs - - # Run the driver worker asynchronously. - driver_executor = getattr(self.driver_worker, method) - coros.append(asyncio.get_event_loop().run_in_executor( - None, partial(driver_executor, *driver_args, **driver_kwargs))) - - # Run the ray workers asynchronously. - for worker in self.workers: - coros.append(worker.execute_method.remote(method, *args, **kwargs)) - - all_outputs = await asyncio.gather(*coros) - return all_outputs - - async def check_health_async(self): - """Raises an error if engine is unhealthy.""" - self._check_if_any_actor_is_dead() + async def check_health_async(self) -> None: + self.model_executor.check_health() class AsyncLLMEngine: @@ -351,6 +317,34 @@ def __init__(self, self._request_tracker: Optional[RequestTracker] = None self._errored_with: Optional[BaseException] = None + @classmethod + def from_engine_args(cls, + engine_args: AsyncEngineArgs, + start_engine_loop: bool = True) -> "AsyncLLMEngine": + """Creates an async LLM engine from the engine arguments.""" + # Create the engine configs. + engine_configs = engine_args.create_engine_configs() + parallel_config = engine_configs[2] + if parallel_config.worker_use_ray or engine_args.engine_use_ray: + initialize_ray_cluster(parallel_config) + from vllm.executor.ray_gpu_executor import RayGPUExecutorAsync + executor_class = RayGPUExecutorAsync + else: + assert parallel_config.world_size == 1, ( + "Ray is required if parallel_config.world_size > 1.") + from vllm.executor.gpu_executor import GPUExecutorAsync + executor_class = GPUExecutorAsync + # Create the async LLM engine. + engine = cls(parallel_config.worker_use_ray, + engine_args.engine_use_ray, + *engine_configs, + executor_class, + log_requests=not engine_args.disable_log_requests, + log_stats=not engine_args.disable_log_stats, + max_log_len=engine_args.max_log_len, + start_engine_loop=start_engine_loop) + return engine + @property def is_running(self) -> bool: return (self.background_loop is not None @@ -372,8 +366,11 @@ def _error_callback(self, exc: Exception) -> None: self.set_errored(exc) self._request_tracker.propagate_exception(exc) - def get_tokenizer(self): - return self.engine.tokenizer.tokenizer + async def get_tokenizer(self) -> "PreTrainedTokenizer": + if self.engine_use_ray: + return await self.engine.get_tokenizer.remote() + else: + return self.engine.get_tokenizer() def start_background_loop(self) -> None: """Start the background loop.""" @@ -607,8 +604,7 @@ async def generate( >>> ... """ # Preprocess the request. - # This should not be used for logging, as it is monotonic time. - arrival_time = time.monotonic() + arrival_time = time.time() try: stream = await self.add_request( @@ -665,35 +661,13 @@ async def get_model_config(self) -> ModelConfig: else: return self.engine.get_model_config() - @classmethod - def from_engine_args(cls, - engine_args: AsyncEngineArgs, - start_engine_loop: bool = True) -> "AsyncLLMEngine": - """Creates an async LLM engine from the engine arguments.""" - # Create the engine configs. - engine_configs = engine_args.create_engine_configs() - parallel_config = engine_configs[2] - # Initialize the cluster. - placement_group = initialize_cluster(parallel_config, - engine_args.engine_use_ray) - # Create the async LLM engine. - engine = cls(parallel_config.worker_use_ray, - engine_args.engine_use_ray, - *engine_configs, - placement_group, - log_requests=not engine_args.disable_log_requests, - log_stats=not engine_args.disable_log_stats, - max_log_len=engine_args.max_log_len, - start_engine_loop=start_engine_loop) - return engine - async def do_log_stats(self) -> None: if self.engine_use_ray: await self.engine.do_log_stats.remote() else: self.engine.do_log_stats() - async def check_health(self): + async def check_health(self) -> None: """Raises an error if engine is unhealthy.""" t = time.perf_counter() logger.debug("Starting health check...") diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index 8484014c9a13..71798ab7d17c 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -1,11 +1,7 @@ -import copy -from collections import defaultdict -import os import time -import pickle -import importlib -from typing import (TYPE_CHECKING, Any, Dict, Iterable, List, Optional, Tuple, - Union) +from typing import Dict, Iterable, List, Optional, Tuple, Type, Union + +from transformers import PreTrainedTokenizer import vllm from vllm.lora.request import LoRARequest @@ -13,38 +9,22 @@ ParallelConfig, SchedulerConfig, LoRAConfig) from vllm.core.scheduler import Scheduler, SchedulerOutputs from vllm.engine.arg_utils import EngineArgs +from vllm.executor.executor_base import ExecutorBase from vllm.engine.metrics import StatLogger, Stats -from vllm.engine.ray_utils import RayWorkerVllm, initialize_cluster, ray +from vllm.engine.ray_utils import initialize_ray_cluster from vllm.logger import init_logger from vllm.outputs import RequestOutput from vllm.sampling_params import SamplingParams from vllm.sequence import (Logprob, SamplerOutput, Sequence, SequenceGroup, SequenceGroupOutput, SequenceOutput, SequenceStatus) -from vllm.transformers_utils.tokenizer import (detokenize_incrementally, - TokenizerGroup) -from vllm.utils import (Counter, set_cuda_visible_devices, get_ip, - get_open_port, get_distributed_init_method) - -if ray: - from ray.util.scheduling_strategies import PlacementGroupSchedulingStrategy - -if TYPE_CHECKING: - from ray.util.placement_group import PlacementGroup +from vllm.transformers_utils.tokenizer import detokenize_incrementally +from vllm.transformers_utils.tokenizer_group import (BaseTokenizerGroup, + get_tokenizer_group) +from vllm.utils import Counter logger = init_logger(__name__) _LOCAL_LOGGING_INTERVAL_SEC = 5 -# A map between the device type (in device config) to its worker module. -DEVICE_TO_WORKER_MODULE_MAP = { - "cuda": "vllm.worker.worker", - "neuron": "vllm.worker.neuron_worker", -} - -# If the env var is set, it uses the Ray's compiled DAG API -# which optimizes the control plane overhead. -# Run VLLM with VLLM_USE_RAY_COMPILED_DAG=1 to enable it. -USE_RAY_COMPILED_DAG = bool(os.getenv("VLLM_USE_RAY_COMPILED_DAG", 0)) - class LLMEngine: """An LLM engine that receives requests and generates texts. @@ -69,8 +49,8 @@ class LLMEngine: parallel_config: The configuration related to distributed execution. scheduler_config: The configuration related to the request scheduler. device_config: The configuration related to the device. - placement_group: Ray placement group for distributed execution. - Required for distributed execution. + executor_class: The model executor class for managing distributed + execution. log_stats: Whether to log statistics. """ @@ -82,7 +62,7 @@ def __init__( scheduler_config: SchedulerConfig, device_config: DeviceConfig, lora_config: Optional[LoRAConfig], - placement_group: Optional["PlacementGroup"], + executor_class: Type[ExecutorBase], log_stats: bool, ) -> None: logger.info( @@ -98,7 +78,8 @@ def __init__( f"download_dir={model_config.download_dir!r}, " f"load_format={model_config.load_format}, " f"tensor_parallel_size={parallel_config.tensor_parallel_size}, " - f"disable_custom_all_reduce={parallel_config.disable_custom_all_reduce}, " + f"disable_custom_all_reduce=" + f"{parallel_config.disable_custom_all_reduce}, " f"quantization={model_config.quantization}, " f"enforce_eager={model_config.enforce_eager}, " f"kv_cache_dtype={cache_config.cache_dtype}, " @@ -118,33 +99,17 @@ def __init__( self._init_tokenizer() self.seq_counter = Counter() - # Create the parallel GPU workers. - if self.parallel_config.worker_use_ray: - # Disable Ray usage stats collection. - ray_usage = os.environ.get("RAY_USAGE_STATS_ENABLED", "0") - if ray_usage != "1": - os.environ["RAY_USAGE_STATS_ENABLED"] = "0" - # Pass additional arguments to initialize the worker - additional_ray_args = {} - if self.parallel_config.ray_workers_use_nsight: - logger.info("Configuring Ray workers to use nsight.") - additional_ray_args = { - "runtime_env": { - "nsight": { - "t": "cuda,cudnn,cublas", - "o": "'worker_process_%p'", - "cuda-graph-trace": "node", - } - } - } - self._init_workers_ray(placement_group, **additional_ray_args) - else: - self._init_workers() + self.model_executor = executor_class(model_config, cache_config, + parallel_config, scheduler_config, + device_config, lora_config) - # Profile the memory usage and initialize the cache. - self._init_cache() + # Ping the tokenizer to ensure liveness if it runs in a + # different process. + self.tokenizer.ping() # Create the scheduler. + # NOTE: the cache_config here have been updated with the numbers of + # GPU and CPU blocks, which are profiled in the distributed executor. self.scheduler = Scheduler(scheduler_config, cache_config, lora_config) # Metric Logging. @@ -154,53 +119,45 @@ def __init__( labels=dict(model_name=model_config.model)) self.stat_logger.info("cache_config", self.cache_config) - self.forward_dag = None - if USE_RAY_COMPILED_DAG: - self.forward_dag = self._compiled_ray_dag() + @classmethod + def from_engine_args(cls, engine_args: EngineArgs) -> "LLMEngine": + """Creates an LLM engine from the engine arguments.""" + # Create the engine configs. + engine_configs = engine_args.create_engine_configs() + parallel_config = engine_configs[2] + + # Initialize the cluster and specify the executor class. + if parallel_config.worker_use_ray: + initialize_ray_cluster(parallel_config) + from vllm.executor.ray_gpu_executor import RayGPUExecutor + executor_class = RayGPUExecutor + else: + assert parallel_config.world_size == 1, ( + "Ray is required if parallel_config.world_size > 1.") + from vllm.executor.gpu_executor import GPUExecutor + executor_class = GPUExecutor + + # Create the LLM engine. + engine = cls(*engine_configs, + executor_class=executor_class, + log_stats=not engine_args.disable_log_stats) + return engine def __reduce__(self): # This is to ensure that the LLMEngine is not referenced in # the closure used to initialize Ray worker actors raise RuntimeError("LLMEngine should not be pickled!") - def get_tokenizer_for_seq(self, sequence: Sequence): - return self.tokenizer.get_lora_tokenizer(sequence.lora_request) + def get_tokenizer(self) -> "PreTrainedTokenizer": + return self.tokenizer.get_lora_tokenizer() - def _dispatch_worker(self): - worker_module = DEVICE_TO_WORKER_MODULE_MAP[ - self.device_config.device_type] - imported_worker = importlib.import_module(worker_module) - Worker = imported_worker.Worker - return Worker - - def _init_workers(self): - # Lazy import the Worker to avoid importing torch.cuda/xformers - # before CUDA_VISIBLE_DEVICES is set in the Worker - Worker = self._dispatch_worker() - - assert self.parallel_config.world_size == 1, ( - "Ray is required if parallel_config.world_size > 1.") - - self.workers: List[Worker] = [] - distributed_init_method = get_distributed_init_method( - get_ip(), get_open_port()) - self.driver_worker = Worker( - self.model_config, - self.parallel_config, - self.scheduler_config, - self.device_config, - local_rank=0, - rank=0, - distributed_init_method=distributed_init_method, - lora_config=self.lora_config, - kv_cache_dtype=self.cache_config.cache_dtype, - is_driver_worker=True, - ) - self._run_workers("init_model") - self._run_workers("load_model") + def get_tokenizer_for_seq(self, + sequence: Sequence) -> "PreTrainedTokenizer": + return self.tokenizer.get_lora_tokenizer(sequence.lora_request) def _init_tokenizer(self, **tokenizer_init_kwargs): init_kwargs = dict( + tokenizer_id=self.model_config.tokenizer, enable_lora=bool(self.lora_config), max_num_seqs=self.scheduler_config.max_num_seqs, max_input_length=None, @@ -208,128 +165,9 @@ def _init_tokenizer(self, **tokenizer_init_kwargs): trust_remote_code=self.model_config.trust_remote_code, revision=self.model_config.tokenizer_revision) init_kwargs.update(tokenizer_init_kwargs) - self.tokenizer: TokenizerGroup = TokenizerGroup( - self.model_config.tokenizer, **init_kwargs) - - def _init_workers_ray(self, placement_group: "PlacementGroup", - **ray_remote_kwargs): - if self.parallel_config.tensor_parallel_size == 1: - num_gpus = self.cache_config.gpu_memory_utilization - else: - num_gpus = 1 - self.driver_dummy_worker: RayWorkerVllm = None - self.workers: List[RayWorkerVllm] = [] - - driver_ip = get_ip() - for bundle_id, bundle in enumerate(placement_group.bundle_specs): - if not bundle.get("GPU", 0): - continue - scheduling_strategy = PlacementGroupSchedulingStrategy( - placement_group=placement_group, - placement_group_capture_child_tasks=True, - placement_group_bundle_index=bundle_id, - ) - worker = ray.remote( - num_cpus=0, - num_gpus=num_gpus, - scheduling_strategy=scheduling_strategy, - **ray_remote_kwargs, - )(RayWorkerVllm).remote(self.model_config.trust_remote_code) - - worker_ip = ray.get(worker.get_node_ip.remote()) - if worker_ip == driver_ip and self.driver_dummy_worker is None: - # If the worker is on the same node as the driver, we use it - # as the resource holder for the driver process. - self.driver_dummy_worker = worker - else: - self.workers.append(worker) - - if self.driver_dummy_worker is None: - raise ValueError( - "Ray does not allocate any GPUs on the driver node. Consider " - "adjusting the Ray placement group or running the driver on a " - "GPU node.") - - driver_node_id, driver_gpu_ids = ray.get( - self.driver_dummy_worker.get_node_and_gpu_ids.remote()) - worker_node_and_gpu_ids = ray.get( - [worker.get_node_and_gpu_ids.remote() for worker in self.workers]) - - node_workers = defaultdict(list) - node_gpus = defaultdict(list) - - node_workers[driver_node_id].append(0) - node_gpus[driver_node_id].extend(driver_gpu_ids) - for i, (node_id, gpu_ids) in enumerate(worker_node_and_gpu_ids, - start=1): - node_workers[node_id].append(i) - node_gpus[node_id].extend(gpu_ids) - for node_id, gpu_ids in node_gpus.items(): - node_gpus[node_id] = sorted(gpu_ids) - - # Set CUDA_VISIBLE_DEVICES for the driver. - set_cuda_visible_devices(node_gpus[driver_node_id]) - for worker, (node_id, _) in zip(self.workers, worker_node_and_gpu_ids): - worker.set_cuda_visible_devices.remote(node_gpus[node_id]) - - distributed_init_method = get_distributed_init_method( - driver_ip, get_open_port()) - - # Lazy import the Worker to avoid importing torch.cuda/xformers - # before CUDA_VISIBLE_DEVICES is set in the Worker - Worker = self._dispatch_worker() - - # Initialize torch distributed process group for the workers. - model_config = copy.deepcopy(self.model_config) - parallel_config = copy.deepcopy(self.parallel_config) - scheduler_config = copy.deepcopy(self.scheduler_config) - device_config = copy.deepcopy(self.device_config) - lora_config = copy.deepcopy(self.lora_config) - kv_cache_dtype = self.cache_config.cache_dtype - - for rank, (worker, (node_id, - _)) in enumerate(zip(self.workers, - worker_node_and_gpu_ids), - start=1): - local_rank = node_workers[node_id].index(rank) - worker.init_worker.remote( - lambda rank=rank, local_rank=local_rank: Worker( - model_config, - parallel_config, - scheduler_config, - device_config, - local_rank, - rank, - distributed_init_method, - lora_config=lora_config, - kv_cache_dtype=kv_cache_dtype, - )) - - driver_rank = 0 - driver_local_rank = node_workers[driver_node_id].index(driver_rank) - self.driver_worker = Worker( - self.model_config, - self.parallel_config, - self.scheduler_config, - self.device_config, - driver_local_rank, - driver_rank, - distributed_init_method, - lora_config=self.lora_config, - kv_cache_dtype=kv_cache_dtype, - is_driver_worker=True, - ) - - # don't use cupy for eager mode - self._run_workers("init_model", - cupy_port=get_open_port() - if not model_config.enforce_eager else None) - self._run_workers( - "load_model", - max_concurrent_workers=self.parallel_config. - max_parallel_loading_workers, - ) + self.tokenizer: BaseTokenizerGroup = get_tokenizer_group( + self.parallel_config.tokenizer_pool_config, **init_kwargs) def _verify_args(self) -> None: self.model_config.verify_with_parallel_config(self.parallel_config) @@ -339,81 +177,6 @@ def _verify_args(self) -> None: self.lora_config.verify_with_scheduler_config( self.scheduler_config) - def _init_cache(self) -> None: - """Profiles the memory usage and initializes the KV cache. - - The engine will first conduct a profiling of the existing memory usage. - Then, it calculate the maximum possible number of GPU and CPU blocks - that can be allocated with the remaining free memory. - More details can be found in the - :meth:`~vllm.worker.worker.Worker.profile_num_available_blocks` method - from class :class:`~vllm.worker.Worker`. - - Afterwards, as there may be multiple workers, - we take the minimum number of blocks across all workers - to ensure this can be applied to all of them. - - Finally, the engine will initialize the KV cache - with the calculated number of blocks. - - .. tip:: - You may limit the usage of GPU memory - by adjusting the `gpu_memory_utilization` parameters. - """ - # Get the maximum number of blocks that can be allocated on GPU and CPU. - num_blocks = self._run_workers( - "profile_num_available_blocks", - block_size=self.cache_config.block_size, - gpu_memory_utilization=self.cache_config.gpu_memory_utilization, - cpu_swap_space=self.cache_config.swap_space_bytes, - cache_dtype=self.cache_config.cache_dtype, - ) - - # Since we use a shared centralized controller, we take the minimum - # number of blocks across all workers to make sure all the memory - # operators can be applied to all workers. - num_gpu_blocks = min(b[0] for b in num_blocks) - num_cpu_blocks = min(b[1] for b in num_blocks) - # FIXME(woosuk): Change to debug log. - logger.info(f"# GPU blocks: {num_gpu_blocks}, " - f"# CPU blocks: {num_cpu_blocks}") - - if num_gpu_blocks <= 0: - raise ValueError("No available memory for the cache blocks. " - "Try increasing `gpu_memory_utilization` when " - "initializing the engine.") - max_seq_len = self.cache_config.block_size * num_gpu_blocks - if self.model_config.max_model_len > max_seq_len: - raise ValueError( - f"The model's max seq len ({self.model_config.max_model_len}) " - "is larger than the maximum number of tokens that can be " - f"stored in KV cache ({max_seq_len}). Try increasing " - "`gpu_memory_utilization` or decreasing `max_model_len` when " - "initializing the engine.") - - self.cache_config.num_gpu_blocks = num_gpu_blocks - self.cache_config.num_cpu_blocks = num_cpu_blocks - - # Initialize the cache. - self._run_workers("init_cache_engine", cache_config=self.cache_config) - # Warm up the model. This includes capturing the model into CUDA graph - # if enforce_eager is False. - self._run_workers("warm_up_model") - - @classmethod - def from_engine_args(cls, engine_args: EngineArgs) -> "LLMEngine": - """Creates an LLM engine from the engine arguments.""" - # Create the engine configs. - engine_configs = engine_args.create_engine_configs() - parallel_config = engine_configs[2] - # Initialize the cluster. - placement_group = initialize_cluster(parallel_config) - # Create the LLM engine. - engine = cls(*engine_configs, - placement_group, - log_stats=not engine_args.disable_log_stats) - return engine - def encode_request( self, request_id: str, # pylint: disable=unused-argument @@ -488,7 +251,7 @@ def add_request( raise ValueError(f"Cannot request more than " f"{max_logprobs} logprobs.") if arrival_time is None: - arrival_time = time.monotonic() + arrival_time = time.time() prompt_token_ids = self.encode_request( request_id=request_id, prompt=prompt, @@ -819,7 +582,7 @@ def step(self) -> List[RequestOutput]: - A Sequence Group (SG) refer to a group of sequences that are generated from the same prompt. - - Step 2: Calls the workers to execute the model. + - Step 2: Calls the distributed executor to execute the model. - Step 3: Processes the model output. This mainly includes: - Decodes the relevant outputs. @@ -855,19 +618,10 @@ def step(self) -> List[RequestOutput]: seq_group_metadata_list, scheduler_outputs = self.scheduler.schedule() if not scheduler_outputs.is_empty(): - # Execute the model. - all_outputs = self._run_workers( - "execute_model", - driver_kwargs={ - "seq_group_metadata_list": seq_group_metadata_list, - "blocks_to_swap_in": scheduler_outputs.blocks_to_swap_in, - "blocks_to_swap_out": scheduler_outputs.blocks_to_swap_out, - "blocks_to_copy": scheduler_outputs.blocks_to_copy, - }, - use_ray_compiled_dag=USE_RAY_COMPILED_DAG) - - # Only the driver worker returns the sampling results. - output = all_outputs[0] + output = self.model_executor.execute_model( + seq_group_metadata_list, scheduler_outputs.blocks_to_swap_in, + scheduler_outputs.blocks_to_swap_out, + scheduler_outputs.blocks_to_copy) else: output = [] @@ -881,7 +635,7 @@ def do_log_stats(self) -> None: def _get_stats(self, scheduler_outputs: Optional[SchedulerOutputs]) -> Stats: """Get Stats to be Logged to Prometheus.""" - now = time.monotonic() + now = time.time() # KV Cache Usage in %. num_total_gpu = self.cache_config.num_gpu_blocks @@ -923,7 +677,8 @@ def _get_stats(self, # Latency Timings. time_last_iters = [] for seq_group in scheduler_outputs.scheduled_seq_groups: - # Time since last token. (n.b. updates seq_group.metrics.last_token_time) + # Time since last token. + # (n.b. updates seq_group.metrics.last_token_time) time_last_iters.append(seq_group.get_last_latency(now)) # Time since arrival for all finished requests. if seq_group.is_finished(): @@ -955,16 +710,17 @@ def _decode_logprobs(self, seq: Sequence, prms: SamplingParams, for token_id, sample_logprob in logprobs.items(): if (sample_logprob.decoded_token is None and token_id != -1): all_input_ids_with_logprob = all_input_ids[:-1] + [token_id] - _, new_text, prefix_offset, read_offset = detokenize_incrementally( - self.get_tokenizer_for_seq(seq), - all_input_ids=all_input_ids_with_logprob, - prev_tokens=seq.tokens, - prefix_offset=seq.prefix_offset, - read_offset=seq.read_offset, - skip_special_tokens=prms.skip_special_tokens, - spaces_between_special_tokens=prms. - spaces_between_special_tokens, - ) + (_, new_text, prefix_offset, + read_offset) = detokenize_incrementally( + self.get_tokenizer_for_seq(seq), + all_input_ids=all_input_ids_with_logprob, + prev_tokens=seq.tokens, + prefix_offset=seq.prefix_offset, + read_offset=seq.read_offset, + skip_special_tokens=prms.skip_special_tokens, + spaces_between_special_tokens=prms. + spaces_between_special_tokens, + ) sample_logprob.decoded_token = new_text def _decode_sequence(self, seq: Sequence, prms: SamplingParams) -> None: @@ -1034,111 +790,13 @@ def _finalize_sequence(self, seq: Sequence, seq.output_text = seq.output_text[:-len(stop_string)] def add_lora(self, lora_request: LoRARequest) -> bool: - assert lora_request.lora_int_id > 0, "lora_id must be greater than 0." - return self._run_workers( - "add_lora", - lora_request=lora_request, - ) + return self.model_executor.add_lora(lora_request) def remove_lora(self, lora_id: int) -> bool: - assert lora_id > 0, "lora_id must be greater than 0." - return self._run_workers( - "remove_lora", - lora_id=lora_id, - ) + return self.model_executor.remove_lora(lora_id) def list_loras(self) -> List[int]: - return self._run_workers("list_loras") - - def _run_workers( - self, - method: str, - *args, - driver_args: Optional[List[Any]] = None, - driver_kwargs: Optional[Dict[str, Any]] = None, - max_concurrent_workers: Optional[int] = None, - use_ray_compiled_dag: bool = False, - **kwargs, - ) -> Any: - """Runs the given method on all workers.""" - - if max_concurrent_workers: - raise NotImplementedError( - "max_concurrent_workers is not supported yet.") - - if use_ray_compiled_dag: - # Right now, compiled DAG can only accept a single - # input. TODO(sang): Fix it. - output_channels = self.forward_dag.execute(1) - else: - # Start the ray workers first. - ray_worker_outputs = [ - worker.execute_method.remote(method, *args, **kwargs) - for worker in self.workers - ] - - if driver_args is None: - driver_args = args - if driver_kwargs is None: - driver_kwargs = kwargs - - # Start the driver worker after all the ray workers. - driver_worker_output = getattr(self.driver_worker, - method)(*driver_args, **driver_kwargs) - - # Get the results of the ray workers. - if self.workers: - if use_ray_compiled_dag: - try: - ray_worker_outputs = [ - pickle.loads(chan.begin_read()) - for chan in output_channels - ] - finally: - # Has to call end_read in order to reuse the DAG. - for chan in output_channels: - chan.end_read() - else: - ray_worker_outputs = ray.get(ray_worker_outputs) - - return [driver_worker_output] + ray_worker_outputs - - def _compiled_ray_dag(self): - import pkg_resources - required_version = "2.9" - current_version = pkg_resources.get_distribution("ray").version - if current_version < required_version: - raise ValueError(f"Ray version {required_version} or greater is " - f"required, but found {current_version}") - - from ray.dag import MultiOutputNode, InputNode - assert self.parallel_config.worker_use_ray - - # Right now, compiled DAG requires at least 1 arg. We send - # a dummy value for now. It will be fixed soon. - with InputNode() as input_data: - forward_dag = MultiOutputNode([ - worker.execute_model_compiled_dag_remote.bind(input_data) - for worker in self.workers - ]) - return forward_dag.experimental_compile() + return self.model_executor.list_loras() def check_health(self) -> None: - """Raises an error if engine is unhealthy.""" - self._check_if_any_actor_is_dead() - - def _check_if_any_actor_is_dead(self): - if not self.parallel_config.worker_use_ray: - return - - if not self.workers: - return - - dead_actors = [] - for actor in self.workers: - actor_state = ray.state.actors(actor._ray_actor_id.hex()) # pylint: disable=protected-access - if actor_state["State"] == "DEAD": - dead_actors.append(actor) - if dead_actors: - raise RuntimeError("At least one Worker is dead. " - f"Dead Workers: {dead_actors}. ") + self.model_executor.check_health() diff --git a/vllm/engine/metrics.py b/vllm/engine/metrics.py index d31542159e4a..17b1852f5b0a 100644 --- a/vllm/engine/metrics.py +++ b/vllm/engine/metrics.py @@ -1,5 +1,6 @@ from vllm.logger import init_logger -from prometheus_client import Counter, Gauge, Histogram, Info, REGISTRY, disable_created_metrics +from prometheus_client import (Counter, Gauge, Histogram, Info, REGISTRY, + disable_created_metrics) import time import numpy as np @@ -177,10 +178,12 @@ def _log_prometheus(self, stats: Stats) -> None: def _log_prometheus_interval(self, prompt_throughput: float, generation_throughput: float) -> None: # Logs metrics to prometheus that are computed every logging_interval. - # Support legacy gauge metrics that make throughput calculations on the vLLM side. - # Moving forward, we should use counters like counter_prompt_tokens, counter_generation_tokens - # Which log raw data and calculate summaries using rate() on the grafana/prometheus side. - # See https://github.com/vllm-project/vllm/pull/2316#discussion_r1464204666 + # Support legacy gauge metrics that make throughput calculations on + # the vLLM side. Moving forward, we should use counters like + # counter_prompt_tokens, counter_generation_tokens + # Which log raw data and calculate summaries using rate() on the + # grafana/prometheus side. See + # https://github.com/vllm-project/vllm/pull/2316#discussion_r1464204666 self.metrics.gauge_avg_prompt_throughput.labels( **self.labels).set(prompt_throughput) self.metrics.gauge_avg_generation_throughput.labels( @@ -188,7 +191,7 @@ def _log_prometheus_interval(self, prompt_throughput: float, def log(self, stats: Stats) -> None: """Called by LLMEngine. - Logs to prometheus and tracked stats every iteration. + Logs to prometheus and tracked stats every iteration. Logs to Stdout every self.local_interval seconds.""" # Log to prometheus. @@ -200,8 +203,8 @@ def log(self, stats: Stats) -> None: # Log locally every local_interval seconds. if self._local_interval_elapsed(stats.now): - - # Compute summary metrics for tracked stats (and log them to promethus if applicable). + # Compute summary metrics for tracked stats (and log them + # to promethus if applicable). prompt_throughput = self._get_throughput(self.num_prompt_tokens, now=stats.now) generation_throughput = self._get_throughput( @@ -213,7 +216,8 @@ def log(self, stats: Stats) -> None: # Log to stdout. logger.info( f"Avg prompt throughput: {prompt_throughput:.1f} tokens/s, " - f"Avg generation throughput: {generation_throughput:.1f} tokens/s, " + f"Avg generation throughput: " + f"{generation_throughput:.1f} tokens/s, " f"Running: {stats.num_running} reqs, " f"Swapped: {stats.num_swapped} reqs, " f"Pending: {stats.num_waiting} reqs, " diff --git a/vllm/engine/ray_utils.py b/vllm/engine/ray_utils.py index bbcbbdfea2f0..742f3dc57519 100644 --- a/vllm/engine/ray_utils.py +++ b/vllm/engine/ray_utils.py @@ -1,6 +1,6 @@ import pickle -from typing import Optional, List, Tuple, TYPE_CHECKING +from typing import Optional, List, Tuple from vllm.config import ParallelConfig from vllm.logger import init_logger @@ -65,45 +65,38 @@ def execute_model_compiled_dag_remote(self, ignored): ray = None RayWorkerVllm = None -if TYPE_CHECKING: - from ray.util.placement_group import PlacementGroup - -def initialize_cluster( +def initialize_ray_cluster( parallel_config: ParallelConfig, - engine_use_ray: bool = False, ray_address: Optional[str] = None, -) -> Optional["PlacementGroup"]: - """Initialize the distributed cluster probably with Ray. +): + """Initialize the distributed cluster with Ray. + + it will connect to the Ray cluster and create a placement group + for the workers, which includes the specification of the resources + for each distributed worker. Args: parallel_config: The configurations for parallel execution. - engine_use_ray: Whether to use Ray for async engine. ray_address: The address of the Ray cluster. If None, uses the default Ray cluster address. - - Returns: - An optional `PlacementGroup`. It includes the specification - of the resources for each distributed worker. None if Ray is - not used. """ - if parallel_config.worker_use_ray or engine_use_ray: - if ray is None: - raise ImportError( - "Ray is not installed. Please install Ray to use distributed " - "serving.") - # Connect to a ray cluster. - if is_hip(): - ray.init(address=ray_address, - ignore_reinit_error=True, - num_gpus=parallel_config.world_size) - else: - ray.init(address=ray_address, ignore_reinit_error=True) - - if not parallel_config.worker_use_ray: - assert parallel_config.world_size == 1, ( - "Ray is required if parallel_config.world_size > 1.") - return None + if ray is None: + raise ImportError( + "Ray is not installed. Please install Ray to use distributed " + "serving.") + + # Connect to a ray cluster. + if is_hip(): + ray.init(address=ray_address, + ignore_reinit_error=True, + num_gpus=parallel_config.world_size) + else: + ray.init(address=ray_address, ignore_reinit_error=True) + + if parallel_config.placement_group: + # Placement group is already set. + return # Create placement group for worker processes current_placement_group = ray.util.get_current_placement_group() @@ -138,4 +131,5 @@ def initialize_cluster( # if they cannot be provisioned. ray.get(current_placement_group.ready(), timeout=1800) - return current_placement_group + # Set the placement group in the parallel config + parallel_config.placement_group = current_placement_group diff --git a/vllm/entrypoints/api_server.py b/vllm/entrypoints/api_server.py index 1eb4ab8b06b6..ba93b1beb2aa 100644 --- a/vllm/entrypoints/api_server.py +++ b/vllm/entrypoints/api_server.py @@ -1,11 +1,14 @@ """ -NOTE: This API server is used only for demonstrating usage of AsyncEngine and simple performance benchmarks. -It is not intended for production use. For production use, we recommend using our OpenAI compatible server. -We are also not going to accept PRs modifying this file, please change `vllm/entrypoints/openai/api_server.py` instead. +NOTE: This API server is used only for demonstrating usage of AsyncEngine +and simple performance benchmarks. It is not intended for production use. +For production use, we recommend using our OpenAI compatible server. +We are also not going to accept PRs modifying this file, please +change `vllm/entrypoints/openai/api_server.py` instead. """ import argparse import json +import ssl from typing import AsyncGenerator from fastapi import FastAPI, Request @@ -80,6 +83,16 @@ async def stream_results() -> AsyncGenerator[bytes, None]: parser.add_argument("--port", type=int, default=8000) parser.add_argument("--ssl-keyfile", type=str, default=None) parser.add_argument("--ssl-certfile", type=str, default=None) + parser.add_argument("--ssl-ca-certs", + type=str, + default=None, + help="The CA certificates file") + parser.add_argument( + "--ssl-cert-reqs", + type=int, + default=int(ssl.CERT_NONE), + help="Whether client certificate is required (see stdlib ssl module's)" + ) parser.add_argument( "--root-path", type=str, @@ -98,4 +111,6 @@ async def stream_results() -> AsyncGenerator[bytes, None]: log_level="debug", timeout_keep_alive=TIMEOUT_KEEP_ALIVE, ssl_keyfile=args.ssl_keyfile, - ssl_certfile=args.ssl_certfile) + ssl_certfile=args.ssl_certfile, + ssl_ca_certs=args.ssl_ca_certs, + ssl_cert_reqs=args.ssl_cert_reqs) diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py index 9f29b4ac92f4..e0626ca4e9da 100644 --- a/vllm/entrypoints/openai/api_server.py +++ b/vllm/entrypoints/openai/api_server.py @@ -5,6 +5,7 @@ import os import importlib import inspect +import ssl from prometheus_client import make_asgi_app import fastapi @@ -18,7 +19,9 @@ import vllm from vllm.engine.arg_utils import AsyncEngineArgs from vllm.engine.async_llm_engine import AsyncLLMEngine -from vllm.entrypoints.openai.protocol import CompletionRequest, ChatCompletionRequest, ErrorResponse +from vllm.entrypoints.openai.protocol import (CompletionRequest, + ChatCompletionRequest, + ErrorResponse) from vllm.logger import init_logger from vllm.entrypoints.openai.serving_chat import OpenAIServingChat from vllm.entrypoints.openai.serving_completion import OpenAIServingCompletion @@ -84,13 +87,11 @@ def parse_args(): type=json.loads, default=["*"], help="allowed headers") - parser.add_argument( - "--api-key", - type=str, - default=None, - help= - "If provided, the server will require this key to be presented in the header." - ) + parser.add_argument("--api-key", + type=str, + default=None, + help="If provided, the server will require this key " + "to be presented in the header.") parser.add_argument("--served-model-name", type=str, default=None, @@ -103,9 +104,8 @@ def parse_args(): default=None, nargs='+', action=LoRAParserAction, - help= - "LoRA module configurations in the format name=path. Multiple modules can be specified." - ) + help="LoRA module configurations in the format name=path. " + "Multiple modules can be specified.") parser.add_argument("--chat-template", type=str, default=None, @@ -125,6 +125,16 @@ def parse_args(): type=str, default=None, help="The file path to the SSL cert file") + parser.add_argument("--ssl-ca-certs", + type=str, + default=None, + help="The CA certificates file") + parser.add_argument( + "--ssl-cert-reqs", + type=int, + default=int(ssl.CERT_NONE), + help="Whether client certificate is required (see stdlib ssl module's)" + ) parser.add_argument( "--root-path", type=str, @@ -138,9 +148,10 @@ def parse_args(): help="Additional ASGI middleware to apply to the app. " "We accept multiple --middleware arguments. " "The value should be an import path. " - "If a function is provided, vLLM will add it to the server using @app.middleware('http'). " - "If a class is provided, vLLM will add it to the server using app.add_middleware(). " - ) + "If a function is provided, vLLM will add it to the server " + "using @app.middleware('http'). " + "If a class is provided, vLLM will add it to the server " + "using app.add_middleware(). ") parser = AsyncEngineArgs.add_cli_args(parser) return parser.parse_args() @@ -235,9 +246,8 @@ async def authentication(request: Request, call_next): elif inspect.iscoroutinefunction(imported): app.middleware("http")(imported) else: - raise ValueError( - f"Invalid middleware {middleware}. Must be a function or a class." - ) + raise ValueError(f"Invalid middleware {middleware}. " + f"Must be a function or a class.") logger.info(f"vLLM API server version {vllm.__version__}") logger.info(f"args: {args}") @@ -263,4 +273,6 @@ async def authentication(request: Request, call_next): log_level=args.uvicorn_log_level, timeout_keep_alive=TIMEOUT_KEEP_ALIVE, ssl_keyfile=args.ssl_keyfile, - ssl_certfile=args.ssl_certfile) + ssl_certfile=args.ssl_certfile, + ssl_ca_certs=args.ssl_ca_certs, + ssl_cert_reqs=args.ssl_cert_reqs) diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py index ba352f18f645..bfdfe39f210e 100644 --- a/vllm/entrypoints/openai/serving_chat.py +++ b/vllm/entrypoints/openai/serving_chat.py @@ -12,7 +12,8 @@ UsageInfo) from vllm.outputs import RequestOutput from vllm.entrypoints.openai.serving_engine import OpenAIServing, LoRA -from vllm.model_executor.guided_decoding import get_guided_decoding_logits_processor +from vllm.model_executor.guided_decoding import ( + get_guided_decoding_logits_processor) logger = init_logger(__name__) @@ -37,8 +38,9 @@ async def create_chat_completion( ChatCompletionResponse]: """Completion API similar to OpenAI's API. - See https://platform.openai.com/docs/api-reference/chat/create - for the API specification. This API mimics the OpenAI ChatCompletion API. + See https://platform.openai.com/docs/api-reference/chat/create + for the API specification. This API mimics the OpenAI + ChatCompletion API. NOTE: Currently we do not support the following feature: - function_call (Users should implement this by themselves) @@ -65,7 +67,7 @@ async def create_chat_completion( lora_request = self._maybe_get_lora(request) guided_decode_logits_processor = ( await get_guided_decoding_logits_processor( - request, self.engine.get_tokenizer())) + request, await self.engine.get_tokenizer())) if guided_decode_logits_processor: if sampling_params.logits_processors is None: sampling_params.logits_processors = [] @@ -101,7 +103,7 @@ async def chat_completion_stream_generator( ) -> Union[ErrorResponse, AsyncGenerator[str, None]]: model_name = request.model - created_time = int(time.monotonic()) + created_time = int(time.time()) chunk_object_type = "chat.completion.chunk" first_iteration = True @@ -116,7 +118,8 @@ async def chat_completion_stream_generator( # the result_generator, it needs to be sent as the FIRST # response (by the try...catch). if first_iteration: - # Send first response for each request.n (index) with the role + # Send first response for each request.n (index) with + # the role role = self.get_chat_request_role(request) for i in range(request.n): choice_data = ChatCompletionResponseStreamChoice( @@ -133,7 +136,8 @@ async def chat_completion_stream_generator( data = chunk.model_dump_json(exclude_unset=True) yield f"data: {data}\n\n" - # Send response to echo the input portion of the last message + # Send response to echo the input portion of the + # last message if request.echo: last_msg_content = "" if request.messages and isinstance( @@ -145,11 +149,12 @@ async def chat_completion_stream_generator( if last_msg_content: for i in range(request.n): - choice_data = ChatCompletionResponseStreamChoice( - index=i, - delta=DeltaMessage( - content=last_msg_content), - finish_reason=None) + choice_data = ( + ChatCompletionResponseStreamChoice( + index=i, + delta=DeltaMessage( + content=last_msg_content), + finish_reason=None)) chunk = ChatCompletionStreamResponse( id=request_id, object=chunk_object_type, @@ -239,7 +244,7 @@ async def chat_completion_full_generator( request_id: str) -> Union[ErrorResponse, ChatCompletionResponse]: model_name = request.model - created_time = int(time.monotonic()) + created_time = int(time.time()) final_res: RequestOutput = None async for res in result_generator: diff --git a/vllm/entrypoints/openai/serving_completion.py b/vllm/entrypoints/openai/serving_completion.py index a8244fd15075..bfd7c9b50cf3 100644 --- a/vllm/entrypoints/openai/serving_completion.py +++ b/vllm/entrypoints/openai/serving_completion.py @@ -1,7 +1,8 @@ import asyncio import time from fastapi import Request -from typing import AsyncGenerator, AsyncIterator, Callable, List, Optional, Dict, Tuple +from typing import (AsyncGenerator, AsyncIterator, Callable, List, Optional, + Dict, Tuple) from vllm.logger import init_logger from vllm.utils import random_uuid from vllm.engine.async_llm_engine import AsyncLLMEngine @@ -16,7 +17,8 @@ ) from vllm.outputs import RequestOutput from vllm.entrypoints.openai.serving_engine import OpenAIServing, LoRA -from vllm.model_executor.guided_decoding import get_guided_decoding_logits_processor +from vllm.model_executor.guided_decoding import ( + get_guided_decoding_logits_processor) logger = init_logger(__name__) @@ -44,9 +46,8 @@ def parse_prompt_format(prompt) -> Tuple[bool, list]: prompt_is_tokens = True prompts = prompt # case 4: array of token arrays else: - raise ValueError( - "prompt must be a string, array of strings, array of tokens, or array of token arrays" - ) + raise ValueError("prompt must be a string, array of strings, " + "array of tokens, or array of token arrays") return prompt_is_tokens, prompts @@ -117,7 +118,7 @@ async def create_completion(self, request: CompletionRequest, model_name = request.model request_id = f"cmpl-{random_uuid()}" - created_time = int(time.monotonic()) + created_time = int(time.time()) # Schedule the request and get the result generator. generators = [] @@ -126,7 +127,7 @@ async def create_completion(self, request: CompletionRequest, lora_request = self._maybe_get_lora(request) guided_decode_logit_processor = ( await get_guided_decoding_logits_processor( - request, self.engine.get_tokenizer())) + request, await self.engine.get_tokenizer())) if guided_decode_logit_processor is not None: if sampling_params.logits_processors is None: sampling_params.logits_processors = [] @@ -156,7 +157,8 @@ async def create_completion(self, request: CompletionRequest, int, RequestOutput]] = merge_async_iterators(*generators) # Similar to the OpenAI API, when n != best_of, we do not stream the - # results. In addition, we do not stream the results when use beam search. + # results. In addition, we do not stream the results when use + # beam search. stream = (request.stream and (request.best_of is None or request.n == request.best_of) and not request.use_beam_search) @@ -223,7 +225,8 @@ async def completion_stream_generator( for output in res.outputs: i = output.index + prompt_idx * request.n - # TODO(simon): optimize the performance by avoiding full text O(n^2) sending. + # TODO(simon): optimize the performance by avoiding full + # text O(n^2) sending. if request.echo and request.max_tokens == 0: # only return the prompt @@ -231,11 +234,12 @@ async def completion_stream_generator( delta_token_ids = res.prompt_token_ids top_logprobs = res.prompt_logprobs has_echoed[i] = True - elif request.echo and request.max_tokens > 0 and not has_echoed[ - i]: + elif (request.echo and request.max_tokens > 0 + and not has_echoed[i]): # echo the prompt and first token delta_text = res.prompt + output.text - delta_token_ids = res.prompt_token_ids + output.token_ids + delta_token_ids = (res.prompt_token_ids + + output.token_ids) top_logprobs = res.prompt_logprobs + (output.logprobs or []) has_echoed[i] = True @@ -248,7 +252,9 @@ async def completion_stream_generator( i]:] if output.logprobs else None if request.logprobs is not None: - assert top_logprobs is not None, "top_logprobs must be provided when logprobs is requested" + assert top_logprobs is not None, ( + "top_logprobs must be provided when logprobs " + "is requested") logprobs = self._create_logprobs( token_ids=delta_token_ids, top_logprobs=top_logprobs, diff --git a/vllm/entrypoints/openai/serving_engine.py b/vllm/entrypoints/openai/serving_engine.py index 230d13d97dbb..2db884945c49 100644 --- a/vllm/entrypoints/openai/serving_engine.py +++ b/vllm/entrypoints/openai/serving_engine.py @@ -50,10 +50,12 @@ def __init__(self, except RuntimeError: event_loop = None - if event_loop is not None and event_loop.is_running( - ): # If the current is instanced by Ray Serve, there is already a running event loop + if event_loop is not None and event_loop.is_running(): + # If the current is instanced by Ray Serve, + # there is already a running event loop event_loop.create_task(self._post_init()) - else: # When using single vLLM without engine_use_ray + else: + # When using single vLLM without engine_use_ray asyncio.run(self._post_init()) async def _post_init(self): @@ -178,8 +180,9 @@ def _validate_prompt_and_tokenize( if token_num + request.max_tokens > self.max_model_len: raise ValueError( - f"This model's maximum context length is {self.max_model_len} tokens. " - f"However, you requested {request.max_tokens + token_num} tokens " + f"This model's maximum context length is " + f"{self.max_model_len} tokens. However, you requested " + f"{request.max_tokens + token_num} tokens " f"({token_num} in the messages, " f"{request.max_tokens} in the completion). " f"Please reduce the length of the messages or completion.", ) diff --git a/vllm/executor/__init__.py b/vllm/executor/__init__.py new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/vllm/executor/executor_base.py b/vllm/executor/executor_base.py new file mode 100644 index 000000000000..30717e8a8735 --- /dev/null +++ b/vllm/executor/executor_base.py @@ -0,0 +1,75 @@ +from abc import ABC, abstractmethod +from typing import Dict, List, Optional + +from vllm.config import (CacheConfig, DeviceConfig, ModelConfig, + ParallelConfig, SchedulerConfig, LoRAConfig) +from vllm.lora.request import LoRARequest +from vllm.sequence import SamplerOutput, SequenceGroupMetadata + + +class ExecutorBase(ABC): + """Base class for all executors. + + An executor is responsible for executing the model on a specific device + type (e.g., CPU, GPU, Neuron, etc.). Or it can be a distributed executor + that can execute the model on multiple devices. + """ + + @abstractmethod + def __init__( + self, + model_config: ModelConfig, + cache_config: CacheConfig, + parallel_config: ParallelConfig, + scheduler_config: SchedulerConfig, + device_config: DeviceConfig, + lora_config: Optional[LoRAConfig], + ) -> None: + raise NotImplementedError + + @abstractmethod + def execute_model(self, + seq_group_metadata_list: List[SequenceGroupMetadata], + blocks_to_swap_in: Dict[int, int], + blocks_to_swap_out: Dict[int, int], + blocks_to_copy: Dict[int, List[int]]) -> SamplerOutput: + """Executes one model step on the given sequences.""" + raise NotImplementedError + + @abstractmethod + def add_lora(self, lora_request: LoRARequest) -> bool: + raise NotImplementedError + + @abstractmethod + def remove_lora(self, lora_id: int) -> bool: + raise NotImplementedError + + @abstractmethod + def list_loras(self) -> List[int]: + raise NotImplementedError + + @abstractmethod + def check_health(self) -> None: + """Checks if the executor is healthy. If not, it should raise an + exception.""" + raise NotImplementedError + + +class ExecutorAsyncBase(ExecutorBase): + + @abstractmethod + async def execute_model_async( + self, + seq_group_metadata_list: List[SequenceGroupMetadata], + blocks_to_swap_in: Dict[int, int], + blocks_to_swap_out: Dict[int, int], + blocks_to_copy: Dict[int, List[int]], + ) -> SamplerOutput: + """Executes one model step on the given sequences.""" + raise NotImplementedError + + @abstractmethod + async def check_health_async(self) -> None: + """Checks if the executor is healthy. If not, it should raise an + exception.""" + raise NotImplementedError diff --git a/vllm/executor/gpu_executor.py b/vllm/executor/gpu_executor.py new file mode 100644 index 000000000000..9019ee7763c7 --- /dev/null +++ b/vllm/executor/gpu_executor.py @@ -0,0 +1,163 @@ +import importlib +from typing import Dict, List, Optional + +from vllm.lora.request import LoRARequest +from vllm.config import (CacheConfig, DeviceConfig, ModelConfig, + ParallelConfig, SchedulerConfig, LoRAConfig) +from vllm.executor.executor_base import ExecutorAsyncBase, ExecutorBase +from vllm.executor.utils import check_block_size_valid +from vllm.logger import init_logger +from vllm.sequence import SamplerOutput, SequenceGroupMetadata +from vllm.utils import (get_ip, get_open_port, get_distributed_init_method, + make_async) + +logger = init_logger(__name__) + +# A map between the device type (in device config) to its worker module. +DEVICE_TO_WORKER_MODULE_MAP = { + "cuda": "vllm.worker.worker", + "neuron": "vllm.worker.neuron_worker", +} + + +class GPUExecutor(ExecutorBase): + + def __init__( + self, + model_config: ModelConfig, + cache_config: CacheConfig, + parallel_config: ParallelConfig, + scheduler_config: SchedulerConfig, + device_config: DeviceConfig, + lora_config: Optional[LoRAConfig], + ) -> None: + self.model_config = model_config + self.cache_config = cache_config + self.lora_config = lora_config + self.parallel_config = parallel_config + self.scheduler_config = scheduler_config + self.device_config = device_config + + # Instantiate the worker and load the model to GPU. + self._init_worker() + + # Profile the memory usage and initialize the cache. + self._init_cache() + + def _dispatch_worker(self): + worker_module = DEVICE_TO_WORKER_MODULE_MAP[ + self.device_config.device_type] + imported_worker = importlib.import_module(worker_module) + Worker = imported_worker.Worker + return Worker + + def _init_worker(self): + # Lazy import the Worker to avoid importing torch.cuda/xformers + # before CUDA_VISIBLE_DEVICES is set in the Worker + Worker = self._dispatch_worker() + + assert self.parallel_config.world_size == 1, ( + "GPUExecutor only supports single GPU.") + + distributed_init_method = get_distributed_init_method( + get_ip(), get_open_port()) + self.driver_worker = Worker( + self.model_config, + self.parallel_config, + self.scheduler_config, + self.device_config, + local_rank=0, + rank=0, + distributed_init_method=distributed_init_method, + lora_config=self.lora_config, + kv_cache_dtype=self.cache_config.cache_dtype, + is_driver_worker=True, + ) + self.driver_worker.init_model() + self.driver_worker.load_model() + + def _init_cache(self) -> None: + """Profiles the memory usage and initializes the KV cache. + + The engine first profiles the existing memory usage. + Then, it allocates the remaining memory for KV blocks. + + .. tip:: + You may limit the usage of GPU memory + by adjusting the `gpu_memory_utilization` parameter. + """ + # Get the maximum number of blocks that can be allocated on GPU and CPU. + num_gpu_blocks, num_cpu_blocks = ( + self.driver_worker.profile_num_available_blocks( + block_size=self.cache_config.block_size, + gpu_memory_utilization=self.cache_config. + gpu_memory_utilization, + cpu_swap_space=self.cache_config.swap_space_bytes, + cache_dtype=self.cache_config.cache_dtype, + )) + + logger.info(f"# GPU blocks: {num_gpu_blocks}, " + f"# CPU blocks: {num_cpu_blocks}") + + check_block_size_valid(num_gpu_blocks, self.cache_config.block_size, + self.model_config.max_model_len) + + self.cache_config.num_gpu_blocks = num_gpu_blocks + self.cache_config.num_cpu_blocks = num_cpu_blocks + + # Initialize the cache. + self.driver_worker.init_cache_engine(cache_config=self.cache_config) + # Warm up the model. This includes capturing the model into CUDA graph + # if enforce_eager is False. + self.driver_worker.warm_up_model() + + def execute_model(self, + seq_group_metadata_list: List[SequenceGroupMetadata], + blocks_to_swap_in: Dict[int, int], + blocks_to_swap_out: Dict[int, int], + blocks_to_copy: Dict[int, List[int]]) -> SamplerOutput: + output = self.driver_worker.execute_model( + seq_group_metadata_list=seq_group_metadata_list, + blocks_to_swap_in=blocks_to_swap_in, + blocks_to_swap_out=blocks_to_swap_out, + blocks_to_copy=blocks_to_copy, + ) + return output + + def add_lora(self, lora_request: LoRARequest) -> bool: + assert lora_request.lora_int_id > 0, "lora_id must be greater than 0." + return self.driver_worker.add_lora(lora_request) + + def remove_lora(self, lora_id: int) -> bool: + assert lora_id > 0, "lora_id must be greater than 0." + return self.driver_worker.remove_lora(lora_id) + + def list_loras(self) -> List[int]: + return self.driver_worker.list_loras() + + def check_health(self) -> None: + # GPUExecutor will always be healthy as long as + # it's running. + return + + +class GPUExecutorAsync(GPUExecutor, ExecutorAsyncBase): + + async def execute_model_async( + self, + seq_group_metadata_list: List[SequenceGroupMetadata], + blocks_to_swap_in: Dict[int, int], + blocks_to_swap_out: Dict[int, int], + blocks_to_copy: Dict[int, List[int]], + ) -> SamplerOutput: + output = await make_async(self.driver_worker.execute_model)( + seq_group_metadata_list=seq_group_metadata_list, + blocks_to_swap_in=blocks_to_swap_in, + blocks_to_swap_out=blocks_to_swap_out, + blocks_to_copy=blocks_to_copy) + return output + + async def check_health_async(self) -> None: + # GPUExecutor will always be healthy as long as + # it's running. + return diff --git a/vllm/executor/ray_gpu_executor.py b/vllm/executor/ray_gpu_executor.py new file mode 100644 index 000000000000..82a2b456895e --- /dev/null +++ b/vllm/executor/ray_gpu_executor.py @@ -0,0 +1,441 @@ +import asyncio +import copy +from collections import defaultdict +import os +import pickle +import importlib +from typing import TYPE_CHECKING, Any, Dict, List, Optional + +from vllm.config import (CacheConfig, DeviceConfig, ModelConfig, + ParallelConfig, SchedulerConfig, LoRAConfig) +from vllm.engine.ray_utils import RayWorkerVllm, ray +from vllm.executor.executor_base import ExecutorAsyncBase, ExecutorBase +from vllm.executor.utils import check_block_size_valid +from vllm.logger import init_logger +from vllm.lora.request import LoRARequest +from vllm.sequence import SamplerOutput, SequenceGroupMetadata +from vllm.utils import (set_cuda_visible_devices, get_ip, get_open_port, + get_distributed_init_method, make_async) + +if ray is not None: + from ray.util.scheduling_strategies import PlacementGroupSchedulingStrategy + +if TYPE_CHECKING: + from ray.util.placement_group import PlacementGroup + +logger = init_logger(__name__) + +# A map between the device type (in device config) to its worker module. +DEVICE_TO_WORKER_MODULE_MAP = { + "cuda": "vllm.worker.worker", + "neuron": "vllm.worker.neuron_worker", +} + +# If the env var is set, it uses the Ray's compiled DAG API +# which optimizes the control plane overhead. +# Run vLLM with VLLM_USE_RAY_COMPILED_DAG=1 to enable it. +USE_RAY_COMPILED_DAG = bool(os.getenv("VLLM_USE_RAY_COMPILED_DAG", 0)) + + +class RayGPUExecutor(ExecutorBase): + + def __init__( + self, + model_config: ModelConfig, + cache_config: CacheConfig, + parallel_config: ParallelConfig, + scheduler_config: SchedulerConfig, + device_config: DeviceConfig, + lora_config: Optional[LoRAConfig], + ) -> None: + self.model_config = model_config + self.cache_config = cache_config + self.lora_config = lora_config + self.parallel_config = parallel_config + self.scheduler_config = scheduler_config + self.device_config = device_config + + assert self.parallel_config.worker_use_ray + placement_group = self.parallel_config.placement_group + + # Disable Ray usage stats collection. + ray_usage = os.environ.get("RAY_USAGE_STATS_ENABLED", "0") + if ray_usage != "1": + os.environ["RAY_USAGE_STATS_ENABLED"] = "0" + + # Create the parallel GPU workers. + self._init_workers_ray(placement_group) + + # Profile the memory usage and initialize the cache. + self._init_cache() + + self.forward_dag = None + if USE_RAY_COMPILED_DAG: + self.forward_dag = self._compiled_ray_dag() + + def _dispatch_worker(self): + worker_module = DEVICE_TO_WORKER_MODULE_MAP[ + self.device_config.device_type] + imported_worker = importlib.import_module(worker_module) + Worker = imported_worker.Worker + return Worker + + def _init_workers_ray(self, placement_group: "PlacementGroup", + **ray_remote_kwargs): + if self.parallel_config.tensor_parallel_size == 1: + # For single GPU case, we use a ray worker with constrained memory. + num_gpus = self.cache_config.gpu_memory_utilization + else: + # Otherwise, the ray workers are allocated with a full GPU. + num_gpus = 1 + + # The driver dummy worker does not actually use any resources. + # It holds the resource for the driver worker. + self.driver_dummy_worker: RayWorkerVllm = None + # The remaining workers are the actual ray actors. + self.workers: List[RayWorkerVllm] = [] + + # Create the workers. + driver_ip = get_ip() + for bundle_id, bundle in enumerate(placement_group.bundle_specs): + if not bundle.get("GPU", 0): + continue + scheduling_strategy = PlacementGroupSchedulingStrategy( + placement_group=placement_group, + placement_group_capture_child_tasks=True, + placement_group_bundle_index=bundle_id, + ) + worker = ray.remote( + num_cpus=0, + num_gpus=num_gpus, + scheduling_strategy=scheduling_strategy, + **ray_remote_kwargs, + )(RayWorkerVllm).remote(self.model_config.trust_remote_code) + + worker_ip = ray.get(worker.get_node_ip.remote()) + if worker_ip == driver_ip and self.driver_dummy_worker is None: + # If the worker is on the same node as the driver, we use it + # as the resource holder for the driver process. + self.driver_dummy_worker = worker + else: + # Else, added to the list of workers. + self.workers.append(worker) + + if self.driver_dummy_worker is None: + raise ValueError( + "Ray does not allocate any GPUs on the driver node. Consider " + "adjusting the Ray placement group or running the driver on a " + "GPU node.") + + # Get the set of GPU IDs used on each node. + driver_node_id, driver_gpu_ids = ray.get( + self.driver_dummy_worker.get_node_and_gpu_ids.remote()) + worker_node_and_gpu_ids = ray.get( + [worker.get_node_and_gpu_ids.remote() for worker in self.workers]) + + node_workers = defaultdict(list) + node_gpus = defaultdict(list) + + node_workers[driver_node_id].append(0) + node_gpus[driver_node_id].extend(driver_gpu_ids) + for i, (node_id, gpu_ids) in enumerate(worker_node_and_gpu_ids, + start=1): + node_workers[node_id].append(i) + node_gpus[node_id].extend(gpu_ids) + for node_id, gpu_ids in node_gpus.items(): + node_gpus[node_id] = sorted(gpu_ids) + + # Set CUDA_VISIBLE_DEVICES for the driver and workers. + set_cuda_visible_devices(node_gpus[driver_node_id]) + for worker, (node_id, _) in zip(self.workers, worker_node_and_gpu_ids): + worker.set_cuda_visible_devices.remote(node_gpus[node_id]) + + distributed_init_method = get_distributed_init_method( + driver_ip, get_open_port()) + + # Lazy import the Worker to avoid importing torch.cuda/xformers + # before CUDA_VISIBLE_DEVICES is set in the Worker + Worker = self._dispatch_worker() + + model_config = copy.deepcopy(self.model_config) + parallel_config = copy.deepcopy(self.parallel_config) + scheduler_config = copy.deepcopy(self.scheduler_config) + device_config = copy.deepcopy(self.device_config) + lora_config = copy.deepcopy(self.lora_config) + kv_cache_dtype = self.cache_config.cache_dtype + + # Initialize the actual workers with the Worker class. + for rank, (worker, (node_id, _)) in enumerate( + zip(self.workers, worker_node_and_gpu_ids), + start=1, + ): + local_rank = node_workers[node_id].index(rank) + worker.init_worker.remote( + lambda rank=rank, local_rank=local_rank: Worker( + model_config, + parallel_config, + scheduler_config, + device_config, + local_rank, + rank, + distributed_init_method, + lora_config=lora_config, + kv_cache_dtype=kv_cache_dtype, + )) + + # Initialize the driver worker with the Worker class. + driver_rank = 0 + driver_local_rank = node_workers[driver_node_id].index(driver_rank) + self.driver_worker = Worker( + self.model_config, + self.parallel_config, + self.scheduler_config, + self.device_config, + driver_local_rank, + driver_rank, + distributed_init_method, + lora_config=self.lora_config, + kv_cache_dtype=kv_cache_dtype, + is_driver_worker=True, + ) + + # FIXME(woosuk): We are not properly initializing cupy NCCL when + # we have multiple nodes. + self._run_workers("init_model", + cupy_port=get_open_port() + if not model_config.enforce_eager else None) + self._run_workers( + "load_model", + max_concurrent_workers=self.parallel_config. + max_parallel_loading_workers, + ) + + def _init_cache(self) -> None: + """Profiles the memory usage and initializes the KV cache. + + The engine will first conduct a profiling of the existing memory usage. + Then, it calculate the maximum possible number of GPU and CPU blocks + that can be allocated with the remaining free memory. + More details can be found in the + :meth:`~vllm.worker.worker.Worker.profile_num_available_blocks` method + from class :class:`~vllm.worker.Worker`. + + Afterwards, as there may be multiple workers, + we take the minimum number of blocks across all workers + to ensure this can be applied to all of them. + + Finally, the engine will initialize the KV cache + with the calculated number of blocks. + + .. tip:: + You may limit the usage of GPU memory + by adjusting the `gpu_memory_utilization` parameter. + """ + # Get the maximum number of blocks that can be allocated on GPU and CPU. + num_blocks = self._run_workers( + "profile_num_available_blocks", + block_size=self.cache_config.block_size, + gpu_memory_utilization=self.cache_config.gpu_memory_utilization, + cpu_swap_space=self.cache_config.swap_space_bytes, + cache_dtype=self.cache_config.cache_dtype, + ) + + # Since we use a shared centralized controller, we take the minimum + # number of blocks across all workers to make sure all the memory + # operators can be applied to all workers. + num_gpu_blocks = min(b[0] for b in num_blocks) + num_cpu_blocks = min(b[1] for b in num_blocks) + logger.info(f"# GPU blocks: {num_gpu_blocks}, " + f"# CPU blocks: {num_cpu_blocks}") + + check_block_size_valid(num_gpu_blocks, self.cache_config.block_size, + self.model_config.max_model_len) + + self.cache_config.num_gpu_blocks = num_gpu_blocks + self.cache_config.num_cpu_blocks = num_cpu_blocks + + # Initialize the cache. + self._run_workers("init_cache_engine", cache_config=self.cache_config) + # Warm up the model. This includes capturing the model into CUDA graph + # if enforce_eager is False. + self._run_workers("warm_up_model") + + def execute_model(self, + seq_group_metadata_list: List[SequenceGroupMetadata], + blocks_to_swap_in: Dict[int, int], + blocks_to_swap_out: Dict[int, int], + blocks_to_copy: Dict[int, List[int]]) -> SamplerOutput: + all_outputs = self._run_workers( + "execute_model", + driver_kwargs={ + "seq_group_metadata_list": seq_group_metadata_list, + "blocks_to_swap_in": blocks_to_swap_in, + "blocks_to_swap_out": blocks_to_swap_out, + "blocks_to_copy": blocks_to_copy, + }, + use_ray_compiled_dag=USE_RAY_COMPILED_DAG) + + # Only the driver worker returns the sampling results. + output = all_outputs[0] + return output + + def add_lora(self, lora_request: LoRARequest) -> bool: + assert lora_request.lora_int_id > 0, "lora_id must be greater than 0." + return self._run_workers( + "add_lora", + lora_request=lora_request, + ) + + def remove_lora(self, lora_id: int) -> bool: + assert lora_id > 0, "lora_id must be greater than 0." + return self._run_workers( + "remove_lora", + lora_id=lora_id, + ) + + def list_loras(self) -> List[int]: + return self._run_workers("list_loras") + + def _run_workers( + self, + method: str, + *args, + driver_args: Optional[List[Any]] = None, + driver_kwargs: Optional[Dict[str, Any]] = None, + max_concurrent_workers: Optional[int] = None, + use_ray_compiled_dag: bool = False, + **kwargs, + ) -> Any: + """Runs the given method on all workers.""" + + if max_concurrent_workers: + raise NotImplementedError( + "max_concurrent_workers is not supported yet.") + + if use_ray_compiled_dag: + # Right now, compiled DAG can only accept a single + # input. TODO(sang): Fix it. + output_channels = self.forward_dag.execute(1) + else: + # Start the ray workers first. + ray_worker_outputs = [ + worker.execute_method.remote(method, *args, **kwargs) + for worker in self.workers + ] + + if driver_args is None: + driver_args = args + if driver_kwargs is None: + driver_kwargs = kwargs + + # Start the driver worker after all the ray workers. + driver_worker_output = getattr(self.driver_worker, + method)(*driver_args, **driver_kwargs) + + # Get the results of the ray workers. + if self.workers: + if use_ray_compiled_dag: + try: + ray_worker_outputs = [ + pickle.loads(chan.begin_read()) + for chan in output_channels + ] + finally: + # Has to call end_read in order to reuse the DAG. + for chan in output_channels: + chan.end_read() + else: + ray_worker_outputs = ray.get(ray_worker_outputs) + + return [driver_worker_output] + ray_worker_outputs + + def _compiled_ray_dag(self): + import pkg_resources + required_version = "2.9" + current_version = pkg_resources.get_distribution("ray").version + if current_version < required_version: + raise ValueError(f"Ray version {required_version} or greater is " + f"required, but found {current_version}") + + from ray.dag import MultiOutputNode, InputNode + assert self.parallel_config.worker_use_ray + + # Right now, compiled DAG requires at least 1 arg. We send + # a dummy value for now. It will be fixed soon. + with InputNode() as input_data: + forward_dag = MultiOutputNode([ + worker.execute_model_compiled_dag_remote.bind(input_data) + for worker in self.workers + ]) + return forward_dag.experimental_compile() + + def check_health(self) -> None: + """Raises an error if engine is unhealthy.""" + self._check_if_any_actor_is_dead() + + def _check_if_any_actor_is_dead(self): + if not self.workers: + return + + dead_actors = [] + for actor in self.workers: + actor_state = ray.state.actors(actor._ray_actor_id.hex()) # pylint: disable=protected-access + if actor_state["State"] == "DEAD": + dead_actors.append(actor) + if dead_actors: + raise RuntimeError("At least one Worker is dead. " + f"Dead Workers: {dead_actors}. ") + + +class RayGPUExecutorAsync(RayGPUExecutor, ExecutorAsyncBase): + + async def _run_workers_async( + self, + method: str, + *args, + driver_args: Optional[List[Any]] = None, + driver_kwargs: Optional[Dict[str, Any]] = None, + **kwargs, + ) -> Any: + """Runs the given method on all workers.""" + coros = [] + + if driver_args is None: + driver_args = args + if driver_kwargs is None: + driver_kwargs = kwargs + + # Run the driver worker asynchronously. + driver_executor = make_async(getattr(self.driver_worker, method)) + coros.append(driver_executor(*driver_args, **driver_kwargs)) + + # Run the ray workers asynchronously. + for worker in self.workers: + coros.append(worker.execute_method.remote(method, *args, **kwargs)) + + all_outputs = await asyncio.gather(*coros) + return all_outputs + + async def execute_model_async( + self, + seq_group_metadata_list: List[SequenceGroupMetadata], + blocks_to_swap_in: Dict[int, int], + blocks_to_swap_out: Dict[int, int], + blocks_to_copy: Dict[int, List[int]], + ) -> SamplerOutput: + all_outputs = await self._run_workers_async( + "execute_model", + driver_kwargs={ + "seq_group_metadata_list": seq_group_metadata_list, + "blocks_to_swap_in": blocks_to_swap_in, + "blocks_to_swap_out": blocks_to_swap_out, + "blocks_to_copy": blocks_to_copy, + }) + + # Only the driver worker returns the sampling results. + output = all_outputs[0] + return output + + async def check_health_async(self) -> None: + """Raises an error if engine is unhealthy.""" + self._check_if_any_actor_is_dead() diff --git a/vllm/executor/utils.py b/vllm/executor/utils.py new file mode 100644 index 000000000000..44976696a77c --- /dev/null +++ b/vllm/executor/utils.py @@ -0,0 +1,13 @@ +def check_block_size_valid(num_gpu_blocks, block_size, max_model_len) -> None: + if num_gpu_blocks <= 0: + raise ValueError("No available memory for the cache blocks. " + "Try increasing `gpu_memory_utilization` when " + "initializing the engine.") + max_seq_len = block_size * num_gpu_blocks + if max_model_len > max_seq_len: + raise ValueError( + f"The model's max seq len ({max_model_len}) " + "is larger than the maximum number of tokens that can be " + f"stored in KV cache ({max_seq_len}). Try increasing " + "`gpu_memory_utilization` or decreasing `max_model_len` when " + "initializing the engine.") diff --git a/vllm/lora/layers.py b/vllm/lora/layers.py index e667d70f71e3..99e6cdeee636 100644 --- a/vllm/lora/layers.py +++ b/vllm/lora/layers.py @@ -20,10 +20,12 @@ RowParallelLinear, QKVParallelLinear, MergedColumnParallelLinear) -from vllm.model_executor.layers.vocab_parallel_embedding import VocabParallelEmbedding, ParallelLMHead +from vllm.model_executor.layers.vocab_parallel_embedding import ( + VocabParallelEmbedding, ParallelLMHead) from vllm.model_executor.parallel_utils.parallel_state import ( get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size) -from vllm.model_executor.parallel_utils.utils import split_tensor_along_last_dim +from vllm.model_executor.parallel_utils.utils import ( + split_tensor_along_last_dim) if TYPE_CHECKING: pass @@ -84,7 +86,8 @@ def _apply_lora_packed_nslice( lora_b_stacked: 3 element tuple of (num_loras, output_dim, lora_rank) indices: (batch_size) output: (batch_size, q_slice_size + 2*kv_slice_size) - output_slices: n-1 element tuple of (slice_size...), where n is number of slices + output_slices: n-1 element tuple of (slice_size...), + where n is number of slices """ org_output = output x = x.view(-1, x.shape[-1]) @@ -819,9 +822,8 @@ def create_lora_weights( ) -> None: # Keep this in sync with csrc/punica/bgmv/bgmv_config.h if 32000 < self.base_layer.vocab_size > 33024: - raise ValueError( - "When using LoRA, vocab size must be 32000 >= vocab_size <= 33024" - ) + raise ValueError("When using LoRA, vocab size must be " + "32000 >= vocab_size <= 33024") self.lora_a_stacked = torch.zeros( ( max_loras, diff --git a/vllm/lora/models.py b/vllm/lora/models.py index 7386d21c58e4..238da256b7cd 100644 --- a/vllm/lora/models.py +++ b/vllm/lora/models.py @@ -13,7 +13,8 @@ from vllm.config import LoRAConfig from vllm.utils import LRUCache, in_wsl -from vllm.lora.layers import BaseLayerWithLoRA, LoRAMapping, from_layer, from_layer_sampler +from vllm.lora.layers import (BaseLayerWithLoRA, LoRAMapping, from_layer, + from_layer_sampler) from vllm.lora.lora import LoRALayerWeights, PackedLoRALayerWeights from vllm.lora.utils import parse_fine_tuned_lora_name, replace_submodule diff --git a/vllm/lora/worker_manager.py b/vllm/lora/worker_manager.py index 7e92bc93ab47..911115d63a63 100644 --- a/vllm/lora/worker_manager.py +++ b/vllm/lora/worker_manager.py @@ -154,10 +154,9 @@ def _load_lora(self, lora_request: LoRARequest) -> LoRAModel: f"LoRA rank {lora.rank} is greater than max_lora_rank " f"{self.lora_config.max_lora_rank}.") if lora.extra_vocab_size > self.lora_config.lora_extra_vocab_size: - raise ValueError( - f"LoRA added vocab size {lora.extra_vocab_size} is greater than " - f"lora_extra_vocab_size {self.lora_config.lora_extra_vocab_size}." - ) + raise ValueError(f"LoRA added vocab size {lora.extra_vocab_size} " + f"is greater than lora_extra_vocab_size " + f"{self.lora_config.lora_extra_vocab_size}.") return lora def add_dummy_lora(self, lora_request: LoRARequest, rank: int) -> bool: diff --git a/vllm/model_executor/guided_decoding.py b/vllm/model_executor/guided_decoding.py index a8573f8bdc6c..00984460d79a 100644 --- a/vllm/model_executor/guided_decoding.py +++ b/vllm/model_executor/guided_decoding.py @@ -8,8 +8,10 @@ from typing import Union, Tuple from pydantic import BaseModel -from vllm.entrypoints.openai.protocol import CompletionRequest, ChatCompletionRequest -from vllm.model_executor.guided_logits_processors import JSONLogitsProcessor, RegexLogitsProcessor +from vllm.entrypoints.openai.protocol import (CompletionRequest, + ChatCompletionRequest) +from vllm.model_executor.guided_logits_processors import (JSONLogitsProcessor, + RegexLogitsProcessor) class GuidedDecodingMode(Enum): diff --git a/vllm/model_executor/guided_logits_processors.py b/vllm/model_executor/guided_logits_processors.py index 1b3e5e71a591..76d41aa37dd7 100644 --- a/vllm/model_executor/guided_logits_processors.py +++ b/vllm/model_executor/guided_logits_processors.py @@ -107,12 +107,15 @@ def __init__(self, Parameters ---------- schema - A JSON schema that encodes the structure we want the model to generate + A JSON schema that encodes the structure we want the model to + generate tokenizer The model's tokenizer whitespace_pattern - Pattern to use for JSON syntactic whitespace (doesn't impact string literals) - Example: allow only a single space or newline with `whitespace_pattern=r"[\n ]?"` + Pattern to use for JSON syntactic whitespace (doesn't impact + string literals) + Example: allow only a single space or newline with + `whitespace_pattern=r"[\n ]?"` """ if isinstance(schema, type(BaseModel)): schema_str = json.dumps(schema.model_json_schema()) @@ -122,8 +125,8 @@ def __init__(self, schema_str = schema else: raise ValueError( - f"Cannot parse schema {schema}. The schema must be either " + - "a Pydantic object, a dictionary or a string that contains the JSON " - + "Schema specification") + f"Cannot parse schema {schema}. The schema must be either " + f"a Pydantic object, a dictionary or a string that contains " + f"the JSON Schema specification") regex_string = build_regex_from_schema(schema_str, whitespace_pattern) super().__init__(regex_string, tokenizer) diff --git a/vllm/model_executor/layers/activation.py b/vllm/model_executor/layers/activation.py index 5a3a7b2dbaee..3eb73ee109f5 100644 --- a/vllm/model_executor/layers/activation.py +++ b/vllm/model_executor/layers/activation.py @@ -47,16 +47,25 @@ class GeluAndMul(nn.Module): return: (batch_size, seq_len, d) or (num_tokens, d) """ + def __init__(self, approximate: str = "none"): + super().__init__() + self.approximate = approximate + if approximate not in ("none", "tanh"): + raise ValueError(f"Unknown approximate mode: {approximate}") + def _forward(self, x: torch.Tensor) -> torch.Tensor: """PyTorch-native implementation equivalent to forward().""" d = x.shape[-1] // 2 - return F.gelu(x[..., :d]) * x[..., d:] + return F.gelu(x[..., :d], approximate=self.approximate) * x[..., d:] def forward(self, x: torch.Tensor) -> torch.Tensor: d = x.shape[-1] // 2 output_shape = (x.shape[:-1] + (d, )) out = torch.empty(output_shape, dtype=x.dtype, device=x.device) - ops.gelu_and_mul(out, x) + if self.approximate == "none": + ops.gelu_and_mul(out, x) + elif self.approximate == "tanh": + ops.gelu_tanh_and_mul(out, x) return out diff --git a/vllm/model_executor/layers/attention/attention.py b/vllm/model_executor/layers/attention/attention.py index 724dd0511c5a..4b63b9eaf59a 100644 --- a/vllm/model_executor/layers/attention/attention.py +++ b/vllm/model_executor/layers/attention/attention.py @@ -35,12 +35,12 @@ def __init__( ) -> None: super().__init__() if _use_flash_attn(): - from vllm.model_executor.layers.attention.backends.flash_attn import FlashAttentionBackend + from vllm.model_executor.layers.attention.backends.flash_attn import FlashAttentionBackend # noqa: E501 self.backend = FlashAttentionBackend(num_heads, head_size, scale, num_kv_heads, alibi_slopes, sliding_window) else: - from vllm.model_executor.layers.attention.backends.xformers import XFormersBackend + from vllm.model_executor.layers.attention.backends.xformers import XFormersBackend # noqa: E501 self.backend = XFormersBackend(num_heads, head_size, scale, num_kv_heads, alibi_slopes, sliding_window) diff --git a/vllm/model_executor/layers/attention/backends/flash_attn.py b/vllm/model_executor/layers/attention/backends/flash_attn.py index 4abe195f274a..58ccd461b993 100644 --- a/vllm/model_executor/layers/attention/backends/flash_attn.py +++ b/vllm/model_executor/layers/attention/backends/flash_attn.py @@ -103,8 +103,6 @@ def forward( key_cache, value_cache, input_metadata, - self.num_heads, - self.num_kv_heads, self.alibi_slopes, ) else: diff --git a/vllm/model_executor/layers/fused_moe/__init__.py b/vllm/model_executor/layers/fused_moe/__init__.py index 1391d43c8abe..299ab44f8f3d 100644 --- a/vllm/model_executor/layers/fused_moe/__init__.py +++ b/vllm/model_executor/layers/fused_moe/__init__.py @@ -1,5 +1,9 @@ -from vllm.model_executor.layers.fused_moe.fused_moe import fused_moe +from vllm.model_executor.layers.fused_moe.fused_moe import ( + fused_moe, + get_config_file_name, +) __all__ = [ "fused_moe", + "get_config_file_name", ] diff --git a/vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json new file mode 100644 index 000000000000..5c8185cfdeec --- /dev/null +++ b/vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 4 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 4 + }, + "24": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 4 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "64": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "96": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "128": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 4 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 4 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 4 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 4 + }, + "2048": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 4 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 4 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 4 + } +} diff --git a/vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json new file mode 100644 index 000000000000..97c9f4445b16 --- /dev/null +++ b/vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 4 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 4 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 4 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 4 + }, + "24": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "32": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "48": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "64": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "96": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "256": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "1536": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 4 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 4 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 4 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 4 + } +} diff --git a/vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json index 1fefb5ff7e42..edf2a38d12ad 100644 --- a/vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json +++ b/vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json @@ -1,20 +1,146 @@ { - "1": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 32, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}, - "2": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 32, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 7}, - "4": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 6}, - "8": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 7}, - "16": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 7}, - "24": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}, - "32": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}, - "64": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}, - "96": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}, - "128": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 6}, - "192": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 6}, - "256": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}, - "512": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 8, "num_stages": 4}, - "1024": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 32, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}, - "1536": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 32, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4}, - "2048": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 32, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}, - "3072": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 32, "GROUP_SIZE_M": 1, "num_warps": 8, "num_stages": 4}, - "4096": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 32, "GROUP_SIZE_M": 16, "num_warps": 8, "num_stages": 4} + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 4 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "64": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "96": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "128": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 4 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 4 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 4 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 4 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 4 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 4 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 4 + } } diff --git a/vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json new file mode 100644 index 000000000000..b2100cebb7f5 --- /dev/null +++ b/vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "48": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "64": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "96": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "256": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "1536": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 4 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 4 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 4 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 4 + } +} diff --git a/vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json new file mode 100644 index 000000000000..f578c8d0160a --- /dev/null +++ b/vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json @@ -0,0 +1,146 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "48": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "64": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "96": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "128": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 4 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 4 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 4 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 4 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 4 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 4 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 4 + } +} diff --git a/vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json index 64d49ca66c1c..e341a67917d5 100644 --- a/vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json +++ b/vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json @@ -1,24 +1,146 @@ { - "1": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 4, "num_warps": 4, "num_stages": 4}, - "2": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}, - "4": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}, - "8": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 2, "num_warps": 8, "num_stages": 4}, - "16": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 4, "num_warps": 4, "num_stages": 4}, - "24": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 4, "num_warps": 4, "num_stages": 4}, - "32": {"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}, - "80": {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}, - "96": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}, - "128": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}, - "192": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, "num_stages": 4}, - "200": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 2, "num_warps": 4, "num_stages": 4}, - "208": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 2, "num_warps": 4, "num_stages": 4}, - "216": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 16, "num_warps": 4, "num_stages": 4}, - "224": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}, - "256": {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 32, "num_warps": 4, "num_stages": 4}, - "512": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}, - "1024": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}, - "1536": {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}, - "2048": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}, - "3072": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4}, - "4096": {"BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 64, "num_warps": 8, "num_stages": 4} + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 4 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 16, + "num_warps": 4, + "num_stages": 4 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 4 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "48": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "64": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "96": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 4 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 64, + "num_warps": 4, + "num_stages": 4 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 4 + }, + "1024": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 4 + }, + "1536": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 4 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 32, + "num_warps": 8, + "num_stages": 4 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 64, + "num_warps": 8, + "num_stages": 4 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 4 + } } diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py index 08e3c2d5b706..1ec09f0cd4c2 100644 --- a/vllm/model_executor/layers/fused_moe/fused_moe.py +++ b/vllm/model_executor/layers/fused_moe/fused_moe.py @@ -30,9 +30,10 @@ def fused_moe_kernel( K, EM, num_valid_tokens, - # The stride variables represent how much to increase the ptr by when moving by 1 - # element in a particular dimension. E.g. `stride_am` is how much to increase `a_ptr` - # by to get the element one row down (A has M rows). + # The stride variables represent how much to increase the ptr by when + # moving by 1 element in a particular dimension. E.g. `stride_am` is + # how much to increase `a_ptr` by to get the element one row down + # (A has M rows). stride_am, stride_ak, stride_be, @@ -50,17 +51,30 @@ def fused_moe_kernel( compute_type: tl.constexpr, ): """ - Implements the fused computation for a Mixture of Experts (MOE) using token and expert matrices. + Implements the fused computation for a Mixture of Experts (MOE) using + token and expert matrices. Key Parameters: - - A: The input tensor representing tokens with shape (*, K), where '*' can be any shape representing batches and K is the feature dimension of each token. - - B: The stacked MOE weight tensor with shape (E, N, K), where E is the number of experts, K is the input feature dimension, and N is the output feature dimension. - - C: The output cache tensor with shape (M, topk, N), where M is the total number of tokens post padding, topk is the number of times each token is repeated, - and N is the output feature dimension. - - sorted_token_ids: A tensor containing the sorted indices of tokens, repeated topk times and arranged by the expert index they are assigned to. - - expert_ids: A tensor containing the indices of the expert for each block. It determines which expert matrix from B should be used for each block in A. - This kernel performs the multiplication of a token by its corresponding expert matrix as determined by `expert_ids`. The sorting of `sorted_token_ids` - by expert index and padding ensures divisibility by BLOCK_SIZE_M, which is necessary to maintain consistency in block matrix multiplication across different blocks processed by the same expert. + - A: The input tensor representing tokens with shape (*, K), where '*' can + be any shape representing batches and K is the feature dimension of + each token. + - B: The stacked MOE weight tensor with shape (E, N, K), where E is + the number of experts, K is the input feature dimension, and N is + the output feature dimension. + - C: The output cache tensor with shape (M, topk, N), where M is the + total number of tokens post padding, topk is the number of times + each token is repeated, and N is the output feature dimension. + - sorted_token_ids: A tensor containing the sorted indices of tokens, + repeated topk times and arranged by the expert index they are + assigned to. + - expert_ids: A tensor containing the indices of the expert for each + block. It determines which expert matrix from B should be used for + each block in A. + This kernel performs the multiplication of a token by its corresponding + expert matrix as determined by `expert_ids`. The sorting of + `sorted_token_ids` by expert index and padding ensures divisibility by + BLOCK_SIZE_M, which is necessary to maintain consistency in block matrix + multiplication across different blocks processed by the same expert. """ # ----------------------------------------------------------- # Map program ids `pid` to the block of C it should compute. @@ -105,7 +119,8 @@ def fused_moe_kernel( accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32) for k in range(0, tl.cdiv(K, BLOCK_SIZE_K)): - # Load the next block of A and B, generate a mask by checking the K dimension. + # Load the next block of A and B, generate a mask by checking the + # K dimension. a = tl.load(a_ptrs, mask=token_mask[:, None] & (offs_k[None, :] < K - k * BLOCK_SIZE_K), @@ -139,30 +154,41 @@ def moe_align_block_size( topk_ids: torch.Tensor, block_size: int, num_experts: int) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: """ - Aligns the token distribution across experts to be compatible with block size for matrix multiplication. + Aligns the token distribution across experts to be compatible with block + size for matrix multiplication. Parameters: - - topk_ids: A tensor of shape [total_tokens, top_k] representing the top-k expert indices for each token. + - topk_ids: A tensor of shape [total_tokens, top_k] representing the + top-k expert indices for each token. - block_size: The block size used in block matrix multiplication. - num_experts: The total number of experts. Returns: - - sorted_token_ids: A tensor containing the sorted token indices according to their allocated expert. + - sorted_token_ids: A tensor containing the sorted token indices according + to their allocated expert. - expert_ids: A tensor indicating the assigned expert index for each block. - - num_tokens_post_padded: The total number of tokens after padding, ensuring divisibility by block_size. + - num_tokens_post_padded: The total number of tokens after padding, + ensuring divisibility by block_size. - This function pads the number of tokens that each expert needs to process so that it is divisible by block_size. - Padding ensures that during block matrix multiplication, the dimensions align correctly. + This function pads the number of tokens that each expert needs to process + so that it is divisible by block_size. + Padding ensures that during block matrix multiplication, the dimensions + align correctly. Example: - Given topk_ids = [[2, 3, 4], [1, 2, 4], [1, 3, 4], [1, 2, 3]], block_size = 4, and num_experts = 4: - - We initially have 12 tokens (after repeating 'top_k' times) and 4 experts, with each expert needing to process 3 tokens. + Given topk_ids = [[2, 3, 4], [1, 2, 4], [1, 3, 4], [1, 2, 3]], + block_size = 4, and num_experts = 4: + - We initially have 12 tokens (after repeating 'top_k' times) and 4 experts, + with each expert needing to process 3 tokens. - As block_size is 4, we pad 1 token for each expert. - First, flatten topk_ids to [2, 3, 4, 1, 2, 4, 1, 3, 4, 1, 2, 3]. - Then append padding tokens [12, 12, 12, 12] for each block. - - After sorting by expert index, we obtain token_ids [3, 6, 9, 12, 0, 4, 10, 12, 1, 7, 11, 12, 2, 5, 8, 12]. - Tokens 12 are non-existent (padding) and are ignored in the subsequent matrix multiplication. - - The padding ensures that the total number of tokens is now divisible by block_size for proper block matrix operations. + - After sorting by expert index, we obtain token_ids + [3, 6, 9, 12, 0, 4, 10, 12, 1, 7, 11, 12, 2, 5, 8, 12]. + Tokens 12 are non-existent (padding) and are ignored in + the subsequent matrix multiplication. + - The padding ensures that the total number of tokens is now divisible + by block_size for proper block matrix operations. """ sorted_ids = torch.empty( (topk_ids.numel() + num_experts * (block_size - 1), ), @@ -219,23 +245,28 @@ def invoke_fused_moe_kernel(A: torch.Tensor, B: torch.Tensor, C: torch.Tensor, ) +def get_config_file_name(E: int, N: int) -> str: + device_name = torch.cuda.get_device_name().replace(" ", "_") + return f"E={E},N={N},device_name={device_name}.json" + + @functools.lru_cache def get_moe_configs(E: int, N: int) -> Optional[Dict[int, Any]]: """ Return optimized configurations for the fused MoE kernel. - The return value will be a dictionary that maps an irregular grid of batch sizes - to configurations of the fused_moe kernel. To evaluate the kernel on a given batch - size bs, the closest batch size in the grid should be picked and the associated - configuration chosen to invoke the kernel. + The return value will be a dictionary that maps an irregular grid of + batch sizes to configurations of the fused_moe kernel. To evaluate the + kernel on a given batch size bs, the closest batch size in the grid should + be picked and the associated configuration chosen to invoke the kernel. """ - # First look up if an optimized configuration is available in the configs directory - device_name = torch.cuda.get_device_name().replace(" ", "_") + # First look up if an optimized configuration is available in the configs + # directory + json_file_name = get_config_file_name(E, N) config_file_path = os.path.join( - os.path.dirname(os.path.realpath(__file__)), "configs", - f"E={E},N={N},device_name={device_name}.json") + os.path.dirname(os.path.realpath(__file__)), "configs", json_file_name) if os.path.exists(config_file_path): with open(config_file_path) as f: logger.info( @@ -243,7 +274,8 @@ def get_moe_configs(E: int, N: int) -> Optional[Dict[int, Any]]: # If a configuration has been found, return it return {int(key): val for key, val in json.load(f).items()} - # If no optimized configuration is available, we will use the default configuration + # If no optimized configuration is available, we will use the default + # configuration return None @@ -258,18 +290,22 @@ def fused_moe( override_config: Optional[Dict[str, Any]] = None, ) -> torch.Tensor: """ - This function computes a Mixture of Experts (MoE) layer using two sets of weights, w1 and w2, and top-k gating mechanism. - + This function computes a Mixture of Experts (MoE) layer using two sets of + weights, w1 and w2, and top-k gating mechanism. + Parameters: - hidden_states (torch.Tensor): The input tensor to the MoE layer. - w1 (torch.Tensor): The first set of expert weights. - w2 (torch.Tensor): The second set of expert weights. - - gating_output (torch.Tensor): The output of the gating operation (before softmax). + - gating_output (torch.Tensor): The output of the gating operation + (before softmax). - topk (int): The number of top-k experts to select. - renormalize (bool): If True, renormalize the top-k weights to sum to 1. - - inplace (bool): If True, perform the operation in-place. Defaults to False. - - override_config (Optional[Dict[str, Any]]): Optional override for the kernel configuration. - + - inplace (bool): If True, perform the operation in-place. + Defaults to False. + - override_config (Optional[Dict[str, Any]]): Optional override + for the kernel configuration. + Returns: - torch.Tensor: The output tensor after applying the MoE layer. """ @@ -325,7 +361,8 @@ def fused_moe( configs = get_moe_configs(E, w2.shape[2]) if configs: - # If an optimal configuration map has been found, look up the optimal config + # If an optimal configuration map has been found, look up the + # optimal config config = configs[min(configs.keys(), key=lambda x: abs(x - M))] else: # Else use the default config diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py index b2396a1d6f14..40e681df48f8 100644 --- a/vllm/model_executor/layers/linear.py +++ b/vllm/model_executor/layers/linear.py @@ -73,7 +73,7 @@ def apply_weights(self, bias: Optional[torch.Tensor] = None) -> torch.Tensor: weight = weights["weight"] if self.separate_bias_add: - if bias: + if bias is not None: return F.linear(x, weight) + bias return F.linear(x, weight) return F.linear(x, weight, bias) @@ -285,7 +285,8 @@ def weight_loader(self, shard_size = shard_size // param.pack_factor shard_offset = shard_offset // param.pack_factor - # If marlin, we need to adjust the offset and size to account for the tiling. + # If marlin, we need to adjust the offset and size to + # account for the tiling. shard_size, shard_offset = adjust_marlin_shard( param, shard_size, shard_offset) @@ -307,7 +308,8 @@ def weight_loader(self, shard_size = shard_size // param.pack_factor shard_offset = shard_offset // param.pack_factor - # If marlin, we need to adjust the offset and size to account for the tiling. + # If marlin, we need to adjust the offset and size to + # account for the tiling. shard_size, shard_offset = adjust_marlin_shard( param, shard_size, shard_offset) @@ -413,7 +415,8 @@ def weight_loader(self, shard_size = shard_size // param.pack_factor shard_offset = shard_offset // param.pack_factor - # If marlin, we need to adjust the offset and size to account for the tiling. + # If marlin, we need to adjust the offset and size to + # account for the tiling. shard_size, shard_offset = adjust_marlin_shard( param, shard_size, shard_offset) @@ -442,7 +445,8 @@ def weight_loader(self, shard_size = shard_size // param.pack_factor shard_offset = shard_offset // param.pack_factor - # If marlin, we need to adjust the offset and size to account for the tiling. + # If marlin, we need to adjust the offset and size to + # account for the tiling. shard_size, shard_offset = adjust_marlin_shard( param, shard_size, shard_offset) diff --git a/vllm/model_executor/layers/quantization/__init__.py b/vllm/model_executor/layers/quantization/__init__.py index dc54641878c6..af27b1844cea 100644 --- a/vllm/model_executor/layers/quantization/__init__.py +++ b/vllm/model_executor/layers/quantization/__init__.py @@ -1,6 +1,7 @@ from typing import Type -from vllm.model_executor.layers.quantization.base_config import QuantizationConfig +from vllm.model_executor.layers.quantization.base_config import ( + QuantizationConfig) from vllm.model_executor.layers.quantization.awq import AWQConfig from vllm.model_executor.layers.quantization.gptq import GPTQConfig from vllm.model_executor.layers.quantization.squeezellm import SqueezeLLMConfig diff --git a/vllm/model_executor/layers/quantization/awq.py b/vllm/model_executor/layers/quantization/awq.py index 3e1c814dd233..2caef5f1ebf5 100644 --- a/vllm/model_executor/layers/quantization/awq.py +++ b/vllm/model_executor/layers/quantization/awq.py @@ -6,7 +6,8 @@ from vllm._C import ops from vllm.model_executor.layers.linear import (LinearMethodBase, set_weight_attrs) -from vllm.model_executor.layers.quantization.base_config import QuantizationConfig +from vllm.model_executor.layers.quantization.base_config import ( + QuantizationConfig) class AWQConfig(QuantizationConfig): @@ -50,7 +51,8 @@ def get_min_capability(self) -> int: def get_config_filenames() -> List[str]: return [ "quant_config.json", # E.g., casperhansen/vicuna-7b-v1.5-awq - "quantize_config.json", # E.g., abhinavkulkarni/mosaicml-mpt-7b-instruct-w4-g128-awq + # E.g., abhinavkulkarni/mosaicml-mpt-7b-instruct-w4-g128-awq + "quantize_config.json", ] @classmethod diff --git a/vllm/model_executor/layers/quantization/gptq.py b/vllm/model_executor/layers/quantization/gptq.py index 2e6aabb23267..bb69c7235a13 100644 --- a/vllm/model_executor/layers/quantization/gptq.py +++ b/vllm/model_executor/layers/quantization/gptq.py @@ -31,8 +31,8 @@ def __init__( self.pack_factor = Fraction(32, self.weight_bits) if self.weight_bits not in [2, 3, 4, 8]: raise ValueError( - "Currently, only 2/3/4/8-bit weight quantization is supported for " - f"GPTQ, but got {self.weight_bits} bits.") + "Currently, only 2/3/4/8-bit weight quantization is " + f"supported for GPTQ, but got {self.weight_bits} bits.") def __repr__(self) -> str: return (f"GPTQConfig(weight_bits={self.weight_bits}, " @@ -101,7 +101,8 @@ def create_weights( "The input size is not aligned with the quantized " "weight shape. This can be caused by too large " "tensor parallel size.") - if output_size_per_partition % self.quant_config.pack_factor.numerator != 0: + if (output_size_per_partition % self.quant_config.pack_factor.numerator + != 0): raise ValueError( "The output size is not aligned with the quantized " "weight shape. This can be caused by too large " @@ -114,7 +115,8 @@ def create_weights( exllama_state = ExllamaState.UNINITIALIZED scale_and_zero_size = input_size // group_size scale_and_zero_input_dim = None - if input_size != input_size_per_partition and self.quant_config.group_size != -1: + if (input_size != input_size_per_partition + and self.quant_config.group_size != -1): # For act-order models, we cannot use Exllama for row parallel layer if self.quant_config.desc_act: exllama_state = ExllamaState.UNUSED diff --git a/vllm/model_executor/layers/quantization/marlin.py b/vllm/model_executor/layers/quantization/marlin.py index 7566d78a8aba..48e44445a4a2 100644 --- a/vllm/model_executor/layers/quantization/marlin.py +++ b/vllm/model_executor/layers/quantization/marlin.py @@ -5,7 +5,8 @@ from vllm._C import ops from vllm.model_executor.layers.linear import LinearMethodBase, set_weight_attrs -from vllm.model_executor.layers.quantization.base_config import QuantizationConfig +from vllm.model_executor.layers.quantization.base_config import ( + QuantizationConfig) class MarlinConfig(QuantizationConfig): @@ -22,8 +23,9 @@ def __init__( self.group_size = group_size if self.group_size != 128 and self.group_size != -1: raise ValueError( - "Currently, only group size 128 and -1 (channelwise) is supported for " - f"Marlin, but got group_size of {self.group_size}") + "Currently, only group size 128 and -1 (channelwise) " + "is supported for Marlin, but got group_size of " + f"{self.group_size}") # 4 Bits packed into 32 bit datatype. self.pack_factor = 32 // 4 @@ -37,14 +39,15 @@ def __init__( # Min in_features dim self.min_k_threads = 128 - # Max parallel problems to solve at once (improves large batch performance) + # Max parallel problems to solve at once (improves large + # batch performance) self.max_parallel = 16 # Permutation length used by the marlin kernels. self.perm_len = 1024 def __repr__(self) -> str: - return f"MarlinConfig(group_size={self.group_size}" + return f"MarlinConfig(group_size={self.group_size})" @classmethod def get_name(cls) -> str: @@ -102,22 +105,26 @@ def create_weights( # Validate output_size_per_partition if output_size_per_partition % self.quant_config.min_n_threads != 0: raise ValueError( - f"Weight output_size_per_partition = {output_size_per_partition} is not divisible by min_n_threads = {self.quant_config.min_n_threads}." - ) + f"Weight output_size_per_partition = " + f"{output_size_per_partition} is not divisible by " + f"min_n_threads = {self.quant_config.min_n_threads}.") if output_size_per_partition % self.quant_config.pack_factor != 0: raise ValueError( - f"Weight output_size_per_partition = {output_size_per_partition} is not divisible by pack_factor = {self.quant_config.pack_factor}." - ) + f"Weight output_size_per_partition = " + f"{output_size_per_partition} is not divisible by " + f"pack_factor = {self.quant_config.pack_factor}.") # Validate input_size_per_partition if input_size_per_partition % self.quant_config.min_k_threads != 0: raise ValueError( - f"Weight input_size_per_partition = {input_size_per_partition} is not divisible by min_k_threads = {self.quant_config.min_k_threads}." - ) - if self.quant_config.group_size != -1 and input_size_per_partition % self.quant_config.group_size != 0: - raise ValueError( - f"Weight input_size_per_partition = f{input_size_per_partition} is not divisible by group_size = {self.quant_config.group_size}." - ) + f"Weight input_size_per_partition = " + f"{input_size_per_partition} is not divisible by " + f"min_k_threads = {self.quant_config.min_k_threads}.") + if (self.quant_config.group_size != -1 and + input_size_per_partition % self.quant_config.group_size != 0): + raise ValueError(f"Weight input_size_per_partition = " + f"{input_size_per_partition} is not divisible by " + f"group_size = {self.quant_config.group_size}.") # Check that we have at least 4 tiles horizontally in the shard num_tiles_per_perm = self.quant_config.perm_len // ( @@ -149,7 +156,9 @@ def create_weights( ) # Determine if channelwise or not - input_groups = 1 if self.quant_config.group_size == -1 else input_size_per_partition // self.quant_config.group_size + input_groups = (1 if self.quant_config.group_size == -1 else + input_size_per_partition // + self.quant_config.group_size) scales = Parameter( torch.empty( diff --git a/vllm/model_executor/layers/quantization/squeezellm.py b/vllm/model_executor/layers/quantization/squeezellm.py index 9244e8855275..ed25455e6ec1 100644 --- a/vllm/model_executor/layers/quantization/squeezellm.py +++ b/vllm/model_executor/layers/quantization/squeezellm.py @@ -6,7 +6,8 @@ from vllm._C import ops from vllm.model_executor.layers.linear import (LinearMethodBase, set_weight_attrs) -from vllm.model_executor.layers.quantization.base_config import QuantizationConfig +from vllm.model_executor.layers.quantization.base_config import ( + QuantizationConfig) from vllm.utils import is_hip diff --git a/vllm/model_executor/layers/rejection_sampler.py b/vllm/model_executor/layers/rejection_sampler.py index 3e1cfc783b8e..564345406025 100644 --- a/vllm/model_executor/layers/rejection_sampler.py +++ b/vllm/model_executor/layers/rejection_sampler.py @@ -21,8 +21,6 @@ def __init__(self, strict_mode: bool = False): nontrivial latency. """ super().__init__() - self.probs_dtype = torch.float32 - self.token_id_dtype = torch.int64 self._strict_mode = strict_mode # NOTE: A "bonus token" is accepted iff all proposal tokens are @@ -44,6 +42,14 @@ def init_gpu_tensors(self, rank: int) -> None: dtype=torch.long, device=device) + @property + def probs_dtype(self): + return torch.float32 + + @property + def token_id_dtype(self): + return torch.int64 + def forward( self, target_probs: torch.Tensor, diff --git a/vllm/model_executor/layers/rotary_embedding.py b/vllm/model_executor/layers/rotary_embedding.py index 13749570f28a..71af9b26e2e9 100644 --- a/vllm/model_executor/layers/rotary_embedding.py +++ b/vllm/model_executor/layers/rotary_embedding.py @@ -22,7 +22,7 @@ # limitations under the License. """Rotary Positional Embeddings.""" import math -from typing import Any, Dict, Optional, Tuple, Union +from typing import Any, Dict, List, Optional, Tuple, Union import torch import torch.nn as nn @@ -96,6 +96,7 @@ def _forward( positions: torch.Tensor, query: torch.Tensor, key: torch.Tensor, + offsets: Optional[torch.Tensor] = None, ) -> Tuple[torch.Tensor, torch.Tensor]: """PyTorch-native implementation equivalent to forward().""" query = query.view(*query.shape[:-1], -1, self.head_size) @@ -107,7 +108,9 @@ def _forward( query_pass = query[..., self.rotary_dim:] key_pass = key[..., self.rotary_dim:] - cos_sin = self.cos_sin_cache[positions] + self.cos_sin_cache = self.cos_sin_cache.to(positions.get_device()) + cos_sin = self.cos_sin_cache[torch.add(positions, offsets) + if offsets is not None else positions] cos, sin = cos_sin.chunk(2, dim=-1) if self.is_neox_style: # NOTE(woosuk): Here we assume that the positions tensor has the @@ -137,11 +140,19 @@ def forward( positions: torch.Tensor, query: torch.Tensor, key: torch.Tensor, + offsets: Optional[torch.Tensor] = None, ) -> Tuple[torch.Tensor, torch.Tensor]: - # ops.rotary_embedding() is an in-place operation that - # updates the query and key tensors. - ops.rotary_embedding(positions, query, key, self.head_size, - self.cos_sin_cache, self.is_neox_style) + self.cos_sin_cache = self.cos_sin_cache.to(positions.get_device()) + # ops.rotary_embedding()/batched_rotary_embedding() + # are in-place operations that update the query and key tensors. + if offsets is not None: + ops.batched_rotary_embedding(positions, query, key, self.head_size, + self.cos_sin_cache, + self.is_neox_style, self.rotary_dim, + offsets) + else: + ops.rotary_embedding(positions, query, key, self.head_size, + self.cos_sin_cache, self.is_neox_style) return query, key @@ -158,27 +169,32 @@ def __init__( max_position_embeddings: int, base: int, is_neox_style: bool, - scaling_factor: float, + scaling_factors: Union[List[float], float], ) -> None: - self.scaling_factor = scaling_factor + if isinstance(scaling_factors, float): + scaling_factors = [scaling_factors] + self.scaling_factors = scaling_factors super().__init__(head_size, rotary_dim, max_position_embeddings, base, is_neox_style) def _compute_cos_sin_cache(self) -> torch.Tensor: inv_freq = self._compute_inv_freq(self.base) - # NOTE(woosuk): self.max_position_embeddings is the original - # maximum length before applying the rope scaling. - # Thus, the maximum length after applying the rope scaling is - # self.max_position_embeddings * self.scaling_factor. - max_len = self.max_position_embeddings * self.scaling_factor - t = torch.arange(max_len, dtype=torch.float) - t = t / self.scaling_factor - - freqs = torch.einsum("i,j -> ij", t, inv_freq) - cos = freqs.cos() - sin = freqs.sin() - cache = torch.cat((cos, sin), dim=-1) - return cache + cache_list = [] + for scaling_factor in self.scaling_factors: + # NOTE(woosuk): self.max_position_embeddings is the original + # maximum length before applying the rope scaling. + # Thus, the maximum length after applying the rope scaling is + # self.max_position_embeddings * self.scaling_factor. + max_len = self.max_position_embeddings * scaling_factor + t = torch.arange(max_len, dtype=torch.float) + t = t / scaling_factor + + freqs = torch.einsum("i,j -> ij", t, inv_freq) + cos = freqs.cos() + sin = freqs.sin() + cache = torch.cat((cos, sin), dim=-1) + cache_list.append(cache) + return torch.cat(cache_list, dim=0) class DynamicNTKScalingRotaryEmbedding(RotaryEmbedding): diff --git a/vllm/model_executor/layers/sampler.py b/vllm/model_executor/layers/sampler.py index 320cb443524c..4377b845df62 100644 --- a/vllm/model_executor/layers/sampler.py +++ b/vllm/model_executor/layers/sampler.py @@ -6,7 +6,8 @@ from vllm.model_executor.parallel_utils.communication_op import ( tensor_model_parallel_gather) -from vllm.model_executor.sampling_metadata import SamplingMetadata, SamplingTensors +from vllm.model_executor.sampling_metadata import (SamplingMetadata, + SamplingTensors) from vllm.sampling_params import SamplingParams, SamplingType from vllm.sequence import (Logprob, PromptLogprobs, SampleLogprobs, SamplerOutput, SequenceData, SequenceGroupOutput, @@ -587,4 +588,4 @@ def _build_sampler_output( SequenceOutput(seq_ids[parent_id], next_token_id, logprobs)) sampler_output.append( SequenceGroupOutput(seq_outputs, group_prompt_logprobs)) - return sampler_output + return SamplerOutput(outputs=sampler_output) diff --git a/vllm/model_executor/models/__init__.py b/vllm/model_executor/models/__init__.py old mode 100644 new mode 100755 index 75c2ae1e9f48..bc3b6a582d53 --- a/vllm/model_executor/models/__init__.py +++ b/vllm/model_executor/models/__init__.py @@ -62,8 +62,11 @@ "Sliding window attention is not yet supported in ROCm's flash attention", } -# Models not supported by Neuron. -_NEURON_SUPPORTED_MODELS = {"LlamaForCausalLM": "neuron.llama"} +# Models supported by Neuron. +_NEURON_SUPPORTED_MODELS = { + "LlamaForCausalLM": "neuron.llama", + "MistralForCausalLM": "neuron.mistral" +} class ModelRegistry: diff --git a/vllm/model_executor/models/baichuan.py b/vllm/model_executor/models/baichuan.py index 6da0082b9428..cbf472750e29 100644 --- a/vllm/model_executor/models/baichuan.py +++ b/vllm/model_executor/models/baichuan.py @@ -333,7 +333,8 @@ def load_weights(self, if "rotary_emb.inv_freq" in name: continue if name == "lm_head.weight": - # Unlike Baichuan, Baichuan2 normalizes the head weights. Refer to: + # Unlike Baichuan, Baichuan2 normalizes the head weights. + # Refer to: # https://huggingface.co/baichuan-inc/Baichuan2-7B-Chat/blob/84603cde5ebffb6084e476cfaeceaf0b8b91fe54/modeling_baichuan.py#L508 # Distinguish between Baichuan and Baichuan2 by checking the # vocab size. This is suggested by diff --git a/vllm/model_executor/models/deepseek.py b/vllm/model_executor/models/deepseek.py index f2dca3df27cf..13c080cb0277 100644 --- a/vllm/model_executor/models/deepseek.py +++ b/vllm/model_executor/models/deepseek.py @@ -119,7 +119,8 @@ def __init__( linear_method=None) if config.n_shared_experts is not None: - intermediate_size = config.moe_intermediate_size * config.n_shared_experts + intermediate_size = (config.moe_intermediate_size * + config.n_shared_experts) self.shared_experts = DeepseekMLP( hidden_size=config.hidden_size, intermediate_size=intermediate_size, @@ -273,8 +274,9 @@ def __init__( max_position_embeddings=max_position_embeddings, linear_method=linear_method, ) - if (config.n_routed_experts is not None and \ - layer_idx >= config.first_k_dense_replace and layer_idx % config.moe_layer_freq == 0): + if (config.n_routed_experts is not None + and layer_idx >= config.first_k_dense_replace + and layer_idx % config.moe_layer_freq == 0): self.mlp = DeepseekMoE(config=config, linear_method=linear_method) else: self.mlp = DeepseekMLP( diff --git a/vllm/model_executor/models/gpt_j.py b/vllm/model_executor/models/gpt_j.py index b8c6822e9825..93dce7b67a7a 100644 --- a/vllm/model_executor/models/gpt_j.py +++ b/vllm/model_executor/models/gpt_j.py @@ -143,7 +143,8 @@ def __init__( linear_method: Optional[LinearMethodBase] = None, ): super().__init__() - inner_dim = 4 * config.n_embd if config.n_inner is None else config.n_inner + inner_dim = (4 * config.n_embd + if config.n_inner is None else config.n_inner) self.ln_1 = nn.LayerNorm(config.n_embd, eps=config.layer_norm_epsilon) self.attn = GPTJAttention(config, linear_method) self.mlp = GPTJMLP(inner_dim, config, linear_method) diff --git a/vllm/model_executor/models/internlm2.py b/vllm/model_executor/models/internlm2.py index 0ae0a8564345..7b2215ef4bda 100644 --- a/vllm/model_executor/models/internlm2.py +++ b/vllm/model_executor/models/internlm2.py @@ -305,7 +305,8 @@ def load_weights(self, param = params_dict[name] if "wqkv" in name: config = self.config - kv_groups = config.num_attention_heads // config.num_key_value_heads + kv_groups = (config.num_attention_heads // + config.num_key_value_heads) head_dim = config.hidden_size // config.num_attention_heads loaded_weight = loaded_weight.view(-1, 2 + kv_groups, head_dim, diff --git a/vllm/model_executor/models/neuron/mistral.py b/vllm/model_executor/models/neuron/mistral.py new file mode 100755 index 000000000000..a302cce30aba --- /dev/null +++ b/vllm/model_executor/models/neuron/mistral.py @@ -0,0 +1,82 @@ +"""Inference-only Mistral model compatible with HuggingFace weights.""" +from typing import List, Optional, Tuple + +import torch +from torch import nn +from transformers import MistralConfig + +from vllm.model_executor.input_metadata import InputMetadata +from vllm.model_executor.layers.sampler import Sampler +from vllm.model_executor.sampling_metadata import SamplingMetadata +from vllm.sequence import SamplerOutput +import os + +KVCache = Tuple[torch.Tensor, torch.Tensor] + + +class MistralForCausalLM(nn.Module): + + def __init__( + self, + config: MistralConfig, + linear_method=None, + ) -> None: + super().__init__() + self.config = config + self.linear_method = linear_method + self.model = None + self.lm_head = None + self.sampler = Sampler(config.vocab_size) + + def forward( + self, + input_ids: torch.Tensor, + positions: torch.Tensor, + kv_caches: List[KVCache], + input_metadata: InputMetadata, + ) -> SamplerOutput: + with torch.inference_mode(): + seq_ids = [] + block_size = self.model.context_buckets[-1] + if input_metadata.is_prompt: + seq_ids = input_metadata.slot_mapping[:, 0] // block_size + else: + seq_ids = input_metadata.block_tables + + logits = self.model(input_ids, + cache_ids=positions, + start_ids=seq_ids) + return logits + + def sample( + self, + hidden_states: torch.Tensor, + sampling_metadata: SamplingMetadata, + ) -> Optional[SamplerOutput]: + next_tokens = self.sampler(self.model.chkpt_model.lm_head, + hidden_states, sampling_metadata) + return next_tokens + + def load_weights(self, + model_name_or_path: str, + cache_dir: Optional[str] = None, + load_format: str = "auto", + revision: Optional[str] = None, + **kwargs): + from transformers_neuronx.mistral.model import MistralForSampling + + split_model_dir = f"{model_name_or_path}-split" + if os.path.isdir(os.path.join(model_name_or_path, + "pytorch_model.bin")): + split_model_dir = model_name_or_path + elif not os.path.exists(f"{model_name_or_path}-split"): + from transformers import MistralForCausalLM + from transformers_neuronx.module import save_pretrained_split + + hf_model = MistralForCausalLM.from_pretrained( + model_name_or_path, low_cpu_mem_usage=True) + save_pretrained_split(hf_model, f"{model_name_or_path}-split") + + self.model = MistralForSampling.from_pretrained( + split_model_dir, **kwargs) + self.model.to_neuron() diff --git a/vllm/model_executor/models/olmo.py b/vllm/model_executor/models/olmo.py index fa7a6d850051..2b0a420e82fa 100644 --- a/vllm/model_executor/models/olmo.py +++ b/vllm/model_executor/models/olmo.py @@ -52,7 +52,8 @@ ) from vllm.model_executor.layers.rotary_embedding import get_rope from vllm.model_executor.layers.sampler import Sampler -from vllm.model_executor.layers.vocab_parallel_embedding import VocabParallelEmbedding +from vllm.model_executor.layers.vocab_parallel_embedding import ( + VocabParallelEmbedding) from vllm.model_executor.parallel_utils.parallel_state import ( get_tensor_model_parallel_world_size, ) from vllm.model_executor.sampling_metadata import SamplingMetadata @@ -81,7 +82,8 @@ def output_multiplier(self) -> float: class OlmoAttention(nn.Module): """ - This is the attention block where the output is computed as ``Attention(LN(x))`` in ``MLP(LN(x + Attention(LN(x))))`` + This is the attention block where the output is computed as + ``Attention(LN(x))`` in ``MLP(LN(x + Attention(LN(x))))`` (plus another skip connection). """ @@ -94,11 +96,12 @@ def __init__( self.config = config self.hidden_size = config.d_model assert config.d_model % config.n_heads == 0 - tensor_model_parallel_world_size = get_tensor_model_parallel_world_size( - ) + tensor_model_parallel_world_size = ( + get_tensor_model_parallel_world_size()) self.total_num_heads = self.config.n_heads assert self.total_num_heads % tensor_model_parallel_world_size == 0 - self.num_heads = self.total_num_heads // tensor_model_parallel_world_size + self.num_heads = (self.total_num_heads // + tensor_model_parallel_world_size) self.head_dim = self.hidden_size // self.total_num_heads # Layer norms. @@ -158,7 +161,8 @@ def forward( class OlmoMLP(nn.Module): """ - This is the MLP block where the output is computed as ``MLP(LN(x))`` in ``MLP(LN(x + Attention(LN(x))))`` + This is the MLP block where the output is computed as + ``MLP(LN(x))`` in ``MLP(LN(x + Attention(LN(x))))`` (plus another skip connection). """ @@ -217,7 +221,8 @@ def forward( class OlmoBlock(nn.Module): """ - This is a typical transformer block where the output is computed as ``MLP(LN(x + Attention(LN(x))))`` + This is a typical transformer block where the output is + computed as ``MLP(LN(x + Attention(LN(x))))`` (plus another skip connection). """ diff --git a/vllm/model_executor/models/qwen2.py b/vllm/model_executor/models/qwen2.py index 4dd63f923e5f..12e0feddcb7f 100644 --- a/vllm/model_executor/models/qwen2.py +++ b/vllm/model_executor/models/qwen2.py @@ -170,7 +170,8 @@ def __init__( self.hidden_size = config.hidden_size # Requires transformers > 4.32.0 rope_theta = getattr(config, "rope_theta", 1000000) - use_sliding_window = config.use_sliding_window and layer_idx < config.max_window_layers + use_sliding_window = (config.use_sliding_window + and layer_idx < config.max_window_layers) self.self_attn = Qwen2Attention( hidden_size=self.hidden_size, num_heads=config.num_attention_heads, @@ -298,7 +299,11 @@ def __init__( self.config = config self.linear_method = linear_method self.model = Qwen2Model(config, linear_method) - self.lm_head = ParallelLMHead(config.vocab_size, config.hidden_size) + + if not config.tie_word_embeddings: + self.lm_head = ParallelLMHead(config.vocab_size, + config.hidden_size) + self.sampler = Sampler(config.vocab_size) def forward( @@ -317,7 +322,11 @@ def sample( hidden_states: torch.Tensor, sampling_metadata: SamplingMetadata, ) -> Optional[SamplerOutput]: - next_tokens = self.sampler(self.lm_head.weight, hidden_states, + if self.config.tie_word_embeddings: + lm_head_weight = self.model.embed_tokens.weight + else: + lm_head_weight = self.lm_head.weight + next_tokens = self.sampler(lm_head_weight, hidden_states, sampling_metadata) return next_tokens @@ -339,6 +348,8 @@ def load_weights(self, model_name_or_path, cache_dir, load_format, revision): if "rotary_emb.inv_freq" in name: continue + if self.config.tie_word_embeddings and "lm_head.weight" in name: + continue for (param_name, weight_name, shard_id) in stacked_params_mapping: if weight_name not in name: continue diff --git a/vllm/model_executor/models/stablelm.py b/vllm/model_executor/models/stablelm.py index d1a547f81561..c66f327beee7 100644 --- a/vllm/model_executor/models/stablelm.py +++ b/vllm/model_executor/models/stablelm.py @@ -1,5 +1,6 @@ # coding=utf-8 -# Copyright 2023 Stability AI, EleutherAI, and The HuggingFace Inc. team. All rights reserved. +# Copyright 2023 Stability AI, EleutherAI, and The HuggingFace Inc. team. +# All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -16,7 +17,8 @@ # This code is based off the following work: # https://huggingface.co/stabilityai/stablelm-3b-4e1t/blob/main/modeling_stablelm_epoch.py # https://huggingface.co/stabilityai/stablelm-3b-4e1t/blob/main/config.json -"""Inference-only StabeLM (https://github.com/Stability-AI/StableLM) model compatible with HuggingFace weights.""" +"""Inference-only StabeLM (https://github.com/Stability-AI/StableLM) +model compatible with HuggingFace weights.""" from typing import List, Optional, Tuple import torch @@ -102,9 +104,9 @@ def __init__(self, self.kv_size = self.num_key_value_heads * self.head_dim self.qkv_bias = getattr(config, "use_qkv_bias", False) if (self.head_dim * self.num_heads * tp_size) != self.hidden_size: - raise ValueError( - f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}" - f" and `num_heads`: {self.num_heads}).") + raise ValueError(f"hidden_size must be divisible by num_heads " + f"(got `hidden_size`: {self.hidden_size}" + f" and `num_heads`: {self.num_heads}).") self.qkv_proj = QKVParallelLinear(self.hidden_size, self.head_dim, @@ -192,7 +194,6 @@ def __init__(self, config: PretrainedConfig, linear_method: Optional[LinearMethodBase] = None) -> None: super().__init__() - # self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, config.pad_token_id) self.embed_tokens = VocabParallelEmbedding( config.vocab_size, config.hidden_size, diff --git a/vllm/model_executor/models/starcoder2.py b/vllm/model_executor/models/starcoder2.py index efa235233372..cfbb1bdb7909 100644 --- a/vllm/model_executor/models/starcoder2.py +++ b/vllm/model_executor/models/starcoder2.py @@ -35,7 +35,8 @@ from vllm.model_executor.layers.sampler import Sampler from vllm.model_executor.layers.vocab_parallel_embedding import ( VocabParallelEmbedding, ParallelLMHead, DEFAULT_VOCAB_PADDING_SIZE) -from vllm.model_executor.parallel_utils.parallel_state import get_tensor_model_parallel_world_size +from vllm.model_executor.parallel_utils.parallel_state import ( + get_tensor_model_parallel_world_size) from vllm.model_executor.weight_utils import (default_weight_loader, hf_model_weights_iterator) from vllm.sequence import SamplerOutput diff --git a/vllm/model_executor/neuron_model_loader.py b/vllm/model_executor/neuron_model_loader.py index b8d63d4ff12f..c434b270a556 100644 --- a/vllm/model_executor/neuron_model_loader.py +++ b/vllm/model_executor/neuron_model_loader.py @@ -34,7 +34,8 @@ def _get_model_architecture(config: PretrainedConfig) -> Type[nn.Module]: def get_model(model_config: ModelConfig, device_config: DeviceConfig, **kwargs) -> nn.Module: - from transformers_neuronx.config import NeuronConfig, ContinuousBatchingConfig + from transformers_neuronx.config import (NeuronConfig, + ContinuousBatchingConfig) parallel_config = kwargs.get("parallel_config") scheduler_config = kwargs.get("scheduler_config") diff --git a/vllm/model_executor/parallel_utils/communication_op.py b/vllm/model_executor/parallel_utils/communication_op.py index cf805df892fd..6f00fd001d95 100644 --- a/vllm/model_executor/parallel_utils/communication_op.py +++ b/vllm/model_executor/parallel_utils/communication_op.py @@ -11,7 +11,8 @@ get_tensor_model_parallel_group, is_cupy_nccl_enabled_for_all_reduce, ) -from vllm.model_executor.parallel_utils.custom_all_reduce import custom_all_reduce +from vllm.model_executor.parallel_utils.custom_all_reduce import ( + custom_all_reduce) def tensor_model_parallel_all_reduce(input_: torch.Tensor) -> torch.Tensor: @@ -24,7 +25,7 @@ def tensor_model_parallel_all_reduce(input_: torch.Tensor) -> torch.Tensor: and GPU topology. TLDR: always assume this function modifies its input, but use the return - value as the output. + value as the output. """ # Bypass the function if we are using only 1 GPU. if get_tensor_model_parallel_world_size() == 1: @@ -176,7 +177,7 @@ def broadcast_tensor_dict( for key, value in metadata_list: if isinstance(value, TensorMetadata): tensor = tensor_dict[key] - torch.distributed.broadcast(tensor, src=src) + torch.distributed.broadcast(tensor, src=src, group=group) else: recv_metadata_list = [None] torch.distributed.broadcast_object_list(recv_metadata_list, diff --git a/vllm/model_executor/sampling_metadata.py b/vllm/model_executor/sampling_metadata.py index 7deb80801856..b23f0170a6ca 100644 --- a/vllm/model_executor/sampling_metadata.py +++ b/vllm/model_executor/sampling_metadata.py @@ -114,7 +114,8 @@ def from_sampling_metadata( do_penalties = True if (i < sampling_metadata.num_prompts and sampling_params.prompt_logprobs is not None): - # For tokens in the prompt that we only need to get their logprobs + # For tokens in the prompt that we only need to get + # their logprobs prompt_len = sampling_metadata.prompt_lens[i] temperatures += [temperature] * (prompt_len - 1) top_ps += [top_p] * (prompt_len - 1) diff --git a/vllm/outputs.py b/vllm/outputs.py index 4f9eddee11cd..b8173fd7a063 100644 --- a/vllm/outputs.py +++ b/vllm/outputs.py @@ -87,12 +87,12 @@ def __init__( @classmethod def from_seq_group(cls, seq_group: SequenceGroup) -> "RequestOutput": - # Get the top-n sequences. - n = seq_group.sampling_params.n seqs = seq_group.get_seqs() - if n == 1: + if len(seqs) == 1: top_n_seqs = seqs else: + # Get the top-n sequences. + n = seq_group.sampling_params.n if seq_group.sampling_params.use_beam_search: sorting_key = lambda seq: seq.get_beam_search_score( seq_group.sampling_params.length_penalty) diff --git a/vllm/sampling_params.py b/vllm/sampling_params.py index 8103f3c2b24b..4aa158878fb9 100644 --- a/vllm/sampling_params.py +++ b/vllm/sampling_params.py @@ -74,8 +74,8 @@ class SamplingParams: stop_token_ids: List of tokens that stop the generation when they are generated. The returned output will contain the stop tokens unless the stop tokens are special tokens. - include_stop_str_in_output: Whether to include the stop strings in output - text. Defaults to False. + include_stop_str_in_output: Whether to include the stop strings in + output text. Defaults to False. ignore_eos: Whether to ignore the EOS token and continue generating tokens after the EOS token is generated. max_tokens: Maximum number of tokens to generate per output sequence. diff --git a/vllm/sequence.py b/vllm/sequence.py index fee96a875dde..4a002edaf580 100644 --- a/vllm/sequence.py +++ b/vllm/sequence.py @@ -2,12 +2,16 @@ import copy import enum from dataclasses import dataclass -from typing import Dict, List, Optional, Union +from typing import Dict, List, Optional, Union, TYPE_CHECKING from vllm.block import LogicalTokenBlock from vllm.sampling_params import SamplingParams from vllm.lora.request import LoRARequest +if TYPE_CHECKING: + import torch + from vllm.spec_decode.metrics import SpecDecodeWorkerMetrics + @dataclass class Logprob: @@ -81,6 +85,8 @@ class SequenceData: Args: prompt_token_ids: The token IDs of the prompt. + output_token_ids: The token IDs of the output. Set to an empty list if + None. Attributes: prompt_token_ids: The token IDs of the prompt. @@ -91,9 +97,13 @@ class SequenceData: def __init__( self, prompt_token_ids: List[int], + output_token_ids: Optional[List[int]] = None, ) -> None: + if output_token_ids is None: + output_token_ids = [] + self.prompt_token_ids = prompt_token_ids - self.output_token_ids: List[int] = [] + self.output_token_ids = output_token_ids self.cumulative_logprob = 0.0 def append_token_id(self, token_id: int, logprob: float) -> None: @@ -117,6 +127,12 @@ def get_last_token_id(self) -> int: return self.prompt_token_ids[-1] return self.output_token_ids[-1] + def get_prompt_token_ids(self) -> int: + return self.prompt_token_ids + + def get_output_token_ids(self) -> int: + return self.output_token_ids + def __repr__(self) -> str: return (f"SequenceData(" f"prompt_token_ids={self.prompt_token_ids}, " @@ -335,7 +351,8 @@ def maybe_set_first_token_time(self, time: float) -> None: self.metrics.first_token_time = time def maybe_set_first_scheduled_time(self, time: float) -> None: - """Sets the first scheduled time and time in queue for Request level timings.""" + """Sets the first scheduled time and time in queue for Request + level timings.""" if self.metrics.first_scheduled_time is None: self.metrics.first_scheduled_time = time self.metrics.time_in_queue = time - self.metrics.arrival_time @@ -506,6 +523,35 @@ def __eq__(self, other: object) -> bool: and self.prompt_logprobs == other.prompt_logprobs) -# For each sequence group, we generate a list of SequenceOutput object, -# each of which contains one possible candidate for the next token. -SamplerOutput = List[SequenceGroupOutput] +@dataclass +class SamplerOutput: + """For each sequence group, we generate a list of SequenceOutput object, + each of which contains one possible candidate for the next token. + + This datastructure implements methods so it can be used like a list, but + also has optional fields for device tensors. + """ + + outputs: List[SequenceGroupOutput] + + # On-device tensor containing probabilities of each token. + sampled_token_probs: Optional["torch.Tensor"] = None + + # On-device tensor containing the sampled token ids. + sampled_token_ids: Optional["torch.Tensor"] = None + + # Spec decode metrics populated by workers. + spec_decode_worker_metrics: Optional["SpecDecodeWorkerMetrics"] = None + + def __getitem__(self, idx: int): + return self.outputs[idx] + + def __setitem__(self, idx: int, value): + self.outputs[idx] = value + + def __len__(self): + return len(self.outputs) + + def __eq__(self, other: object): + return isinstance(other, + self.__class__) and self.outputs == other.outputs diff --git a/vllm/spec_decode/batch_expansion.py b/vllm/spec_decode/batch_expansion.py new file mode 100644 index 000000000000..0f698fa34601 --- /dev/null +++ b/vllm/spec_decode/batch_expansion.py @@ -0,0 +1,358 @@ +from typing import Iterator, List, Tuple, Optional, Dict +from itertools import chain, count + +import torch + +from vllm.sequence import (SamplerOutput, SequenceGroupMetadata, SequenceData) +from vllm.worker.worker import Worker +from vllm.spec_decode.util import (nvtx_range, sampler_output_to_torch, + get_all_seq_ids, + split_batch_by_proposal_len) +from vllm.spec_decode.interfaces import (SpeculativeScorer, + SpeculativeProposals, + SpeculativeScores) + +SeqId = int +TargetSeqId = int +TokenId = int + + +class BatchExpansionTop1Scorer(SpeculativeScorer): + """Implements a speculative scorer that uses batch expansion to get + probabilities of speculative tokens according to the scoring model. + + Batch expansion converts a list of sequences and multiple query positions + to a new batch of sequences, each with a single query position. This allows + for MQA-like scoring in speculative decoding without requiring an MQA + kernel. + + It is strictly less efficient than MQA scoring. + + It only supports scoring the top1 proposal tokens of the proposer, instead + of topk/tree. + """ + + def __init__(self, scorer_worker: Worker, device: str, vocab_size: int): + self._scorer_worker = scorer_worker + self._device = device + self._vocab_size = vocab_size + + @nvtx_range("BatchExpansionTop1Scorer.score_proposals") + def score_proposals( + self, + seq_group_metadata_list: List[SequenceGroupMetadata], + blocks_to_swap_in: Optional[Dict[int, int]], + blocks_to_swap_out: Optional[Dict[int, int]], + blocks_to_copy: Optional[Dict[int, List[int]]], + k: int, + proposals: SpeculativeProposals, + ) -> SpeculativeScores: + """Score the proposed tokens via the scorer model. + + This converts each input sequence to a set of k+1 target sequences. The + target sequences have the unique continuations to be scored and a + unique sequence ID that is different from all input sequence ids. + + If a speculative sequence length would exceed the max model length, then + no speculation is produced for that sequence. + + Args: + seq_group_metadata_list: The input sequence group metadata. + blocks_to_swap_in: This is passed to the worker during scoring. + blocks_to_swap_out: This is passed to the worker during scoring. + blocks_to_copy: This is passed to the worker during scoring. + k: The fixed proposal length. + proposals: The speculative proposals to score. + Returns: + SpeculativeScores: The scores of each speculative token, along with + which sequences were ignored during scoring. + """ + + # TODO(cade) perform this on GPU to remove blocking call. + proposal_lens_list = proposals.proposal_lens.tolist() + proposal_token_ids_list = proposals.proposal_token_ids.tolist() + + (spec_indices, non_spec_indices, target_seq_group_metadata_list, + num_scoring_tokens) = self._expand_batch( + seq_group_metadata_list=seq_group_metadata_list, + proposal_token_ids_list=proposal_token_ids_list, + proposal_lens_list=proposal_lens_list, + ) + + target_sampler_output = self._scorer_worker.execute_model( + seq_group_metadata_list=target_seq_group_metadata_list, + blocks_to_swap_in=blocks_to_swap_in, + blocks_to_swap_out=blocks_to_swap_out, + blocks_to_copy=blocks_to_copy, + return_python_output=False) + + all_tokens, all_probs = self._contract_batch( + original_bs=len(seq_group_metadata_list), + target_sampler_output=target_sampler_output, + proposals=proposals, + num_scoring_tokens=num_scoring_tokens, + non_spec_indices=non_spec_indices, + spec_indices=spec_indices, + k=k, + ) + + return SpeculativeScores( + probs=all_probs, + token_ids=all_tokens, + ) + + def _expand_batch( + self, + seq_group_metadata_list: List[SequenceGroupMetadata], + proposal_token_ids_list: List[TokenId], + proposal_lens_list: List[int], + ) -> Tuple[List[int], List[int], List[SequenceGroupMetadata], int]: + """Given the input sequences and potentially multiple corresponding + proposal tokens, create a new batch where each sequence has a single + query token. + """ + + # vLLM currently only supports proposal lens equal to zero or the batch + # proposal len. This adds some complexity (splitting the batch into spec + # and non spec sequences) and should be removed in the future. It can be + # done by supporting per-sequence proposal lens. + spec_seqs, spec_indices = split_batch_by_proposal_len( + seq_group_metadata_list, + proposal_lens_list, + select_proposal_len_zero=False) + non_spec_seqs, non_spec_indices = split_batch_by_proposal_len( + seq_group_metadata_list, + proposal_lens_list, + select_proposal_len_zero=True) + + target_seq_group_metadata_list = self._create_scoring_model_input( + spec_seqs, proposal_token_ids_list) + num_scoring_tokens = len(target_seq_group_metadata_list) + target_seq_group_metadata_list.extend(non_spec_seqs) + + return (spec_indices, non_spec_indices, target_seq_group_metadata_list, + num_scoring_tokens) + + def _contract_batch(self, original_bs: int, + target_sampler_output: List[SamplerOutput], + proposals: SpeculativeProposals, + num_scoring_tokens: int, non_spec_indices: List[int], + spec_indices: List[int], + k: int) -> Tuple[torch.Tensor, torch.Tensor]: + """Contract the expanded batch back into its original size. + This maps the scores of speculative tokens back to their original + sequences. + """ + (target_token_ids, target_probs, non_spec_target_token_ids, + non_spec_target_probs) = self._split_scoring_output( + target_sampler_output, num_scoring_tokens) + + # Map distinct sequences used to score each token + # of shape [batch_size * k + 1] back to [batch_size, k + 1]. + batch_size, k = proposals.proposal_token_ids.shape + + target_token_ids = target_token_ids.squeeze().reshape( + batch_size, k + 1) + target_probs = target_probs.squeeze().reshape(batch_size, k + 1, + self._vocab_size) + + all_tokens = torch.full(size=(original_bs, k + 1), + fill_value=-1, + device=self._device, + dtype=torch.long) + all_probs = torch.zeros(original_bs, + k + 1, + self._vocab_size, + device=self._device, + dtype=torch.float32) + + if non_spec_indices: + all_tokens[non_spec_indices, 0] = non_spec_target_token_ids + all_probs[non_spec_indices, :1, :] = non_spec_target_probs + + if spec_indices: + all_tokens[spec_indices] = target_token_ids + all_probs[spec_indices] = target_probs + + return all_tokens, all_probs + + def _create_scoring_model_input( + self, + seq_group_metadata_list: List[SequenceGroupMetadata], + proposal_token_ids: List[List[TokenId]], # shape: [batch_size, k] + ) -> List[SequenceGroupMetadata]: + """Given the original input sequences and proposed tokens from the draft + model, create a list of target sequences that can be used for scoring. + """ + + if not seq_group_metadata_list: + return [] + + target_seq_ids_iter = self._create_target_seq_id_iterator( + get_all_seq_ids(seq_group_metadata_list)) + + target_seq_group_metadata = list( + chain.from_iterable( + self._create_target_seq_group_metadata( + seq_group_metadata, + proposal_token_ids, + i, + target_seq_ids_iter, + ) for i, seq_group_metadata in enumerate( + seq_group_metadata_list))) + + return target_seq_group_metadata + + def _create_target_seq_group_metadata( + self, + input_seq_group_metadata: SequenceGroupMetadata, + proposal_token_ids: List[TokenId], # shape: [batch_size, k] + batch_index: int, + target_seq_ids_iter: Iterator[TargetSeqId], + ) -> List[SequenceGroupMetadata]: + """Given an input sequence group metadata and a list of draft tokens, + create a list of target SequenceGroupMetadata, one for each + token id that needs to be scored. + + Naive speculative decoding requires K target model scores, one for each + draft model token. However one can add a bonus token such that if each + token is accepted, then a final token may be sampled from the model. + This function creates K+1 target SequenceGroupMetadata to take + advantage of the bonus token. + """ + assert not input_seq_group_metadata.is_prompt, ( + "Speculating on " + "prompts not yet supported") + assert len(input_seq_group_metadata.seq_data) == 1, ( + "Beam search " + "not supported in speculative decoding") + input_seq_id = next(iter(input_seq_group_metadata.seq_data.keys())) + + token_ids_to_score = self._get_token_ids_to_score( + proposal_token_ids[batch_index]) + + target_seq_group_metadata_list: List[SequenceGroupMetadata] = [] + for token_ids in token_ids_to_score: + target_seq_group_metadata_list.append( + self._create_single_target_seq_group_metadata( + input_seq_group_metadata, + input_seq_id, + next(target_seq_ids_iter), + token_ids, + )) + + return target_seq_group_metadata_list + + def _create_single_target_seq_group_metadata( + self, + seq_group_metadata: SequenceGroupMetadata, + seq_id: SeqId, + target_seq_id: TargetSeqId, + token_ids: List[TokenId], + ) -> SequenceGroupMetadata: + """Create a single target SequenceGroupMetadata. + + Args: + seq_group_metadata: The metadata for the input sequence. + seq_id: The input sequence ID. + target_seq_id: The corresponding target sequence ID. + token_ids: The list of token ids that are to be appended to the + input sequence. + """ + seq_data = seq_group_metadata.seq_data[seq_id] + prompt_token_ids = seq_data.get_prompt_token_ids() + new_output_token_ids = [*seq_data.get_output_token_ids(), *token_ids] + + return SequenceGroupMetadata( + request_id=seq_group_metadata.request_id, + is_prompt=seq_group_metadata.is_prompt, + seq_data={ + target_seq_id: + SequenceData( + prompt_token_ids=prompt_token_ids, + output_token_ids=new_output_token_ids, + ), + }, + sampling_params=seq_group_metadata.sampling_params, + block_tables={ + target_seq_id: seq_group_metadata.block_tables[seq_id], + }, + lora_request=None, + ) + + def _split_scoring_output( + self, sampler_output: SamplerOutput, num_scoring_tokens: int + ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: + """Split the target model output into speculative and non-speculative + output. + """ + + # vLLM currently only supports proposal lens equal to zero or the batch + # proposal len. This adds some complexity (splitting the batch into spec + # and non spec sequences) and should be removed in the future. It can be + # done by supporting per-sequence proposal lens. + # + # First samples are from speculative scoring, latter samples are non- + # speculative samples. + split_sizes = [ + num_scoring_tokens, + sampler_output.sampled_token_ids.numel() - num_scoring_tokens + ] + (spec_probs, non_spec_probs + ) = sampler_output.sampled_token_probs.split(split_sizes) + (spec_sampled_tokens, non_spec_sampled_tokens + ) = sampler_output.sampled_token_ids.flatten().split(split_sizes) + + # Convert scores to tensors. + sampler_output.sampled_token_probs = spec_probs + sampler_output.sampled_token_ids = spec_sampled_tokens + target_token_ids, target_probs = sampler_output_to_torch( + [sampler_output]) + + # Convert non-speculative output tokens to tensors. + sampler_output.sampled_token_probs = non_spec_probs + sampler_output.sampled_token_ids = non_spec_sampled_tokens + non_spec_target_token_ids, non_spec_target_probs = ( + sampler_output_to_torch([sampler_output])) + + return (target_token_ids, target_probs, non_spec_target_token_ids, + non_spec_target_probs) + + def _create_target_seq_id_iterator( + self, seq_ids: List[SeqId]) -> Iterator[TargetSeqId]: + """Create an iterator for creating target sequence ids. + Target sequence ids are distinct from sequence ids because we create a + distinct target sequence id for each proposal token to be scored. + + This implementation increments a counter starting at 1 + max of all + provided input sequence ids. + """ + return count(start=max(seq_ids) + 1) + + def _get_token_ids_to_score( + self, + full_spec_token_ids: List[TokenId] # shape: [k] + ) -> List[List[TokenId]]: + """Given an int tensor of proposal token ids, return a list of + token ids that should be scored. + + Returns k+1 output lists. The additional one is used for generating the + bonus token. + + Example: + Input: [0, 1, 2, 3] (k=4) + Output: (k+1 lists) + [] + [0] + [0, 1] + [0, 1, 2] + [0, 1, 2, 3] + """ + empty_token_ids = [] + + token_ids_to_score = [empty_token_ids] + token_ids_to_score.extend([ + full_spec_token_ids[:i + 1] + for i in range(len(full_spec_token_ids)) + ]) + return token_ids_to_score diff --git a/vllm/spec_decode/interfaces.py b/vllm/spec_decode/interfaces.py new file mode 100644 index 000000000000..9e53ffb60ac3 --- /dev/null +++ b/vllm/spec_decode/interfaces.py @@ -0,0 +1,77 @@ +from typing import List, Tuple, Optional, Dict +from dataclasses import dataclass +from abc import ABC, abstractmethod + +import torch + +from vllm.sequence import SequenceGroupMetadata + + +@dataclass +class SpeculativeProposals: + """Datastructure used to represent proposal tokens from some proposer. It + also tracks how many speculative tokens each sequence has. + """ + + # Speculative proposal tokens. + proposal_token_ids: torch.Tensor + + # Probabilities of the proposal tokens according to the proposer. + proposal_probs: torch.Tensor + + # The valid length of each proposal; can be zero. + proposal_lens: torch.Tensor + + def __repr__(self): + return (f"SpeculativeProposals(" + f"proposal_token_ids={self.proposal_token_ids.shape}, " + f"proposal_probs={self.proposal_probs.shape}, " + f"proposal_lens={self.proposal_lens.shape})") + + +@dataclass +class SpeculativeScores: + """Datastructure used to represent the scores of speculative tokens + according to the scoring model. + """ + + # Probabilities of the speculative tokens according to the scoring model. + probs: torch.Tensor + + # Token ids sampled from the scoring model. Used for speculative bonus + # tokens and also non-speculative normal decoding. + token_ids: torch.Tensor + + def __repr__(self): + return (f"SpeculativeScores(" + f"probs={self.probs.shape}, " + f"token_ids={self.token_ids.shape})") + + +class SpeculativeProposer(ABC): + + @abstractmethod + def get_proposals( + self, + seq_group_metadata_list: List[SequenceGroupMetadata], + blocks_to_swap_in: Dict[int, int], + blocks_to_swap_out: Dict[int, int], + blocks_to_copy: Dict[int, List[int]], + max_proposal_len: int, + ) -> SpeculativeProposals: + raise NotImplementedError + + +class SpeculativeScorer(ABC): + + @abstractmethod + def score_proposals( + self, + seq_group_metadata_list: List[SequenceGroupMetadata], + blocks_to_swap_in: Optional[Dict[int, int]], + blocks_to_swap_out: Optional[Dict[int, int]], + blocks_to_copy: Optional[Dict[int, List[int]]], + k: int, + proposals: SpeculativeProposals, + ) -> Tuple[torch.Tensor, torch.Tensor]: + raise NotImplementedError diff --git a/vllm/spec_decode/metrics.py b/vllm/spec_decode/metrics.py new file mode 100644 index 000000000000..65a2a4a63a98 --- /dev/null +++ b/vllm/spec_decode/metrics.py @@ -0,0 +1,174 @@ +import torch +from dataclasses import dataclass +from vllm.model_executor.layers.rejection_sampler import RejectionSampler +from typing import Optional +from vllm.utils import in_wsl +import time +from typing import Callable + + +@dataclass +class SpecDecodeWorkerMetrics: + """Dataclass holding metrics emitted from the spec decode worker. + """ + + # The empirical acceptance rate of the proposal method on a per-token basis. + # This is useful for evaluating how well the proposal method aligns with the + # scoring method. + draft_acceptance_rate: float + + # The empirical efficiency, measured as the number of tokens emitted by the + # system divided by the number of tokens that could be emitted by the system + # if the proposal method were perfect. + system_efficiency: float + + # The number of speculative tokens produced by the proposal method. + draft_tokens: int + + # The number of tokens emitted by the entire system. + emitted_tokens: int + + # The number of tokens accepted by the scoring model and verification + # routine, e.g. Llama2-70B and lossless rejection sampling. + # + # NOTE: Any token accepted by the verification routine is considered + # accepted (regardless of if the speculative prefix is also accepted). The + # user will usually see less accepted tokens. This metric is helpful when + # evaluating alignment of the proposal method with the scoring model. + accepted_tokens: int + + # The number of speculative tokens per sequence. + num_spec_tokens: int + + +Timer = Callable[[], float] + + +class AsyncMetricsCollector: + """Class which copies rejection sampler metrics from the device to CPU on a + non-default Torch stream. + """ + + def __init__(self, + rejection_sampler: RejectionSampler, + timer: Optional[Timer] = None, + collect_interval_s: float = 5.0): + self._rejection_sampler = rejection_sampler + self._timer = time.time if timer is None else timer + + self._rank: Optional[int] = None + + # We don't have a device set yet. + self._copy_stream: Optional[torch.cuda.Stream] = None + + self._in_flight_copy: Optional[torch.cuda.Event] = None + + pin_memory = not in_wsl() + self._aggregate_num_accepted_tokens = torch.tensor( + 0, dtype=torch.long, device="cpu", pin_memory=pin_memory) + self._aggregate_num_emitted_tokens = torch.tensor( + 0, dtype=torch.long, device="cpu", pin_memory=pin_memory) + self._aggregate_num_draft_tokens = 0 + + self._rejsample_metrics_collect_interval_s = collect_interval_s + self._last_metrics_collect_time = self._timer() + + def init_gpu_tensors(self, rank: int) -> None: + self._rank = rank + self._copy_stream = torch.cuda.Stream() + + def maybe_collect_rejsample_metrics( + self, k: int) -> Optional[SpecDecodeWorkerMetrics]: + + # If a copy was initiated in the previous call, collect and return. + if self._in_flight_copy is not None: + ready_event = self._in_flight_copy + self._in_flight_copy = None + return self._collect_rejsample_metrics(k, ready_event) + + # Otherwise, check if we should start a new copy. + if self._should_collect_rejsample_metrics(self._timer()): + assert self._in_flight_copy is None + self._in_flight_copy = self._copy_rejsample_metrics_async() + + return None + + def _should_collect_rejsample_metrics(self, now: float) -> bool: + """Return whether or not this iteration should print rejection sampling + metrics. + """ + if self._rank != 0: + return False + + if (now - self._last_metrics_collect_time < + self._rejsample_metrics_collect_interval_s): + return False + return True + + def _copy_rejsample_metrics_async(self) -> torch.cuda.Event: + """Copy rejection sampling metrics (number of accepted tokens, etc) to + CPU asynchronously. + + Returns a CUDA event recording when the copy is complete. + """ + self._copy_stream.wait_stream(torch.cuda.current_stream()) + + with torch.cuda.stream(self._copy_stream): + self._aggregate_num_accepted_tokens.copy_( + self._rejection_sampler.num_accepted_tokens, non_blocking=True) + self._aggregate_num_emitted_tokens.copy_( + self._rejection_sampler.num_emitted_tokens, non_blocking=True) + # Number of draft tokens is calculated on CPU, so no copy is + # required. + self._aggregate_num_draft_tokens = ( + self._rejection_sampler.num_draft_tokens) + + aggregate_metrics_ready = torch.cuda.Event() + aggregate_metrics_ready.record(self._copy_stream) + + return aggregate_metrics_ready + + def _collect_rejsample_metrics( + self, k: int, + ready_event: torch.cuda.Event) -> SpecDecodeWorkerMetrics: + """Create metrics object from statistics copied asynchronously. + + Args: + k: int. The number of speculative tokens; used to determine system + efficiency. + ready_event: torch.cuda.Event. The CUDA event recording when the + async GPU->CPU copy is complete. + """ + + ready_event.synchronize() + accepted_tokens = self._aggregate_num_accepted_tokens.item() + emitted_tokens = self._aggregate_num_emitted_tokens.item() + draft_tokens = self._aggregate_num_draft_tokens + + num_possible_tokens = self.get_max_num_accepted_tokens(draft_tokens, k) + + if draft_tokens > 0: + draft_acceptance_rate = accepted_tokens / draft_tokens + else: + draft_acceptance_rate = float("nan") + + if num_possible_tokens > 0: + system_efficiency = emitted_tokens / num_possible_tokens + else: + system_efficiency = float("nan") + + return SpecDecodeWorkerMetrics( + num_spec_tokens=k, + draft_acceptance_rate=draft_acceptance_rate, + system_efficiency=system_efficiency, + accepted_tokens=accepted_tokens, + draft_tokens=draft_tokens, + emitted_tokens=emitted_tokens, + ) + + @staticmethod + def get_max_num_accepted_tokens(draft_tokens: int, k: int) -> int: + # Divide by k since batch size can be variable. + total_num_spec_seqs = draft_tokens / k + num_accepted_per_seq_if_all_accepted = k + 1 + return int(total_num_spec_seqs / num_accepted_per_seq_if_all_accepted) diff --git a/vllm/spec_decode/multi_step_worker.py b/vllm/spec_decode/multi_step_worker.py new file mode 100644 index 000000000000..0915c275b040 --- /dev/null +++ b/vllm/spec_decode/multi_step_worker.py @@ -0,0 +1,370 @@ +from typing import List, Dict, Optional, Tuple +import copy + +import torch + +from vllm.sequence import SamplerOutput, SequenceGroupMetadata +from vllm.worker.worker import Worker +from vllm.spec_decode.interfaces import (SpeculativeProposals, + SpeculativeProposer) +from vllm.spec_decode.util import sampler_output_to_torch + + +class MultiStepWorker(Worker): + """The MultiStepWorker is equivalent to a Worker except that it allows + multiple forward passes in a single call, assuming the scheduler has + allocated enough space to store the additional KV. This reduces overhead + by invoking the scheduler less. + + The MultiStepWorker does not support cache swap operations, or beam search. + Cache swap operations do not require large modifications. On the other hand, + beam search requires memory allocations during sequence forks and thus + requires more thought for MultiStepWorker support. + """ + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + self._proposer: Optional[DraftModelTop1Proposer] = None + + def init_model(self): + super().init_model() + + self._proposer = DraftModelTop1Proposer( + self, + self.device, + self.max_model_len, + self.vocab_size, + ) + + @torch.inference_mode() + def execute_model_multi_step( + self, + seq_group_metadata_list: List[SequenceGroupMetadata], + blocks_to_swap_in: Dict[int, int], + blocks_to_swap_out: Dict[int, int], + blocks_to_copy: Dict[int, List[int]], + num_steps: int, + ) -> List[SamplerOutput]: + """Run the model forward pass num_steps times. Returns the list of + sampler output, one per model forward pass. + """ + self._raise_if_unsupported(seq_group_metadata_list, blocks_to_swap_in, + blocks_to_swap_out, blocks_to_copy) + + # Shallow copy input data so modifications (such as appending tokens) + # do not cause side-effects. + copied_seq_group_metadata_list = self._shallow_copy_inputs( + seq_group_metadata_list) + + # Assert enough KV space for num_steps tokens per sequence. + self._assert_enough_kv_space(seq_group_metadata_list, num_steps) + + # Run model num_steps times. + model_outputs = [] + for _ in range(num_steps): + model_output = super().execute_model( + seq_group_metadata_list=copied_seq_group_metadata_list, + blocks_to_swap_in=blocks_to_swap_in, + blocks_to_swap_out=blocks_to_swap_out, + blocks_to_copy=blocks_to_copy, + ) + + self._append_new_tokens(model_output, + copied_seq_group_metadata_list) + model_outputs.append(model_output) + + return model_outputs + + def get_spec_proposals( + self, + seq_group_metadata_list: List[SequenceGroupMetadata], + blocks_to_swap_in: Dict[int, int], + blocks_to_swap_out: Dict[int, int], + blocks_to_copy: Dict[int, List[int]], + max_proposal_len: int, + ) -> SpeculativeProposals: + """Produce speculations given an input batch of sequences. The number of + speculative tokens per sequence is determined by max_proposal_len. + """ + + return self._proposer.get_proposals( + seq_group_metadata_list, + blocks_to_swap_in, + blocks_to_swap_out, + blocks_to_copy, + max_proposal_len, + ) + + def _append_new_tokens( + self, model_output: SamplerOutput, + seq_group_metadata_list: SequenceGroupMetadata) -> None: + """Given model output from a single run, append the tokens to the + sequences. This is normally done outside of the worker, but it is + required if the worker is to perform multiple forward passes. + """ + for seq_group_metadata, sequence_group_outputs in zip( + seq_group_metadata_list, model_output): + seq_group_metadata.is_prompt = False + + for seq_output in sequence_group_outputs.samples: + # NOTE: Beam search is not supported, so we can assume that + # parent_seq_id == seq_id. + seq = seq_group_metadata.seq_data[seq_output.parent_seq_id] + + token_id = seq_output.output_token + token_logprob = seq_output.logprobs[token_id] + + seq.append_token_id(token_id, token_logprob.logprob) + + def _shallow_copy_inputs( + self, seq_group_metadata_list: List[SequenceGroupMetadata] + ) -> List[SequenceGroupMetadata]: + """Copy input data structures to remove side-effects when input data + structures are shared with other modules. + + Helpful when the vLLM scheduler runs in the same process as the worker. + The alternative is deep-copying (or other form of deep copy); this has + performance downsides. + """ + + # Shallow-copy the list of SequenceGroupMetadata. This allows us to + # append tokens and change is_prompt without external side-effects. + new_seq_group_metadata_list = [] + + for old_seq_group_metadata in seq_group_metadata_list: + # We must shallow-copy seq_group_metadata as is_prompt could change. + seq_group_metadata = copy.copy(old_seq_group_metadata) + new_seq_group_metadata_list.append(seq_group_metadata) + + # We must shallow-copy seq_data as we will append token ids + new_seq_data = {} + for seq_id, old_seq_data in seq_group_metadata.seq_data.items(): + new_seq_data[seq_id] = copy.copy(old_seq_data) + new_seq_data[ + seq_id].output_token_ids = old_seq_data.output_token_ids[:] + + seq_group_metadata.seq_data = new_seq_data + + return new_seq_group_metadata_list + + def _assert_enough_kv_space( + self, seq_group_metadata_list: List[SequenceGroupMetadata], + num_steps: int) -> None: + """Assert there are enough physical blocks per sequence to store the + current KV plus additional KV from num_steps tokens. + """ + assert self.model_runner.block_size is not None + for seq_group_metadata in seq_group_metadata_list: + # Only one seq_id is guaranteed because there is no beam search. + seq_id = list(seq_group_metadata.seq_data.keys())[0] + seq = seq_group_metadata.seq_data[seq_id] + + # After num_steps, the seq len will be the current seq len + # plus one token per step. + final_seq_len = seq.get_len() + num_steps + + # We will have final_seq_len - 1 KV because vLLM saves KV for a + # token in the iteration after the token was generated. + required_num_kv_slots = final_seq_len - 1 + + # The allocated number of kv slots is the number of allocated blocks + # times the number of slots of block. + number_physical_blocks = len( + seq_group_metadata.block_tables[seq_id]) + allocated_kv_slots = (number_physical_blocks * + self.model_runner.block_size) + + if required_num_kv_slots > allocated_kv_slots: + request_id = seq_group_metadata.request_id + raise ValueError( + "The worker attempted to run " + f"{num_steps} times but found insufficient KV space for " + f"{request_id=} {seq_id=}. ({allocated_kv_slots=} " + f"{required_num_kv_slots=}).") + + def _raise_if_unsupported( + self, + seq_group_metadata_list: List[SequenceGroupMetadata], + blocks_to_swap_in: Dict[int, int], + blocks_to_swap_out: Dict[int, int], + blocks_to_copy: Dict[int, List[int]], + ) -> None: + """MultiStepWorker does not yet implement support for cache swap + operations or beam search. + """ + if any([blocks_to_swap_in, blocks_to_swap_out, blocks_to_copy]): + raise NotImplementedError( + "MultiStepWorker does not support cache operations") + + if any( + len(seq_group_metadata.seq_data.keys()) != 1 + for seq_group_metadata in seq_group_metadata_list): + raise NotImplementedError( + "MultiStepWorker does not support beam search.") + + +class DraftModelTop1Proposer(SpeculativeProposer): + """Helper class which separates out sequences which would exceed the max + model length when speculated upon. + + This allows combinations of models such as JackFram/llama-68m draft with + meta-llama/Llama2-13b-chat-hf, as llama-68m has max_position_embeddings of + 2048 while Llama2-13b has max_position_embeddings of 4096. + + We treat the sequences which exceed the proposal draft model length as + "non-spec sequences". Essentially they skip the draft model and go through + normal decoding in the target model. + + Currently, only proposal_lens of 0 and k are supported, where k is a global + batch proposal length. In the future vLLM should support per-sequence + proposal lengths. + """ + + def __init__( + self, + draft_worker: MultiStepWorker, + device: str, + max_model_len: int, + vocab_size: int, + ): + self._draft_worker = draft_worker + self._device = device + self._max_model_len = max_model_len + self._vocab_size = vocab_size + + def get_proposals( + self, + seq_group_metadata_list: List[SequenceGroupMetadata], + blocks_to_swap_in: Dict[int, int], + blocks_to_swap_out: Dict[int, int], + blocks_to_copy: Dict[int, List[int]], + max_proposal_len: int, + ) -> SpeculativeProposals: + """Get speculative proposals given the input batch. + + Sequences which would exceed the max model length are skipped during + speculation. + """ + + # Split speculative- and non-speculative- sequences. + (proposal_lens, nonzero_proposal_len_seqs, + nonzero_proposal_len_indices) = self._split_by_max_model_len( + seq_group_metadata_list, max_proposal_len) + + if nonzero_proposal_len_seqs: + # Speculate tokens using the draft worker for the speculative + # sequences. + maybe_sampler_output = self._draft_worker.execute_model_multi_step( + seq_group_metadata_list=nonzero_proposal_len_seqs, + blocks_to_swap_in=blocks_to_swap_in, + blocks_to_swap_out=blocks_to_swap_out, + blocks_to_copy=blocks_to_copy, + num_steps=max_proposal_len, + ) + else: + # If no sequences can be speculated, set sampler output to None. + maybe_sampler_output = None + + # Combine speculative- and non-speculative sequences into the same + # representation. + proposal_tokens, proposal_probs, proposal_lens = self._merge_outputs( + batch_size=len(seq_group_metadata_list), + max_proposal_len=max_proposal_len, + maybe_sampler_output=maybe_sampler_output, + proposal_lens=proposal_lens, + nonzero_proposal_len_indices=nonzero_proposal_len_indices, + ) + + proposals = SpeculativeProposals( + proposal_token_ids=proposal_tokens, + proposal_probs=proposal_probs, + proposal_lens=proposal_lens, + ) + + return proposals + + def _split_by_max_model_len( + self, + seq_group_metadata_list: List[SequenceGroupMetadata], + max_proposal_len: int, + ) -> Tuple[List[int], List[SequenceGroupMetadata], List[int]]: + """Determine which sequences would exceed the max model length. + """ + + proposal_lens: List[int] = [] + nonzero_proposal_len_seqs: List[SequenceGroupMetadata] = [] + nonzero_proposal_len_indices: List[int] = [] + for i, seq_group_metadata in enumerate(seq_group_metadata_list): + seq_data = next(iter(seq_group_metadata.seq_data.values())) + seq_len = seq_data.get_len() + + # Currently only proposal lens of 0 or the global batch proposal len + # are supported. + if seq_len + max_proposal_len < self._max_model_len: + proposal_lens.append(max_proposal_len) + nonzero_proposal_len_seqs.append(seq_group_metadata) + nonzero_proposal_len_indices.append(i) + else: + proposal_lens.append(0) + + return (proposal_lens, nonzero_proposal_len_seqs, + nonzero_proposal_len_indices) + + def _merge_outputs( + self, + batch_size: int, + max_proposal_len: int, + maybe_sampler_output: Optional[SamplerOutput], + proposal_lens: List[int], + nonzero_proposal_len_indices: List[int], + ) -> Tuple[torch.Tensor, torch.tensor, torch.Tensor]: + """After speculations are produced, merge the speculation results with + the skipped sequences. + """ + if maybe_sampler_output is None: + # If no speculative tokens, the sampler output will be None. + # In this case we return empty tensors. + proposal_tokens = torch.zeros(0, + max_proposal_len, + dtype=torch.long, + device=self._device) + proposal_probs = torch.zeros(0, + max_proposal_len, + self._vocab_size, + dtype=torch.float32, + device=self._device) + proposal_lens = torch.zeros(len(proposal_lens), + dtype=torch.long, + device=self._device) + return proposal_tokens, proposal_probs, proposal_lens + + sampler_output = maybe_sampler_output + + proposal_tokens, proposal_probs = sampler_output_to_torch( + sampler_output) + + # Now, reformat the output GPU tensors such that each sequence has + # a proposal. the proposal can be empty, e.g. [-1, -1, -1] + + entire_proposal_tokens = torch.full(size=(batch_size, + *proposal_tokens.shape[1:]), + fill_value=-1, + dtype=torch.long, + device=self._device) + entire_proposal_tokens[nonzero_proposal_len_indices] = proposal_tokens + entire_proposal_probs = torch.zeros(batch_size, + *proposal_probs.shape[1:], + dtype=torch.float32, + device=self._device) + entire_proposal_probs[nonzero_proposal_len_indices] = proposal_probs + + proposal_tokens, proposal_probs = (entire_proposal_tokens, + entire_proposal_probs) + + proposal_lens = torch.zeros(batch_size, + dtype=torch.long, + device=self._device) + proposal_lens[nonzero_proposal_len_indices] = max_proposal_len + + return proposal_tokens, proposal_probs, proposal_lens diff --git a/vllm/spec_decode/spec_decode_worker.py b/vllm/spec_decode/spec_decode_worker.py new file mode 100644 index 000000000000..1e5674134700 --- /dev/null +++ b/vllm/spec_decode/spec_decode_worker.py @@ -0,0 +1,375 @@ +from typing import List, Tuple, Optional, Dict +from functools import cached_property + +import torch + +from vllm.spec_decode.metrics import AsyncMetricsCollector +from vllm.sequence import (SamplerOutput, SequenceGroupMetadata, + SequenceGroupOutput, SequenceOutput) +from vllm.worker.worker import Worker +from vllm.spec_decode.multi_step_worker import MultiStepWorker +from vllm.model_executor.layers.rejection_sampler import RejectionSampler +from vllm.config import CacheConfig +from vllm.spec_decode.util import (nvtx_range, get_all_seq_ids, + split_batch_by_proposal_len) +from vllm.spec_decode.interfaces import SpeculativeProposals, SpeculativeScores +from vllm.spec_decode.batch_expansion import BatchExpansionTop1Scorer +from vllm.spec_decode.interfaces import SpeculativeScorer + + +class SpecDecodeWorker: + """Worker which implements speculative decoding. + + Speculative decoding reduces decoding per-token latency by using a proposal + method, such as a small draft model, to speculate ahead of a larger LLM. The + probabilities of the speculative tokens are then determined by the larger + LLM, after which some verification routine determines which (if any) of the + speculative tokens are accepted by the larger LLM. + + See https://github.com/vllm-project/vllm/pull/2188 and + https://github.com/vllm-project/vllm/pull/3103 for more info. + + The current implementation has the following limitations: + * Only draft-model proposal is implemented (contributions for more forms are + welcome!). + * Only top-1 proposal and scoring are implemented. Tree-attention is left as + future work. + * Only lossless rejection sampling is supported. Contributions adding lossy + verification routines are welcome (e.g. Medusa's typical acceptance). + * All sequences in a batch must have the same proposal length, or zero. This + can be improved by having per-sequence speculation in the future. + * The scoring forward pass is done without an MQA kernel, which is + suboptimal especially as the batch size, proposal length, and sequence + lengths grow. Contributions to add a MQA scoring are welcome once + correctness tests pass. + More info here https://docs.google.com/document/d/1T-JaS2T1NRfdP51qzqpyakoCXxSXTtORppiwaj5asxA/edit. + """ + + def __init__( + self, + proposer_worker: MultiStepWorker, + scorer_worker: Worker, + rejection_sampler: RejectionSampler, + metrics_collector: Optional[AsyncMetricsCollector] = None, + ): + """ + Create a SpecDecodeWorker. + + Args: + proposer_worker: A worker that can produce speculative tokens for + sequences. + scorer_worker: A worker that produces probabilities of speculative + tokens according to some base model. Typically a vanilla vLLM + Worker. + rejection_sampler: A Torch module used to perform modified rejection + sampling for speculative decoding. + metrics_collector: Helper class for collecting metrics; can be set + for testing purposes. + """ + self.proposer_worker = proposer_worker + self.scorer_worker = scorer_worker + self.rejection_sampler = rejection_sampler + + self._metrics = AsyncMetricsCollector( + rejection_sampler + ) if metrics_collector is None else metrics_collector + + self.probs_dtype = self.rejection_sampler.probs_dtype + self.token_id_dtype = self.rejection_sampler.token_id_dtype + + self.scorer: SpeculativeScorer = None + + def init_model(self) -> None: + """Initialize both scorer and proposer models. + """ + # The scorer worker model is initialized first in case the proposer + # model has a smaller TP degree than the target worker. + self.scorer_worker.init_model() + self.proposer_worker.init_model() + + self._metrics.init_gpu_tensors(self.rank) + self.rejection_sampler.init_gpu_tensors(self.rank) + self.scorer = BatchExpansionTop1Scorer( + scorer_worker=self.scorer_worker, + device=self.device, + vocab_size=self._vocab_size) + + def profile_num_available_blocks(self, block_size: int, + gpu_memory_utilization: float, + cpu_swap_space: int, + cache_dtype: str) -> Tuple[int, int]: + """Determine the number of cache blocks to use. + + This is done by profiling the scorer model (which is typically the + larger of the two). Then the total memory which would be used by the + scorer cache is divided evenly between the proposer and scorer model KV, + such that the number of blocks is equal in both KV caches. + """ + num_gpu_blocks, num_cpu_blocks = ( + self.scorer_worker.profile_num_available_blocks( + block_size, gpu_memory_utilization, cpu_swap_space, + cache_dtype)) + + scorer_cache_block_size_bytes = ( + self.scorer_worker.get_cache_block_size_bytes( + block_size, cache_dtype)) + proposer_cache_block_size_bytes = ( + self.proposer_worker.get_cache_block_size_bytes( + block_size, cache_dtype)) + + new_num_gpu_blocks = split_num_cache_blocks_evenly( + scorer_cache_block_size_bytes, proposer_cache_block_size_bytes, + num_gpu_blocks) + return new_num_gpu_blocks, num_cpu_blocks + + def init_cache_engine(self, cache_config: CacheConfig): + """Initialize the cache engine of the scorer and proposer workers. + """ + self.scorer_worker.init_cache_engine(cache_config) + self.proposer_worker.init_cache_engine(cache_config) + + @torch.inference_mode() + def execute_model( + self, + seq_group_metadata_list: List[SequenceGroupMetadata], + blocks_to_swap_in: Optional[Dict[int, int]], + blocks_to_swap_out: Optional[Dict[int, int]], + blocks_to_copy: Optional[Dict[int, List[int]]], + num_spec_tokens: int, + ) -> List[SamplerOutput]: + """Perform speculative decoding on the input batch. + """ + + assert seq_group_metadata_list is not None, ( + "speculative decoding " + "requires non-None seq_group_metadata_list") + + # If no spec tokens, call the proposer and scorer workers normally. + # Used for prefill. + if num_spec_tokens == 0 or len(seq_group_metadata_list) == 0: + return self._run_no_spec( + seq_group_metadata_list=seq_group_metadata_list, + blocks_to_swap_in=blocks_to_swap_in, + blocks_to_swap_out=blocks_to_swap_out, + blocks_to_copy=blocks_to_copy, + ) + + return self._run_speculative_decoding_step( + seq_group_metadata_list=seq_group_metadata_list, + blocks_to_swap_in=blocks_to_swap_in, + blocks_to_swap_out=blocks_to_swap_out, + blocks_to_copy=blocks_to_copy, + k=num_spec_tokens, + ) + + @nvtx_range("spec_decode_worker._run_no_spec") + def _run_no_spec( + self, + seq_group_metadata_list: List[SequenceGroupMetadata], + blocks_to_swap_in: Optional[Dict[int, int]], + blocks_to_swap_out: Optional[Dict[int, int]], + blocks_to_copy: Optional[Dict[int, List[int]]], + ) -> List[SamplerOutput]: + """Run a prefill step, without any speculation. The input is sent to the + proposer and scorer model so that the KV cache is consistent between the + two. + """ + + self.proposer_worker.execute_model( + seq_group_metadata_list=seq_group_metadata_list, + blocks_to_swap_in=blocks_to_swap_in, + blocks_to_swap_out=blocks_to_swap_out, + blocks_to_copy=blocks_to_copy, + return_python_output=False) + + sampler_output = self.scorer_worker.execute_model( + seq_group_metadata_list=seq_group_metadata_list, + blocks_to_swap_in=blocks_to_swap_in, + blocks_to_swap_out=blocks_to_swap_out, + blocks_to_copy=blocks_to_copy, + ) + + # Clear device tensors from sampler output. This reduces communication + # overhead when the engine runs in a different process than the workers. + sampler_output.probs = None + sampler_output.sampled_tokens = None + return [sampler_output] + + @nvtx_range("spec_decode_worker._run_speculative_decoding_step") + def _run_speculative_decoding_step( + self, + seq_group_metadata_list: List[SequenceGroupMetadata], + blocks_to_swap_in: Optional[Dict[int, int]], + blocks_to_swap_out: Optional[Dict[int, int]], + blocks_to_copy: Optional[Dict[int, List[int]]], + k: int, + ) -> List[SamplerOutput]: + """Execute a single step of speculative decoding. + + This invokes the proposer worker to get k speculative tokens for each + sequence, then scores each speculative token using the scoring worker. + + Returns a list of SamplerOutput, each containing a single token per + sequence. + """ + + # Generate proposals using draft worker. + proposals = self.proposer_worker.get_spec_proposals( + seq_group_metadata_list, blocks_to_swap_in, blocks_to_swap_out, + blocks_to_copy, k) + + proposal_scores = self.scorer.score_proposals( + seq_group_metadata_list, + blocks_to_swap_in, + blocks_to_swap_out, + blocks_to_copy, + k, + proposals, + ) + + accepted_token_ids = self._verify_tokens(seq_group_metadata_list, + proposal_scores, proposals, k) + + return self._create_output_sampler_list(seq_group_metadata_list, + accepted_token_ids, k) + + @nvtx_range("spec_decode_worker._verify_tokens") + def _verify_tokens( + self, + seq_group_metadata_list: List[SequenceGroupMetadata], + proposal_scores: SpeculativeScores, + proposals: SpeculativeProposals, + max_proposal_len: int, + ) -> torch.Tensor: + """Determine which speculative tokens are accepted using the + probabilities of each token according to the proposer and scorer models. + """ + proposal_lens_list = proposals.proposal_lens.tolist() + + # vLLM currently only supports proposal lens equal to zero or the batch + # proposal len. This adds some complexity (splitting the batch into spec + # and non spec sequences) and should be removed in the future. It can be + # done by supporting per-sequence proposal lens. + _, spec_indices = split_batch_by_proposal_len( + seq_group_metadata_list, + proposal_lens_list, + select_proposal_len_zero=False) + _, non_spec_indices = split_batch_by_proposal_len( + seq_group_metadata_list, + proposal_lens_list, + select_proposal_len_zero=True) + original_indices = spec_indices + non_spec_indices + + proposal_probs = proposal_scores.probs[spec_indices, :-1] + bonus_token_ids = proposal_scores.token_ids[spec_indices, -1:] + non_spec_token_ids = proposal_scores.token_ids[non_spec_indices] + + accepted_token_ids = self.rejection_sampler( + proposal_probs, + bonus_token_ids, + proposals.proposal_probs, + proposals.proposal_token_ids, + ) + + # Append output tokens from non-speculative sequences to + # the accepted token ids tensor. + non_spec_token_ids = non_spec_token_ids.expand(-1, max_proposal_len + + 1).clone() + non_spec_token_ids[:, 1:] = -1 + accepted_token_ids = torch.cat( + [accepted_token_ids, non_spec_token_ids]) + + # Rearrange so that results are in the order of the original seq group + # metadata. + accepted_token_ids[original_indices] = accepted_token_ids.clone() + + return accepted_token_ids + + def _create_output_sampler_list( + self, + seq_group_metadata_list: List[SequenceGroupMetadata], + accepted_token_ids: torch.Tensor, # shape: [batch_size, k+1] + k: int, + ) -> List[SamplerOutput]: + """Given the accepted token ids, create a list of SamplerOutput. + + The output is padded with -1 tokens such that each sequence has + the same number of outputs. + """ + seq_ids = get_all_seq_ids(seq_group_metadata_list) + + # shape: [k+1, batch_size] + accepted_token_ids_by_step = accepted_token_ids.transpose(0, + 1).tolist() + sampler_output_list = [] + for token_ids_by_step in accepted_token_ids_by_step: + if all(token_id == -1 for token_id in token_ids_by_step): + break + + step_output_token_ids = [] + for token_id, seq_id in zip(token_ids_by_step, seq_ids): + step_output_token_ids.append( + SequenceGroupOutput( + samples=[ + SequenceOutput( + parent_seq_id=seq_id, + output_token=token_id, + # TODO Add verifier logprobs. + logprobs={token_id: 0.0}, + ) + ], + prompt_logprobs=None, + )) + sampler_output_list.append( + SamplerOutput(outputs=step_output_token_ids)) + + maybe_rejsample_metrics = ( + self._metrics.maybe_collect_rejsample_metrics(k)) + if maybe_rejsample_metrics is not None: + sampler_output_list[ + 0].spec_decode_worker_metrics = maybe_rejsample_metrics + + return sampler_output_list + + @cached_property + def _vocab_size(self) -> int: + """Get the vocab size of the model and make sure it's consistent between + draft and target workers. + """ + vocab_sizes = [ + worker.vocab_size + for worker in [self.proposer_worker, self.scorer_worker] + ] + assert all(vocab_sizes[0] == vocab_size for vocab_size in vocab_sizes) + return vocab_sizes[0] + + @property + def rank(self): + return self.scorer_worker.rank + + @property + def device(self): + return self.scorer_worker.device + + +def split_num_cache_blocks_evenly(scorer_cache_block_size_bytes: int, + proposer_cache_block_size_bytes: int, + total_num_gpu_blocks: int) -> int: + """Given total_num_gpu_blocks, the number of GPU blocks that could be + allocate to the target model, this function calculates how many blocks + should be given to the draft and target model. + + Note that usually the block size, in bytes, of each model is different, + as it's a function of number of KV/layer, number of heads, and hidden + dimension size. + + Since the target and draft models allocate the same number of blocks, we + simply calculate the number of blocks where if allocated by both models, + the total memory usage from KV cache is no larger than the number of + blocks allocatable by the target model alone. + """ + new_num_gpu_blocks = int( + total_num_gpu_blocks * scorer_cache_block_size_bytes / + (proposer_cache_block_size_bytes + scorer_cache_block_size_bytes)) + + return new_num_gpu_blocks diff --git a/vllm/spec_decode/util.py b/vllm/spec_decode/util.py new file mode 100644 index 000000000000..2c5f95455190 --- /dev/null +++ b/vllm/spec_decode/util.py @@ -0,0 +1,99 @@ +import torch +from typing import List, Tuple +from vllm.sequence import SequenceGroupMetadata, SamplerOutput +from contextlib import contextmanager +from itertools import chain + +SeqId = int + + +def get_all_seq_ids( + seq_group_metadata_list: List[SequenceGroupMetadata]) -> List[SeqId]: + """Given a list of SequenceGroupMetadata, create a list of all + sequence ids. + """ + return list( + chain.from_iterable([ + seq_group_metadata.seq_data.keys() + for seq_group_metadata in seq_group_metadata_list + ])) + + +def split_batch_by_proposal_len( + seq_group_metadata_list: List[SequenceGroupMetadata], + proposal_lens: List[int], select_proposal_len_zero: bool +) -> Tuple[List[SequenceGroupMetadata], List[int]]: + """Utility function that splits a batch based on whether the proposal len is + zero or not. We should remove this once vLLM supports per-sequence proposal + lens in a batch. + """ + + if select_proposal_len_zero: + predicate = lambda proposal_len: proposal_len == 0 + else: + predicate = lambda proposal_len: proposal_len != 0 + + indices = [ + i for i, (_, proposal_len + ) in enumerate(zip(seq_group_metadata_list, proposal_lens)) + if predicate(proposal_len) + ] + seq_groups = [ + seq_group for seq_group, proposal_len in zip( + seq_group_metadata_list, proposal_lens) if predicate(proposal_len) + ] + + return seq_groups, indices + + +def sampler_output_to_torch( + sampler_output_list: List[SamplerOutput], +) -> Tuple[torch.Tensor, torch.Tensor]: + """Utility function which converts a list of SamplerOutput to tensors. + + Returns: + sampled_token_ids: torch.Tensor + shape: [batch_size, len(sampler_output_list)] + + sampled_token_probs: torch.Tensor + shape: [batch_size, len(sampler_output_list), vocab_size] + """ + + # shape: [batch_size, num_sampler_output, vocab_size] + sampled_token_probs = torch.stack( + [ + sampler_output.sampled_token_probs + for sampler_output in sampler_output_list + ], + dim=0, + ).transpose(0, 1) + + # shape: [batch_size, num_sampler_output] + sampled_token_ids = torch.stack( + [ + sampler_output.sampled_token_ids.flatten() + for sampler_output in sampler_output_list + ], + dim=0, + ).transpose(0, 1) + + return sampled_token_ids, sampled_token_probs + + +@contextmanager +def nvtx_range(msg, *args, **kwargs): + """ + Context manager / decorator that pushes an NVTX range at the beginning + of its scope, and pops it at the end. If extra arguments are given, + they are passed as arguments to msg.format(). + + If running with cuda graphs, you must enable nsys cuda graph profiling. + + Arguments: + msg (string): message to associate with the range + """ + torch.cuda.nvtx.range_push(msg.format(*args, **kwargs)) + try: + yield + finally: + torch.cuda.nvtx.range_pop() diff --git a/vllm/transformers_utils/configs/mpt.py b/vllm/transformers_utils/configs/mpt.py index 5ea0d9122ef1..2c0e45623aa2 100644 --- a/vllm/transformers_utils/configs/mpt.py +++ b/vllm/transformers_utils/configs/mpt.py @@ -62,62 +62,6 @@ def __init__(self, fc_type: str = 'torch', verbose: Optional[int] = None, **kwargs: Any): - """The MPT configuration class. - Args: - d_model (int): The size of the embedding dimension of the model. - n_heads (int): The number of attention heads. - n_layers (int): The number of layers in the model. - expansion_ratio (int): The ratio of the up/down scale in the ffn. - max_seq_len (int): The maximum sequence length of the model. - vocab_size (int): The size of the vocabulary. - resid_pdrop (float): The dropout probability applied to the attention output before combining with residual. - emb_pdrop (float): The dropout probability for the embedding layer. - learned_pos_emb (bool): Whether to use learned positional embeddings - attn_config (Dict): A dictionary used to configure the model's attention module: - attn_type (str): type of attention to use. Options: multihead_attention, multiquery_attention, grouped_query_attention - attn_pdrop (float): The dropout probability for the attention layers. - attn_impl (str): The attention implementation to use. One of 'torch', 'flash', or 'triton'. - qk_ln (bool): Whether to apply layer normalization to the queries and keys in the attention layer. - clip_qkv (Optional[float]): If not None, clip the queries, keys, and values in the attention layer to - this value. - softmax_scale (Optional[float]): If not None, scale the softmax in the attention layer by this value. If None, - use the default scale of ``1/sqrt(d_keys)``. - prefix_lm (Optional[bool]): Whether the model should operate as a Prefix LM. This requires passing an - extra `prefix_mask` argument which indicates which tokens belong to the prefix. Tokens in the prefix - can attend to one another bi-directionally. Tokens outside the prefix use causal attention. - attn_uses_sequence_id (Optional[bool]): Whether to restrict attention to tokens that have the same sequence_id. - When the model is in `train` mode, this requires passing an extra `sequence_id` argument which indicates - which sub-sequence each token belongs to. - Defaults to ``False`` meaning any provided `sequence_id` will be ignored. - alibi (bool): Whether to use the alibi bias instead of position embeddings. - alibi_bias_max (int): The maximum value of the alibi bias. - kv_n_heads (Optional[int]): For grouped_query_attention only, allow user to specify number of kv heads. - ffn_config (Dict): A dictionary used to configure the model's ffn module: - ffn_type (str): type of ffn to use. Options: mptmlp, te_ln_mlp - init_device (str): The device to use for parameter initialization. - logit_scale (Optional[Union[float, str]]): If not None, scale the logits by this value. - no_bias (bool): Whether to use bias in all layers. - verbose (int): The verbosity level. 0 is silent. - embedding_fraction (float): The fraction to scale the gradients of the embedding layer by. - norm_type (str): choose type of norm to use - use_cache (bool): Whether or not the model should return the last key/values attentions - init_config (Dict): A dictionary used to configure the model initialization: - init_config.name: The parameter initialization scheme to use. Options: 'default_', 'baseline_', - 'kaiming_uniform_', 'kaiming_normal_', 'neox_init_', 'small_init_', 'xavier_uniform_', or - 'xavier_normal_'. These mimic the parameter initialization methods in PyTorch. - init_div_is_residual (Union[int, float, str, bool]): Value to divide initial weights by if ``module._is_residual`` is True. - emb_init_std (Optional[float]): The standard deviation of the normal distribution used to initialize the embedding layer. - emb_init_uniform_lim (Optional[Union[Tuple[float, float], float]]): The lower and upper limits of the uniform distribution - used to initialize the embedding layer. Mutually exclusive with ``emb_init_std``. - init_std (float): The standard deviation of the normal distribution used to initialize the model, - if using the baseline_ parameter initialization scheme. - init_gain (float): The gain to use for parameter initialization with kaiming or xavier initialization schemes. - fan_mode (str): The fan mode to use for parameter initialization with kaiming initialization schemes. - init_nonlinearity (str): The nonlinearity to use for parameter initialization with kaiming initialization schemes. - --- - See llmfoundry.models.utils.param_init_fns.py for info on other param init config options - fc_type (str): choose fc layer implementation. Options: torch and te. te layers support fp8 when using H100 GPUs. - """ self.d_model = d_model self.n_heads = n_heads self.n_layers = n_layers @@ -139,8 +83,8 @@ def __init__(self, self.fc_type = fc_type if verbose is not None: warnings.warn(DeprecationWarning( - 'verbose argument for MPTConfig is now ignored and will be removed. Use python_log_level instead.' - ), + 'verbose argument for MPTConfig is now ignored and ' + 'will be removed. Use python_log_level instead.'), stacklevel=2) if 'name' in kwargs: del kwargs['name'] @@ -149,7 +93,8 @@ def __init__(self, if self.attn_config.get('alibi', False): self.learned_pos_emb = False warnings.warn( - f'alibi is turned on, setting `learned_pos_emb` to {self.learned_pos_emb}`', + f'alibi is turned on, setting `learned_pos_emb` ' + f'to {self.learned_pos_emb}`', stacklevel=2) super().__init__(**kwargs) self._validate_config() @@ -176,8 +121,8 @@ def _validate_config(self) -> None: [self.attn_config['attn_pdrop'], self.resid_pdrop, self.emb_pdrop] )): raise ValueError( - "self.attn_config['attn_pdrop'], resid_pdrop, emb_pdrop are probabilities and must be between 0 and 1" # pylint: disable=line-too-long - ) + "self.attn_config['attn_pdrop'], resid_pdrop, emb_pdrop are " + "probabilities and must be between 0 and 1") if self.attn_config['attn_impl'] not in ['torch', 'flash', 'triton']: raise ValueError( f"Unknown attn_impl={self.attn_config['attn_impl']}") @@ -193,17 +138,17 @@ def _validate_config(self) -> None: if self.attn_config['attn_uses_sequence_id'] and self.attn_config[ 'attn_impl'] not in ['torch', 'triton']: raise NotImplementedError( - 'attn_uses_sequence_id only implemented with torch and triton attention.' # pylint: disable=line-too-long - ) + 'attn_uses_sequence_id only implemented with torch ' + 'and triton attention.') if self.embedding_fraction > 1 or self.embedding_fraction <= 0: raise ValueError( - 'model.embedding_fraction must be between 0 (exclusive) and 1 (inclusive)!' # pylint: disable=line-too-long - ) + 'model.embedding_fraction must be between 0 (exclusive) ' + 'and 1 (inclusive)!') if isinstance(self.logit_scale, str) and self.logit_scale != 'inv_sqrt_d_model': raise ValueError( - f"self.logit_scale={self.logit_scale!r} is not recognized as an option; use numeric value or 'inv_sqrt_d_model'." # pylint: disable=line-too-long - ) + f"self.logit_scale={self.logit_scale!r} is not recognized as " + "an option; use numeric value or 'inv_sqrt_d_model'.") if self.init_config.get('name', None) is None: raise ValueError( f"self.init_config={self.init_config!r} 'name' needs to be set." @@ -219,11 +164,11 @@ def _validate_config(self) -> None: del te except Exception as exc: raise ImportError( - # pylint: disable=line-too-long - 'TransformerEngine import fail. `fc_type: te` requires TransformerEngine be installed. ' - + - 'The required version of transformer_engine also requires FlashAttention v1.0.6 is installed:\n' - + 'pip install flash-attn==1.0.6 --no-build-isolation \n' + + 'TransformerEngine import fail. `fc_type: te` requires ' + 'TransformerEngine be installed. ' + 'The required version of transformer_engine also requires ' + 'FlashAttention v1.0.6 is installed:\n' + 'pip install flash-attn==1.0.6 --no-build-isolation \n' 'pip install git+https://github.com/NVIDIA/TransformerEngine.git@144e4888b2cdd60bd52e706d5b7a79cb9c1a7156' ) from exc if self.ffn_config['ffn_type'] == 'mptmlp': diff --git a/vllm/transformers_utils/configs/starcoder2.py b/vllm/transformers_utils/configs/starcoder2.py index 4c3b6b8def07..2879cd044527 100644 --- a/vllm/transformers_utils/configs/starcoder2.py +++ b/vllm/transformers_utils/configs/starcoder2.py @@ -2,78 +2,6 @@ class Starcoder2Config(PretrainedConfig): - r""" - This is the configuration class to store the configuration of a [`Starcoder2Model`]. It is used to instantiate a - Starcoder2 model according to the specified arguments, defining the model architecture. Instantiating a configuration - with the defaults will yield a similar configuration to that of the [bigcode/starcoder2-7b_16k](https://huggingface.co/bigcode/starcoder2-7b_16k) model. - - - Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the - documentation from [`PretrainedConfig`] for more information. - - - Args: - vocab_size (`int`, *optional*, defaults to 49152): - Vocabulary size of the Starcoder2 model. Defines the number of different tokens that can be represented by the - `inputs_ids` passed when calling [`Starcoder2Model`] - hidden_size (`int`, *optional*, defaults to 3072): - Dimension of the hidden representations. - intermediate_size (`int`, *optional*, defaults to 12288): - Dimension of the MLP representations. - num_hidden_layers (`int`, *optional*, defaults to 30): - Number of hidden layers in the Transformer encoder. - num_attention_heads (`int`, *optional*, defaults to 24): - Number of attention heads for each attention layer in the Transformer encoder. - num_key_value_heads (`int`, *optional*, defaults to 2): - This is the number of key_value heads that should be used to implement Grouped Query Attention. If - `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if - `num_key_value_heads=1 the model will use Multi Query Attention (MQA) otherwise GQA is used. When - converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed - by meanpooling all the original heads within that group. For more details checkout [this - paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to `8`. - hidden_act (`str` or `function`, *optional*, defaults to `"gelu_pytorch_tanh"`): - The non-linear activation function (function or string) in the decoder. - max_position_embeddings (`int`, *optional*, defaults to 4096): - The maximum sequence length that this model might ever be used with. Starcoder2's sliding window attention - allows sequence of up to 4096*32 tokens. - initializer_range (`float`, *optional*, defaults to 0.02): - The standard deviation of the truncated_normal_initializer for initializing all weight matrices. - norm_epsilon (`float`, *optional*, defaults to 1e-05): - Epsilon value for the layer norm - use_cache (`bool`, *optional*, defaults to `True`): - Whether or not the model should return the last key/values attentions (not used by all models). Only - relevant if `config.is_decoder=True`. - bos_token_id (`int`, *optional*, defaults to 50256): - The id of the "beginning-of-sequence" token. - eos_token_id (`int`, *optional*, defaults to 50256): - The id of the "end-of-sequence" token. - rope_theta (`float`, *optional*, defaults to 10000.0): - The base period of the RoPE embeddings. - sliding_window (`int`, *optional*): - Sliding window attention window size. If not specified, will default to `None` (no sliding window). - attention_dropout (`float`, *optional*, defaults to 0.0): - The dropout ratio for the attention probabilities. - residual_dropout (`float`, *optional*, defaults to 0.0): - Residual connection dropout value. - embedding_dropout (`float`, *optional*, defaults to 0.0): - Embedding dropout. - use_bias (`bool`, *optional*, defaults to `True`): - Whether to use bias term on linear layers of the model. - - - ```python - >>> from transformers import Starcoder2Model, Starcoder2Config - - >>> # Initializing a Starcoder2 7B style configuration - >>> configuration = Starcoder2Config() - - >>> # Initializing a model from the Starcoder2 7B style configuration - >>> model = Starcoder2Model(configuration) - - >>> # Accessing the model configuration - >>> configuration = model.config - ```""" - model_type = "starcoder2" keys_to_ignore_at_inference = ["past_key_values"] diff --git a/vllm/transformers_utils/tokenizer.py b/vllm/transformers_utils/tokenizer.py index 6edc225cdfc8..f7a1a19a89bc 100644 --- a/vllm/transformers_utils/tokenizer.py +++ b/vllm/transformers_utils/tokenizer.py @@ -5,12 +5,48 @@ from vllm.logger import init_logger from vllm.lora.request import LoRARequest -from vllm.utils import make_async, LRUCache +from vllm.utils import make_async from vllm.transformers_utils.tokenizers import * logger = init_logger(__name__) +def get_cached_tokenizer( + tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast] +) -> Union[PreTrainedTokenizer, PreTrainedTokenizerFast]: + """Get tokenizer with cached properties. + + This will patch the tokenizer object in place. + + By default, transformers will recompute multiple tokenizer properties + each time they are called, leading to a significant slowdown. This + function caches these properties for faster access.""" + + tokenizer_all_special_ids = set(tokenizer.all_special_ids) + tokenizer_all_special_tokens_extended = ( + tokenizer.all_special_tokens_extended) + tokenizer_all_special_tokens = set(tokenizer.all_special_tokens) + + class CachedTokenizer(tokenizer.__class__): + + @property + def all_special_ids(self): + return tokenizer_all_special_ids + + @property + def all_special_tokens(self): + return tokenizer_all_special_tokens + + @property + def all_special_tokens_extended(self): + return tokenizer_all_special_tokens_extended + + CachedTokenizer.__name__ = f"Cached{tokenizer.__class__.__name__}" + + tokenizer.__class__ = CachedTokenizer + return tokenizer + + def get_tokenizer( tokenizer_name: str, *args, @@ -64,7 +100,7 @@ def get_tokenizer( logger.warning( "Using a slow tokenizer. This might cause a significant " "slowdown. Consider using a fast tokenizer instead.") - return tokenizer + return get_cached_tokenizer(tokenizer) def get_lora_tokenizer(lora_request: LoRARequest, *args, @@ -88,63 +124,6 @@ def get_lora_tokenizer(lora_request: LoRARequest, *args, get_lora_tokenizer_async = make_async(get_lora_tokenizer) -class TokenizerGroup: - """A group of tokenizers that can be used for LoRA adapters.""" - - def __init__(self, tokenizer_id: str, enable_lora: bool, max_num_seqs: int, - max_input_length: Optional[int], **tokenizer_config): - self.tokenizer_id = tokenizer_id - self.tokenizer_config = tokenizer_config - self.enable_lora = enable_lora - self.max_input_length = max_input_length - self.tokenizer = get_tokenizer(self.tokenizer_id, **tokenizer_config) - if enable_lora: - self.lora_tokenizers = LRUCache(capacity=max_num_seqs) - else: - self.lora_tokenizers = None - - def encode(self, - prompt: str, - request_id: Optional[str] = None, - lora_request: Optional[LoRARequest] = None) -> List[int]: - tokenizer = self.get_lora_tokenizer(lora_request) - return tokenizer.encode(prompt) - - async def encode_async( - self, - prompt: str, - request_id: Optional[str] = None, - lora_request: Optional[LoRARequest] = None) -> List[int]: - tokenizer = await self.get_lora_tokenizer_async(lora_request) - return tokenizer.encode(prompt) - - def get_lora_tokenizer( - self, - lora_request: Optional[LoRARequest]) -> "PreTrainedTokenizer": - if not lora_request or not self.enable_lora: - return self.tokenizer - if lora_request.lora_int_id not in self.lora_tokenizers: - tokenizer = (get_lora_tokenizer( - lora_request, **self.tokenizer_config) or self.tokenizer) - self.lora_tokenizers.put(lora_request.lora_int_id, tokenizer) - return tokenizer - else: - return self.lora_tokenizers.get(lora_request.lora_int_id) - - async def get_lora_tokenizer_async( - self, - lora_request: Optional[LoRARequest]) -> "PreTrainedTokenizer": - if not lora_request or not self.enable_lora: - return self.tokenizer - if lora_request.lora_int_id not in self.lora_tokenizers: - tokenizer = (await get_lora_tokenizer_async( - lora_request, **self.tokenizer_config) or self.tokenizer) - self.lora_tokenizers.put(lora_request.lora_int_id, tokenizer) - return tokenizer - else: - return self.lora_tokenizers.get(lora_request.lora_int_id) - - def _convert_tokens_to_string_with_added_encoders( tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast], output_tokens: List[str], diff --git a/vllm/transformers_utils/tokenizer_group/__init__.py b/vllm/transformers_utils/tokenizer_group/__init__.py new file mode 100644 index 000000000000..adc8d9b90ddb --- /dev/null +++ b/vllm/transformers_utils/tokenizer_group/__init__.py @@ -0,0 +1,32 @@ +from typing import Optional +from vllm.config import TokenizerPoolConfig +from vllm.transformers_utils.tokenizer_group.base_tokenizer_group import ( + BaseTokenizerGroup) +from vllm.transformers_utils.tokenizer_group.tokenizer_group import ( + TokenizerGroup) +from vllm.engine.ray_utils import ray + +if ray: + from vllm.transformers_utils.tokenizer_group.ray_tokenizer_group import ( + RayTokenizerGroupPool) +else: + RayTokenizerGroupPool = None + + +def get_tokenizer_group(tokenizer_pool_config: Optional[TokenizerPoolConfig], + **init_kwargs) -> BaseTokenizerGroup: + if tokenizer_pool_config is None: + return TokenizerGroup(**init_kwargs) + if tokenizer_pool_config.pool_type == "ray": + if RayTokenizerGroupPool is None: + raise ImportError( + "RayTokenizerGroupPool is not available. Please install " + "the ray package to use the Ray tokenizer group pool.") + return RayTokenizerGroupPool.from_config(tokenizer_pool_config, + **init_kwargs) + else: + raise ValueError( + f"Unknown pool type: {tokenizer_pool_config.pool_type}") + + +__all__ = ["get_tokenizer_group", "BaseTokenizerGroup"] diff --git a/vllm/transformers_utils/tokenizer_group/base_tokenizer_group.py b/vllm/transformers_utils/tokenizer_group/base_tokenizer_group.py new file mode 100644 index 000000000000..99518a606fab --- /dev/null +++ b/vllm/transformers_utils/tokenizer_group/base_tokenizer_group.py @@ -0,0 +1,48 @@ +from abc import ABC, abstractmethod +from typing import List, Optional + +from transformers import PreTrainedTokenizer + +from vllm.lora.request import LoRARequest + + +class BaseTokenizerGroup(ABC): + """A group of tokenizers that can be used for LoRA adapters.""" + + @abstractmethod + def ping(self) -> bool: + """Check if the tokenizer group is alive.""" + pass + + @abstractmethod + def get_max_input_len(self, + lora_request: Optional[LoRARequest] = None + ) -> Optional[int]: + """Get the maximum input length for the LoRA request.""" + pass + + @abstractmethod + def encode(self, prompt: str, request_id: Optional[str], + lora_request: Optional[LoRARequest]) -> List[int]: + """Encode a prompt using the tokenizer group.""" + pass + + @abstractmethod + async def encode_async(self, prompt: str, request_id: Optional[str], + lora_request: Optional[LoRARequest]) -> List[int]: + """Encode a prompt using the tokenizer group.""" + pass + + @abstractmethod + def get_lora_tokenizer( + self, + lora_request: Optional[LoRARequest]) -> "PreTrainedTokenizer": + """Get a tokenizer for a LoRA request.""" + pass + + @abstractmethod + async def get_lora_tokenizer_async( + self, + lora_request: Optional[LoRARequest]) -> "PreTrainedTokenizer": + """Get a tokenizer for a LoRA request.""" + pass diff --git a/vllm/transformers_utils/tokenizer_group/ray_tokenizer_group.py b/vllm/transformers_utils/tokenizer_group/ray_tokenizer_group.py new file mode 100644 index 000000000000..e048ec05bece --- /dev/null +++ b/vllm/transformers_utils/tokenizer_group/ray_tokenizer_group.py @@ -0,0 +1,166 @@ +import asyncio +import os +from typing import List, Optional + +from transformers import PreTrainedTokenizer + +from vllm.config import TokenizerPoolConfig +from vllm.lora.request import LoRARequest +from vllm.engine.ray_utils import ray +from vllm.transformers_utils.tokenizer_group.base_tokenizer_group import ( + BaseTokenizerGroup) +from vllm.transformers_utils.tokenizer_group.tokenizer_group import ( + TokenizerGroup) +from ray.util.scheduling_strategies import NodeAffinitySchedulingStrategy + + +class RayTokenizerGroupPool(BaseTokenizerGroup): + """A Ray-based pool of TokenizerGroups for async tokenization.""" + + # Class to use for workers making up the pool. + _worker_cls = TokenizerGroup + + @classmethod + def from_config(cls, tokenizer_pool_config: TokenizerPoolConfig, + **init_kwargs) -> "RayTokenizerGroupPool": + ray_actor_options = (tokenizer_pool_config.extra_config or { + "num_cpus": 0 + }) + ray_actor_options.setdefault( + "scheduling_strategy", + NodeAffinitySchedulingStrategy( + node_id=ray.get_runtime_context().get_node_id(), soft=True)) + + # Carry over the env vars to the actors. + # This is necessary for API keys and such. + ray_actor_options.setdefault("runtime_env", {}) + _carry_over_env_vars_to_runtime_env(ray_actor_options["runtime_env"]) + + init_kwargs["num_actors"] = tokenizer_pool_config.pool_size + init_kwargs["ray_actor_options"] = ray_actor_options + + return cls(**init_kwargs) + + def __init__(self, tokenizer_id: str, enable_lora: bool, max_num_seqs: int, + max_input_length: Optional[int], num_actors: int, + ray_actor_options: dict, **tokenizer_config): + # Store a local copy of the TokenizerGroup for quick access + # to underlying HF tokenizers. + self._local_tokenizer_group = self._worker_cls( + tokenizer_id=tokenizer_id, + enable_lora=enable_lora, + max_num_seqs=max_num_seqs, + max_input_length=max_input_length, + ) + + ray_tokenizer_group_cls = ray.remote( + self._worker_cls).options(**ray_actor_options) + self.tokenizer_actors = [ + ray_tokenizer_group_cls.remote(tokenizer_id, enable_lora, + max_num_seqs, max_input_length, + **tokenizer_config) + for _ in range(num_actors) + ] + self._idle_actors: Optional[asyncio.Queue] = None + + @property + def pool_size(self) -> int: + return len(self.tokenizer_actors) + + def ping(self): + return ray.get( + [actor.ping.remote() for actor in self.tokenizer_actors]) + + def _ensure_queue_initialized(self): + if self._idle_actors is None: + self._idle_actors = asyncio.Queue() + for actor in self.tokenizer_actors: + self._idle_actors.put_nowait(actor) + + def encode(self, + prompt: str, + request_id: Optional[str] = None, + lora_request: Optional[LoRARequest] = None) -> List[int]: + """Encode a prompt using the tokenizer group. + + We pick an idle actor and use it to encode the prompt. + The actor is then put back in the queue for future use. + This is blocking. + """ + self._ensure_queue_initialized() + + if self._idle_actors.empty(): + raise RuntimeError("No idle actors available.") + actor = self._idle_actors.get_nowait() + try: + ret = ray.get( + actor.encode.remote(request_id=request_id, + prompt=prompt, + lora_request=lora_request)) + finally: + # Put the actor back in the queue. + # This is done in a finally block to ensure that the actor is + # always put back in the queue, even if an exception/cancellation + # is raised. + self._idle_actors.put_nowait(actor) + return ret + + async def encode_async( + self, + prompt: str, + request_id: Optional[str] = None, + lora_request: Optional[LoRARequest] = None) -> List[int]: + """Encode a prompt using the tokenizer group. + + We pick an idle actor and use it to encode the prompt. + If there are no idle actors, we wait until one becomes + available. + The actor is then put back in the queue for future use. + This is non-blocking. + """ + self._ensure_queue_initialized() + + actor = await self._idle_actors.get() + try: + ret = await actor.encode.remote(request_id=request_id, + prompt=prompt, + lora_request=lora_request) + finally: + # Put the actor back in the queue. + # This is done in a finally block to ensure that the actor is + # always put back in the queue, even if an exception/cancellation + # is raised. + self._idle_actors.put_nowait(actor) + return ret + + def get_max_input_len(self, + lora_request: Optional[LoRARequest] = None + ) -> Optional[int]: + """Get the maximum input length for the LoRA request.""" + return self._local_tokenizer_group.get_max_input_len(lora_request) + + def get_lora_tokenizer( + self, + lora_request: Optional[LoRARequest] = None + ) -> "PreTrainedTokenizer": + return self._local_tokenizer_group.get_lora_tokenizer(lora_request) + + async def get_lora_tokenizer_async( + self, + lora_request: Optional[LoRARequest] = None + ) -> "PreTrainedTokenizer": + return await self._local_tokenizer_group.get_lora_tokenizer_async( + lora_request) + + +def _carry_over_env_vars_to_runtime_env(runtime_env: dict) -> None: + """Copy over all current process environment variables to the runtime_env. + + The variables in runtime_env will take precedence over the current process + environment variables. + + runtime_env will be modified in place.""" + env_vars = os.environ.copy() + runtime_env.setdefault("env_vars", {}) + env_vars.update(runtime_env["env_vars"]) + runtime_env["env_vars"] = env_vars diff --git a/vllm/transformers_utils/tokenizer_group/tokenizer_group.py b/vllm/transformers_utils/tokenizer_group/tokenizer_group.py new file mode 100644 index 000000000000..3af1334cb5ed --- /dev/null +++ b/vllm/transformers_utils/tokenizer_group/tokenizer_group.py @@ -0,0 +1,80 @@ +from typing import List, Optional + +from transformers import PreTrainedTokenizer + +from vllm.lora.request import LoRARequest +from vllm.transformers_utils.tokenizer import (get_lora_tokenizer, + get_lora_tokenizer_async) +from vllm.transformers_utils.tokenizer_group.base_tokenizer_group import ( + BaseTokenizerGroup) +from vllm.utils import LRUCache +from vllm.transformers_utils.tokenizer import get_tokenizer + + +class TokenizerGroup(BaseTokenizerGroup): + """A group of tokenizers that can be used for LoRA adapters.""" + + def __init__(self, tokenizer_id: str, enable_lora: bool, max_num_seqs: int, + max_input_length: Optional[int], **tokenizer_config): + self.tokenizer_id = tokenizer_id + self.tokenizer_config = tokenizer_config + self.enable_lora = enable_lora + self.max_input_length = max_input_length + self.tokenizer = get_tokenizer(self.tokenizer_id, **tokenizer_config) + if enable_lora: + self.lora_tokenizers = LRUCache(capacity=max_num_seqs) + else: + self.lora_tokenizers = None + + def ping(self) -> bool: + """Check if the tokenizer group is alive.""" + return True + + def get_max_input_len(self, + lora_request: Optional[LoRARequest] = None + ) -> Optional[int]: + """Get the maximum input length for the LoRA request.""" + return self.max_input_length + + def encode(self, + prompt: str, + request_id: Optional[str] = None, + lora_request: Optional[LoRARequest] = None) -> List[int]: + tokenizer = self.get_lora_tokenizer(lora_request) + return tokenizer.encode(prompt) + + async def encode_async( + self, + prompt: str, + request_id: Optional[str] = None, + lora_request: Optional[LoRARequest] = None) -> List[int]: + tokenizer = await self.get_lora_tokenizer_async(lora_request) + return tokenizer.encode(prompt) + + def get_lora_tokenizer( + self, + lora_request: Optional[LoRARequest] = None + ) -> "PreTrainedTokenizer": + if not lora_request or not self.enable_lora: + return self.tokenizer + if lora_request.lora_int_id not in self.lora_tokenizers: + tokenizer = (get_lora_tokenizer( + lora_request, **self.tokenizer_config) or self.tokenizer) + self.lora_tokenizers.put(lora_request.lora_int_id, tokenizer) + return tokenizer + else: + return self.lora_tokenizers.get(lora_request.lora_int_id) + + async def get_lora_tokenizer_async( + self, + lora_request: Optional[LoRARequest] = None + ) -> "PreTrainedTokenizer": + if not lora_request or not self.enable_lora: + return self.tokenizer + if lora_request.lora_int_id not in self.lora_tokenizers: + tokenizer = (await get_lora_tokenizer_async( + lora_request, **self.tokenizer_config) or self.tokenizer) + self.lora_tokenizers.put(lora_request.lora_int_id, tokenizer) + return tokenizer + else: + return self.lora_tokenizers.get(lora_request.lora_int_id) diff --git a/vllm/transformers_utils/tokenizers/baichuan.py b/vllm/transformers_utils/tokenizers/baichuan.py index 1dd241e4a5c4..02045bdcb2cc 100644 --- a/vllm/transformers_utils/tokenizers/baichuan.py +++ b/vllm/transformers_utils/tokenizers/baichuan.py @@ -1,4 +1,3 @@ -# yapf: disable # Adapted from # https://huggingface.co/baichuan-inc/Baichuan2-13B-Chat/blob/8f6e343d545c503b91429582231d1d354dac2740/tokenization_baichuan.py # This includes a fix suggested in @@ -13,7 +12,6 @@ from transformers.tokenization_utils import AddedToken, PreTrainedTokenizer from transformers.utils import logging - logger = logging.get_logger(__name__) VOCAB_FILES_NAMES = {"vocab_file": "tokenizer.model"} @@ -52,27 +50,16 @@ def __init__( clean_up_tokenization_spaces=False, **kwargs, ): - self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs - bos_token = ( - AddedToken(bos_token, lstrip=False, rstrip=False) - if isinstance(bos_token, str) - else bos_token - ) - eos_token = ( - AddedToken(eos_token, lstrip=False, rstrip=False) - if isinstance(eos_token, str) - else eos_token - ) - unk_token = ( - AddedToken(unk_token, lstrip=False, rstrip=False) - if isinstance(unk_token, str) - else unk_token - ) - pad_token = ( - AddedToken(pad_token, lstrip=False, rstrip=False) - if isinstance(pad_token, str) - else pad_token - ) + self.sp_model_kwargs = ({} if sp_model_kwargs is None else + sp_model_kwargs) + bos_token = (AddedToken(bos_token, lstrip=False, rstrip=False) + if isinstance(bos_token, str) else bos_token) + eos_token = (AddedToken(eos_token, lstrip=False, rstrip=False) + if isinstance(eos_token, str) else eos_token) + unk_token = (AddedToken(unk_token, lstrip=False, rstrip=False) + if isinstance(unk_token, str) else unk_token) + pad_token = (AddedToken(pad_token, lstrip=False, rstrip=False) + if isinstance(pad_token, str) else pad_token) self.vocab_file = vocab_file self.add_bos_token = add_bos_token self.add_eos_token = add_eos_token @@ -107,7 +94,10 @@ def vocab_size(self): def get_vocab(self): """Returns vocab as a dict""" - vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)} + vocab = { + self.convert_ids_to_tokens(i): i + for i in range(self.vocab_size) + } vocab.update(self.added_tokens_encoder) return vocab @@ -130,7 +120,8 @@ def convert_tokens_to_string(self, tokens): out_string = "" prev_is_special = False for i, token in enumerate(tokens): - # make sure that special tokens are not decoded using sentencepiece model + # make sure that special tokens are not decoded using + # sentencepiece model if token in self.all_special_tokens: if not prev_is_special and i != 0: out_string += " " @@ -143,9 +134,9 @@ def convert_tokens_to_string(self, tokens): out_string += self.sp_model.decode(current_sub_tokens) return out_string - def save_vocabulary( - self, save_directory, filename_prefix: Optional[str] = None - ) -> Tuple[str]: + def save_vocabulary(self, + save_directory, + filename_prefix: Optional[str] = None) -> Tuple[str]: """ Save the vocabulary and special tokens file to a directory. @@ -157,24 +148,24 @@ def save_vocabulary( `Tuple(str)`: Paths to the files saved. """ if not os.path.isdir(save_directory): - logger.error(f"Vocabulary path ({save_directory}) should be a directory") + logger.error(f"Vocabulary path ({save_directory}) " + "should be a directory") return out_vocab_file = os.path.join( save_directory, - (filename_prefix + "-" if filename_prefix else "") - + VOCAB_FILES_NAMES["vocab_file"], + (filename_prefix + "-" if filename_prefix else "") + + VOCAB_FILES_NAMES["vocab_file"], ) if os.path.abspath(self.vocab_file) != os.path.abspath( - out_vocab_file - ) and os.path.isfile(self.vocab_file): + out_vocab_file) and os.path.isfile(self.vocab_file): copyfile(self.vocab_file, out_vocab_file) elif not os.path.isfile(self.vocab_file): with open(out_vocab_file, "wb") as fi: content_spiece_model = self.sp_model.serialized_model_proto() fi.write(content_spiece_model) - return (out_vocab_file,) + return (out_vocab_file, ) def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None): bos_token_id = [self.bos_token_id] if self.add_bos_token else [] @@ -194,7 +185,8 @@ def get_special_tokens_mask( already_has_special_tokens: bool = False, ) -> List[int]: """ - Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding + Retrieve sequence ids from a token list that has no special tokens + added. This method is called when adding special tokens using the tokenizer `prepare_for_model` method. Args: @@ -202,11 +194,14 @@ def get_special_tokens_mask( List of IDs. token_ids_1 (`List[int]`, *optional*): Optional second list of IDs for sequence pairs. - already_has_special_tokens (`bool`, *optional*, defaults to `False`): - Whether or not the token list is already formatted with special tokens for the model. + already_has_special_tokens (`bool`, *optional*, defaults to + `False`): + Whether or not the token list is already formatted with + special tokens for the model. Returns: - `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token. + `List[int]`: A list of integers in the range [0, 1]: + 1 for a special token, 0 for a sequence token. """ if already_has_special_tokens: return super().get_special_tokens_mask( @@ -220,20 +215,16 @@ def get_special_tokens_mask( if token_ids_1 is None: return bos_token_id + ([0] * len(token_ids_0)) + eos_token_id - return ( - bos_token_id - + ([0] * len(token_ids_0)) - + eos_token_id - + bos_token_id - + ([0] * len(token_ids_1)) - + eos_token_id - ) + return (bos_token_id + ([0] * len(token_ids_0)) + eos_token_id + + bos_token_id + ([0] * len(token_ids_1)) + eos_token_id) def create_token_type_ids_from_sequences( - self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None - ) -> List[int]: + self, + token_ids_0: List[int], + token_ids_1: Optional[List[int]] = None) -> List[int]: """ - Creates a mask from the two sequences passed to be used in a sequence-pair classification task. An ALBERT + Creates a mask from the two sequences passed to be used in a + sequence-pair classification task. An ALBERT sequence pair mask has the following format: ``` @@ -250,7 +241,8 @@ def create_token_type_ids_from_sequences( Optional second list of IDs for sequence pairs. Returns: - `List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given sequence(s). + `List[int]`: List of [token type IDs](../glossary#token-type-ids) + according to the given sequence(s). """ bos_token_id = [self.bos_token_id] if self.add_bos_token else [] eos_token_id = [self.eos_token_id] if self.add_eos_token else [] diff --git a/vllm/utils.py b/vllm/utils.py index 5b94067cec77..d4a8c962c3bf 100644 --- a/vllm/utils.py +++ b/vllm/utils.py @@ -21,6 +21,7 @@ from typing import Any, Hashable, Optional from vllm.logger import init_logger +import warnings T = TypeVar("T") logger = init_logger(__name__) @@ -133,9 +134,10 @@ def get_max_shared_memory_bytes(gpu: int = 0) -> int: # the Neuron-X backend does not have the `cuda_utils` module. from vllm._C import cuda_utils - max_shared_mem = cuda_utils.get_max_shared_memory_per_block_device_attribute( - gpu) - # value 0 will cause MAX_SEQ_LEN become negative and test_attention.py will fail + max_shared_mem = ( + cuda_utils.get_max_shared_memory_per_block_device_attribute(gpu)) + # value 0 will cause MAX_SEQ_LEN become negative and test_attention.py + # will fail assert max_shared_mem > 0, "max_shared_mem can not be zero" return int(max_shared_mem) @@ -171,16 +173,35 @@ def _async_wrapper(*args, **kwargs) -> asyncio.Future: def get_ip() -> str: + host_ip = os.environ.get("HOST_IP") + if host_ip: + return host_ip + + # IP is not set, try to get it from the network interface + # try ipv4 s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM) try: s.connect(("8.8.8.8", 80)) # Doesn't need to be reachable return s.getsockname()[0] - except OSError: - # try ipv6 + except Exception: + pass + + # try ipv6 + try: s = socket.socket(socket.AF_INET6, socket.SOCK_DGRAM) - s.connect(("dns.google", 80)) + # Google's public DNS server, see + # https://developers.google.com/speed/public-dns/docs/using#addresses + s.connect(("2001:4860:4860::8888", 80)) # Doesn't need to be reachable return s.getsockname()[0] + except Exception: + pass + + warnings.warn( + "Failed to get the IP address, using 0.0.0.0 by default." + "The value can be set by the environment variable HOST_IP.", + stacklevel=2) + return "0.0.0.0" def get_distributed_init_method(ip: str, port: int) -> str: @@ -209,9 +230,8 @@ def get_nvcc_cuda_version() -> Optional[Version]: if not cuda_home: cuda_home = '/usr/local/cuda' if os.path.isfile(cuda_home + '/bin/nvcc'): - logger.info( - f'CUDA_HOME is not found in the environment. Using {cuda_home} as CUDA_HOME.' - ) + logger.info(f'CUDA_HOME is not found in the environment. ' + f'Using {cuda_home} as CUDA_HOME.') else: logger.warning( f'Not found nvcc in {cuda_home}. Skip cuda version check!') diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py index 9023b0c59b3f..7eac576e3f0f 100644 --- a/vllm/worker/model_runner.py +++ b/vllm/worker/model_runner.py @@ -93,16 +93,13 @@ def load_model(self) -> None: scheduler_config=self.scheduler_config) self.model_memory_usage = m.consumed_memory - logger.info( - f"Loading model weights took {self.model_memory_usage / float(2**30):.4f} GB" - ) - - vocab_size = self.model.config.vocab_size + logger.info(f"Loading model weights took " + f"{self.model_memory_usage / float(2**30):.4f} GB") if self.lora_config: - assert hasattr( - self.model, "supported_lora_modules" - ) and self.model.supported_lora_modules, "Model does not support LoRA" + assert hasattr(self.model, "supported_lora_modules" + ) and self.model.supported_lora_modules, ( + "Model does not support LoRA") assert hasattr( self.model, "embedding_modules"), "Model does not have embedding_modules" @@ -111,7 +108,7 @@ def load_model(self) -> None: self.lora_manager = LRUCacheWorkerLoRAManager( self.scheduler_config.max_num_seqs, self.scheduler_config.max_num_batched_tokens + - self.scheduler_config.max_paddings, vocab_size, + self.scheduler_config.max_paddings, self.vocab_size, self.lora_config, self.device, self.model.embedding_modules, self.model.embedding_padding_modules) self.model = self.lora_manager.create_lora_manager(self.model) @@ -607,8 +604,7 @@ def execute_model( @torch.inference_mode() def profile_run(self) -> None: # Enable top-k sampling to reflect the accurate memory usage. - vocab_size = self.model_config.get_vocab_size() - sampling_params = SamplingParams(top_p=0.99, top_k=vocab_size - 1) + sampling_params = SamplingParams(top_p=0.99, top_k=self.vocab_size - 1) max_num_batched_tokens = self.scheduler_config.max_num_batched_tokens max_num_seqs = self.scheduler_config.max_num_seqs @@ -774,6 +770,10 @@ def __del__(self) -> None: self.graph_runners.clear() self.cupy_nccl_backend = None + @property + def vocab_size(self) -> int: + return self.model_config.get_vocab_size() + class CUDAGraphRunner: diff --git a/vllm/worker/neuron_worker.py b/vllm/worker/neuron_worker.py index 3229a21c11a3..340c079600c7 100644 --- a/vllm/worker/neuron_worker.py +++ b/vllm/worker/neuron_worker.py @@ -79,7 +79,8 @@ def profile_num_available_blocks( cpu_swap_space: int = 0, cache_dtype: str = "float16", ) -> Tuple[int, int]: - """Simply returns max_num_seqs as num_gpu_blocks, 0 as num_cpu_blocks.""" + """Simply returns max_num_seqs as num_gpu_blocks, 0 as + num_cpu_blocks.""" num_gpu_blocks = self.scheduler_config.max_num_seqs num_cpu_blocks = 0 return num_gpu_blocks, num_cpu_blocks @@ -177,7 +178,8 @@ def _init_distributed_environment( "distributed_init_method must be set if torch.distributed " "is not already initialized") else: - distributed_backend = distributed_backend if distributed_backend else "nccl" + distributed_backend = (distributed_backend + if distributed_backend else "nccl") torch.distributed.init_process_group( backend=distributed_backend, world_size=parallel_config.world_size, diff --git a/vllm/worker/spec_decode/multi_step_worker.py b/vllm/worker/spec_decode/multi_step_worker.py deleted file mode 100644 index ab3e28389a04..000000000000 --- a/vllm/worker/spec_decode/multi_step_worker.py +++ /dev/null @@ -1,178 +0,0 @@ -from typing import List, Dict -import copy - -import torch - -from vllm.sequence import SamplerOutput, SequenceGroupMetadata -from vllm.worker.worker import Worker - - -class MultiStepWorker(Worker): - """The MultiStepWorker is equivalent to a Worker except that it allows - multiple forward passes in a single call, assuming the scheduler has - allocated enough space to store the additional KV. This reduces overhead - by invoking the scheduler less. - - The MultiStepWorker does not support cache swap operations, or beam search. - Cache swap operations do not require large modifications. On the other hand, - beam search requires memory allocations during sequence forks and thus - requires more thought for MultiStepWorker support. - """ - - @torch.inference_mode() - def execute_model_multi_step( - self, - seq_group_metadata_list: List[SequenceGroupMetadata], - blocks_to_swap_in: Dict[int, int], - blocks_to_swap_out: Dict[int, int], - blocks_to_copy: Dict[int, List[int]], - num_steps: int, - ) -> List[SamplerOutput]: - """Run the model forward pass num_steps times. Returns the list of - sampler output, one per model forward pass. - """ - self._raise_if_unsupported(seq_group_metadata_list, blocks_to_swap_in, - blocks_to_swap_out, blocks_to_copy) - - # Shallow copy input data so modifications (such as appending tokens) - # do not cause side-effects. - copied_seq_group_metadata_list = self._shallow_copy_inputs( - seq_group_metadata_list) - - # Assert enough KV space for num_steps tokens per sequence. - self._assert_enough_kv_space(seq_group_metadata_list, num_steps) - - # Run model num_steps times. - model_outputs = [] - for _ in range(num_steps): - model_output = super().execute_model( - seq_group_metadata_list=copied_seq_group_metadata_list, - blocks_to_swap_in=blocks_to_swap_in, - blocks_to_swap_out=blocks_to_swap_out, - blocks_to_copy=blocks_to_copy, - ) - - self._append_new_tokens(model_output, - copied_seq_group_metadata_list) - model_outputs.append(model_output) - - return model_outputs - - def _append_new_tokens( - self, model_output: SamplerOutput, - seq_group_metadata_list: SequenceGroupMetadata) -> None: - """Given model output from a single run, append the tokens to the - sequences. This is normally done outside of the worker, but it is - required if the worker is to perform multiple forward passes. - """ - for seq_group_metadata, sequence_group_outputs in zip( - seq_group_metadata_list, model_output): - seq_group_metadata.is_prompt = False - - for seq_output in sequence_group_outputs.samples: - # NOTE: Beam search is not supported, so we can assume that - # parent_seq_id == seq_id. - seq = seq_group_metadata.seq_data[seq_output.parent_seq_id] - - token_id = seq_output.output_token - token_logprob = seq_output.logprobs[token_id] - - seq.append_token_id(token_id, token_logprob.logprob) - - def _shallow_copy_inputs( - self, seq_group_metadata_list: List[SequenceGroupMetadata] - ) -> List[SequenceGroupMetadata]: - """Copy input data structures to remove side-effects when input data - structures are shared with other modules. - - The multi-step worker must be able to append tokens to sequences after - a forward pass. This necessitates modification of the data structures - used by the worker. Since these data structures are shared with other - parts of vLLM, like the scheduler, we must take care not to introduce - unexpected side-effects. - - When Ray is used to orchestrate worker processes (such as when the - tensor-parallel degree is >1), this is not a problem because the input - datastructures will be serialized and created anew in the worker - process. - - However, when Ray is not used to orchestrate the worker processes (such - as when the tensor-parallel degree is 1), this is a problem. We avoid - the problem by shallow-copying the input datastructures (specifically, - the parts that will change in multiple steps). - """ - - # Shallow-copy the list of SequenceGroupMetadata. This allows us to - # append tokens and change is_prompt without external side-effects. - new_seq_group_metadata_list = [] - - for old_seq_group_metadata in seq_group_metadata_list: - # We must shallow-copy seq_group_metadata as is_prompt could change. - seq_group_metadata = copy.copy(old_seq_group_metadata) - new_seq_group_metadata_list.append(seq_group_metadata) - - # We must shallow-copy seq_data as we will append token ids - new_seq_data = {} - for seq_id, old_seq_data in seq_group_metadata.seq_data.items(): - new_seq_data[seq_id] = copy.copy(old_seq_data) - new_seq_data[ - seq_id].output_token_ids = old_seq_data.output_token_ids[:] - - seq_group_metadata.seq_data = new_seq_data - - return new_seq_group_metadata_list - - def _assert_enough_kv_space( - self, seq_group_metadata_list: List[SequenceGroupMetadata], - num_steps: int) -> None: - """Assert there are enough physical blocks per sequence to store the - current KV plus additional KV from num_steps tokens. - """ - assert self.model_runner.block_size is not None - for seq_group_metadata in seq_group_metadata_list: - # Only one seq_id is guaranteed because there is no beam search. - seq_id = list(seq_group_metadata.seq_data.keys())[0] - seq = seq_group_metadata.seq_data[seq_id] - - # After num_steps, the seq len will be the current seq len - # plus one token per step. - final_seq_len = seq.get_len() + num_steps - - # We will have final_seq_len - 1 KV because vLLM saves KV for a - # token in the iteration after the token was generated. - required_num_kv_slots = final_seq_len - 1 - - # The allocated number of kv slots is the number of allocated blocks - # times the number of slots of block. - number_physical_blocks = len( - seq_group_metadata.block_tables[seq_id]) - allocated_kv_slots = (number_physical_blocks * - self.model_runner.block_size) - - if required_num_kv_slots > allocated_kv_slots: - request_id = seq_group_metadata.request_id - raise ValueError( - "The worker attempted to run " - f"{num_steps} times but found insufficient KV space for " - f"{request_id=} {seq_id=}. ({allocated_kv_slots=} " - f"{required_num_kv_slots=}).") - - def _raise_if_unsupported( - self, - seq_group_metadata_list: List[SequenceGroupMetadata], - blocks_to_swap_in: Dict[int, int], - blocks_to_swap_out: Dict[int, int], - blocks_to_copy: Dict[int, List[int]], - ) -> None: - """MultiStepWorker does not yet implement support for cache swap - operations or beam search. - """ - if any([blocks_to_swap_in, blocks_to_swap_out, blocks_to_copy]): - raise NotImplementedError( - "MultiStepWorker does not support cache operations") - - if any( - len(seq_group_metadata.seq_data.keys()) != 1 - for seq_group_metadata in seq_group_metadata_list): - raise NotImplementedError( - "MultiStepWorker does not support beam search.") diff --git a/vllm/worker/worker.py b/vllm/worker/worker.py index 157e8c45836b..0dcd4018afa5 100644 --- a/vllm/worker/worker.py +++ b/vllm/worker/worker.py @@ -130,8 +130,8 @@ def profile_num_available_blocks( # GPU did not change their memory usage during the profiling. peak_memory = self.init_gpu_memory - free_gpu_memory - cache_block_size = CacheEngine.get_cache_block_size( - block_size, cache_dtype, self.model_config, self.parallel_config) + cache_block_size = self.get_cache_block_size_bytes( + block_size, cache_dtype) num_gpu_blocks = int( (total_gpu_memory * gpu_memory_utilization - peak_memory) // cache_block_size) @@ -232,6 +232,22 @@ def remove_lora(self, lora_id: int) -> bool: def list_loras(self) -> Set[int]: return self.model_runner.list_loras() + @property + def max_model_len(self) -> int: + return self.model_config.max_model_len + + @property + def vocab_size(self) -> int: + return self.model_runner.vocab_size + + def get_cache_block_size_bytes(self, block_size: int, + cache_dtype: str) -> int: + """Get the size of the KV cache block size in bytes. + """ + return CacheEngine.get_cache_block_size(block_size, cache_dtype, + self.model_config, + self.parallel_config) + def init_distributed_environment( parallel_config: ParallelConfig,