From 7e47c2d25399d476ef667fc21645cee9b3e1f8be Mon Sep 17 00:00:00 2001 From: Peng Guanwen Date: Fri, 2 Aug 2024 14:23:06 +0800 Subject: [PATCH 01/22] Use flashinfer kernel to do sampling if available --- tests/samplers/test_sampler.py | 4 +- vllm/model_executor/layers/sampler.py | 106 ++++++++++++++++++----- vllm/model_executor/sampling_metadata.py | 8 +- vllm/utils.py | 16 ++-- 4 files changed, 101 insertions(+), 33 deletions(-) diff --git a/tests/samplers/test_sampler.py b/tests/samplers/test_sampler.py index bf062e4a5c09d..4a85075bc231d 100644 --- a/tests/samplers/test_sampler.py +++ b/tests/samplers/test_sampler.py @@ -625,7 +625,9 @@ def mock_sample(probs, *args, **kwargs): return ([[prob.topk(1, dim=-1).indices.tolist(), [0]] for prob in probs], None) - with patch("vllm.model_executor.layers.sampler._sample", mock_sample): + # top-k and top-p is only calculated when flashinfer kernel is not available + with patch("vllm.model_executor.layers.sampler._sample", mock_sample), \ + patch("vllm.model_executor.layers.sampler.HAS_FLASHINFER", False): sampler(logits=fake_logits, sampling_metadata=sampling_metadata) assert sample_probs is not None diff --git a/vllm/model_executor/layers/sampler.py b/vllm/model_executor/layers/sampler.py index 6632b1c434582..97afdf1bea93c 100644 --- a/vllm/model_executor/layers/sampler.py +++ b/vllm/model_executor/layers/sampler.py @@ -1,8 +1,10 @@ """A layer that samples the next tokens from the model's outputs.""" import itertools +import warnings from math import inf from typing import Dict, List, Optional, Tuple +import numpy as np import torch import torch.nn as nn @@ -18,6 +20,13 @@ from vllm.sequence import (CompletionSequenceGroupOutput, Logprob, PromptLogprobs, SampleLogprobs, SamplerOutput, SequenceOutput) +from vllm.utils import async_numpy_to_tensor + +HAS_FLASHINFER = True +try: + from flashinfer.sampling import top_k_top_p_sampling_from_probs +except ImportError: + HAS_FLASHINFER = False # (num_token_ids, num_parent_ids) per sequence group. SampleResultType = List[Tuple[List[int], List[int]]] @@ -121,7 +130,7 @@ def forward( # Use in-place division to avoid creating a new tensor. logits.div_(sampling_tensors.temperatures.unsqueeze(dim=1)) - if do_top_p_top_k: + if do_top_p_top_k and not HAS_FLASHINFER: logits = _apply_top_k_top_p(logits, sampling_tensors.top_ps, sampling_tensors.top_ks) @@ -475,32 +484,69 @@ def _multinomial( seq_groups: Optional[List[SequenceGroupToSample]] = None, ) -> torch.Tensor: if num_samples > 1: - # This is equivalent to torch.repeat_interleaved (which also - # forces a GPU<->CPU sync). - # This allows us to do sampling with replacement by creating - # num_samples copies of each row in the tensor, and then - # batch sampling the resulting tensor. - probs = probs[:, None, :].expand(probs.shape[0], num_samples, - probs.shape[1]).contiguous().view( - -1, probs.shape[1]) - q = torch.empty_like(probs) + probs = probs.repeat_interleave(num_samples, dim=0) if seq_groups is None: + q = torch.empty_like(probs) q.exponential_() else: + q_ = np.empty(probs.shape) sample_idx = 0 for seq_group in seq_groups: seq_ids = seq_group.seq_ids - next_sample_idx = sample_idx + len(seq_ids) * num_samples - q[sample_idx:next_sample_idx].exponential_( - generator=seq_group.generator) - sample_idx = next_sample_idx + stride = len(seq_ids) * num_samples + assert seq_group.generator is not None + q_[sample_idx:sample_idx + stride] = \ + seq_group.generator.exponential(size=(stride, q_.shape[1])) + sample_idx += stride + q = async_numpy_to_tensor(q_, probs.device) return probs.div_(q).argmax(dim=1).view(-1, num_samples) +def _top_k_top_p_multinomial( + probs: torch.Tensor, top_ks: torch.Tensor, top_ps: torch.Tensor, + num_samples: int, seq_groups: Optional[List[SequenceGroupToSample]]): + max_top_k_round = 32 + if num_samples > 1: + probs = probs.repeat_interleave(num_samples, dim=0) + top_ks = top_ks.repeat_interleave(num_samples) + top_ps = top_ps.repeat_interleave(num_samples) + batch_size = probs.shape[0] + if seq_groups is None: + uniform_samples = torch.rand((max_top_k_round, batch_size), + device=probs.device) + else: + uniform_samples_cpu = np.empty((max_top_k_round, batch_size)) + sample_idx = 0 + for seq_group in seq_groups: + seq_ids = seq_group.seq_ids + stride = len(seq_ids) * num_samples + assert seq_group.generator is not None + uniform_samples_cpu[:, sample_idx:sample_idx + stride] = \ + seq_group.generator.random((max_top_k_round, stride)) + sample_idx += stride + uniform_samples = async_numpy_to_tensor(uniform_samples_cpu, + probs.device) + batch_next_token_ids, success = top_k_top_p_sampling_from_probs( + probs, + uniform_samples, + top_ks, + top_ps, + ) + if not success.all(): + warnings.warn("Sampling failed, fallback to greedy sampling.", + stacklevel=2) + probs = probs.masked_fill(torch.isnan(probs), 0.0) + argmax_ids = torch.argmax(probs, dim=-1) + batch_next_token_ids = torch.where(success, batch_next_token_ids, + argmax_ids) + return batch_next_token_ids.view(-1, num_samples) + + def _sample_with_torch( probs: torch.Tensor, logprobs: torch.Tensor, sampling_metadata: SamplingMetadata, + sampling_tensors: SamplingTensors, include_gpu_probs_tensor: bool, modify_greedy_probs: bool, ) -> Tuple[SampleResultType, Optional[torch.Tensor]]: @@ -563,13 +609,23 @@ def _sample_with_torch( sampling_params = seq_group.sampling_params max_best_of_in_batch = max(max_best_of_in_batch, sampling_params.best_of) - seeded_args = {} if sampling_type == SamplingType.RANDOM else { - "seq_groups": seq_groups, - } + if HAS_FLASHINFER: + multinomial_samples[sampling_type] = _top_k_top_p_multinomial( + probs[long_sample_indices], + sampling_tensors.top_ks[long_sample_indices], + sampling_tensors.top_ps[long_sample_indices], + max_best_of_in_batch, + seq_groups + if sampling_type == SamplingType.RANDOM_SEED else None, + ) + else: + seeded_args = {} if sampling_type == SamplingType.RANDOM else { + "seq_groups": seq_groups, + } - multinomial_samples[sampling_type] = _multinomial( - probs[long_sample_indices], max_best_of_in_batch, - **seeded_args) + multinomial_samples[sampling_type] = _multinomial( + probs[long_sample_indices], max_best_of_in_batch, + **seeded_args) if sampled_token_ids_tensor is not None: # Store sampled tokens in output tensor. @@ -692,9 +748,12 @@ def _sample_with_triton_kernel( def _sample( - probs: torch.Tensor, logprobs: torch.Tensor, - sampling_metadata: SamplingMetadata, sampling_tensors: SamplingTensors, - include_gpu_probs_tensor: bool, modify_greedy_probs: bool + probs: torch.Tensor, + logprobs: torch.Tensor, + sampling_metadata: SamplingMetadata, + sampling_tensors: SamplingTensors, + include_gpu_probs_tensor: bool, + modify_greedy_probs: bool, ) -> Tuple[SampleResultType, Optional[torch.Tensor]]: """ Args: @@ -712,6 +771,7 @@ def _sample( probs, logprobs, sampling_metadata, + sampling_tensors, include_gpu_probs_tensor=include_gpu_probs_tensor, modify_greedy_probs=modify_greedy_probs, ) diff --git a/vllm/model_executor/sampling_metadata.py b/vllm/model_executor/sampling_metadata.py index 59cfec9ec8934..745bb971298cc 100644 --- a/vllm/model_executor/sampling_metadata.py +++ b/vllm/model_executor/sampling_metadata.py @@ -3,6 +3,7 @@ from dataclasses import dataclass from typing import Dict, List, Optional, Tuple +import numpy as np import torch from vllm.sampling_params import SamplingParams, SamplingType @@ -38,7 +39,7 @@ class SequenceGroupToSample: # prefill is enabled. query_len: Optional[int] # A random number generator for sampling. - generator: Optional[torch.Generator] + generator: Optional[np.random.Generator] # True if the sequence group is in prefill stage. False if it is in a # decode stage. is_prompt: bool @@ -211,7 +212,7 @@ def _prepare_seq_groups( seq_ids = list(seq_group_metadata.seq_data.keys()) sampling_params = seq_group_metadata.sampling_params is_prompt = seq_group_metadata.is_prompt - generator: Optional[torch.Generator] = None + generator: Optional[np.random.Generator] = None # If the current seq group is in decode stage, it is None. seq_len: Optional[int] = None query_len: Optional[int] = None @@ -221,8 +222,7 @@ def _prepare_seq_groups( if seq_group_metadata.is_prompt: if sampling_params.seed is not None: - generator = torch.Generator(device=device).manual_seed( - sampling_params.seed) + generator = np.random.default_rng(seed=sampling_params.seed) if generators is not None: generators[seq_group_metadata.request_id] = generator diff --git a/vllm/utils.py b/vllm/utils.py index 1448316e66edb..afe81f5884a49 100644 --- a/vllm/utils.py +++ b/vllm/utils.py @@ -647,8 +647,8 @@ def make_tensor_with_pad( pad: T, dtype: torch.dtype, *, + device: Union[str, torch.device], max_len: Optional[int] = None, - device: Optional[Union[str, torch.device]] = None, pin_memory: bool = False, ) -> torch.Tensor: """ @@ -660,11 +660,17 @@ def make_tensor_with_pad( np_dtype = TORCH_DTYPE_TO_NUMPY_DTYPE[dtype] padded_x = make_ndarray_with_pad(x, pad, np_dtype, max_len=max_len) - tensor = torch.from_numpy(padded_x).to(device) - if pin_memory: - tensor = tensor.pin_memory() + return async_numpy_to_tensor(padded_x, device) - return tensor + +def async_numpy_to_tensor(x: npt.NDArray, device: Union[str, torch.device]): + """ + Make a tensor from a numpy array. Use pinned memory if possible. + """ + t = torch.from_numpy(x) + if is_pin_memory_available(): + t = t.pin_memory() + return t.to(device, non_blocking=True) def async_tensor_h2d( From 56beab00dde99c63accd9961e569b42c06099b68 Mon Sep 17 00:00:00 2001 From: Peng Guanwen Date: Mon, 5 Aug 2024 15:03:56 +0800 Subject: [PATCH 02/22] Fix type mismatch --- vllm/model_executor/layers/sampler.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/vllm/model_executor/layers/sampler.py b/vllm/model_executor/layers/sampler.py index 97afdf1bea93c..0325448ed8ab8 100644 --- a/vllm/model_executor/layers/sampler.py +++ b/vllm/model_executor/layers/sampler.py @@ -515,14 +515,16 @@ def _top_k_top_p_multinomial( uniform_samples = torch.rand((max_top_k_round, batch_size), device=probs.device) else: - uniform_samples_cpu = np.empty((max_top_k_round, batch_size)) + uniform_samples_cpu = np.empty((max_top_k_round, batch_size), + dtype=np.float32) sample_idx = 0 for seq_group in seq_groups: seq_ids = seq_group.seq_ids stride = len(seq_ids) * num_samples assert seq_group.generator is not None uniform_samples_cpu[:, sample_idx:sample_idx + stride] = \ - seq_group.generator.random((max_top_k_round, stride)) + seq_group.generator.random((max_top_k_round, stride), + dtype=np.float32) sample_idx += stride uniform_samples = async_numpy_to_tensor(uniform_samples_cpu, probs.device) @@ -629,8 +631,8 @@ def _sample_with_torch( if sampled_token_ids_tensor is not None: # Store sampled tokens in output tensor. - sampled_token_ids_tensor[ - long_sample_indices] = multinomial_samples[sampling_type] + sampled_token_ids_tensor[long_sample_indices] = \ + multinomial_samples[sampling_type].to(torch.long) elif sampling_type == SamplingType.BEAM: beam_search_logprobs = logprobs[sample_indices] From 5396c9ddc26e2898d0f83ee72b085af70f9b98e6 Mon Sep 17 00:00:00 2001 From: Peng Guanwen Date: Mon, 5 Aug 2024 15:46:43 +0800 Subject: [PATCH 03/22] Some renaming --- vllm/model_executor/layers/sampler.py | 31 ++++++++++++++------------- vllm/utils.py | 2 +- 2 files changed, 17 insertions(+), 16 deletions(-) diff --git a/vllm/model_executor/layers/sampler.py b/vllm/model_executor/layers/sampler.py index 0325448ed8ab8..6a4eac7116cc2 100644 --- a/vllm/model_executor/layers/sampler.py +++ b/vllm/model_executor/layers/sampler.py @@ -22,11 +22,11 @@ SequenceOutput) from vllm.utils import async_numpy_to_tensor -HAS_FLASHINFER = True try: - from flashinfer.sampling import top_k_top_p_sampling_from_probs + from flashinfer.sampling import (top_k_top_p_sampling_from_probs as + flashinfer_top_k_top_p_sampling) except ImportError: - HAS_FLASHINFER = False + flashinfer_top_k_top_p_sampling = None # (num_token_ids, num_parent_ids) per sequence group. SampleResultType = List[Tuple[List[int], List[int]]] @@ -130,7 +130,7 @@ def forward( # Use in-place division to avoid creating a new tensor. logits.div_(sampling_tensors.temperatures.unsqueeze(dim=1)) - if do_top_p_top_k and not HAS_FLASHINFER: + if do_top_p_top_k and flashinfer_top_k_top_p_sampling is None: logits = _apply_top_k_top_p(logits, sampling_tensors.top_ps, sampling_tensors.top_ks) @@ -502,7 +502,7 @@ def _multinomial( return probs.div_(q).argmax(dim=1).view(-1, num_samples) -def _top_k_top_p_multinomial( +def _top_k_top_p_multinomial_with_flashinfer( probs: torch.Tensor, top_ks: torch.Tensor, top_ps: torch.Tensor, num_samples: int, seq_groups: Optional[List[SequenceGroupToSample]]): max_top_k_round = 32 @@ -528,7 +528,7 @@ def _top_k_top_p_multinomial( sample_idx += stride uniform_samples = async_numpy_to_tensor(uniform_samples_cpu, probs.device) - batch_next_token_ids, success = top_k_top_p_sampling_from_probs( + batch_next_token_ids, success = flashinfer_top_k_top_p_sampling( probs, uniform_samples, top_ks, @@ -611,15 +611,16 @@ def _sample_with_torch( sampling_params = seq_group.sampling_params max_best_of_in_batch = max(max_best_of_in_batch, sampling_params.best_of) - if HAS_FLASHINFER: - multinomial_samples[sampling_type] = _top_k_top_p_multinomial( - probs[long_sample_indices], - sampling_tensors.top_ks[long_sample_indices], - sampling_tensors.top_ps[long_sample_indices], - max_best_of_in_batch, - seq_groups - if sampling_type == SamplingType.RANDOM_SEED else None, - ) + if flashinfer_top_k_top_p_sampling is not None: + multinomial_samples[ + sampling_type] = _top_k_top_p_multinomial_with_flashinfer( + probs[long_sample_indices], + sampling_tensors.top_ks[long_sample_indices], + sampling_tensors.top_ps[long_sample_indices], + max_best_of_in_batch, + seq_groups + if sampling_type == SamplingType.RANDOM_SEED else None, + ) else: seeded_args = {} if sampling_type == SamplingType.RANDOM else { "seq_groups": seq_groups, diff --git a/vllm/utils.py b/vllm/utils.py index ff41da012abc8..eb46936c1db78 100644 --- a/vllm/utils.py +++ b/vllm/utils.py @@ -668,7 +668,7 @@ def make_tensor_with_pad( def async_numpy_to_tensor(x: npt.NDArray, device: Union[str, torch.device]): """ - Make a tensor from a numpy array. Use pinned memory if possible. + Make a tensor from a numpy array asynchronously. Use pinned memory if possible. """ t = torch.from_numpy(x) if is_pin_memory_available(): From 5999bd36cea4e364bac6a84732c8f968827b131f Mon Sep 17 00:00:00 2001 From: Peng Guanwen Date: Mon, 5 Aug 2024 17:21:05 +0800 Subject: [PATCH 04/22] Fallback for flashinfer sampler --- vllm/model_executor/layers/sampler.py | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/vllm/model_executor/layers/sampler.py b/vllm/model_executor/layers/sampler.py index 6a4eac7116cc2..daf5905edc951 100644 --- a/vllm/model_executor/layers/sampler.py +++ b/vllm/model_executor/layers/sampler.py @@ -482,8 +482,9 @@ def _multinomial( probs: torch.Tensor, num_samples: int, seq_groups: Optional[List[SequenceGroupToSample]] = None, + is_fallback: bool = False, ) -> torch.Tensor: - if num_samples > 1: + if num_samples > 1 and not is_fallback: probs = probs.repeat_interleave(num_samples, dim=0) if seq_groups is None: q = torch.empty_like(probs) @@ -535,12 +536,9 @@ def _top_k_top_p_multinomial_with_flashinfer( top_ps, ) if not success.all(): - warnings.warn("Sampling failed, fallback to greedy sampling.", + warnings.warn("Sampling with FlashInfer failed, fallback.", stacklevel=2) - probs = probs.masked_fill(torch.isnan(probs), 0.0) - argmax_ids = torch.argmax(probs, dim=-1) - batch_next_token_ids = torch.where(success, batch_next_token_ids, - argmax_ids) + return _multinomial(probs, num_samples, seq_groups, is_fallback=True) return batch_next_token_ids.view(-1, num_samples) From 420b004e3138b2e68e9f8b7cc47e4808fc3ffd28 Mon Sep 17 00:00:00 2001 From: Peng Guanwen Date: Mon, 5 Aug 2024 17:33:22 +0800 Subject: [PATCH 05/22] Formatting fix --- vllm/model_executor/layers/sampler.py | 4 ++-- vllm/utils.py | 3 ++- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/vllm/model_executor/layers/sampler.py b/vllm/model_executor/layers/sampler.py index daf5905edc951..65506b76a3fb4 100644 --- a/vllm/model_executor/layers/sampler.py +++ b/vllm/model_executor/layers/sampler.py @@ -23,8 +23,8 @@ from vllm.utils import async_numpy_to_tensor try: - from flashinfer.sampling import (top_k_top_p_sampling_from_probs as - flashinfer_top_k_top_p_sampling) + from flashinfer.sampling import ( + top_k_top_p_sampling_from_probs as flashinfer_top_k_top_p_sampling) except ImportError: flashinfer_top_k_top_p_sampling = None diff --git a/vllm/utils.py b/vllm/utils.py index eb46936c1db78..f009ade696475 100644 --- a/vllm/utils.py +++ b/vllm/utils.py @@ -668,7 +668,8 @@ def make_tensor_with_pad( def async_numpy_to_tensor(x: npt.NDArray, device: Union[str, torch.device]): """ - Make a tensor from a numpy array asynchronously. Use pinned memory if possible. + Make a tensor from a numpy array asynchronously. Use pinned memory + if possible. """ t = torch.from_numpy(x) if is_pin_memory_available(): From 98d372e1538596ca3ced16a6b9eab1b51748c86e Mon Sep 17 00:00:00 2001 From: Peng Guanwen Date: Mon, 5 Aug 2024 17:33:54 +0800 Subject: [PATCH 06/22] Tests fix --- tests/samplers/test_sampler.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/samplers/test_sampler.py b/tests/samplers/test_sampler.py index 4a85075bc231d..38e59fbe04ebf 100644 --- a/tests/samplers/test_sampler.py +++ b/tests/samplers/test_sampler.py @@ -627,7 +627,8 @@ def mock_sample(probs, *args, **kwargs): # top-k and top-p is only calculated when flashinfer kernel is not available with patch("vllm.model_executor.layers.sampler._sample", mock_sample), \ - patch("vllm.model_executor.layers.sampler.HAS_FLASHINFER", False): + patch("vllm.model_executor.layers.sampler." + "flashinfer_top_k_top_p_sampling", None): sampler(logits=fake_logits, sampling_metadata=sampling_metadata) assert sample_probs is not None From 0a8be18e52e0443477b66e5b34739ae5c1262220 Mon Sep 17 00:00:00 2001 From: Peng Guanwen Date: Mon, 5 Aug 2024 17:41:11 +0800 Subject: [PATCH 07/22] Fix mypy --- vllm/model_executor/layers/sampler.py | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/vllm/model_executor/layers/sampler.py b/vllm/model_executor/layers/sampler.py index 65506b76a3fb4..7411689403689 100644 --- a/vllm/model_executor/layers/sampler.py +++ b/vllm/model_executor/layers/sampler.py @@ -609,6 +609,9 @@ def _sample_with_torch( sampling_params = seq_group.sampling_params max_best_of_in_batch = max(max_best_of_in_batch, sampling_params.best_of) + seq_groups_arg = (None if sampling_type == SamplingType.RANDOM else + seq_groups) + if flashinfer_top_k_top_p_sampling is not None: multinomial_samples[ sampling_type] = _top_k_top_p_multinomial_with_flashinfer( @@ -616,17 +619,13 @@ def _sample_with_torch( sampling_tensors.top_ks[long_sample_indices], sampling_tensors.top_ps[long_sample_indices], max_best_of_in_batch, - seq_groups - if sampling_type == SamplingType.RANDOM_SEED else None, + seq_groups_arg, ) else: - seeded_args = {} if sampling_type == SamplingType.RANDOM else { - "seq_groups": seq_groups, - } - multinomial_samples[sampling_type] = _multinomial( - probs[long_sample_indices], max_best_of_in_batch, - **seeded_args) + probs[long_sample_indices], + max_best_of_in_batch, + seq_groups=seq_groups_arg) if sampled_token_ids_tensor is not None: # Store sampled tokens in output tensor. From f1706466a787bb41480211c972cf3d2a76d819e8 Mon Sep 17 00:00:00 2001 From: Peng Guanwen Date: Mon, 5 Aug 2024 17:49:24 +0800 Subject: [PATCH 08/22] Add test for flashinfer sampler --- .buildkite/test-pipeline.yaml | 6 +++++- vllm/model_executor/layers/sampler.py | 4 ++-- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index 573c3740f0bbb..c40677ccc5dd8 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -179,7 +179,11 @@ steps: - label: Samplers Test #mirror_hardwares: [amd] - command: pytest -v -s samplers + command: + - pytest -v -s samplers + # Test for flashinfer samplers + - pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.1.2/flashinfer-0.1.2+cu121torch2.4-cp310-cp310-linux_x86_64.whl + - pytest -v -s samplers - label: LogitsProcessor Test mirror_hardwares: [amd] diff --git a/vllm/model_executor/layers/sampler.py b/vllm/model_executor/layers/sampler.py index 7411689403689..1bc9c8552328f 100644 --- a/vllm/model_executor/layers/sampler.py +++ b/vllm/model_executor/layers/sampler.py @@ -23,8 +23,8 @@ from vllm.utils import async_numpy_to_tensor try: - from flashinfer.sampling import ( - top_k_top_p_sampling_from_probs as flashinfer_top_k_top_p_sampling) + from flashinfer.sampling import (top_k_top_p_sampling_from_probs as + flashinfer_top_k_top_p_sampling) except ImportError: flashinfer_top_k_top_p_sampling = None From 88c8a985efa0fcdca5c8a0dd560c8ee2df4394b2 Mon Sep 17 00:00:00 2001 From: Peng Guanwen Date: Mon, 5 Aug 2024 17:59:35 +0800 Subject: [PATCH 09/22] Suppress yapf on import --- vllm/model_executor/layers/sampler.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/vllm/model_executor/layers/sampler.py b/vllm/model_executor/layers/sampler.py index 1bc9c8552328f..d88e24bf7c279 100644 --- a/vllm/model_executor/layers/sampler.py +++ b/vllm/model_executor/layers/sampler.py @@ -23,8 +23,11 @@ from vllm.utils import async_numpy_to_tensor try: - from flashinfer.sampling import (top_k_top_p_sampling_from_probs as - flashinfer_top_k_top_p_sampling) + # yapf: disable + from flashinfer.sampling import ( + top_k_top_p_sampling_from_probs as flashinfer_top_k_top_p_sampling) + + # yapf: enable except ImportError: flashinfer_top_k_top_p_sampling = None From c404cd56399ce9ced2af951f3b62891c8f455492 Mon Sep 17 00:00:00 2001 From: Peng Guanwen Date: Mon, 5 Aug 2024 18:02:36 +0800 Subject: [PATCH 10/22] Fix pipeline --- .buildkite/test-pipeline.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index c40677ccc5dd8..f8ff1426666a5 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -179,7 +179,7 @@ steps: - label: Samplers Test #mirror_hardwares: [amd] - command: + commands: - pytest -v -s samplers # Test for flashinfer samplers - pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.1.2/flashinfer-0.1.2+cu121torch2.4-cp310-cp310-linux_x86_64.whl From c361a952a3818fdf625b9a593a5bd7157032547a Mon Sep 17 00:00:00 2001 From: Peng Guanwen Date: Tue, 6 Aug 2024 17:56:54 +0800 Subject: [PATCH 11/22] Change back to torch generator, add env flags --- .buildkite/test-pipeline.yaml | 4 +--- vllm/envs.py | 5 ++++ vllm/model_executor/layers/sampler.py | 30 ++++++++++-------------- vllm/model_executor/sampling_metadata.py | 15 ++++++------ 4 files changed, 25 insertions(+), 29 deletions(-) diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index f8ff1426666a5..55dc5dcd3b872 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -181,9 +181,7 @@ steps: #mirror_hardwares: [amd] commands: - pytest -v -s samplers - # Test for flashinfer samplers - - pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.1.2/flashinfer-0.1.2+cu121torch2.4-cp310-cp310-linux_x86_64.whl - - pytest -v -s samplers + - VLLM_NO_FLASHINFER_SAMPLER=1 pytest -v -s samplers - label: LogitsProcessor Test mirror_hardwares: [amd] diff --git a/vllm/envs.py b/vllm/envs.py index 9bcb26f8e5a64..dfa108510f833 100644 --- a/vllm/envs.py +++ b/vllm/envs.py @@ -28,6 +28,7 @@ VLLM_LOGGING_CONFIG_PATH: Optional[str] = None VLLM_TRACE_FUNCTION: int = 0 VLLM_ATTENTION_BACKEND: Optional[str] = None + VLLM_NO_FLASHINFER_SAMPLER: bool = False VLLM_PP_LAYER_PARTITION: Optional[str] = None VLLM_CPU_KVCACHE_SPACE: int = 0 VLLM_CPU_OMP_THREADS_BIND: str = "" @@ -238,6 +239,10 @@ def get_default_config_root(): "VLLM_ATTENTION_BACKEND": lambda: os.getenv("VLLM_ATTENTION_BACKEND", None), + # If set, vllm will not use flashinfer sampler + "VLLM_NO_FLASHINFER_SAMPLER": + lambda: bool(os.getenv("VLLM_NO_FLASHINFER_SAMPLER", 0)), + # Pipeline stage partition strategy "VLLM_PP_LAYER_PARTITION": lambda: os.getenv("VLLM_PP_LAYER_PARTITION", None), diff --git a/vllm/model_executor/layers/sampler.py b/vllm/model_executor/layers/sampler.py index d88e24bf7c279..03ca2f8bd5c59 100644 --- a/vllm/model_executor/layers/sampler.py +++ b/vllm/model_executor/layers/sampler.py @@ -1,10 +1,10 @@ """A layer that samples the next tokens from the model's outputs.""" import itertools import warnings +from importlib.util import find_spec from math import inf from typing import Dict, List, Optional, Tuple -import numpy as np import torch import torch.nn as nn @@ -13,6 +13,7 @@ if HAS_TRITON: from vllm.model_executor.layers.ops.sample import sample as sample_triton +import vllm.envs as envs from vllm.model_executor.sampling_metadata import (SamplingMetadata, SamplingTensors, SequenceGroupToSample) @@ -20,15 +21,14 @@ from vllm.sequence import (CompletionSequenceGroupOutput, Logprob, PromptLogprobs, SampleLogprobs, SamplerOutput, SequenceOutput) -from vllm.utils import async_numpy_to_tensor -try: +if not envs.VLLM_NO_FLASHINFER_SAMPLER and find_spec("flashinfer"): # yapf: disable from flashinfer.sampling import ( top_k_top_p_sampling_from_probs as flashinfer_top_k_top_p_sampling) # yapf: enable -except ImportError: +else: flashinfer_top_k_top_p_sampling = None # (num_token_ids, num_parent_ids) per sequence group. @@ -489,20 +489,18 @@ def _multinomial( ) -> torch.Tensor: if num_samples > 1 and not is_fallback: probs = probs.repeat_interleave(num_samples, dim=0) + q = torch.empty_like(probs) if seq_groups is None: - q = torch.empty_like(probs) q.exponential_() else: - q_ = np.empty(probs.shape) sample_idx = 0 for seq_group in seq_groups: seq_ids = seq_group.seq_ids stride = len(seq_ids) * num_samples assert seq_group.generator is not None - q_[sample_idx:sample_idx + stride] = \ - seq_group.generator.exponential(size=(stride, q_.shape[1])) + q[sample_idx:sample_idx + + stride].exponential_(generator=seq_group.generator) sample_idx += stride - q = async_numpy_to_tensor(q_, probs.device) return probs.div_(q).argmax(dim=1).view(-1, num_samples) @@ -515,23 +513,19 @@ def _top_k_top_p_multinomial_with_flashinfer( top_ks = top_ks.repeat_interleave(num_samples) top_ps = top_ps.repeat_interleave(num_samples) batch_size = probs.shape[0] + uniform_samples = torch.empty((max_top_k_round, batch_size), + device=probs.device) if seq_groups is None: - uniform_samples = torch.rand((max_top_k_round, batch_size), - device=probs.device) + uniform_samples.random_() else: - uniform_samples_cpu = np.empty((max_top_k_round, batch_size), - dtype=np.float32) sample_idx = 0 for seq_group in seq_groups: seq_ids = seq_group.seq_ids stride = len(seq_ids) * num_samples assert seq_group.generator is not None - uniform_samples_cpu[:, sample_idx:sample_idx + stride] = \ - seq_group.generator.random((max_top_k_round, stride), - dtype=np.float32) + uniform_samples[:, sample_idx:sample_idx + + stride].random_(generator=seq_group.generator) sample_idx += stride - uniform_samples = async_numpy_to_tensor(uniform_samples_cpu, - probs.device) batch_next_token_ids, success = flashinfer_top_k_top_p_sampling( probs, uniform_samples, diff --git a/vllm/model_executor/sampling_metadata.py b/vllm/model_executor/sampling_metadata.py index b3fb7e657df1b..0562d26f2318a 100644 --- a/vllm/model_executor/sampling_metadata.py +++ b/vllm/model_executor/sampling_metadata.py @@ -3,7 +3,6 @@ from dataclasses import dataclass from typing import Dict, List, Optional, Tuple -import numpy as np import torch from vllm.sampling_params import SamplingParams, SamplingType @@ -41,7 +40,7 @@ class SequenceGroupToSample: # prefill is enabled. query_len: Optional[int] # A random number generator for sampling. - generator: Optional[np.random.Generator] + generator: Optional[torch.Generator] # True if the sequence group is in prefill stage. False if it is in a # decode stage. is_prompt: bool @@ -214,19 +213,19 @@ def _prepare_seq_groups( seq_ids = list(seq_group_metadata.seq_data.keys()) sampling_params = seq_group_metadata.sampling_params is_prompt = seq_group_metadata.is_prompt - generator: Optional[np.random.Generator] = None + generator: Optional[torch.Generator] = None # If the current seq group is in decode stage, it is None. seq_len: Optional[int] = None query_len: Optional[int] = None prompt_logprob_indices: List[int] = [] sample_indices: List[int] = [] do_sample = seq_group_metadata.do_sample + seed = sampling_params.seed if seq_group_metadata.is_prompt: - if sampling_params.seed is not None: - generator = np.random.default_rng(seed=sampling_params.seed) - if generators is not None: - generators[seq_group_metadata.request_id] = generator + if seed is not None and generators is not None: + generator = torch.Generator(device=device).manual_seed(seed) + generators[seq_group_metadata.request_id] = generator num_prompts += 1 num_prefill_sample = len(seq_ids) @@ -243,7 +242,7 @@ def _prepare_seq_groups( prompt_logprob_len = 0 sample_len = len(seq_ids) if do_sample else 0 - if sampling_params.seed is not None and generators is not None: + if seed is not None and generators is not None: generator = generators.get(seq_group_metadata.request_id) # Update indices to select from the model output. From 99f7eccb163b6f6184ff0960a1ab0b5db136c96f Mon Sep 17 00:00:00 2001 From: Peng Guanwen Date: Wed, 7 Aug 2024 13:26:21 +0800 Subject: [PATCH 12/22] rename env for flashinfer, rollback changes in utils --- vllm/envs.py | 6 +++--- vllm/model_executor/layers/sampler.py | 2 +- vllm/utils.py | 17 +++++------------ 3 files changed, 9 insertions(+), 16 deletions(-) diff --git a/vllm/envs.py b/vllm/envs.py index 77e1e4002bcd3..b58225cf14caa 100644 --- a/vllm/envs.py +++ b/vllm/envs.py @@ -29,7 +29,7 @@ VLLM_LOGGING_CONFIG_PATH: Optional[str] = None VLLM_TRACE_FUNCTION: int = 0 VLLM_ATTENTION_BACKEND: Optional[str] = None - VLLM_NO_FLASHINFER_SAMPLER: bool = False + VLLM_DISABLE_FLASHINFER_SAMPLER: bool = False VLLM_PP_LAYER_PARTITION: Optional[str] = None VLLM_CPU_KVCACHE_SPACE: int = 0 VLLM_CPU_OMP_THREADS_BIND: str = "" @@ -252,8 +252,8 @@ def get_default_config_root(): lambda: os.getenv("VLLM_ATTENTION_BACKEND", None), # If set, vllm will not use flashinfer sampler - "VLLM_NO_FLASHINFER_SAMPLER": - lambda: bool(os.getenv("VLLM_NO_FLASHINFER_SAMPLER", 0)), + "VLLM_DISABLE_FLASHINFER_SAMPLER": + lambda: bool(os.getenv("VLLM_DISABLE_FLASHINFER_SAMPLER", 0)), # Pipeline stage partition strategy "VLLM_PP_LAYER_PARTITION": diff --git a/vllm/model_executor/layers/sampler.py b/vllm/model_executor/layers/sampler.py index 03ca2f8bd5c59..6f82862f2acd5 100644 --- a/vllm/model_executor/layers/sampler.py +++ b/vllm/model_executor/layers/sampler.py @@ -22,7 +22,7 @@ PromptLogprobs, SampleLogprobs, SamplerOutput, SequenceOutput) -if not envs.VLLM_NO_FLASHINFER_SAMPLER and find_spec("flashinfer"): +if not envs.VLLM_DISABLE_FLASHINFER_SAMPLER and find_spec("flashinfer"): # yapf: disable from flashinfer.sampling import ( top_k_top_p_sampling_from_probs as flashinfer_top_k_top_p_sampling) diff --git a/vllm/utils.py b/vllm/utils.py index f2a7b228c8077..7e50cc314d554 100644 --- a/vllm/utils.py +++ b/vllm/utils.py @@ -666,7 +666,7 @@ def make_tensor_with_pad( pad: T, dtype: torch.dtype, *, - device: Union[str, torch.device], + device: Optional[Union[str, torch.device]] = None, max_len: Optional[int] = None, pin_memory: bool = False, ) -> torch.Tensor: @@ -679,18 +679,11 @@ def make_tensor_with_pad( np_dtype = TORCH_DTYPE_TO_NUMPY_DTYPE[dtype] padded_x = make_ndarray_with_pad(x, pad, np_dtype, max_len=max_len) - return async_numpy_to_tensor(padded_x, device) + tensor = torch.from_numpy(padded_x).to(device) + if pin_memory: + tensor = tensor.pin_memory() - -def async_numpy_to_tensor(x: npt.NDArray, device: Union[str, torch.device]): - """ - Make a tensor from a numpy array asynchronously. Use pinned memory - if possible. - """ - t = torch.from_numpy(x) - if is_pin_memory_available(): - t = t.pin_memory() - return t.to(device, non_blocking=True) + return tensor def async_tensor_h2d( From 7e03711b14f78e292d83103591cf869791e83384 Mon Sep 17 00:00:00 2001 From: Peng Guanwen Date: Wed, 7 Aug 2024 13:28:03 +0800 Subject: [PATCH 13/22] rollback changes to utils --- vllm/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/utils.py b/vllm/utils.py index 7e50cc314d554..51bd72977a226 100644 --- a/vllm/utils.py +++ b/vllm/utils.py @@ -666,8 +666,8 @@ def make_tensor_with_pad( pad: T, dtype: torch.dtype, *, - device: Optional[Union[str, torch.device]] = None, max_len: Optional[int] = None, + device: Optional[Union[str, torch.device]] = None, pin_memory: bool = False, ) -> torch.Tensor: """ From 64160465b50f3f059607c873a434074123798f73 Mon Sep 17 00:00:00 2001 From: Peng Guanwen Date: Thu, 8 Aug 2024 10:01:59 +0800 Subject: [PATCH 14/22] rename env --- .buildkite/test-pipeline.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index 7d9a90f18a1d7..ccafd406d5ae6 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -181,7 +181,7 @@ steps: - tests/samplers commands: - pytest -v -s samplers - - VLLM_NO_FLASHINFER_SAMPLER=1 pytest -v -s samplers + - VLLM_DISABLE_FLASHINFER_SAMPLER=1 pytest -v -s samplers - label: LogitsProcessor Test # 5min mirror_hardwares: [amd] From fdc23a35214fd569b18ba79048415db3fd511a97 Mon Sep 17 00:00:00 2001 From: Peng Guanwen Date: Thu, 8 Aug 2024 11:35:09 +0800 Subject: [PATCH 15/22] add top_k_top_p when fallback --- vllm/model_executor/layers/sampler.py | 1 + 1 file changed, 1 insertion(+) diff --git a/vllm/model_executor/layers/sampler.py b/vllm/model_executor/layers/sampler.py index 6f82862f2acd5..af9378ead4d7a 100644 --- a/vllm/model_executor/layers/sampler.py +++ b/vllm/model_executor/layers/sampler.py @@ -535,6 +535,7 @@ def _top_k_top_p_multinomial_with_flashinfer( if not success.all(): warnings.warn("Sampling with FlashInfer failed, fallback.", stacklevel=2) + probs = _apply_top_k_top_p(probs, top_ps, top_ks) return _multinomial(probs, num_samples, seq_groups, is_fallback=True) return batch_next_token_ids.view(-1, num_samples) From b97c911e7bf824d4e0714d7ceed8eed5ad207290 Mon Sep 17 00:00:00 2001 From: Peng Guanwen Date: Mon, 12 Aug 2024 15:00:47 +0800 Subject: [PATCH 16/22] Adapt flashinfer 0.1.4 --- tests/samplers/test_sampler.py | 32 +++++++++++++++++++++++++++ vllm/envs.py | 2 +- vllm/model_executor/layers/sampler.py | 18 ++++++++------- 3 files changed, 43 insertions(+), 9 deletions(-) diff --git a/tests/samplers/test_sampler.py b/tests/samplers/test_sampler.py index 38e59fbe04ebf..e436096484a1a 100644 --- a/tests/samplers/test_sampler.py +++ b/tests/samplers/test_sampler.py @@ -7,6 +7,7 @@ import torch from transformers import GenerationConfig, GenerationMixin +import vllm.envs as envs from vllm.model_executor.layers.sampler import Sampler from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.model_executor.utils import set_random_seed @@ -639,6 +640,37 @@ def mock_sample(probs, *args, **kwargs): assert torch.equal(hf_probs.eq(0), sample_probs.eq(0)) +@pytest.mark.parametrize("seed", RANDOM_SEEDS) +@pytest.mark.parametrize("device", CUDA_DEVICES) +def test_flashinfer_fallback(seed: int, device: str): + if envs.VLLM_DISABLE_FLASHINFER_SAMPLER: + pytest.skip("Flashinfer sampler is disabled") + + set_random_seed(seed) + torch.set_default_device(device) + batch_size = random.randint(1, 256) + _, fake_logits, sampler = _prepare_test(batch_size) + + def failing_flashinfer_sampling(*_args, **_kwargs): + return None, torch.zeros(batch_size, device=device, dtype=torch.int32) + + sampling_params = SamplingParams( + temperature=1.0, + n=random.randint(1, 10), + seed=random.randint(0, 10000), + ) + sampler_output = _do_sample(batch_size, fake_logits, sampler, + sampling_params, device) + + with patch( + "vllm.model_executor.layers.sampler." + "flashinfer_top_k_top_p_sampling", failing_flashinfer_sampling): + fallback_sampler_output = _do_sample(batch_size, fake_logits, sampler, + sampling_params, device) + + assert sampler_output == fallback_sampler_output + + @pytest.mark.parametrize("device", CUDA_DEVICES) def test_sampler_repetition_penalty_mixed(device: str): diff --git a/vllm/envs.py b/vllm/envs.py index b58225cf14caa..523f1600ac4e3 100644 --- a/vllm/envs.py +++ b/vllm/envs.py @@ -253,7 +253,7 @@ def get_default_config_root(): # If set, vllm will not use flashinfer sampler "VLLM_DISABLE_FLASHINFER_SAMPLER": - lambda: bool(os.getenv("VLLM_DISABLE_FLASHINFER_SAMPLER", 0)), + lambda: bool(int(os.getenv("VLLM_DISABLE_FLASHINFER_SAMPLER", "0"))), # Pipeline stage partition strategy "VLLM_PP_LAYER_PARTITION": diff --git a/vllm/model_executor/layers/sampler.py b/vllm/model_executor/layers/sampler.py index af9378ead4d7a..a5dd69872542e 100644 --- a/vllm/model_executor/layers/sampler.py +++ b/vllm/model_executor/layers/sampler.py @@ -23,6 +23,7 @@ SequenceOutput) if not envs.VLLM_DISABLE_FLASHINFER_SAMPLER and find_spec("flashinfer"): + import flashinfer.sampling # yapf: disable from flashinfer.sampling import ( top_k_top_p_sampling_from_probs as flashinfer_top_k_top_p_sampling) @@ -485,9 +486,8 @@ def _multinomial( probs: torch.Tensor, num_samples: int, seq_groups: Optional[List[SequenceGroupToSample]] = None, - is_fallback: bool = False, ) -> torch.Tensor: - if num_samples > 1 and not is_fallback: + if num_samples > 1: probs = probs.repeat_interleave(num_samples, dim=0) q = torch.empty_like(probs) if seq_groups is None: @@ -516,7 +516,7 @@ def _top_k_top_p_multinomial_with_flashinfer( uniform_samples = torch.empty((max_top_k_round, batch_size), device=probs.device) if seq_groups is None: - uniform_samples.random_() + uniform_samples.uniform_() else: sample_idx = 0 for seq_group in seq_groups: @@ -524,7 +524,7 @@ def _top_k_top_p_multinomial_with_flashinfer( stride = len(seq_ids) * num_samples assert seq_group.generator is not None uniform_samples[:, sample_idx:sample_idx + - stride].random_(generator=seq_group.generator) + stride].uniform_(generator=seq_group.generator) sample_idx += stride batch_next_token_ids, success = flashinfer_top_k_top_p_sampling( probs, @@ -533,10 +533,12 @@ def _top_k_top_p_multinomial_with_flashinfer( top_ps, ) if not success.all(): - warnings.warn("Sampling with FlashInfer failed, fallback.", - stacklevel=2) - probs = _apply_top_k_top_p(probs, top_ps, top_ks) - return _multinomial(probs, num_samples, seq_groups, is_fallback=True) + warnings.warn("FlashInfer rejection sampling failed, fallback.", + stacklevel=1) + probs = flashinfer.sampling.top_k_renorm_prob(probs, top_ks) + probs = flashinfer.sampling.top_p_renorm_prob(probs, top_ps) + batch_next_token_ids = flashinfer.sampling.sampling_from_probs( + probs, uniform_samples[0]) return batch_next_token_ids.view(-1, num_samples) From f8d709344753efb3a9be6ebd745e8dbb1e4d6724 Mon Sep 17 00:00:00 2001 From: Peng Guanwen Date: Mon, 12 Aug 2024 15:01:08 +0800 Subject: [PATCH 17/22] Revert changes to sampling_metadata --- vllm/model_executor/sampling_metadata.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/vllm/model_executor/sampling_metadata.py b/vllm/model_executor/sampling_metadata.py index 0562d26f2318a..015e85b4ca81d 100644 --- a/vllm/model_executor/sampling_metadata.py +++ b/vllm/model_executor/sampling_metadata.py @@ -220,12 +220,13 @@ def _prepare_seq_groups( prompt_logprob_indices: List[int] = [] sample_indices: List[int] = [] do_sample = seq_group_metadata.do_sample - seed = sampling_params.seed if seq_group_metadata.is_prompt: - if seed is not None and generators is not None: - generator = torch.Generator(device=device).manual_seed(seed) - generators[seq_group_metadata.request_id] = generator + if sampling_params.seed is not None: + generator = torch.Generator(device=device).manual_seed( + sampling_params.seed) + if generators is not None: + generators[seq_group_metadata.request_id] = generator num_prompts += 1 num_prefill_sample = len(seq_ids) @@ -242,7 +243,7 @@ def _prepare_seq_groups( prompt_logprob_len = 0 sample_len = len(seq_ids) if do_sample else 0 - if seed is not None and generators is not None: + if sampling_params.seed is not None and generators is not None: generator = generators.get(seq_group_metadata.request_id) # Update indices to select from the model output. From 2d7e5c340a7682fe7332f66df79ad9de1d6b9ce3 Mon Sep 17 00:00:00 2001 From: Peng Guanwen Date: Mon, 12 Aug 2024 17:14:41 +0800 Subject: [PATCH 18/22] Change flashinfer 0.1.2 to 0.1.4 in test --- .buildkite/test-pipeline.yaml | 8 ++++---- Dockerfile | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index ccafd406d5ae6..c20573a7021d6 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -61,7 +61,7 @@ steps: - tests/basic_correctness commands: # This flashinfer installation will fail on AMD ROCm, so it is set as optional. - - pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.1.2/flashinfer-0.1.2+cu121torch2.4-cp310-cp310-linux_x86_64.whl || true + - pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.1.4/flashinfer-0.1.4+cu121torch2.4-cp310-cp310-linux_x86_64.whl || true - pytest -v -s basic_correctness/test_basic_correctness.py - pytest -v -s basic_correctness/test_cpu_offload.py - VLLM_ATTENTION_BACKEND=XFORMERS pytest -v -s basic_correctness/test_chunked_prefill.py @@ -156,7 +156,7 @@ steps: - vllm/ - tests/models commands: - - pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.1.2/flashinfer-0.1.2+cu121torch2.4-cp310-cp310-linux_x86_64.whl + - pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.1.4/flashinfer-0.1.4+cu121torch2.4-cp310-cp310-linux_x86_64.whl - pytest -v -s models -m \"not vlm\" - label: Vision Language Models Test # 42min @@ -213,7 +213,7 @@ steps: - vllm/attention - tests/kernels commands: - - pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.1.2/flashinfer-0.1.2+cu121torch2.4-cp310-cp310-linux_x86_64.whl + - pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.1.4/flashinfer-0.1.4+cu121torch2.4-cp310-cp310-linux_x86_64.whl - pytest -v -s kernels --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT parallelism: 4 @@ -331,7 +331,7 @@ steps: # NOTE: don't test llama model here, it seems hf implementation is buggy # see https://github.com/vllm-project/vllm/pull/5689 for details - pytest -v -s distributed/test_custom_all_reduce.py - - pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.1.2/flashinfer-0.1.2+cu121torch2.4-cp310-cp310-linux_x86_64.whl + - pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.1.4/flashinfer-0.1.4+cu121torch2.4-cp310-cp310-linux_x86_64.whl - TARGET_TEST_SUITE=A100 pytest -v -s distributed/test_basic_distributed_correctness.py - pytest -v -s -x lora/test_mixtral.py diff --git a/Dockerfile b/Dockerfile index 49aaea2949ac6..c13cb5c7e7a95 100644 --- a/Dockerfile +++ b/Dockerfile @@ -194,7 +194,7 @@ RUN --mount=type=bind,from=mamba-builder,src=/usr/src/mamba,target=/usr/src/mamb python3 -m pip install /usr/src/mamba/*.whl --no-cache-dir RUN --mount=type=cache,target=/root/.cache/pip \ - python3 -m pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.1.2/flashinfer-0.1.2+cu121torch2.4-cp310-cp310-linux_x86_64.whl + python3 -m pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.1.4/flashinfer-0.1.4+cu121torch2.4-cp310-cp310-linux_x86_64.whl #################### vLLM installation IMAGE #################### From f8931107e176cfef2813109e32f3e7f2ddbe6193 Mon Sep 17 00:00:00 2001 From: Peng Guanwen Date: Thu, 15 Aug 2024 13:49:47 +0800 Subject: [PATCH 19/22] Disable flashinfer in GPTQ reproduce test --- tests/basic_correctness/test_cpu_offload.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/tests/basic_correctness/test_cpu_offload.py b/tests/basic_correctness/test_cpu_offload.py index f0f09ee63c0e6..563e27129c207 100644 --- a/tests/basic_correctness/test_cpu_offload.py +++ b/tests/basic_correctness/test_cpu_offload.py @@ -1,3 +1,5 @@ +from unittest.mock import patch + import pytest from tests.quantization.utils import is_quant_method_supported @@ -29,9 +31,12 @@ def test_cpu_offload_gptq(): compare_two_settings("Qwen/Qwen2-1.5B-Instruct-GPTQ-Int4", [], ["--cpu-offload-gb", "1"]) # Test GPTQ - compare_two_settings("Qwen/Qwen2-1.5B-Instruct-GPTQ-Int4", - ["--quantization", "gptq"], - ["--quantization", "gptq", "--cpu-offload-gb", "1"]) + # The model output logits has small variance between runs, which do not play + # well with the flashinfer sampler. + with patch.dict("os.environ", {"VLLM_DISABLE_FLASHINFER_SAMPLER": "1"}): + compare_two_settings( + "Qwen/Qwen2-1.5B-Instruct-GPTQ-Int4", ["--quantization", "gptq"], + ["--quantization", "gptq", "--cpu-offload-gb", "1"]) @pytest.mark.skipif(not is_quant_method_supported("awq_marlin"), From e4cfcfcef03d982216a48bdfe673ab9c56a0a185 Mon Sep 17 00:00:00 2001 From: Peng Guanwen Date: Thu, 15 Aug 2024 18:36:52 +0800 Subject: [PATCH 20/22] Disable flashinfer sampler in distributed test --- .buildkite/test-pipeline.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index de517ddc8c4df..d7abe0728513a 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -274,7 +274,7 @@ steps: commands: - # the following commands are for the first node, with ip 192.168.10.10 (ray environment already set up) - VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py - - VLLM_MULTI_NODE=1 pytest -v -s distributed/test_pipeline_parallel.py + - VLLM_MULTI_NODE=1 VLLM_DISABLE_FLASHINFER_SAMPLER=1 pytest -v -s distributed/test_pipeline_parallel.py - # the following commands are for the second node, with ip 192.168.10.11 (ray environment already set up) - VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py @@ -302,7 +302,7 @@ steps: - vllm/ - tests/distributed/test_pipeline_parallel commands: - - pytest -v -s distributed/test_pipeline_parallel.py + - VLLM_DISABLE_FLASHINFER_SAMPLER=1 pytest -v -s distributed/test_pipeline_parallel.py - label: LoRA Long Context (Distributed) # 11min # This test runs llama 13B, so it is required to run on 4 GPUs. From 0ec8b614e9940fb0d0291c964c104859a2324162 Mon Sep 17 00:00:00 2001 From: Peng Guanwen Date: Fri, 16 Aug 2024 09:38:37 +0800 Subject: [PATCH 21/22] Disable flashinfer sampler by default --- .buildkite/test-pipeline.yaml | 6 +++--- tests/basic_correctness/test_cpu_offload.py | 11 +++-------- tests/samplers/test_sampler.py | 2 +- vllm/envs.py | 6 +++--- vllm/model_executor/layers/sampler.py | 2 +- 5 files changed, 11 insertions(+), 16 deletions(-) diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index dc8b4d03be4af..7d62713735bae 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -184,7 +184,7 @@ steps: - tests/samplers commands: - pytest -v -s samplers - - VLLM_DISABLE_FLASHINFER_SAMPLER=1 pytest -v -s samplers + - VLLM_USE_FLASHINFER_SAMPLER=1 pytest -v -s samplers - label: LogitsProcessor Test # 5min mirror_hardwares: [amd] @@ -279,7 +279,7 @@ steps: commands: - # the following commands are for the first node, with ip 192.168.10.10 (ray environment already set up) - VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py - - VLLM_MULTI_NODE=1 VLLM_DISABLE_FLASHINFER_SAMPLER=1 pytest -v -s distributed/test_pipeline_parallel.py + - VLLM_MULTI_NODE=1 pytest -v -s distributed/test_pipeline_parallel.py - # the following commands are for the second node, with ip 192.168.10.11 (ray environment already set up) - VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py @@ -309,7 +309,7 @@ steps: - vllm/ - tests/distributed/test_pipeline_parallel commands: - - VLLM_DISABLE_FLASHINFER_SAMPLER=1 pytest -v -s distributed/test_pipeline_parallel.py + - pytest -v -s distributed/test_pipeline_parallel.py - label: LoRA Long Context (Distributed) # 11min # This test runs llama 13B, so it is required to run on 4 GPUs. diff --git a/tests/basic_correctness/test_cpu_offload.py b/tests/basic_correctness/test_cpu_offload.py index 563e27129c207..f0f09ee63c0e6 100644 --- a/tests/basic_correctness/test_cpu_offload.py +++ b/tests/basic_correctness/test_cpu_offload.py @@ -1,5 +1,3 @@ -from unittest.mock import patch - import pytest from tests.quantization.utils import is_quant_method_supported @@ -31,12 +29,9 @@ def test_cpu_offload_gptq(): compare_two_settings("Qwen/Qwen2-1.5B-Instruct-GPTQ-Int4", [], ["--cpu-offload-gb", "1"]) # Test GPTQ - # The model output logits has small variance between runs, which do not play - # well with the flashinfer sampler. - with patch.dict("os.environ", {"VLLM_DISABLE_FLASHINFER_SAMPLER": "1"}): - compare_two_settings( - "Qwen/Qwen2-1.5B-Instruct-GPTQ-Int4", ["--quantization", "gptq"], - ["--quantization", "gptq", "--cpu-offload-gb", "1"]) + compare_two_settings("Qwen/Qwen2-1.5B-Instruct-GPTQ-Int4", + ["--quantization", "gptq"], + ["--quantization", "gptq", "--cpu-offload-gb", "1"]) @pytest.mark.skipif(not is_quant_method_supported("awq_marlin"), diff --git a/tests/samplers/test_sampler.py b/tests/samplers/test_sampler.py index fc0048ca955ee..06c3b3b7148b4 100644 --- a/tests/samplers/test_sampler.py +++ b/tests/samplers/test_sampler.py @@ -643,7 +643,7 @@ def mock_sample(probs, *args, **kwargs): @pytest.mark.parametrize("seed", RANDOM_SEEDS) @pytest.mark.parametrize("device", CUDA_DEVICES) def test_flashinfer_fallback(seed: int, device: str): - if envs.VLLM_DISABLE_FLASHINFER_SAMPLER: + if not envs.VLLM_USE_FLASHINFER_SAMPLER: pytest.skip("Flashinfer sampler is disabled") set_random_seed(seed) diff --git a/vllm/envs.py b/vllm/envs.py index 1fd1a33607e57..92ed243899d47 100644 --- a/vllm/envs.py +++ b/vllm/envs.py @@ -30,7 +30,7 @@ VLLM_LOGGING_CONFIG_PATH: Optional[str] = None VLLM_TRACE_FUNCTION: int = 0 VLLM_ATTENTION_BACKEND: Optional[str] = None - VLLM_DISABLE_FLASHINFER_SAMPLER: bool = False + VLLM_USE_FLASHINFER_SAMPLER: bool = False VLLM_PP_LAYER_PARTITION: Optional[str] = None VLLM_CPU_KVCACHE_SPACE: int = 0 VLLM_CPU_OMP_THREADS_BIND: str = "" @@ -258,8 +258,8 @@ def get_default_config_root(): lambda: os.getenv("VLLM_ATTENTION_BACKEND", None), # If set, vllm will not use flashinfer sampler - "VLLM_DISABLE_FLASHINFER_SAMPLER": - lambda: bool(int(os.getenv("VLLM_DISABLE_FLASHINFER_SAMPLER", "0"))), + "VLLM_USE_FLASHINFER_SAMPLER": + lambda: bool(int(os.getenv("VLLM_USE_FLASHINFER_SAMPLER", "0"))), # Pipeline stage partition strategy "VLLM_PP_LAYER_PARTITION": diff --git a/vllm/model_executor/layers/sampler.py b/vllm/model_executor/layers/sampler.py index b4f2bfde1c4b1..7344d59e988f0 100644 --- a/vllm/model_executor/layers/sampler.py +++ b/vllm/model_executor/layers/sampler.py @@ -22,7 +22,7 @@ PromptLogprobs, SampleLogprobs, SamplerOutput, SequenceOutput) -if not envs.VLLM_DISABLE_FLASHINFER_SAMPLER and find_spec("flashinfer"): +if envs.VLLM_USE_FLASHINFER_SAMPLER and find_spec("flashinfer"): import flashinfer.sampling # yapf: disable from flashinfer.sampling import ( From 9eaea5cf1f990f6b16e878a88044c90e322c045d Mon Sep 17 00:00:00 2001 From: Peng Guanwen Date: Sat, 17 Aug 2024 09:11:04 +0800 Subject: [PATCH 22/22] Update vllm/envs.py Co-authored-by: Michael Goin --- vllm/envs.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/envs.py b/vllm/envs.py index 92ed243899d47..115ead01f537d 100644 --- a/vllm/envs.py +++ b/vllm/envs.py @@ -257,7 +257,7 @@ def get_default_config_root(): "VLLM_ATTENTION_BACKEND": lambda: os.getenv("VLLM_ATTENTION_BACKEND", None), - # If set, vllm will not use flashinfer sampler + # If set, vllm will use flashinfer sampler "VLLM_USE_FLASHINFER_SAMPLER": lambda: bool(int(os.getenv("VLLM_USE_FLASHINFER_SAMPLER", "0"))),