From 7e47c2d25399d476ef667fc21645cee9b3e1f8be Mon Sep 17 00:00:00 2001
From: Peng Guanwen <pg999w@outlook.com>
Date: Fri, 2 Aug 2024 14:23:06 +0800
Subject: [PATCH 01/22] Use flashinfer kernel to do sampling if available

---
 tests/samplers/test_sampler.py           |   4 +-
 vllm/model_executor/layers/sampler.py    | 106 ++++++++++++++++++-----
 vllm/model_executor/sampling_metadata.py |   8 +-
 vllm/utils.py                            |  16 ++--
 4 files changed, 101 insertions(+), 33 deletions(-)

diff --git a/tests/samplers/test_sampler.py b/tests/samplers/test_sampler.py
index bf062e4a5c09d..4a85075bc231d 100644
--- a/tests/samplers/test_sampler.py
+++ b/tests/samplers/test_sampler.py
@@ -625,7 +625,9 @@ def mock_sample(probs, *args, **kwargs):
         return ([[prob.topk(1, dim=-1).indices.tolist(), [0]]
                  for prob in probs], None)
 
-    with patch("vllm.model_executor.layers.sampler._sample", mock_sample):
+    # top-k and top-p is only calculated when flashinfer kernel is not available
+    with patch("vllm.model_executor.layers.sampler._sample", mock_sample), \
+         patch("vllm.model_executor.layers.sampler.HAS_FLASHINFER", False):
         sampler(logits=fake_logits, sampling_metadata=sampling_metadata)
 
     assert sample_probs is not None
diff --git a/vllm/model_executor/layers/sampler.py b/vllm/model_executor/layers/sampler.py
index 6632b1c434582..97afdf1bea93c 100644
--- a/vllm/model_executor/layers/sampler.py
+++ b/vllm/model_executor/layers/sampler.py
@@ -1,8 +1,10 @@
 """A layer that samples the next tokens from the model's outputs."""
 import itertools
+import warnings
 from math import inf
 from typing import Dict, List, Optional, Tuple
 
+import numpy as np
 import torch
 import torch.nn as nn
 
@@ -18,6 +20,13 @@
 from vllm.sequence import (CompletionSequenceGroupOutput, Logprob,
                            PromptLogprobs, SampleLogprobs, SamplerOutput,
                            SequenceOutput)
+from vllm.utils import async_numpy_to_tensor
+
+HAS_FLASHINFER = True
+try:
+    from flashinfer.sampling import top_k_top_p_sampling_from_probs
+except ImportError:
+    HAS_FLASHINFER = False
 
 # (num_token_ids, num_parent_ids) per sequence group.
 SampleResultType = List[Tuple[List[int], List[int]]]
@@ -121,7 +130,7 @@ def forward(
         # Use in-place division to avoid creating a new tensor.
         logits.div_(sampling_tensors.temperatures.unsqueeze(dim=1))
 
-        if do_top_p_top_k:
+        if do_top_p_top_k and not HAS_FLASHINFER:
             logits = _apply_top_k_top_p(logits, sampling_tensors.top_ps,
                                         sampling_tensors.top_ks)
 
@@ -475,32 +484,69 @@ def _multinomial(
     seq_groups: Optional[List[SequenceGroupToSample]] = None,
 ) -> torch.Tensor:
     if num_samples > 1:
-        # This is equivalent to torch.repeat_interleaved (which also
-        # forces a GPU<->CPU sync).
-        # This allows us to do sampling with replacement by creating
-        # num_samples copies of each row in the tensor, and then
-        # batch sampling the resulting tensor.
-        probs = probs[:, None, :].expand(probs.shape[0], num_samples,
-                                         probs.shape[1]).contiguous().view(
-                                             -1, probs.shape[1])
-    q = torch.empty_like(probs)
+        probs = probs.repeat_interleave(num_samples, dim=0)
     if seq_groups is None:
+        q = torch.empty_like(probs)
         q.exponential_()
     else:
+        q_ = np.empty(probs.shape)
         sample_idx = 0
         for seq_group in seq_groups:
             seq_ids = seq_group.seq_ids
-            next_sample_idx = sample_idx + len(seq_ids) * num_samples
-            q[sample_idx:next_sample_idx].exponential_(
-                generator=seq_group.generator)
-            sample_idx = next_sample_idx
+            stride = len(seq_ids) * num_samples
+            assert seq_group.generator is not None
+            q_[sample_idx:sample_idx + stride] = \
+                seq_group.generator.exponential(size=(stride, q_.shape[1]))
+            sample_idx += stride
+        q = async_numpy_to_tensor(q_, probs.device)
     return probs.div_(q).argmax(dim=1).view(-1, num_samples)
 
 
+def _top_k_top_p_multinomial(
+        probs: torch.Tensor, top_ks: torch.Tensor, top_ps: torch.Tensor,
+        num_samples: int, seq_groups: Optional[List[SequenceGroupToSample]]):
+    max_top_k_round = 32
+    if num_samples > 1:
+        probs = probs.repeat_interleave(num_samples, dim=0)
+        top_ks = top_ks.repeat_interleave(num_samples)
+        top_ps = top_ps.repeat_interleave(num_samples)
+    batch_size = probs.shape[0]
+    if seq_groups is None:
+        uniform_samples = torch.rand((max_top_k_round, batch_size),
+                                     device=probs.device)
+    else:
+        uniform_samples_cpu = np.empty((max_top_k_round, batch_size))
+        sample_idx = 0
+        for seq_group in seq_groups:
+            seq_ids = seq_group.seq_ids
+            stride = len(seq_ids) * num_samples
+            assert seq_group.generator is not None
+            uniform_samples_cpu[:, sample_idx:sample_idx + stride] = \
+                seq_group.generator.random((max_top_k_round, stride))
+            sample_idx += stride
+        uniform_samples = async_numpy_to_tensor(uniform_samples_cpu,
+                                                probs.device)
+    batch_next_token_ids, success = top_k_top_p_sampling_from_probs(
+        probs,
+        uniform_samples,
+        top_ks,
+        top_ps,
+    )
+    if not success.all():
+        warnings.warn("Sampling failed, fallback to greedy sampling.",
+                      stacklevel=2)
+        probs = probs.masked_fill(torch.isnan(probs), 0.0)
+        argmax_ids = torch.argmax(probs, dim=-1)
+        batch_next_token_ids = torch.where(success, batch_next_token_ids,
+                                           argmax_ids)
+    return batch_next_token_ids.view(-1, num_samples)
+
+
 def _sample_with_torch(
     probs: torch.Tensor,
     logprobs: torch.Tensor,
     sampling_metadata: SamplingMetadata,
+    sampling_tensors: SamplingTensors,
     include_gpu_probs_tensor: bool,
     modify_greedy_probs: bool,
 ) -> Tuple[SampleResultType, Optional[torch.Tensor]]:
@@ -563,13 +609,23 @@ def _sample_with_torch(
                     sampling_params = seq_group.sampling_params
                     max_best_of_in_batch = max(max_best_of_in_batch,
                                                sampling_params.best_of)
-            seeded_args = {} if sampling_type == SamplingType.RANDOM else {
-                "seq_groups": seq_groups,
-            }
+            if HAS_FLASHINFER:
+                multinomial_samples[sampling_type] = _top_k_top_p_multinomial(
+                    probs[long_sample_indices],
+                    sampling_tensors.top_ks[long_sample_indices],
+                    sampling_tensors.top_ps[long_sample_indices],
+                    max_best_of_in_batch,
+                    seq_groups
+                    if sampling_type == SamplingType.RANDOM_SEED else None,
+                )
+            else:
+                seeded_args = {} if sampling_type == SamplingType.RANDOM else {
+                    "seq_groups": seq_groups,
+                }
 
-            multinomial_samples[sampling_type] = _multinomial(
-                probs[long_sample_indices], max_best_of_in_batch,
-                **seeded_args)
+                multinomial_samples[sampling_type] = _multinomial(
+                    probs[long_sample_indices], max_best_of_in_batch,
+                    **seeded_args)
 
             if sampled_token_ids_tensor is not None:
                 # Store sampled tokens in output tensor.
@@ -692,9 +748,12 @@ def _sample_with_triton_kernel(
 
 
 def _sample(
-    probs: torch.Tensor, logprobs: torch.Tensor,
-    sampling_metadata: SamplingMetadata, sampling_tensors: SamplingTensors,
-    include_gpu_probs_tensor: bool, modify_greedy_probs: bool
+    probs: torch.Tensor,
+    logprobs: torch.Tensor,
+    sampling_metadata: SamplingMetadata,
+    sampling_tensors: SamplingTensors,
+    include_gpu_probs_tensor: bool,
+    modify_greedy_probs: bool,
 ) -> Tuple[SampleResultType, Optional[torch.Tensor]]:
     """
     Args:
@@ -712,6 +771,7 @@ def _sample(
         probs,
         logprobs,
         sampling_metadata,
+        sampling_tensors,
         include_gpu_probs_tensor=include_gpu_probs_tensor,
         modify_greedy_probs=modify_greedy_probs,
     )
diff --git a/vllm/model_executor/sampling_metadata.py b/vllm/model_executor/sampling_metadata.py
index 59cfec9ec8934..745bb971298cc 100644
--- a/vllm/model_executor/sampling_metadata.py
+++ b/vllm/model_executor/sampling_metadata.py
@@ -3,6 +3,7 @@
 from dataclasses import dataclass
 from typing import Dict, List, Optional, Tuple
 
+import numpy as np
 import torch
 
 from vllm.sampling_params import SamplingParams, SamplingType
@@ -38,7 +39,7 @@ class SequenceGroupToSample:
     # prefill is enabled.
     query_len: Optional[int]
     # A random number generator for sampling.
-    generator: Optional[torch.Generator]
+    generator: Optional[np.random.Generator]
     # True if the sequence group is in prefill stage. False if it is in a
     # decode stage.
     is_prompt: bool
@@ -211,7 +212,7 @@ def _prepare_seq_groups(
         seq_ids = list(seq_group_metadata.seq_data.keys())
         sampling_params = seq_group_metadata.sampling_params
         is_prompt = seq_group_metadata.is_prompt
-        generator: Optional[torch.Generator] = None
+        generator: Optional[np.random.Generator] = None
         # If the current seq group is in decode stage, it is None.
         seq_len: Optional[int] = None
         query_len: Optional[int] = None
@@ -221,8 +222,7 @@ def _prepare_seq_groups(
 
         if seq_group_metadata.is_prompt:
             if sampling_params.seed is not None:
-                generator = torch.Generator(device=device).manual_seed(
-                    sampling_params.seed)
+                generator = np.random.default_rng(seed=sampling_params.seed)
                 if generators is not None:
                     generators[seq_group_metadata.request_id] = generator
 
diff --git a/vllm/utils.py b/vllm/utils.py
index 1448316e66edb..afe81f5884a49 100644
--- a/vllm/utils.py
+++ b/vllm/utils.py
@@ -647,8 +647,8 @@ def make_tensor_with_pad(
     pad: T,
     dtype: torch.dtype,
     *,
+    device: Union[str, torch.device],
     max_len: Optional[int] = None,
-    device: Optional[Union[str, torch.device]] = None,
     pin_memory: bool = False,
 ) -> torch.Tensor:
     """
@@ -660,11 +660,17 @@ def make_tensor_with_pad(
     np_dtype = TORCH_DTYPE_TO_NUMPY_DTYPE[dtype]
     padded_x = make_ndarray_with_pad(x, pad, np_dtype, max_len=max_len)
 
-    tensor = torch.from_numpy(padded_x).to(device)
-    if pin_memory:
-        tensor = tensor.pin_memory()
+    return async_numpy_to_tensor(padded_x, device)
 
-    return tensor
+
+def async_numpy_to_tensor(x: npt.NDArray, device: Union[str, torch.device]):
+    """
+    Make a tensor from a numpy array. Use pinned memory if possible.
+    """
+    t = torch.from_numpy(x)
+    if is_pin_memory_available():
+        t = t.pin_memory()
+    return t.to(device, non_blocking=True)
 
 
 def async_tensor_h2d(

From 56beab00dde99c63accd9961e569b42c06099b68 Mon Sep 17 00:00:00 2001
From: Peng Guanwen <pg999w@outlook.com>
Date: Mon, 5 Aug 2024 15:03:56 +0800
Subject: [PATCH 02/22] Fix type mismatch

---
 vllm/model_executor/layers/sampler.py | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/vllm/model_executor/layers/sampler.py b/vllm/model_executor/layers/sampler.py
index 97afdf1bea93c..0325448ed8ab8 100644
--- a/vllm/model_executor/layers/sampler.py
+++ b/vllm/model_executor/layers/sampler.py
@@ -515,14 +515,16 @@ def _top_k_top_p_multinomial(
         uniform_samples = torch.rand((max_top_k_round, batch_size),
                                      device=probs.device)
     else:
-        uniform_samples_cpu = np.empty((max_top_k_round, batch_size))
+        uniform_samples_cpu = np.empty((max_top_k_round, batch_size),
+                                       dtype=np.float32)
         sample_idx = 0
         for seq_group in seq_groups:
             seq_ids = seq_group.seq_ids
             stride = len(seq_ids) * num_samples
             assert seq_group.generator is not None
             uniform_samples_cpu[:, sample_idx:sample_idx + stride] = \
-                seq_group.generator.random((max_top_k_round, stride))
+                seq_group.generator.random((max_top_k_round, stride),
+                                           dtype=np.float32)
             sample_idx += stride
         uniform_samples = async_numpy_to_tensor(uniform_samples_cpu,
                                                 probs.device)
@@ -629,8 +631,8 @@ def _sample_with_torch(
 
             if sampled_token_ids_tensor is not None:
                 # Store sampled tokens in output tensor.
-                sampled_token_ids_tensor[
-                    long_sample_indices] = multinomial_samples[sampling_type]
+                sampled_token_ids_tensor[long_sample_indices] = \
+                    multinomial_samples[sampling_type].to(torch.long)
 
         elif sampling_type == SamplingType.BEAM:
             beam_search_logprobs = logprobs[sample_indices]

From 5396c9ddc26e2898d0f83ee72b085af70f9b98e6 Mon Sep 17 00:00:00 2001
From: Peng Guanwen <pg999w@outlook.com>
Date: Mon, 5 Aug 2024 15:46:43 +0800
Subject: [PATCH 03/22] Some renaming

---
 vllm/model_executor/layers/sampler.py | 31 ++++++++++++++-------------
 vllm/utils.py                         |  2 +-
 2 files changed, 17 insertions(+), 16 deletions(-)

diff --git a/vllm/model_executor/layers/sampler.py b/vllm/model_executor/layers/sampler.py
index 0325448ed8ab8..6a4eac7116cc2 100644
--- a/vllm/model_executor/layers/sampler.py
+++ b/vllm/model_executor/layers/sampler.py
@@ -22,11 +22,11 @@
                            SequenceOutput)
 from vllm.utils import async_numpy_to_tensor
 
-HAS_FLASHINFER = True
 try:
-    from flashinfer.sampling import top_k_top_p_sampling_from_probs
+    from flashinfer.sampling import (top_k_top_p_sampling_from_probs as
+                                     flashinfer_top_k_top_p_sampling)
 except ImportError:
-    HAS_FLASHINFER = False
+    flashinfer_top_k_top_p_sampling = None
 
 # (num_token_ids, num_parent_ids) per sequence group.
 SampleResultType = List[Tuple[List[int], List[int]]]
@@ -130,7 +130,7 @@ def forward(
         # Use in-place division to avoid creating a new tensor.
         logits.div_(sampling_tensors.temperatures.unsqueeze(dim=1))
 
-        if do_top_p_top_k and not HAS_FLASHINFER:
+        if do_top_p_top_k and flashinfer_top_k_top_p_sampling is None:
             logits = _apply_top_k_top_p(logits, sampling_tensors.top_ps,
                                         sampling_tensors.top_ks)
 
@@ -502,7 +502,7 @@ def _multinomial(
     return probs.div_(q).argmax(dim=1).view(-1, num_samples)
 
 
-def _top_k_top_p_multinomial(
+def _top_k_top_p_multinomial_with_flashinfer(
         probs: torch.Tensor, top_ks: torch.Tensor, top_ps: torch.Tensor,
         num_samples: int, seq_groups: Optional[List[SequenceGroupToSample]]):
     max_top_k_round = 32
@@ -528,7 +528,7 @@ def _top_k_top_p_multinomial(
             sample_idx += stride
         uniform_samples = async_numpy_to_tensor(uniform_samples_cpu,
                                                 probs.device)
-    batch_next_token_ids, success = top_k_top_p_sampling_from_probs(
+    batch_next_token_ids, success = flashinfer_top_k_top_p_sampling(
         probs,
         uniform_samples,
         top_ks,
@@ -611,15 +611,16 @@ def _sample_with_torch(
                     sampling_params = seq_group.sampling_params
                     max_best_of_in_batch = max(max_best_of_in_batch,
                                                sampling_params.best_of)
-            if HAS_FLASHINFER:
-                multinomial_samples[sampling_type] = _top_k_top_p_multinomial(
-                    probs[long_sample_indices],
-                    sampling_tensors.top_ks[long_sample_indices],
-                    sampling_tensors.top_ps[long_sample_indices],
-                    max_best_of_in_batch,
-                    seq_groups
-                    if sampling_type == SamplingType.RANDOM_SEED else None,
-                )
+            if flashinfer_top_k_top_p_sampling is not None:
+                multinomial_samples[
+                    sampling_type] = _top_k_top_p_multinomial_with_flashinfer(
+                        probs[long_sample_indices],
+                        sampling_tensors.top_ks[long_sample_indices],
+                        sampling_tensors.top_ps[long_sample_indices],
+                        max_best_of_in_batch,
+                        seq_groups
+                        if sampling_type == SamplingType.RANDOM_SEED else None,
+                    )
             else:
                 seeded_args = {} if sampling_type == SamplingType.RANDOM else {
                     "seq_groups": seq_groups,
diff --git a/vllm/utils.py b/vllm/utils.py
index ff41da012abc8..eb46936c1db78 100644
--- a/vllm/utils.py
+++ b/vllm/utils.py
@@ -668,7 +668,7 @@ def make_tensor_with_pad(
 
 def async_numpy_to_tensor(x: npt.NDArray, device: Union[str, torch.device]):
     """
-    Make a tensor from a numpy array. Use pinned memory if possible.
+    Make a tensor from a numpy array asynchronously. Use pinned memory if possible.
     """
     t = torch.from_numpy(x)
     if is_pin_memory_available():

From 5999bd36cea4e364bac6a84732c8f968827b131f Mon Sep 17 00:00:00 2001
From: Peng Guanwen <pg999w@outlook.com>
Date: Mon, 5 Aug 2024 17:21:05 +0800
Subject: [PATCH 04/22] Fallback for flashinfer sampler

---
 vllm/model_executor/layers/sampler.py | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/vllm/model_executor/layers/sampler.py b/vllm/model_executor/layers/sampler.py
index 6a4eac7116cc2..daf5905edc951 100644
--- a/vllm/model_executor/layers/sampler.py
+++ b/vllm/model_executor/layers/sampler.py
@@ -482,8 +482,9 @@ def _multinomial(
     probs: torch.Tensor,
     num_samples: int,
     seq_groups: Optional[List[SequenceGroupToSample]] = None,
+    is_fallback: bool = False,
 ) -> torch.Tensor:
-    if num_samples > 1:
+    if num_samples > 1 and not is_fallback:
         probs = probs.repeat_interleave(num_samples, dim=0)
     if seq_groups is None:
         q = torch.empty_like(probs)
@@ -535,12 +536,9 @@ def _top_k_top_p_multinomial_with_flashinfer(
         top_ps,
     )
     if not success.all():
-        warnings.warn("Sampling failed, fallback to greedy sampling.",
+        warnings.warn("Sampling with FlashInfer failed, fallback.",
                       stacklevel=2)
-        probs = probs.masked_fill(torch.isnan(probs), 0.0)
-        argmax_ids = torch.argmax(probs, dim=-1)
-        batch_next_token_ids = torch.where(success, batch_next_token_ids,
-                                           argmax_ids)
+        return _multinomial(probs, num_samples, seq_groups, is_fallback=True)
     return batch_next_token_ids.view(-1, num_samples)
 
 

From 420b004e3138b2e68e9f8b7cc47e4808fc3ffd28 Mon Sep 17 00:00:00 2001
From: Peng Guanwen <pg999w@outlook.com>
Date: Mon, 5 Aug 2024 17:33:22 +0800
Subject: [PATCH 05/22] Formatting fix

---
 vllm/model_executor/layers/sampler.py | 4 ++--
 vllm/utils.py                         | 3 ++-
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/vllm/model_executor/layers/sampler.py b/vllm/model_executor/layers/sampler.py
index daf5905edc951..65506b76a3fb4 100644
--- a/vllm/model_executor/layers/sampler.py
+++ b/vllm/model_executor/layers/sampler.py
@@ -23,8 +23,8 @@
 from vllm.utils import async_numpy_to_tensor
 
 try:
-    from flashinfer.sampling import (top_k_top_p_sampling_from_probs as
-                                     flashinfer_top_k_top_p_sampling)
+    from flashinfer.sampling import (
+        top_k_top_p_sampling_from_probs as flashinfer_top_k_top_p_sampling)
 except ImportError:
     flashinfer_top_k_top_p_sampling = None
 
diff --git a/vllm/utils.py b/vllm/utils.py
index eb46936c1db78..f009ade696475 100644
--- a/vllm/utils.py
+++ b/vllm/utils.py
@@ -668,7 +668,8 @@ def make_tensor_with_pad(
 
 def async_numpy_to_tensor(x: npt.NDArray, device: Union[str, torch.device]):
     """
-    Make a tensor from a numpy array asynchronously. Use pinned memory if possible.
+    Make a tensor from a numpy array asynchronously. Use pinned memory
+    if possible.
     """
     t = torch.from_numpy(x)
     if is_pin_memory_available():

From 98d372e1538596ca3ced16a6b9eab1b51748c86e Mon Sep 17 00:00:00 2001
From: Peng Guanwen <pg999w@outlook.com>
Date: Mon, 5 Aug 2024 17:33:54 +0800
Subject: [PATCH 06/22] Tests fix

---
 tests/samplers/test_sampler.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tests/samplers/test_sampler.py b/tests/samplers/test_sampler.py
index 4a85075bc231d..38e59fbe04ebf 100644
--- a/tests/samplers/test_sampler.py
+++ b/tests/samplers/test_sampler.py
@@ -627,7 +627,8 @@ def mock_sample(probs, *args, **kwargs):
 
     # top-k and top-p is only calculated when flashinfer kernel is not available
     with patch("vllm.model_executor.layers.sampler._sample", mock_sample), \
-         patch("vllm.model_executor.layers.sampler.HAS_FLASHINFER", False):
+         patch("vllm.model_executor.layers.sampler."
+               "flashinfer_top_k_top_p_sampling", None):
         sampler(logits=fake_logits, sampling_metadata=sampling_metadata)
 
     assert sample_probs is not None

From 0a8be18e52e0443477b66e5b34739ae5c1262220 Mon Sep 17 00:00:00 2001
From: Peng Guanwen <pg999w@outlook.com>
Date: Mon, 5 Aug 2024 17:41:11 +0800
Subject: [PATCH 07/22] Fix mypy

---
 vllm/model_executor/layers/sampler.py | 15 +++++++--------
 1 file changed, 7 insertions(+), 8 deletions(-)

diff --git a/vllm/model_executor/layers/sampler.py b/vllm/model_executor/layers/sampler.py
index 65506b76a3fb4..7411689403689 100644
--- a/vllm/model_executor/layers/sampler.py
+++ b/vllm/model_executor/layers/sampler.py
@@ -609,6 +609,9 @@ def _sample_with_torch(
                     sampling_params = seq_group.sampling_params
                     max_best_of_in_batch = max(max_best_of_in_batch,
                                                sampling_params.best_of)
+            seq_groups_arg = (None if sampling_type == SamplingType.RANDOM else
+                              seq_groups)
+
             if flashinfer_top_k_top_p_sampling is not None:
                 multinomial_samples[
                     sampling_type] = _top_k_top_p_multinomial_with_flashinfer(
@@ -616,17 +619,13 @@ def _sample_with_torch(
                         sampling_tensors.top_ks[long_sample_indices],
                         sampling_tensors.top_ps[long_sample_indices],
                         max_best_of_in_batch,
-                        seq_groups
-                        if sampling_type == SamplingType.RANDOM_SEED else None,
+                        seq_groups_arg,
                     )
             else:
-                seeded_args = {} if sampling_type == SamplingType.RANDOM else {
-                    "seq_groups": seq_groups,
-                }
-
                 multinomial_samples[sampling_type] = _multinomial(
-                    probs[long_sample_indices], max_best_of_in_batch,
-                    **seeded_args)
+                    probs[long_sample_indices],
+                    max_best_of_in_batch,
+                    seq_groups=seq_groups_arg)
 
             if sampled_token_ids_tensor is not None:
                 # Store sampled tokens in output tensor.

From f1706466a787bb41480211c972cf3d2a76d819e8 Mon Sep 17 00:00:00 2001
From: Peng Guanwen <pg999w@outlook.com>
Date: Mon, 5 Aug 2024 17:49:24 +0800
Subject: [PATCH 08/22] Add test for flashinfer sampler

---
 .buildkite/test-pipeline.yaml         | 6 +++++-
 vllm/model_executor/layers/sampler.py | 4 ++--
 2 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index 573c3740f0bbb..c40677ccc5dd8 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -179,7 +179,11 @@ steps:
 
 - label: Samplers Test
   #mirror_hardwares: [amd]
-  command: pytest -v -s samplers
+  command:
+    - pytest -v -s samplers
+    # Test for flashinfer samplers
+    - pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.1.2/flashinfer-0.1.2+cu121torch2.4-cp310-cp310-linux_x86_64.whl
+    - pytest -v -s samplers
 
 - label: LogitsProcessor Test
   mirror_hardwares: [amd]
diff --git a/vllm/model_executor/layers/sampler.py b/vllm/model_executor/layers/sampler.py
index 7411689403689..1bc9c8552328f 100644
--- a/vllm/model_executor/layers/sampler.py
+++ b/vllm/model_executor/layers/sampler.py
@@ -23,8 +23,8 @@
 from vllm.utils import async_numpy_to_tensor
 
 try:
-    from flashinfer.sampling import (
-        top_k_top_p_sampling_from_probs as flashinfer_top_k_top_p_sampling)
+    from flashinfer.sampling import (top_k_top_p_sampling_from_probs as
+                                     flashinfer_top_k_top_p_sampling)
 except ImportError:
     flashinfer_top_k_top_p_sampling = None
 

From 88c8a985efa0fcdca5c8a0dd560c8ee2df4394b2 Mon Sep 17 00:00:00 2001
From: Peng Guanwen <pg999w@outlook.com>
Date: Mon, 5 Aug 2024 17:59:35 +0800
Subject: [PATCH 09/22] Suppress yapf on import

---
 vllm/model_executor/layers/sampler.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/vllm/model_executor/layers/sampler.py b/vllm/model_executor/layers/sampler.py
index 1bc9c8552328f..d88e24bf7c279 100644
--- a/vllm/model_executor/layers/sampler.py
+++ b/vllm/model_executor/layers/sampler.py
@@ -23,8 +23,11 @@
 from vllm.utils import async_numpy_to_tensor
 
 try:
-    from flashinfer.sampling import (top_k_top_p_sampling_from_probs as
-                                     flashinfer_top_k_top_p_sampling)
+    # yapf: disable
+    from flashinfer.sampling import (
+        top_k_top_p_sampling_from_probs as flashinfer_top_k_top_p_sampling)
+
+    # yapf: enable
 except ImportError:
     flashinfer_top_k_top_p_sampling = None
 

From c404cd56399ce9ced2af951f3b62891c8f455492 Mon Sep 17 00:00:00 2001
From: Peng Guanwen <pg999w@outlook.com>
Date: Mon, 5 Aug 2024 18:02:36 +0800
Subject: [PATCH 10/22] Fix pipeline

---
 .buildkite/test-pipeline.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index c40677ccc5dd8..f8ff1426666a5 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -179,7 +179,7 @@ steps:
 
 - label: Samplers Test
   #mirror_hardwares: [amd]
-  command:
+  commands:
     - pytest -v -s samplers
     # Test for flashinfer samplers
     - pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.1.2/flashinfer-0.1.2+cu121torch2.4-cp310-cp310-linux_x86_64.whl

From c361a952a3818fdf625b9a593a5bd7157032547a Mon Sep 17 00:00:00 2001
From: Peng Guanwen <pg999w@outlook.com>
Date: Tue, 6 Aug 2024 17:56:54 +0800
Subject: [PATCH 11/22] Change back to torch generator, add env flags

---
 .buildkite/test-pipeline.yaml            |  4 +---
 vllm/envs.py                             |  5 ++++
 vllm/model_executor/layers/sampler.py    | 30 ++++++++++--------------
 vllm/model_executor/sampling_metadata.py | 15 ++++++------
 4 files changed, 25 insertions(+), 29 deletions(-)

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index f8ff1426666a5..55dc5dcd3b872 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -181,9 +181,7 @@ steps:
   #mirror_hardwares: [amd]
   commands:
     - pytest -v -s samplers
-    # Test for flashinfer samplers
-    - pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.1.2/flashinfer-0.1.2+cu121torch2.4-cp310-cp310-linux_x86_64.whl
-    - pytest -v -s samplers
+    - VLLM_NO_FLASHINFER_SAMPLER=1 pytest -v -s samplers
 
 - label: LogitsProcessor Test
   mirror_hardwares: [amd]
diff --git a/vllm/envs.py b/vllm/envs.py
index 9bcb26f8e5a64..dfa108510f833 100644
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -28,6 +28,7 @@
     VLLM_LOGGING_CONFIG_PATH: Optional[str] = None
     VLLM_TRACE_FUNCTION: int = 0
     VLLM_ATTENTION_BACKEND: Optional[str] = None
+    VLLM_NO_FLASHINFER_SAMPLER: bool = False
     VLLM_PP_LAYER_PARTITION: Optional[str] = None
     VLLM_CPU_KVCACHE_SPACE: int = 0
     VLLM_CPU_OMP_THREADS_BIND: str = ""
@@ -238,6 +239,10 @@ def get_default_config_root():
     "VLLM_ATTENTION_BACKEND":
     lambda: os.getenv("VLLM_ATTENTION_BACKEND", None),
 
+    # If set, vllm will not use flashinfer sampler
+    "VLLM_NO_FLASHINFER_SAMPLER":
+    lambda: bool(os.getenv("VLLM_NO_FLASHINFER_SAMPLER", 0)),
+
     # Pipeline stage partition strategy
     "VLLM_PP_LAYER_PARTITION":
     lambda: os.getenv("VLLM_PP_LAYER_PARTITION", None),
diff --git a/vllm/model_executor/layers/sampler.py b/vllm/model_executor/layers/sampler.py
index d88e24bf7c279..03ca2f8bd5c59 100644
--- a/vllm/model_executor/layers/sampler.py
+++ b/vllm/model_executor/layers/sampler.py
@@ -1,10 +1,10 @@
 """A layer that samples the next tokens from the model's outputs."""
 import itertools
 import warnings
+from importlib.util import find_spec
 from math import inf
 from typing import Dict, List, Optional, Tuple
 
-import numpy as np
 import torch
 import torch.nn as nn
 
@@ -13,6 +13,7 @@
 if HAS_TRITON:
     from vllm.model_executor.layers.ops.sample import sample as sample_triton
 
+import vllm.envs as envs
 from vllm.model_executor.sampling_metadata import (SamplingMetadata,
                                                    SamplingTensors,
                                                    SequenceGroupToSample)
@@ -20,15 +21,14 @@
 from vllm.sequence import (CompletionSequenceGroupOutput, Logprob,
                            PromptLogprobs, SampleLogprobs, SamplerOutput,
                            SequenceOutput)
-from vllm.utils import async_numpy_to_tensor
 
-try:
+if not envs.VLLM_NO_FLASHINFER_SAMPLER and find_spec("flashinfer"):
     # yapf: disable
     from flashinfer.sampling import (
         top_k_top_p_sampling_from_probs as flashinfer_top_k_top_p_sampling)
 
     # yapf: enable
-except ImportError:
+else:
     flashinfer_top_k_top_p_sampling = None
 
 # (num_token_ids, num_parent_ids) per sequence group.
@@ -489,20 +489,18 @@ def _multinomial(
 ) -> torch.Tensor:
     if num_samples > 1 and not is_fallback:
         probs = probs.repeat_interleave(num_samples, dim=0)
+    q = torch.empty_like(probs)
     if seq_groups is None:
-        q = torch.empty_like(probs)
         q.exponential_()
     else:
-        q_ = np.empty(probs.shape)
         sample_idx = 0
         for seq_group in seq_groups:
             seq_ids = seq_group.seq_ids
             stride = len(seq_ids) * num_samples
             assert seq_group.generator is not None
-            q_[sample_idx:sample_idx + stride] = \
-                seq_group.generator.exponential(size=(stride, q_.shape[1]))
+            q[sample_idx:sample_idx +
+              stride].exponential_(generator=seq_group.generator)
             sample_idx += stride
-        q = async_numpy_to_tensor(q_, probs.device)
     return probs.div_(q).argmax(dim=1).view(-1, num_samples)
 
 
@@ -515,23 +513,19 @@ def _top_k_top_p_multinomial_with_flashinfer(
         top_ks = top_ks.repeat_interleave(num_samples)
         top_ps = top_ps.repeat_interleave(num_samples)
     batch_size = probs.shape[0]
+    uniform_samples = torch.empty((max_top_k_round, batch_size),
+                                  device=probs.device)
     if seq_groups is None:
-        uniform_samples = torch.rand((max_top_k_round, batch_size),
-                                     device=probs.device)
+        uniform_samples.random_()
     else:
-        uniform_samples_cpu = np.empty((max_top_k_round, batch_size),
-                                       dtype=np.float32)
         sample_idx = 0
         for seq_group in seq_groups:
             seq_ids = seq_group.seq_ids
             stride = len(seq_ids) * num_samples
             assert seq_group.generator is not None
-            uniform_samples_cpu[:, sample_idx:sample_idx + stride] = \
-                seq_group.generator.random((max_top_k_round, stride),
-                                           dtype=np.float32)
+            uniform_samples[:, sample_idx:sample_idx +
+                            stride].random_(generator=seq_group.generator)
             sample_idx += stride
-        uniform_samples = async_numpy_to_tensor(uniform_samples_cpu,
-                                                probs.device)
     batch_next_token_ids, success = flashinfer_top_k_top_p_sampling(
         probs,
         uniform_samples,
diff --git a/vllm/model_executor/sampling_metadata.py b/vllm/model_executor/sampling_metadata.py
index b3fb7e657df1b..0562d26f2318a 100644
--- a/vllm/model_executor/sampling_metadata.py
+++ b/vllm/model_executor/sampling_metadata.py
@@ -3,7 +3,6 @@
 from dataclasses import dataclass
 from typing import Dict, List, Optional, Tuple
 
-import numpy as np
 import torch
 
 from vllm.sampling_params import SamplingParams, SamplingType
@@ -41,7 +40,7 @@ class SequenceGroupToSample:
     # prefill is enabled.
     query_len: Optional[int]
     # A random number generator for sampling.
-    generator: Optional[np.random.Generator]
+    generator: Optional[torch.Generator]
     # True if the sequence group is in prefill stage. False if it is in a
     # decode stage.
     is_prompt: bool
@@ -214,19 +213,19 @@ def _prepare_seq_groups(
         seq_ids = list(seq_group_metadata.seq_data.keys())
         sampling_params = seq_group_metadata.sampling_params
         is_prompt = seq_group_metadata.is_prompt
-        generator: Optional[np.random.Generator] = None
+        generator: Optional[torch.Generator] = None
         # If the current seq group is in decode stage, it is None.
         seq_len: Optional[int] = None
         query_len: Optional[int] = None
         prompt_logprob_indices: List[int] = []
         sample_indices: List[int] = []
         do_sample = seq_group_metadata.do_sample
+        seed = sampling_params.seed
 
         if seq_group_metadata.is_prompt:
-            if sampling_params.seed is not None:
-                generator = np.random.default_rng(seed=sampling_params.seed)
-                if generators is not None:
-                    generators[seq_group_metadata.request_id] = generator
+            if seed is not None and generators is not None:
+                generator = torch.Generator(device=device).manual_seed(seed)
+                generators[seq_group_metadata.request_id] = generator
 
             num_prompts += 1
             num_prefill_sample = len(seq_ids)
@@ -243,7 +242,7 @@ def _prepare_seq_groups(
             prompt_logprob_len = 0
             sample_len = len(seq_ids) if do_sample else 0
 
-            if sampling_params.seed is not None and generators is not None:
+            if seed is not None and generators is not None:
                 generator = generators.get(seq_group_metadata.request_id)
 
         # Update indices to select from the model output.

From 99f7eccb163b6f6184ff0960a1ab0b5db136c96f Mon Sep 17 00:00:00 2001
From: Peng Guanwen <pg999w@outlook.com>
Date: Wed, 7 Aug 2024 13:26:21 +0800
Subject: [PATCH 12/22] rename env for flashinfer, rollback changes in utils

---
 vllm/envs.py                          |  6 +++---
 vllm/model_executor/layers/sampler.py |  2 +-
 vllm/utils.py                         | 17 +++++------------
 3 files changed, 9 insertions(+), 16 deletions(-)

diff --git a/vllm/envs.py b/vllm/envs.py
index 77e1e4002bcd3..b58225cf14caa 100644
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -29,7 +29,7 @@
     VLLM_LOGGING_CONFIG_PATH: Optional[str] = None
     VLLM_TRACE_FUNCTION: int = 0
     VLLM_ATTENTION_BACKEND: Optional[str] = None
-    VLLM_NO_FLASHINFER_SAMPLER: bool = False
+    VLLM_DISABLE_FLASHINFER_SAMPLER: bool = False
     VLLM_PP_LAYER_PARTITION: Optional[str] = None
     VLLM_CPU_KVCACHE_SPACE: int = 0
     VLLM_CPU_OMP_THREADS_BIND: str = ""
@@ -252,8 +252,8 @@ def get_default_config_root():
     lambda: os.getenv("VLLM_ATTENTION_BACKEND", None),
 
     # If set, vllm will not use flashinfer sampler
-    "VLLM_NO_FLASHINFER_SAMPLER":
-    lambda: bool(os.getenv("VLLM_NO_FLASHINFER_SAMPLER", 0)),
+    "VLLM_DISABLE_FLASHINFER_SAMPLER":
+    lambda: bool(os.getenv("VLLM_DISABLE_FLASHINFER_SAMPLER", 0)),
 
     # Pipeline stage partition strategy
     "VLLM_PP_LAYER_PARTITION":
diff --git a/vllm/model_executor/layers/sampler.py b/vllm/model_executor/layers/sampler.py
index 03ca2f8bd5c59..6f82862f2acd5 100644
--- a/vllm/model_executor/layers/sampler.py
+++ b/vllm/model_executor/layers/sampler.py
@@ -22,7 +22,7 @@
                            PromptLogprobs, SampleLogprobs, SamplerOutput,
                            SequenceOutput)
 
-if not envs.VLLM_NO_FLASHINFER_SAMPLER and find_spec("flashinfer"):
+if not envs.VLLM_DISABLE_FLASHINFER_SAMPLER and find_spec("flashinfer"):
     # yapf: disable
     from flashinfer.sampling import (
         top_k_top_p_sampling_from_probs as flashinfer_top_k_top_p_sampling)
diff --git a/vllm/utils.py b/vllm/utils.py
index f2a7b228c8077..7e50cc314d554 100644
--- a/vllm/utils.py
+++ b/vllm/utils.py
@@ -666,7 +666,7 @@ def make_tensor_with_pad(
     pad: T,
     dtype: torch.dtype,
     *,
-    device: Union[str, torch.device],
+    device: Optional[Union[str, torch.device]] = None,
     max_len: Optional[int] = None,
     pin_memory: bool = False,
 ) -> torch.Tensor:
@@ -679,18 +679,11 @@ def make_tensor_with_pad(
     np_dtype = TORCH_DTYPE_TO_NUMPY_DTYPE[dtype]
     padded_x = make_ndarray_with_pad(x, pad, np_dtype, max_len=max_len)
 
-    return async_numpy_to_tensor(padded_x, device)
+    tensor = torch.from_numpy(padded_x).to(device)
+    if pin_memory:
+        tensor = tensor.pin_memory()
 
-
-def async_numpy_to_tensor(x: npt.NDArray, device: Union[str, torch.device]):
-    """
-    Make a tensor from a numpy array asynchronously. Use pinned memory
-    if possible.
-    """
-    t = torch.from_numpy(x)
-    if is_pin_memory_available():
-        t = t.pin_memory()
-    return t.to(device, non_blocking=True)
+    return tensor
 
 
 def async_tensor_h2d(

From 7e03711b14f78e292d83103591cf869791e83384 Mon Sep 17 00:00:00 2001
From: Peng Guanwen <pg999w@outlook.com>
Date: Wed, 7 Aug 2024 13:28:03 +0800
Subject: [PATCH 13/22] rollback changes to utils

---
 vllm/utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/utils.py b/vllm/utils.py
index 7e50cc314d554..51bd72977a226 100644
--- a/vllm/utils.py
+++ b/vllm/utils.py
@@ -666,8 +666,8 @@ def make_tensor_with_pad(
     pad: T,
     dtype: torch.dtype,
     *,
-    device: Optional[Union[str, torch.device]] = None,
     max_len: Optional[int] = None,
+    device: Optional[Union[str, torch.device]] = None,
     pin_memory: bool = False,
 ) -> torch.Tensor:
     """

From 64160465b50f3f059607c873a434074123798f73 Mon Sep 17 00:00:00 2001
From: Peng Guanwen <pg999w@outlook.com>
Date: Thu, 8 Aug 2024 10:01:59 +0800
Subject: [PATCH 14/22] rename env

---
 .buildkite/test-pipeline.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index 7d9a90f18a1d7..ccafd406d5ae6 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -181,7 +181,7 @@ steps:
   - tests/samplers
   commands:
     - pytest -v -s samplers
-    - VLLM_NO_FLASHINFER_SAMPLER=1 pytest -v -s samplers
+    - VLLM_DISABLE_FLASHINFER_SAMPLER=1 pytest -v -s samplers
 
 - label: LogitsProcessor Test # 5min
   mirror_hardwares: [amd]

From fdc23a35214fd569b18ba79048415db3fd511a97 Mon Sep 17 00:00:00 2001
From: Peng Guanwen <pg999w@outlook.com>
Date: Thu, 8 Aug 2024 11:35:09 +0800
Subject: [PATCH 15/22] add top_k_top_p when fallback

---
 vllm/model_executor/layers/sampler.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/vllm/model_executor/layers/sampler.py b/vllm/model_executor/layers/sampler.py
index 6f82862f2acd5..af9378ead4d7a 100644
--- a/vllm/model_executor/layers/sampler.py
+++ b/vllm/model_executor/layers/sampler.py
@@ -535,6 +535,7 @@ def _top_k_top_p_multinomial_with_flashinfer(
     if not success.all():
         warnings.warn("Sampling with FlashInfer failed, fallback.",
                       stacklevel=2)
+        probs = _apply_top_k_top_p(probs, top_ps, top_ks)
         return _multinomial(probs, num_samples, seq_groups, is_fallback=True)
     return batch_next_token_ids.view(-1, num_samples)
 

From b97c911e7bf824d4e0714d7ceed8eed5ad207290 Mon Sep 17 00:00:00 2001
From: Peng Guanwen <pg999w@outlook.com>
Date: Mon, 12 Aug 2024 15:00:47 +0800
Subject: [PATCH 16/22] Adapt flashinfer 0.1.4

---
 tests/samplers/test_sampler.py        | 32 +++++++++++++++++++++++++++
 vllm/envs.py                          |  2 +-
 vllm/model_executor/layers/sampler.py | 18 ++++++++-------
 3 files changed, 43 insertions(+), 9 deletions(-)

diff --git a/tests/samplers/test_sampler.py b/tests/samplers/test_sampler.py
index 38e59fbe04ebf..e436096484a1a 100644
--- a/tests/samplers/test_sampler.py
+++ b/tests/samplers/test_sampler.py
@@ -7,6 +7,7 @@
 import torch
 from transformers import GenerationConfig, GenerationMixin
 
+import vllm.envs as envs
 from vllm.model_executor.layers.sampler import Sampler
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.model_executor.utils import set_random_seed
@@ -639,6 +640,37 @@ def mock_sample(probs, *args, **kwargs):
     assert torch.equal(hf_probs.eq(0), sample_probs.eq(0))
 
 
+@pytest.mark.parametrize("seed", RANDOM_SEEDS)
+@pytest.mark.parametrize("device", CUDA_DEVICES)
+def test_flashinfer_fallback(seed: int, device: str):
+    if envs.VLLM_DISABLE_FLASHINFER_SAMPLER:
+        pytest.skip("Flashinfer sampler is disabled")
+
+    set_random_seed(seed)
+    torch.set_default_device(device)
+    batch_size = random.randint(1, 256)
+    _, fake_logits, sampler = _prepare_test(batch_size)
+
+    def failing_flashinfer_sampling(*_args, **_kwargs):
+        return None, torch.zeros(batch_size, device=device, dtype=torch.int32)
+
+    sampling_params = SamplingParams(
+        temperature=1.0,
+        n=random.randint(1, 10),
+        seed=random.randint(0, 10000),
+    )
+    sampler_output = _do_sample(batch_size, fake_logits, sampler,
+                                sampling_params, device)
+
+    with patch(
+            "vllm.model_executor.layers.sampler."
+            "flashinfer_top_k_top_p_sampling", failing_flashinfer_sampling):
+        fallback_sampler_output = _do_sample(batch_size, fake_logits, sampler,
+                                             sampling_params, device)
+
+    assert sampler_output == fallback_sampler_output
+
+
 @pytest.mark.parametrize("device", CUDA_DEVICES)
 def test_sampler_repetition_penalty_mixed(device: str):
 
diff --git a/vllm/envs.py b/vllm/envs.py
index b58225cf14caa..523f1600ac4e3 100644
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -253,7 +253,7 @@ def get_default_config_root():
 
     # If set, vllm will not use flashinfer sampler
     "VLLM_DISABLE_FLASHINFER_SAMPLER":
-    lambda: bool(os.getenv("VLLM_DISABLE_FLASHINFER_SAMPLER", 0)),
+    lambda: bool(int(os.getenv("VLLM_DISABLE_FLASHINFER_SAMPLER", "0"))),
 
     # Pipeline stage partition strategy
     "VLLM_PP_LAYER_PARTITION":
diff --git a/vllm/model_executor/layers/sampler.py b/vllm/model_executor/layers/sampler.py
index af9378ead4d7a..a5dd69872542e 100644
--- a/vllm/model_executor/layers/sampler.py
+++ b/vllm/model_executor/layers/sampler.py
@@ -23,6 +23,7 @@
                            SequenceOutput)
 
 if not envs.VLLM_DISABLE_FLASHINFER_SAMPLER and find_spec("flashinfer"):
+    import flashinfer.sampling
     # yapf: disable
     from flashinfer.sampling import (
         top_k_top_p_sampling_from_probs as flashinfer_top_k_top_p_sampling)
@@ -485,9 +486,8 @@ def _multinomial(
     probs: torch.Tensor,
     num_samples: int,
     seq_groups: Optional[List[SequenceGroupToSample]] = None,
-    is_fallback: bool = False,
 ) -> torch.Tensor:
-    if num_samples > 1 and not is_fallback:
+    if num_samples > 1:
         probs = probs.repeat_interleave(num_samples, dim=0)
     q = torch.empty_like(probs)
     if seq_groups is None:
@@ -516,7 +516,7 @@ def _top_k_top_p_multinomial_with_flashinfer(
     uniform_samples = torch.empty((max_top_k_round, batch_size),
                                   device=probs.device)
     if seq_groups is None:
-        uniform_samples.random_()
+        uniform_samples.uniform_()
     else:
         sample_idx = 0
         for seq_group in seq_groups:
@@ -524,7 +524,7 @@ def _top_k_top_p_multinomial_with_flashinfer(
             stride = len(seq_ids) * num_samples
             assert seq_group.generator is not None
             uniform_samples[:, sample_idx:sample_idx +
-                            stride].random_(generator=seq_group.generator)
+                            stride].uniform_(generator=seq_group.generator)
             sample_idx += stride
     batch_next_token_ids, success = flashinfer_top_k_top_p_sampling(
         probs,
@@ -533,10 +533,12 @@ def _top_k_top_p_multinomial_with_flashinfer(
         top_ps,
     )
     if not success.all():
-        warnings.warn("Sampling with FlashInfer failed, fallback.",
-                      stacklevel=2)
-        probs = _apply_top_k_top_p(probs, top_ps, top_ks)
-        return _multinomial(probs, num_samples, seq_groups, is_fallback=True)
+        warnings.warn("FlashInfer rejection sampling failed, fallback.",
+                      stacklevel=1)
+        probs = flashinfer.sampling.top_k_renorm_prob(probs, top_ks)
+        probs = flashinfer.sampling.top_p_renorm_prob(probs, top_ps)
+        batch_next_token_ids = flashinfer.sampling.sampling_from_probs(
+            probs, uniform_samples[0])
     return batch_next_token_ids.view(-1, num_samples)
 
 

From f8d709344753efb3a9be6ebd745e8dbb1e4d6724 Mon Sep 17 00:00:00 2001
From: Peng Guanwen <pg999w@outlook.com>
Date: Mon, 12 Aug 2024 15:01:08 +0800
Subject: [PATCH 17/22] Revert changes to sampling_metadata

---
 vllm/model_executor/sampling_metadata.py | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/vllm/model_executor/sampling_metadata.py b/vllm/model_executor/sampling_metadata.py
index 0562d26f2318a..015e85b4ca81d 100644
--- a/vllm/model_executor/sampling_metadata.py
+++ b/vllm/model_executor/sampling_metadata.py
@@ -220,12 +220,13 @@ def _prepare_seq_groups(
         prompt_logprob_indices: List[int] = []
         sample_indices: List[int] = []
         do_sample = seq_group_metadata.do_sample
-        seed = sampling_params.seed
 
         if seq_group_metadata.is_prompt:
-            if seed is not None and generators is not None:
-                generator = torch.Generator(device=device).manual_seed(seed)
-                generators[seq_group_metadata.request_id] = generator
+            if sampling_params.seed is not None:
+                generator = torch.Generator(device=device).manual_seed(
+                    sampling_params.seed)
+                if generators is not None:
+                    generators[seq_group_metadata.request_id] = generator
 
             num_prompts += 1
             num_prefill_sample = len(seq_ids)
@@ -242,7 +243,7 @@ def _prepare_seq_groups(
             prompt_logprob_len = 0
             sample_len = len(seq_ids) if do_sample else 0
 
-            if seed is not None and generators is not None:
+            if sampling_params.seed is not None and generators is not None:
                 generator = generators.get(seq_group_metadata.request_id)
 
         # Update indices to select from the model output.

From 2d7e5c340a7682fe7332f66df79ad9de1d6b9ce3 Mon Sep 17 00:00:00 2001
From: Peng Guanwen <pg999w@outlook.com>
Date: Mon, 12 Aug 2024 17:14:41 +0800
Subject: [PATCH 18/22] Change flashinfer 0.1.2 to 0.1.4 in test

---
 .buildkite/test-pipeline.yaml | 8 ++++----
 Dockerfile                    | 2 +-
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index ccafd406d5ae6..c20573a7021d6 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -61,7 +61,7 @@ steps:
   - tests/basic_correctness
   commands:
   # This flashinfer installation will fail on AMD ROCm, so it is set as optional.
-  - pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.1.2/flashinfer-0.1.2+cu121torch2.4-cp310-cp310-linux_x86_64.whl || true
+  - pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.1.4/flashinfer-0.1.4+cu121torch2.4-cp310-cp310-linux_x86_64.whl || true
   - pytest -v -s basic_correctness/test_basic_correctness.py
   - pytest -v -s basic_correctness/test_cpu_offload.py
   - VLLM_ATTENTION_BACKEND=XFORMERS pytest -v -s basic_correctness/test_chunked_prefill.py
@@ -156,7 +156,7 @@ steps:
   - vllm/
   - tests/models
   commands:
-    - pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.1.2/flashinfer-0.1.2+cu121torch2.4-cp310-cp310-linux_x86_64.whl
+    - pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.1.4/flashinfer-0.1.4+cu121torch2.4-cp310-cp310-linux_x86_64.whl
     - pytest -v -s models -m \"not vlm\"
 
 - label: Vision Language Models Test # 42min
@@ -213,7 +213,7 @@ steps:
   - vllm/attention
   - tests/kernels
   commands:
-    - pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.1.2/flashinfer-0.1.2+cu121torch2.4-cp310-cp310-linux_x86_64.whl
+    - pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.1.4/flashinfer-0.1.4+cu121torch2.4-cp310-cp310-linux_x86_64.whl
     - pytest -v -s kernels --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
   parallelism: 4
 
@@ -331,7 +331,7 @@ steps:
   # NOTE: don't test llama model here, it seems hf implementation is buggy
   # see https://github.com/vllm-project/vllm/pull/5689 for details
   - pytest -v -s distributed/test_custom_all_reduce.py
-  - pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.1.2/flashinfer-0.1.2+cu121torch2.4-cp310-cp310-linux_x86_64.whl
+  - pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.1.4/flashinfer-0.1.4+cu121torch2.4-cp310-cp310-linux_x86_64.whl
   - TARGET_TEST_SUITE=A100 pytest -v -s distributed/test_basic_distributed_correctness.py
   - pytest -v -s -x lora/test_mixtral.py
 
diff --git a/Dockerfile b/Dockerfile
index 49aaea2949ac6..c13cb5c7e7a95 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -194,7 +194,7 @@ RUN --mount=type=bind,from=mamba-builder,src=/usr/src/mamba,target=/usr/src/mamb
     python3 -m pip install /usr/src/mamba/*.whl --no-cache-dir
 
 RUN --mount=type=cache,target=/root/.cache/pip \
-    python3 -m pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.1.2/flashinfer-0.1.2+cu121torch2.4-cp310-cp310-linux_x86_64.whl
+    python3 -m pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.1.4/flashinfer-0.1.4+cu121torch2.4-cp310-cp310-linux_x86_64.whl
 #################### vLLM installation IMAGE ####################
 
 

From f8931107e176cfef2813109e32f3e7f2ddbe6193 Mon Sep 17 00:00:00 2001
From: Peng Guanwen <pg999w@outlook.com>
Date: Thu, 15 Aug 2024 13:49:47 +0800
Subject: [PATCH 19/22] Disable flashinfer in GPTQ reproduce test

---
 tests/basic_correctness/test_cpu_offload.py | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/tests/basic_correctness/test_cpu_offload.py b/tests/basic_correctness/test_cpu_offload.py
index f0f09ee63c0e6..563e27129c207 100644
--- a/tests/basic_correctness/test_cpu_offload.py
+++ b/tests/basic_correctness/test_cpu_offload.py
@@ -1,3 +1,5 @@
+from unittest.mock import patch
+
 import pytest
 
 from tests.quantization.utils import is_quant_method_supported
@@ -29,9 +31,12 @@ def test_cpu_offload_gptq():
     compare_two_settings("Qwen/Qwen2-1.5B-Instruct-GPTQ-Int4", [],
                          ["--cpu-offload-gb", "1"])
     # Test GPTQ
-    compare_two_settings("Qwen/Qwen2-1.5B-Instruct-GPTQ-Int4",
-                         ["--quantization", "gptq"],
-                         ["--quantization", "gptq", "--cpu-offload-gb", "1"])
+    # The model output logits has small variance between runs, which do not play
+    # well with the flashinfer sampler.
+    with patch.dict("os.environ", {"VLLM_DISABLE_FLASHINFER_SAMPLER": "1"}):
+        compare_two_settings(
+            "Qwen/Qwen2-1.5B-Instruct-GPTQ-Int4", ["--quantization", "gptq"],
+            ["--quantization", "gptq", "--cpu-offload-gb", "1"])
 
 
 @pytest.mark.skipif(not is_quant_method_supported("awq_marlin"),

From e4cfcfcef03d982216a48bdfe673ab9c56a0a185 Mon Sep 17 00:00:00 2001
From: Peng Guanwen <pg999w@outlook.com>
Date: Thu, 15 Aug 2024 18:36:52 +0800
Subject: [PATCH 20/22] Disable flashinfer sampler in distributed test

---
 .buildkite/test-pipeline.yaml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index de517ddc8c4df..d7abe0728513a 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -274,7 +274,7 @@ steps:
   commands:
   - # the following commands are for the first node, with ip 192.168.10.10 (ray environment already set up)
     - VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py
-    - VLLM_MULTI_NODE=1 pytest -v -s distributed/test_pipeline_parallel.py
+    - VLLM_MULTI_NODE=1 VLLM_DISABLE_FLASHINFER_SAMPLER=1 pytest -v -s distributed/test_pipeline_parallel.py
   - # the following commands are for the second node, with ip 192.168.10.11 (ray environment already set up)
     - VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py
 
@@ -302,7 +302,7 @@ steps:
   - vllm/
   - tests/distributed/test_pipeline_parallel
   commands:
-  - pytest -v -s distributed/test_pipeline_parallel.py
+  - VLLM_DISABLE_FLASHINFER_SAMPLER=1 pytest -v -s distributed/test_pipeline_parallel.py
 
 - label: LoRA Long Context (Distributed) # 11min
   # This test runs llama 13B, so it is required to run on 4 GPUs.

From 0ec8b614e9940fb0d0291c964c104859a2324162 Mon Sep 17 00:00:00 2001
From: Peng Guanwen <pg999w@outlook.com>
Date: Fri, 16 Aug 2024 09:38:37 +0800
Subject: [PATCH 21/22] Disable flashinfer sampler by default

---
 .buildkite/test-pipeline.yaml               |  6 +++---
 tests/basic_correctness/test_cpu_offload.py | 11 +++--------
 tests/samplers/test_sampler.py              |  2 +-
 vllm/envs.py                                |  6 +++---
 vllm/model_executor/layers/sampler.py       |  2 +-
 5 files changed, 11 insertions(+), 16 deletions(-)

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index dc8b4d03be4af..7d62713735bae 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -184,7 +184,7 @@ steps:
   - tests/samplers
   commands:
     - pytest -v -s samplers
-    - VLLM_DISABLE_FLASHINFER_SAMPLER=1 pytest -v -s samplers
+    - VLLM_USE_FLASHINFER_SAMPLER=1 pytest -v -s samplers
 
 - label: LogitsProcessor Test # 5min
   mirror_hardwares: [amd]
@@ -279,7 +279,7 @@ steps:
   commands:
   - # the following commands are for the first node, with ip 192.168.10.10 (ray environment already set up)
     - VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py
-    - VLLM_MULTI_NODE=1 VLLM_DISABLE_FLASHINFER_SAMPLER=1 pytest -v -s distributed/test_pipeline_parallel.py
+    - VLLM_MULTI_NODE=1 pytest -v -s distributed/test_pipeline_parallel.py
   - # the following commands are for the second node, with ip 192.168.10.11 (ray environment already set up)
     - VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py
 
@@ -309,7 +309,7 @@ steps:
   - vllm/
   - tests/distributed/test_pipeline_parallel
   commands:
-  - VLLM_DISABLE_FLASHINFER_SAMPLER=1 pytest -v -s distributed/test_pipeline_parallel.py
+  - pytest -v -s distributed/test_pipeline_parallel.py
 
 - label: LoRA Long Context (Distributed) # 11min
   # This test runs llama 13B, so it is required to run on 4 GPUs.
diff --git a/tests/basic_correctness/test_cpu_offload.py b/tests/basic_correctness/test_cpu_offload.py
index 563e27129c207..f0f09ee63c0e6 100644
--- a/tests/basic_correctness/test_cpu_offload.py
+++ b/tests/basic_correctness/test_cpu_offload.py
@@ -1,5 +1,3 @@
-from unittest.mock import patch
-
 import pytest
 
 from tests.quantization.utils import is_quant_method_supported
@@ -31,12 +29,9 @@ def test_cpu_offload_gptq():
     compare_two_settings("Qwen/Qwen2-1.5B-Instruct-GPTQ-Int4", [],
                          ["--cpu-offload-gb", "1"])
     # Test GPTQ
-    # The model output logits has small variance between runs, which do not play
-    # well with the flashinfer sampler.
-    with patch.dict("os.environ", {"VLLM_DISABLE_FLASHINFER_SAMPLER": "1"}):
-        compare_two_settings(
-            "Qwen/Qwen2-1.5B-Instruct-GPTQ-Int4", ["--quantization", "gptq"],
-            ["--quantization", "gptq", "--cpu-offload-gb", "1"])
+    compare_two_settings("Qwen/Qwen2-1.5B-Instruct-GPTQ-Int4",
+                         ["--quantization", "gptq"],
+                         ["--quantization", "gptq", "--cpu-offload-gb", "1"])
 
 
 @pytest.mark.skipif(not is_quant_method_supported("awq_marlin"),
diff --git a/tests/samplers/test_sampler.py b/tests/samplers/test_sampler.py
index fc0048ca955ee..06c3b3b7148b4 100644
--- a/tests/samplers/test_sampler.py
+++ b/tests/samplers/test_sampler.py
@@ -643,7 +643,7 @@ def mock_sample(probs, *args, **kwargs):
 @pytest.mark.parametrize("seed", RANDOM_SEEDS)
 @pytest.mark.parametrize("device", CUDA_DEVICES)
 def test_flashinfer_fallback(seed: int, device: str):
-    if envs.VLLM_DISABLE_FLASHINFER_SAMPLER:
+    if not envs.VLLM_USE_FLASHINFER_SAMPLER:
         pytest.skip("Flashinfer sampler is disabled")
 
     set_random_seed(seed)
diff --git a/vllm/envs.py b/vllm/envs.py
index 1fd1a33607e57..92ed243899d47 100644
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -30,7 +30,7 @@
     VLLM_LOGGING_CONFIG_PATH: Optional[str] = None
     VLLM_TRACE_FUNCTION: int = 0
     VLLM_ATTENTION_BACKEND: Optional[str] = None
-    VLLM_DISABLE_FLASHINFER_SAMPLER: bool = False
+    VLLM_USE_FLASHINFER_SAMPLER: bool = False
     VLLM_PP_LAYER_PARTITION: Optional[str] = None
     VLLM_CPU_KVCACHE_SPACE: int = 0
     VLLM_CPU_OMP_THREADS_BIND: str = ""
@@ -258,8 +258,8 @@ def get_default_config_root():
     lambda: os.getenv("VLLM_ATTENTION_BACKEND", None),
 
     # If set, vllm will not use flashinfer sampler
-    "VLLM_DISABLE_FLASHINFER_SAMPLER":
-    lambda: bool(int(os.getenv("VLLM_DISABLE_FLASHINFER_SAMPLER", "0"))),
+    "VLLM_USE_FLASHINFER_SAMPLER":
+    lambda: bool(int(os.getenv("VLLM_USE_FLASHINFER_SAMPLER", "0"))),
 
     # Pipeline stage partition strategy
     "VLLM_PP_LAYER_PARTITION":
diff --git a/vllm/model_executor/layers/sampler.py b/vllm/model_executor/layers/sampler.py
index b4f2bfde1c4b1..7344d59e988f0 100644
--- a/vllm/model_executor/layers/sampler.py
+++ b/vllm/model_executor/layers/sampler.py
@@ -22,7 +22,7 @@
                            PromptLogprobs, SampleLogprobs, SamplerOutput,
                            SequenceOutput)
 
-if not envs.VLLM_DISABLE_FLASHINFER_SAMPLER and find_spec("flashinfer"):
+if envs.VLLM_USE_FLASHINFER_SAMPLER and find_spec("flashinfer"):
     import flashinfer.sampling
     # yapf: disable
     from flashinfer.sampling import (

From 9eaea5cf1f990f6b16e878a88044c90e322c045d Mon Sep 17 00:00:00 2001
From: Peng Guanwen <pg999w@outlook.com>
Date: Sat, 17 Aug 2024 09:11:04 +0800
Subject: [PATCH 22/22] Update vllm/envs.py

Co-authored-by: Michael Goin <michael@neuralmagic.com>
---
 vllm/envs.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/envs.py b/vllm/envs.py
index 92ed243899d47..115ead01f537d 100644
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -257,7 +257,7 @@ def get_default_config_root():
     "VLLM_ATTENTION_BACKEND":
     lambda: os.getenv("VLLM_ATTENTION_BACKEND", None),
 
-    # If set, vllm will not use flashinfer sampler
+    # If set, vllm will use flashinfer sampler
     "VLLM_USE_FLASHINFER_SAMPLER":
     lambda: bool(int(os.getenv("VLLM_USE_FLASHINFER_SAMPLER", "0"))),