[Kernel][Model] Varlen prefill + Prefill chunking support for mamba k…

…ernels and Jamba model (vllm-project#8533)
liuyanyi · Oct 6, 2024 · 56e8cea · 56e8cea
1 parent 8b228f5
commit 56e8cea
Show file tree

Hide file tree

Showing 13 changed files with 1,176 additions and 894 deletions.
diff --git a/csrc/mamba/causal_conv1d/causal_conv1d.cu b/csrc/mamba/causal_conv1d/causal_conv1d.cu
diff --git a/csrc/mamba/causal_conv1d/causal_conv1d.h b/csrc/mamba/causal_conv1d/causal_conv1d.h
@@ -24,6 +24,7 @@ struct ConvParamsBase {
     index_t out_c_stride;
     index_t out_l_stride;
 
+    int conv_state_len;
     index_t conv_state_batch_stride;
     index_t conv_state_c_stride;
     index_t conv_state_l_stride;
@@ -35,6 +36,10 @@ struct ConvParamsBase {
     void *__restrict__ out_ptr;
 
     void *__restrict__ conv_state_ptr;
+    void *__restrict__ query_start_loc_ptr;
+    void *__restrict__ has_initial_state_ptr;
+    void *__restrict__ cache_indices_ptr;
+    int32_t *__restrict__ cache_seqlens;
 
     // For the continuous batching case. Makes it so that the mamba state for 
     // the current batch doesn't need to be a contiguous tensor.
@@ -52,6 +57,11 @@ struct ConvParamsBase {
     index_t final_states_batch_stride;
     index_t final_states_l_stride;
     index_t final_states_c_stride;
+
+    void *  conv_states_ptr;
+    index_t conv_states_batch_stride;
+    index_t conv_states_l_stride;
+    index_t conv_states_c_stride;
 };
 
 

diff --git a/csrc/mamba/mamba_ssm/selective_scan.h b/csrc/mamba/mamba_ssm/selective_scan.h
@@ -54,10 +54,14 @@ struct SSMParamsBase {
     void *__restrict__ delta_ptr;
     void *__restrict__ delta_bias_ptr;
     void *__restrict__ out_ptr;
-    void *__restrict__ x_ptr;
+    void *__restrict__ ssm_states_ptr;
     void *__restrict__ z_ptr;
     void *__restrict__ out_z_ptr;
-    void *__restrict__ index_ptr;
+
+    void *__restrict__ query_start_loc_ptr;
+    void *__restrict__ cache_indices_ptr;
+    void *__restrict__ has_initial_state_ptr;
+
 };
 
 
@@ -201,7 +205,7 @@ inline __device__ void load_input(typename Ktraits::input_t *u,
                                   typename Ktraits::input_t (&u_vals)[Ktraits::kNItems],
                                   typename Ktraits::BlockLoadT::TempStorage &smem_load,
                                   int seqlen) {
-    if constexpr (Ktraits::kIsEvenLen) {
+    if constexpr (Ktraits::kIsEvenLen && !Ktraits::kVarlen) {
         auto& smem_load_vec = reinterpret_cast<typename Ktraits::BlockLoadVecT::TempStorage&>(smem_load);
         using vec_t = typename Ktraits::vec_t;
         typename Ktraits::BlockLoadVecT(smem_load_vec).Load(
@@ -217,21 +221,6 @@ inline __device__ void load_input(typename Ktraits::input_t *u,
     }
 }
 
-template<typename Ktraits>
-inline __device__ void load_index(int *u,
-                                  int (&u_vals)[Ktraits::kNItems],
-                                  typename Ktraits::BlockLoadIndexT::TempStorage &smem_load_index,
-                                  int seqlen) {
-    if constexpr (Ktraits::kIsEvenLen) {
-        auto& smem_load_index_vec = reinterpret_cast<typename Ktraits::BlockLoadIndexVecT::TempStorage&>(smem_load_index);
-        Ktraits::BlockLoadIndexVecT(smem_load_index_vec).Load(
-            reinterpret_cast<uint4*>(u),
-            reinterpret_cast<uint4(&)[Ktraits::kNLoadsIndex]>(u_vals)
-       );
-    } else {
-        Ktraits::BlockLoadIndexT(smem_load_index).Load(u, u_vals, seqlen, 0);
-    }
-}
 
 template<typename Ktraits>
 inline __device__ void load_weight(typename Ktraits::input_t *Bvar,
@@ -240,7 +229,7 @@ inline __device__ void load_weight(typename Ktraits::input_t *Bvar,
                                    int seqlen) {
     constexpr int kNItems = Ktraits::kNItems;
     typename Ktraits::input_t B_vals_load[kNItems];
-    if constexpr (Ktraits::kIsEvenLen) {
+    if constexpr (Ktraits::kIsEvenLen && !Ktraits::kVarlen) {
         auto& smem_load_weight_vec = reinterpret_cast<typename Ktraits::BlockLoadWeightVecT::TempStorage&>(smem_load_weight);
         using vec_t = typename Ktraits::vec_t;
         typename Ktraits::BlockLoadWeightVecT(smem_load_weight_vec).Load(
@@ -263,7 +252,7 @@ inline __device__ void store_output(typename Ktraits::input_t *out,
     typename Ktraits::input_t write_vals[Ktraits::kNItems];
     #pragma unroll
     for (int i = 0; i < Ktraits::kNItems; ++i) { write_vals[i] = out_vals[i]; }
-    if constexpr (Ktraits::kIsEvenLen) {
+    if constexpr (Ktraits::kIsEvenLen && !Ktraits::kVarlen) {
         auto& smem_store_vec = reinterpret_cast<typename Ktraits::BlockStoreVecT::TempStorage&>(smem_store);
         using vec_t = typename Ktraits::vec_t;
         typename Ktraits::BlockStoreVecT(smem_store_vec).Store(

diff --git a/csrc/mamba/mamba_ssm/selective_scan_fwd.cu b/csrc/mamba/mamba_ssm/selective_scan_fwd.cu
diff --git a/csrc/ops.h b/csrc/ops.h
@@ -215,25 +215,30 @@ void moe_align_block_size(torch::Tensor topk_ids, int64_t num_experts,
                           torch::Tensor experts_ids,
                           torch::Tensor num_tokens_post_pad);
 
-std::vector<torch::Tensor> selective_scan_fwd(
-    const torch::Tensor& u, const torch::Tensor& delta, const torch::Tensor& A,
-    const torch::Tensor& B, const torch::Tensor& C,
-    const c10::optional<torch::Tensor>& D_,
-    const c10::optional<torch::Tensor>& z_,
-    const c10::optional<torch::Tensor>& delta_bias_, bool delta_softplus,
-    const c10::optional<torch::Tensor>& index_,
-    const c10::optional<torch::Tensor>& x);
+void selective_scan_fwd(const torch::Tensor& u, const torch::Tensor& delta,
+                        const torch::Tensor& A, const torch::Tensor& B,
+                        const torch::Tensor& C,
+                        const c10::optional<torch::Tensor>& D_,
+                        const c10::optional<torch::Tensor>& z_,
+                        const c10::optional<torch::Tensor>& delta_bias_,
+                        bool delta_softplus,
+                        const c10::optional<torch::Tensor>& query_start_loc,
+                        const c10::optional<torch::Tensor>& cache_indices,
+                        const c10::optional<torch::Tensor>& has_initial_state,
+                        const torch::Tensor& ssm_states);
 
 at::Tensor causal_conv1d_update(
     const at::Tensor& x, const at::Tensor& conv_state, const at::Tensor& weight,
-    const c10::optional<at::Tensor>& bias, bool silu_activation,
-    const c10::optional<at::Tensor>& conv_state_indices);
+    const c10::optional<at::Tensor>& bias_, bool silu_activation,
+    const c10::optional<at::Tensor>& cache_seqlens_,
+    const c10::optional<at::Tensor>& conv_state_indices_);
 
 at::Tensor causal_conv1d_fwd(const at::Tensor& x, const at::Tensor& weight,
                              const c10::optional<at::Tensor>& bias_,
-                             const c10::optional<at::Tensor>& seq_idx_,
-                             const c10::optional<at::Tensor>& initial_states_,
-                             const c10::optional<at::Tensor>& final_states_out_,
+                             const c10::optional<at::Tensor>& conv_states,
+                             const c10::optional<at::Tensor>& query_start_loc,
+                             const c10::optional<at::Tensor>& cache_indices,
+                             const c10::optional<at::Tensor>& has_initial_state,
                              bool silu_activation);
 
 #ifndef USE_ROCM

diff --git a/csrc/torch_bindings.cpp b/csrc/torch_bindings.cpp
@@ -273,26 +273,31 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
   ops.def(
       "selective_scan_fwd(Tensor! u, Tensor! delta,"
       "Tensor! A, Tensor! B, Tensor! C,"
-      "Tensor? D_, Tensor? z_, Tensor? delta_bias_,"
+      "Tensor? D_, Tensor!? z_, Tensor? delta_bias_,"
       "bool delta_softplus,"
-      "Tensor? index_, Tensor!? x) -> Tensor[]");
+      "Tensor? query_start_loc,"
+      "Tensor? cache_indices,"
+      "Tensor? has_initial_state,"
+      "Tensor! ssm_states) -> ()");
   ops.impl("selective_scan_fwd", torch::kCUDA, &selective_scan_fwd);
 
   ops.def(
       "causal_conv1d_update(Tensor! x,"
       "Tensor! conv_state,"
       "Tensor! weight,"
-      "Tensor? bias,"
+      "Tensor? bias_,"
       "bool silu_activation,"
+      "Tensor? cache_seqlens_,"
       "Tensor? conv_state_indices) -> Tensor");
   ops.impl("causal_conv1d_update", torch::kCUDA, &causal_conv1d_update);
 
   ops.def(
       "causal_conv1d_fwd(Tensor! x, Tensor! weight,"
       "Tensor? bias_,"
-      "Tensor? seq_idx_,"
-      "Tensor? initial_states_,"
-      "Tensor!? final_states_out_,"
+      "Tensor!? conv_states,"
+      "Tensor? query_start_loc,"
+      "Tensor? cache_indices,"
+      "Tensor? has_initial_state,"
       "bool silu_activation) -> Tensor");
   ops.impl("causal_conv1d_fwd", torch::kCUDA, &causal_conv1d_fwd);
 #endif