PaddlePaddle · yuanlehome · Sep 15, 2023 · Aug 25, 2023 · Aug 25, 2023 · Aug 25, 2023
diff --git a/paddle/fluid/framework/ir/trt_flash_multihead_matmul_fuse_pass.cc b/paddle/fluid/framework/ir/trt_flash_multihead_matmul_fuse_pass.cc
@@ -267,6 +267,14 @@ int TrtFlashMultiHeadMatmulFusePass::BuildFlashFusion(
     Graph* graph, const std::string& name_scope, Scope* scope) const {
   GraphPatternDetector gpd;
   auto* pattern = gpd.mutable_pattern();
+  bool use_trt_fma = false;
+
+#ifdef PADDLE_WITH_TENSORRT
+  int sm = platform::GetGPUComputeCapability(platform::GetCurrentDeviceId());
+  use_trt_fma = sm >= 80 ? true : false;
+#endif
+  // Lora's attention weight cannot be manipulated during pass processing
+  bool weight_is_constant = false;
 
   // Create pattern.
   patterns::TrtFlashMultiHeadMatmulPattern multihead_pattern(pattern,
@@ -299,73 +307,88 @@ int TrtFlashMultiHeadMatmulFusePass::BuildFlashFusion(
     int head_number =
         PADDLE_GET_CONST(std::vector<int>, reshape_desc->GetAttr("shape"))
             .at(2);
-
+    int head_size =
+        PADDLE_GET_CONST(std::vector<int>, reshape_desc->GetAttr("shape"))
+            .at(3);
     multihead_op_desc.SetType("flash_multihead_matmul");
     multihead_op_desc.SetInput("Input", {input0->Name()});
-
-    auto* wq_tensor =
-        scope->FindVar(mul0_w->Name())->GetMutable<phi::DenseTensor>();
-    auto* wk_tensor =
-        scope->FindVar(mul1_w->Name())->GetMutable<phi::DenseTensor>();
-    auto* wv_tensor =
-        scope->FindVar(mul2_w->Name())->GetMutable<phi::DenseTensor>();
+    if (mul0_w->Var()->Persistable() && mul1_w->Var()->Persistable() &&
+        mul2_w->Var()->Persistable()) {
+      weight_is_constant = true;
+    }
     // check the scale
-    int hidden_out = wq_tensor->dims()[1];
-    int head_size = hidden_out / head_number;
+    int hidden_out = head_number * head_size;
     if (abs(scale_attr - 1.0f / sqrt(static_cast<float>(head_size))) > 1e-5) {
       VLOG(3) << "scale of muilthead matmul do not fit the requirement of "
                  "flash attention plugin, Stop fusing.";
       return;
     }
-
-    float* wq_data = wq_tensor->data<float>();
-    float* wk_data = wk_tensor->data<float>();
-    float* wv_data = wv_tensor->data<float>();
-    // combined_w_dims = [in,3,out]
-    auto combined_w_dims =
-        phi::make_ddim({wq_tensor->dims()[0], 3, wq_tensor->dims()[1]});
-    auto* combined_w_desc = mul0_w->Var();
-    combined_w_desc->SetShape({wq_tensor->dims()[0], 3, wq_tensor->dims()[1]});
-    combined_w_desc->SetPersistable(true);
-    phi::DenseTensor tmp_combined_w_tensor;
-    tmp_combined_w_tensor.Resize(combined_w_dims);
-    float* tmp_combined_w_data =
-        dev_ctx->template HostAlloc<float>(&tmp_combined_w_tensor);
-
-    std::vector<const float*> w_vec = {wq_data, wk_data, wv_data};
-    int dims_h = combined_w_dims[0], dims_w = combined_w_dims[2];
-    // dims_h=in_feature, dims_w=out_feature
-    // Combine the three fc weights together.
-    // weight [Hidden_in * 3 * N * H]
-    for (int i = 0; i < dims_h; i++) {
-      for (int j = 0; j < 3; j++) {
-        for (int k = 0; k < dims_w; k++) {
-          int out_index = i * (3 * dims_w) + j * dims_w + k;
-          int in_index = i * dims_w + k;
-          tmp_combined_w_data[out_index] = w_vec[j][in_index];
+    if (use_trt_fma && weight_is_constant) {
+      auto* wq_tensor =
+          scope->FindVar(mul0_w->Name())->GetMutable<phi::DenseTensor>();
+      auto* wk_tensor =
+          scope->FindVar(mul1_w->Name())->GetMutable<phi::DenseTensor>();
+      auto* wv_tensor =
+          scope->FindVar(mul2_w->Name())->GetMutable<phi::DenseTensor>();
+      float* wq_data = wq_tensor->data<float>();
+      float* wk_data = wk_tensor->data<float>();
+      float* wv_data = wv_tensor->data<float>();
+      // auto dims = wq_tensor->dims();
+      // combined_w_dims = [in,3,out]
+      auto combined_w_dims =
+          phi::make_ddim({wq_tensor->dims()[0], 3, wq_tensor->dims()[1]});
+      auto* combined_w_desc = mul0_w->Var();
+      combined_w_desc->SetShape(
+          {wq_tensor->dims()[0], 3, wq_tensor->dims()[1]});
+      combined_w_desc->SetPersistable(true);
+      phi::DenseTensor tmp_combined_w_tensor;
+      tmp_combined_w_tensor.Resize(combined_w_dims);
+      float* tmp_combined_w_data =
+          dev_ctx->template HostAlloc<float>(&tmp_combined_w_tensor);
+      std::vector<const float*> w_vec = {wq_data, wk_data, wv_data};
+      int dims_h = combined_w_dims[0], dims_w = combined_w_dims[2];
+      // dims_h=in_feature, dims_w=out_feature
+      // Combine the three fc weights together.
+      // weight [Hidden_in * 3 * N * H]
+      for (int i = 0; i < dims_h; i++) {
+        for (int j = 0; j < 3; j++) {
+          for (int k = 0; k < dims_w; k++) {
+            int out_index = i * (3 * dims_w) + j * dims_w + k;
+            int in_index = i * dims_w + k;
+            tmp_combined_w_data[out_index] = w_vec[j][in_index];
+          }
         }
       }
+      // clear weight for reuse
+      wq_tensor->clear();
+      wq_tensor->Resize(combined_w_dims);
+      float* new_combined_w_data = dev_ctx->template HostAlloc<float>(
+          wq_tensor, sizeof(float) * wq_tensor->numel());
+      memcpy(new_combined_w_data,
+             tmp_combined_w_data,
+             sizeof(float) * wq_tensor->numel());
+
+      scope->EraseVars({mul1_w->Name(), mul2_w->Name()});
+      multihead_op_desc.SetInput("W", {mul0_w->Name()});
+
+    } else {
+      multihead_op_desc.SetInput("weight_query", {mul0_w->Name()});
+      multihead_op_desc.SetInput("weight_key", {mul1_w->Name()});
+      multihead_op_desc.SetInput("weight_value", {mul2_w->Name()});
     }
-    // clear weight for reuse
-    wq_tensor->clear();
-    wq_tensor->Resize(combined_w_dims);
-
-    float* new_combined_w_data = dev_ctx->template HostAlloc<float>(
-        wq_tensor, sizeof(float) * wq_tensor->numel());
-    memcpy(new_combined_w_data,
-           tmp_combined_w_data,
-           sizeof(float) * wq_tensor->numel());
-
-    scope->EraseVars({mul1_w->Name(), mul2_w->Name()});
-
-    multihead_op_desc.SetInput("W", {mul0_w->Name()});
-    multihead_op_desc.SetOutput("Out", {reshape2_qkv_out->Name()});
-    multihead_op_desc.SetAttr("alpha", scale_attr);
+    multihead_op_desc.SetAttr("scale", scale_attr);
+    multihead_op_desc.SetAttr("hidden_out", hidden_out);
     multihead_op_desc.SetAttr("head_number", head_number);
-
+    multihead_op_desc.SetOutput("Out", {reshape2_qkv_out->Name()});
+    multihead_op_desc.SetAttr("use_trt_fma", use_trt_fma);
+    multihead_op_desc.SetAttr("weight_is_constant", weight_is_constant);
     auto* multihead = graph->CreateOpNode(&multihead_op_desc);
     IR_NODE_LINK_TO(input0, multihead);
     IR_NODE_LINK_TO(mul0_w, multihead);
+    if (!use_trt_fma || !weight_is_constant) {
+      IR_NODE_LINK_TO(mul1_w, multihead);
+      IR_NODE_LINK_TO(mul2_w, multihead);
+    }
     IR_NODE_LINK_TO(multihead, reshape2_qkv_out);
   };
   int fusion_count{0};
@@ -471,11 +494,14 @@ int TrtFlashMultiHeadMatmulFusePass::BuildFlashFusion(
                                                   reshape2_qkv,
                                                   scale});
     // Remove unneeded nodes.
+    if (!use_trt_fma || !weight_is_constant) {
+      marked_nodes.erase(mul1_w);
+      marked_nodes.erase(mul2_w);
+    }
     GraphSafeRemoveNodes(graph, marked_nodes);
     ++fusion_count;
   };
   gpd(graph, handler);
-
   return fusion_count;
 }
 
@@ -492,13 +518,6 @@ void TrtFlashMultiHeadMatmulFusePass::ApplyImpl(Graph* graph) const {
                "8.5.2.2. Stop this pass";
     return;
   }
-  int sm = platform::GetGPUComputeCapability(platform::GetCurrentDeviceId());
-  if (sm < 80) {
-    VLOG(3) << "Flash attention oss plugin only available for nvidia gpu with "
-               "sm >= 80, but got sm = "
-            << sm << " . Stop this pass";
-    return;
-  }
 #else
   // if no tensorrt, early stop
   return;