PaddlePaddle · Xreki · Sep 4, 2023 · Aug 16, 2023 · Aug 16, 2023 · Aug 17, 2023
diff --git a/paddle/phi/kernels/gpu/flash_attn_grad_kernel.cu b/paddle/phi/kernels/gpu/flash_attn_grad_kernel.cu
@@ -236,11 +236,10 @@ void FlashAttnUnpaddedGradKernel(const Context& ctx,
     const int64_t total_k = k.dims()[0];
     const int64_t num_heads_k = k.dims()[1];
 
-    // TODO(umiswing): add deterministic in fa2.
-    // int num_splits = 0;  // 0 for an internal heuristic, which is optimal
-    // if (FLAGS_cudnn_deterministic) {
-    //   num_splits = 1;
-    // }
+    int num_splits = 0;  // 0 for an internal heuristic, which is optimal
+    if (FLAGS_cudnn_deterministic) {
+      num_splits = 1;
+    }
 
     // TODO(umiswing): add shape check
     PADDLE_ENFORCE_EQ(
@@ -294,6 +293,7 @@ void FlashAttnUnpaddedGradKernel(const Context& ctx,
                                             params.scale,
                                             params.causal,
                                             params.is_bf16,
+                                            num_splits,
                                             stream,
                                             params.seed,
                                             params.offset);
@@ -401,6 +401,11 @@ void FlashAttnGradKernel(const Context& ctx,
     VLOG(10) << "FlashAttn bwd seed: " << params.seed
              << ", offset: " << params.offset;
 
+    int num_splits = 0;  // 0 for an internal heuristic, which is optimal
+    if (FLAGS_cudnn_deterministic) {
+      num_splits = 1;
+    }
+
     bool succ = phi::dynload::flash_attn_bwd(dout.data(),
                                              q.data(),
                                              k.data(),
@@ -426,6 +431,7 @@ void FlashAttnGradKernel(const Context& ctx,
                                              params.scale,
                                              params.causal,
                                              params.is_bf16,
+                                             num_splits,
                                              stream,
                                              params.seed,
                                              params.offset);

diff --git a/test/legacy_test/test_flash_attention.py b/test/legacy_test/test_flash_attention.py
@@ -191,17 +191,13 @@ def test_unpadded(self):
                 fetches_result[0], out_, rtol=5e-03, atol=1e-03
             )
 
-    def test_all(self):
+    def flash_attn_compute(self, query, key, value):
         print(
             f"Test case shape {self.shape} dtype {self.dtype} causal {self.causal}"
         )
         # test dynamic
         paddle.disable_static()
 
-        query = np.random.random(self.shape)
-        key = np.random.random(self.shape)
-        value = np.random.random(self.shape)
-
         q = paddle.to_tensor(
             query, place=self.place, dtype=self.dtype, stop_gradient=False
         )
@@ -306,6 +302,29 @@ def test_all(self):
             np.testing.assert_allclose(
                 fetches_result[0], out_, rtol=5e-03, atol=1e-03
             )
+            return out, out_, fetches_result[0]
+
+    def test_all(self):
+        query = np.random.random(self.shape)
+        key = np.random.random(self.shape)
+        value = np.random.random(self.shape)
+        out, out_, _ = self.flash_attn_compute(query, key, value)
+
+    def test_all_flag(self):
+        paddle.set_flags({'FLAGS_cudnn_deterministic': 1})
+        query = np.random.random(self.shape)
+        key = np.random.random(self.shape)
+        value = np.random.random(self.shape)
+
+        out1, out1_, fetches_result1 = self.flash_attn_compute(
+            query, key, value
+        )
+        out2, out2_, fetches_result2 = self.flash_attn_compute(
+            query, key, value
+        )
+        self.assertTrue(np.equal(out1.numpy(), out2.numpy()).all())
+        self.assertTrue(np.equal(fetches_result1, fetches_result2).all())
+        paddle.set_flags({'FLAGS_cudnn_deterministic': 0})
 
 
 @unittest.skipIf(