From cdb8e2c43464c85c4a845b564ed9267c34035954 Mon Sep 17 00:00:00 2001
From: houj04 <houj04@foxmail.com>
Date: Fri, 3 Nov 2023 17:58:59 +0800
Subject: [PATCH 1/2] [XPU] add bfloat16 support for gaussian and uniform

---
 cmake/external/xpu.cmake                  |  2 +-
 paddle/phi/backends/xpu/xpu2_op_list.cc   |  9 ++-
 paddle/phi/backends/xpu/xpu3_op_list.cc   |  9 ++-
 paddle/phi/kernels/xpu/gaussian_kernel.cc |  3 +-
 paddle/phi/kernels/xpu/uniform_kernel.cc  | 65 +++++----------
 test/xpu/test_gaussian_random_op_xpu.py   | 34 ++++++++
 test/xpu/test_uniform_random_op_xpu.py    | 99 +++++++++++++++++++----
 7 files changed, 153 insertions(+), 68 deletions(-)

diff --git a/cmake/external/xpu.cmake b/cmake/external/xpu.cmake
index 48979742e4501..34d31d299eb89 100644
--- a/cmake/external/xpu.cmake
+++ b/cmake/external/xpu.cmake
@@ -24,7 +24,7 @@ set(XPU_XFT_LIB_NAME "libxft.so")
 set(XPU_XPTI_LIB_NAME "libxpti.so")
 
 if(NOT DEFINED XPU_BASE_DATE)
-  set(XPU_BASE_DATE "20231025")
+  set(XPU_BASE_DATE "20231103")
 endif()
 set(XPU_XCCL_BASE_VERSION "1.0.53.6")
 if(NOT DEFINED XPU_XFT_BASE_VERSION)
diff --git a/paddle/phi/backends/xpu/xpu2_op_list.cc b/paddle/phi/backends/xpu/xpu2_op_list.cc
index 0b22963170998..1ba90c8f1af3b 100644
--- a/paddle/phi/backends/xpu/xpu2_op_list.cc
+++ b/paddle/phi/backends/xpu/xpu2_op_list.cc
@@ -446,7 +446,9 @@ XPUOpMap& get_kl2_ops() {
                      phi::DataType::INT64,
                      phi::DataType::BOOL})},
       {"gaussian_random",
-       XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT16})},
+       XPUKernelSet({phi::DataType::FLOAT32,
+                     phi::DataType::FLOAT16,
+                     phi::DataType::BFLOAT16})},
       {"gelu_grad",
        XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT16})},
       {"gelu", XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT16})},
@@ -977,7 +979,10 @@ XPUOpMap& get_kl2_ops() {
       {"update_loss_scaling",
        XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT16})},
       {"unbind", XPUKernelSet({phi::DataType::FLOAT32})},
-      {"uniform_random", XPUKernelSet({phi::DataType::FLOAT32})},
+      {"uniform_random",
+       XPUKernelSet({phi::DataType::FLOAT32,
+                     phi::DataType::FLOAT16,
+                     phi::DataType::BFLOAT16})},
       {"unique",
        XPUKernelSet({phi::DataType::FLOAT32,
                      phi::DataType::INT32,
diff --git a/paddle/phi/backends/xpu/xpu3_op_list.cc b/paddle/phi/backends/xpu/xpu3_op_list.cc
index f52f91c911de4..8bd1599128704 100644
--- a/paddle/phi/backends/xpu/xpu3_op_list.cc
+++ b/paddle/phi/backends/xpu/xpu3_op_list.cc
@@ -416,7 +416,9 @@ XPUOpMap& get_kl3_ops() {
                      phi::DataType::INT64,
                      phi::DataType::BOOL})},
       {"gaussian_random",
-       XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT16})},
+       XPUKernelSet({phi::DataType::FLOAT32,
+                     phi::DataType::FLOAT16,
+                     phi::DataType::BFLOAT16})},
       {"gelu_grad",
        XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT16})},
       {"gelu", XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT16})},
@@ -942,7 +944,10 @@ XPUOpMap& get_kl3_ops() {
       {"update_loss_scaling",
        XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT16})},
       {"unbind", XPUKernelSet({phi::DataType::FLOAT32})},
-      {"uniform_random", XPUKernelSet({phi::DataType::FLOAT32})},
+      {"uniform_random",
+       XPUKernelSet({phi::DataType::FLOAT32,
+                     phi::DataType::FLOAT16,
+                     phi::DataType::BFLOAT16})},
       {"unique",
        XPUKernelSet({phi::DataType::FLOAT32,
                      phi::DataType::INT32,
diff --git a/paddle/phi/kernels/xpu/gaussian_kernel.cc b/paddle/phi/kernels/xpu/gaussian_kernel.cc
index f8058f94e872f..2c4a29b6bfe51 100644
--- a/paddle/phi/kernels/xpu/gaussian_kernel.cc
+++ b/paddle/phi/kernels/xpu/gaussian_kernel.cc
@@ -50,4 +50,5 @@ PD_REGISTER_KERNEL(gaussian,
                    ALL_LAYOUT,
                    phi::GaussianKernel,
                    float,
-                   phi::dtype::float16) {}
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16) {}
diff --git a/paddle/phi/kernels/xpu/uniform_kernel.cc b/paddle/phi/kernels/xpu/uniform_kernel.cc
index 99388e31e5881..dd3e0e2931536 100644
--- a/paddle/phi/kernels/xpu/uniform_kernel.cc
+++ b/paddle/phi/kernels/xpu/uniform_kernel.cc
@@ -14,12 +14,9 @@ limitations under the License. */
 
 #include "paddle/phi/kernels/uniform_kernel.h"
 
-#include <string>
-
-#include "paddle/phi/backends/xpu/xpu_context.h"
-#include "paddle/phi/common/memory_utils.h"
+#include "paddle/phi/backends/xpu/enforce_xpu.h"
+#include "paddle/phi/core/generator.h"
 #include "paddle/phi/core/kernel_registry.h"
-#include "paddle/phi/kernels/funcs/uniform_real_distribution.h"
 
 namespace phi {
 
@@ -31,49 +28,27 @@ void UniformKernel(const Context &dev_ctx,
                    const Scalar &max,
                    int seed,
                    DenseTensor *out) {
-  int diag_num = 0;
-  int diag_step = 0;
-  float diag_val = 0.0f;
   out->Resize(phi::make_ddim(shape.GetData()));
   T *data = dev_ctx.template Alloc<T>(out);
-  int64_t size = out->numel();
-
-  std::unique_ptr<T[]> data_cpu(new T[size]);
-
-  std::shared_ptr<std::mt19937_64> engine;
-  if (seed) {
-    engine = std::make_shared<std::mt19937_64>();
-    engine->seed(seed);
-  } else {
-    engine = dev_ctx.GetGenerator()->GetCPUEngine();
-  }
-  UniformRealDistribution<T>(
-      data_cpu.get(), size, min.to<float>(), max.to<float>(), engine);
-  if (diag_num > 0) {
-    PADDLE_ENFORCE_GT(
-        size,
-        (diag_num - 1) * (diag_step + 1),
-        phi::errors::InvalidArgument(
-            "ShapeInvalid: the diagonal's elements is equal (num-1) "
-            "* (step-1) with num %d, step %d,"
-            "It should be smaller than %d, but received %d",
-            diag_num,
-            diag_step,
-            (diag_num - 1) * (diag_step + 1),
-            size));
-    for (int64_t i = 0; i < diag_num; ++i) {
-      int64_t pos = i * diag_step + i;
-      data_cpu[pos] = diag_val;
-    }
-  }
-
-  memory_utils::Copy(dev_ctx.GetPlace(),
-                     data,
-                     phi::CPUPlace(),
-                     reinterpret_cast<void *>(data_cpu.get()),
-                     size * sizeof(T));
+  using XPUType = typename XPUTypeTrait<T>::Type;
+  int64_t real_seed = seed != 0 ? seed : dev_ctx.GetGenerator()->Random64();
+
+  // int random(Context* ctx, T* x, int64_t len, T min, T max, int64_t seed);
+  int r = xpu::random<XPUType>(dev_ctx.x_context(),
+                               reinterpret_cast<XPUType *>(data),
+                               out->numel(),
+                               static_cast<XPUType>(min.to<float>()),
+                               static_cast<XPUType>(max.to<float>()),
+                               real_seed);
+  PADDLE_ENFORCE_XDNN_SUCCESS(r, "random");
 }
 
 }  // namespace phi
 
-PD_REGISTER_KERNEL(uniform, XPU, ALL_LAYOUT, phi::UniformKernel, float) {}
+PD_REGISTER_KERNEL(uniform,
+                   XPU,
+                   ALL_LAYOUT,
+                   phi::UniformKernel,
+                   float,
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16) {}
diff --git a/test/xpu/test_gaussian_random_op_xpu.py b/test/xpu/test_gaussian_random_op_xpu.py
index abdec498f0a62..7e80bd00ac586 100644
--- a/test/xpu/test_gaussian_random_op_xpu.py
+++ b/test/xpu/test_gaussian_random_op_xpu.py
@@ -26,8 +26,23 @@
 from paddle import base
 
 paddle.enable_static()
+from paddle.base import core
 from paddle.tensor import random
 
+typeid_dict = {
+    'int32': int(core.VarDesc.VarType.INT32),
+    'int64': int(core.VarDesc.VarType.INT64),
+    'float32': int(core.VarDesc.VarType.FP32),
+    'float16': int(core.VarDesc.VarType.FP16),
+    'bfloat16': int(core.VarDesc.VarType.BF16),
+    'bool': int(core.VarDesc.VarType.BOOL),
+    'int8': int(core.VarDesc.VarType.INT8),
+    'uint8': int(core.VarDesc.VarType.UINT8),
+    'float64': int(core.VarDesc.VarType.FP64),
+}
+
+from op_test import convert_uint16_to_float
+
 
 class XPUTestGaussianRandomOp(XPUOpTestWrapper):
     def __init__(self):
@@ -52,6 +67,7 @@ def setUp(self):
                 "std": self.std,
                 "seed": 10,
                 "use_mkldnn": self.use_mkldnn,
+                "dtype": typeid_dict[self.in_type_str],
             }
             paddle.seed(10)
 
@@ -67,6 +83,10 @@ def test_check_output(self):
             )
 
         def verify_output(self, outs):
+            # special for bf16
+            if self.in_type_str == "bfloat16":
+                outs = convert_uint16_to_float(outs)
+
             self.assertEqual(outs[0].shape, (123, 92))
             hist, _ = np.histogram(outs[0], range=(-3, 5))
             hist = hist.astype("float32")
@@ -100,6 +120,7 @@ def setUp(self):
                 'std': self.std,
                 'seed': self.seed,
                 'use_mkldnn': self.use_mkldnn,
+                "dtype": typeid_dict[self.in_type_str],
             }
 
             self.inputs = {"ShapeTensorList": shape_tensor_list}
@@ -165,6 +186,7 @@ def setUp(self):
                 'std': self.std,
                 'seed': self.seed,
                 'use_mkldnn': self.use_mkldnn,
+                "dtype": typeid_dict[self.in_type_str],
             }
             self.outputs = {'Out': np.zeros((123, 92), dtype=self.dtype)}
 
@@ -265,6 +287,11 @@ def test_default_fp16():
             out = paddle.tensor.random.gaussian([2, 3])
             self.assertEqual(out.dtype, base.core.VarDesc.VarType.FP16)
 
+        def test_default_bf16():
+            paddle.framework.set_default_dtype('bfloat16')
+            out = paddle.tensor.random.gaussian([2, 3])
+            self.assertEqual(out.dtype, base.core.VarDesc.VarType.BF16)
+
         def test_default_fp32():
             paddle.framework.set_default_dtype('float32')
             out = paddle.tensor.random.gaussian([2, 3])
@@ -278,6 +305,7 @@ def test_default_fp64():
         test_default_fp64()
         test_default_fp32()
         test_default_fp16()
+        test_default_bf16()
 
         paddle.enable_static()
 
@@ -291,6 +319,11 @@ def test_default_fp16():
             out = paddle.tensor.random.standard_normal([2, 3])
             self.assertEqual(out.dtype, base.core.VarDesc.VarType.FP16)
 
+        def test_default_bf16():
+            paddle.framework.set_default_dtype('bfloat16')
+            out = paddle.tensor.random.standard_normal([2, 3])
+            self.assertEqual(out.dtype, base.core.VarDesc.VarType.BF16)
+
         def test_default_fp32():
             paddle.framework.set_default_dtype('float32')
             out = paddle.tensor.random.standard_normal([2, 3])
@@ -304,6 +337,7 @@ def test_default_fp64():
         test_default_fp64()
         test_default_fp32()
         test_default_fp16()
+        test_default_bf16()
 
         paddle.enable_static()
 
diff --git a/test/xpu/test_uniform_random_op_xpu.py b/test/xpu/test_uniform_random_op_xpu.py
index 24972d64b0eb6..a82f305b047a4 100644
--- a/test/xpu/test_uniform_random_op_xpu.py
+++ b/test/xpu/test_uniform_random_op_xpu.py
@@ -1,4 +1,4 @@
-#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#   Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -16,32 +16,97 @@
 import unittest
 
 import numpy as np
-from test_uniform_random_op import (
-    TestUniformRandomOp,
-    TestUniformRandomOpSelectedRows,
+from get_test_cover_info import (
+    XPUOpTestWrapper,
+    create_test_class,
+    get_xpu_op_support_types,
 )
+from op_test_xpu import XPUOpTest
 
 import paddle
 
 paddle.enable_static()
+from paddle.base import core
 
+typeid_dict = {
+    'int32': int(core.VarDesc.VarType.INT32),
+    'int64': int(core.VarDesc.VarType.INT64),
+    'float32': int(core.VarDesc.VarType.FP32),
+    'float16': int(core.VarDesc.VarType.FP16),
+    'bfloat16': int(core.VarDesc.VarType.BF16),
+    'bool': int(core.VarDesc.VarType.BOOL),
+    'int8': int(core.VarDesc.VarType.INT8),
+    'uint8': int(core.VarDesc.VarType.UINT8),
+    'float64': int(core.VarDesc.VarType.FP64),
+}
 
-class TestXPUUniformRandomOp(TestUniformRandomOp):
-    def test_check_output(self):
-        if paddle.is_compiled_with_xpu():
-            place = paddle.XPUPlace(0)
-            outs = self.calc_output(place)
-            outs = [np.array(out) for out in outs]
-            outs.sort(key=len)
-            self.verify_output(outs)
 
+def output_hist(out):
+    if out.dtype == np.uint16:
+        out = convert_uint16_to_float(out)
+    hist, _ = np.histogram(out, range=(-5, 10))
+    hist = hist.astype("float32")
+    hist /= float(out.size)
+    prob = 0.1 * np.ones(10)
+    return hist, prob
 
-class TestXPUUniformRandomOpSelectedRows(TestUniformRandomOpSelectedRows):
-    def test_check_output(self):
-        if paddle.is_compiled_with_xpu():
-            place = paddle.XPUPlace(0)
-            self.check_with_place(place)
 
+from op_test import convert_uint16_to_float
+
+
+class XPUTestUniformRandomOp(XPUOpTestWrapper):
+    def __init__(self):
+        self.op_name = 'uniform_random'
+        self.use_dynamic_create_class = False
+
+    class TestUniformRandomOp(XPUOpTest):
+        def init(self):
+            self.dtype = self.in_type
+            self.place = paddle.XPUPlace(0)
+            self.op_type = "uniform_random"
+            self.python_api = paddle.uniform
+
+        def setUp(self):
+            self.init()
+            self.inputs = {}
+            self.use_mkldnn = False
+            self.set_attrs()
+            paddle.seed(10)
+
+            self.outputs = {"Out": np.zeros((1000, 784), dtype=self.dtype)}
+
+        def set_attrs(self):
+            self.attrs = {
+                "shape": [1000, 784],
+                "min": -5.0,
+                "max": 10.0,
+                "dtype": typeid_dict[self.in_type_str],
+            }
+            self.output_hist = output_hist
+
+        def test_check_output(self):
+            self.check_output_with_place_customized(
+                self.verify_output, self.place
+            )
+
+        def verify_output(self, outs):
+            hist, prob = self.output_hist(np.array(outs[0]))
+            np.testing.assert_allclose(hist, prob, rtol=0, atol=0.01)
+
+    class TestMaxMinAreInt(TestUniformRandomOp):
+        def set_attrs(self):
+            self.attrs = {
+                "shape": [1000, 784],
+                "min": -5,
+                "max": 10,
+                "dtype": typeid_dict[self.in_type_str],
+            }
+            self.output_hist = output_hist
+
+
+support_types = get_xpu_op_support_types('uniform_random')
+for stype in support_types:
+    create_test_class(globals(), XPUTestUniformRandomOp, stype)
 
 if __name__ == "__main__":
     unittest.main()

From 1d530a7f0946512568600cf772e7b2407352dc3c Mon Sep 17 00:00:00 2001
From: houj04 <houj04@foxmail.com>
Date: Fri, 3 Nov 2023 21:14:17 +0800
Subject: [PATCH 2/2] fix zero dim.

---
 paddle/phi/kernels/xpu/uniform_kernel.cc | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/paddle/phi/kernels/xpu/uniform_kernel.cc b/paddle/phi/kernels/xpu/uniform_kernel.cc
index dd3e0e2931536..ead65b65a8466 100644
--- a/paddle/phi/kernels/xpu/uniform_kernel.cc
+++ b/paddle/phi/kernels/xpu/uniform_kernel.cc
@@ -30,6 +30,10 @@ void UniformKernel(const Context &dev_ctx,
                    DenseTensor *out) {
   out->Resize(phi::make_ddim(shape.GetData()));
   T *data = dev_ctx.template Alloc<T>(out);
+  if (out->numel() == 0) {
+    return;
+  }
+
   using XPUType = typename XPUTypeTrait<T>::Type;
   int64_t real_seed = seed != 0 ? seed : dev_ctx.GetGenerator()->Random64();