From cdb8e2c43464c85c4a845b564ed9267c34035954 Mon Sep 17 00:00:00 2001 From: houj04 Date: Fri, 3 Nov 2023 17:58:59 +0800 Subject: [PATCH 1/2] [XPU] add bfloat16 support for gaussian and uniform --- cmake/external/xpu.cmake | 2 +- paddle/phi/backends/xpu/xpu2_op_list.cc | 9 ++- paddle/phi/backends/xpu/xpu3_op_list.cc | 9 ++- paddle/phi/kernels/xpu/gaussian_kernel.cc | 3 +- paddle/phi/kernels/xpu/uniform_kernel.cc | 65 +++++---------- test/xpu/test_gaussian_random_op_xpu.py | 34 ++++++++ test/xpu/test_uniform_random_op_xpu.py | 99 +++++++++++++++++++---- 7 files changed, 153 insertions(+), 68 deletions(-) diff --git a/cmake/external/xpu.cmake b/cmake/external/xpu.cmake index 48979742e4501..34d31d299eb89 100644 --- a/cmake/external/xpu.cmake +++ b/cmake/external/xpu.cmake @@ -24,7 +24,7 @@ set(XPU_XFT_LIB_NAME "libxft.so") set(XPU_XPTI_LIB_NAME "libxpti.so") if(NOT DEFINED XPU_BASE_DATE) - set(XPU_BASE_DATE "20231025") + set(XPU_BASE_DATE "20231103") endif() set(XPU_XCCL_BASE_VERSION "1.0.53.6") if(NOT DEFINED XPU_XFT_BASE_VERSION) diff --git a/paddle/phi/backends/xpu/xpu2_op_list.cc b/paddle/phi/backends/xpu/xpu2_op_list.cc index 0b22963170998..1ba90c8f1af3b 100644 --- a/paddle/phi/backends/xpu/xpu2_op_list.cc +++ b/paddle/phi/backends/xpu/xpu2_op_list.cc @@ -446,7 +446,9 @@ XPUOpMap& get_kl2_ops() { phi::DataType::INT64, phi::DataType::BOOL})}, {"gaussian_random", - XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT16})}, + XPUKernelSet({phi::DataType::FLOAT32, + phi::DataType::FLOAT16, + phi::DataType::BFLOAT16})}, {"gelu_grad", XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT16})}, {"gelu", XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT16})}, @@ -977,7 +979,10 @@ XPUOpMap& get_kl2_ops() { {"update_loss_scaling", XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT16})}, {"unbind", XPUKernelSet({phi::DataType::FLOAT32})}, - {"uniform_random", XPUKernelSet({phi::DataType::FLOAT32})}, + {"uniform_random", + XPUKernelSet({phi::DataType::FLOAT32, + phi::DataType::FLOAT16, + phi::DataType::BFLOAT16})}, {"unique", XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::INT32, diff --git a/paddle/phi/backends/xpu/xpu3_op_list.cc b/paddle/phi/backends/xpu/xpu3_op_list.cc index f52f91c911de4..8bd1599128704 100644 --- a/paddle/phi/backends/xpu/xpu3_op_list.cc +++ b/paddle/phi/backends/xpu/xpu3_op_list.cc @@ -416,7 +416,9 @@ XPUOpMap& get_kl3_ops() { phi::DataType::INT64, phi::DataType::BOOL})}, {"gaussian_random", - XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT16})}, + XPUKernelSet({phi::DataType::FLOAT32, + phi::DataType::FLOAT16, + phi::DataType::BFLOAT16})}, {"gelu_grad", XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT16})}, {"gelu", XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT16})}, @@ -942,7 +944,10 @@ XPUOpMap& get_kl3_ops() { {"update_loss_scaling", XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT16})}, {"unbind", XPUKernelSet({phi::DataType::FLOAT32})}, - {"uniform_random", XPUKernelSet({phi::DataType::FLOAT32})}, + {"uniform_random", + XPUKernelSet({phi::DataType::FLOAT32, + phi::DataType::FLOAT16, + phi::DataType::BFLOAT16})}, {"unique", XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::INT32, diff --git a/paddle/phi/kernels/xpu/gaussian_kernel.cc b/paddle/phi/kernels/xpu/gaussian_kernel.cc index f8058f94e872f..2c4a29b6bfe51 100644 --- a/paddle/phi/kernels/xpu/gaussian_kernel.cc +++ b/paddle/phi/kernels/xpu/gaussian_kernel.cc @@ -50,4 +50,5 @@ PD_REGISTER_KERNEL(gaussian, ALL_LAYOUT, phi::GaussianKernel, float, - phi::dtype::float16) {} + phi::dtype::float16, + phi::dtype::bfloat16) {} diff --git a/paddle/phi/kernels/xpu/uniform_kernel.cc b/paddle/phi/kernels/xpu/uniform_kernel.cc index 99388e31e5881..dd3e0e2931536 100644 --- a/paddle/phi/kernels/xpu/uniform_kernel.cc +++ b/paddle/phi/kernels/xpu/uniform_kernel.cc @@ -14,12 +14,9 @@ limitations under the License. */ #include "paddle/phi/kernels/uniform_kernel.h" -#include - -#include "paddle/phi/backends/xpu/xpu_context.h" -#include "paddle/phi/common/memory_utils.h" +#include "paddle/phi/backends/xpu/enforce_xpu.h" +#include "paddle/phi/core/generator.h" #include "paddle/phi/core/kernel_registry.h" -#include "paddle/phi/kernels/funcs/uniform_real_distribution.h" namespace phi { @@ -31,49 +28,27 @@ void UniformKernel(const Context &dev_ctx, const Scalar &max, int seed, DenseTensor *out) { - int diag_num = 0; - int diag_step = 0; - float diag_val = 0.0f; out->Resize(phi::make_ddim(shape.GetData())); T *data = dev_ctx.template Alloc(out); - int64_t size = out->numel(); - - std::unique_ptr data_cpu(new T[size]); - - std::shared_ptr engine; - if (seed) { - engine = std::make_shared(); - engine->seed(seed); - } else { - engine = dev_ctx.GetGenerator()->GetCPUEngine(); - } - UniformRealDistribution( - data_cpu.get(), size, min.to(), max.to(), engine); - if (diag_num > 0) { - PADDLE_ENFORCE_GT( - size, - (diag_num - 1) * (diag_step + 1), - phi::errors::InvalidArgument( - "ShapeInvalid: the diagonal's elements is equal (num-1) " - "* (step-1) with num %d, step %d," - "It should be smaller than %d, but received %d", - diag_num, - diag_step, - (diag_num - 1) * (diag_step + 1), - size)); - for (int64_t i = 0; i < diag_num; ++i) { - int64_t pos = i * diag_step + i; - data_cpu[pos] = diag_val; - } - } - - memory_utils::Copy(dev_ctx.GetPlace(), - data, - phi::CPUPlace(), - reinterpret_cast(data_cpu.get()), - size * sizeof(T)); + using XPUType = typename XPUTypeTrait::Type; + int64_t real_seed = seed != 0 ? seed : dev_ctx.GetGenerator()->Random64(); + + // int random(Context* ctx, T* x, int64_t len, T min, T max, int64_t seed); + int r = xpu::random(dev_ctx.x_context(), + reinterpret_cast(data), + out->numel(), + static_cast(min.to()), + static_cast(max.to()), + real_seed); + PADDLE_ENFORCE_XDNN_SUCCESS(r, "random"); } } // namespace phi -PD_REGISTER_KERNEL(uniform, XPU, ALL_LAYOUT, phi::UniformKernel, float) {} +PD_REGISTER_KERNEL(uniform, + XPU, + ALL_LAYOUT, + phi::UniformKernel, + float, + phi::dtype::float16, + phi::dtype::bfloat16) {} diff --git a/test/xpu/test_gaussian_random_op_xpu.py b/test/xpu/test_gaussian_random_op_xpu.py index abdec498f0a62..7e80bd00ac586 100644 --- a/test/xpu/test_gaussian_random_op_xpu.py +++ b/test/xpu/test_gaussian_random_op_xpu.py @@ -26,8 +26,23 @@ from paddle import base paddle.enable_static() +from paddle.base import core from paddle.tensor import random +typeid_dict = { + 'int32': int(core.VarDesc.VarType.INT32), + 'int64': int(core.VarDesc.VarType.INT64), + 'float32': int(core.VarDesc.VarType.FP32), + 'float16': int(core.VarDesc.VarType.FP16), + 'bfloat16': int(core.VarDesc.VarType.BF16), + 'bool': int(core.VarDesc.VarType.BOOL), + 'int8': int(core.VarDesc.VarType.INT8), + 'uint8': int(core.VarDesc.VarType.UINT8), + 'float64': int(core.VarDesc.VarType.FP64), +} + +from op_test import convert_uint16_to_float + class XPUTestGaussianRandomOp(XPUOpTestWrapper): def __init__(self): @@ -52,6 +67,7 @@ def setUp(self): "std": self.std, "seed": 10, "use_mkldnn": self.use_mkldnn, + "dtype": typeid_dict[self.in_type_str], } paddle.seed(10) @@ -67,6 +83,10 @@ def test_check_output(self): ) def verify_output(self, outs): + # special for bf16 + if self.in_type_str == "bfloat16": + outs = convert_uint16_to_float(outs) + self.assertEqual(outs[0].shape, (123, 92)) hist, _ = np.histogram(outs[0], range=(-3, 5)) hist = hist.astype("float32") @@ -100,6 +120,7 @@ def setUp(self): 'std': self.std, 'seed': self.seed, 'use_mkldnn': self.use_mkldnn, + "dtype": typeid_dict[self.in_type_str], } self.inputs = {"ShapeTensorList": shape_tensor_list} @@ -165,6 +186,7 @@ def setUp(self): 'std': self.std, 'seed': self.seed, 'use_mkldnn': self.use_mkldnn, + "dtype": typeid_dict[self.in_type_str], } self.outputs = {'Out': np.zeros((123, 92), dtype=self.dtype)} @@ -265,6 +287,11 @@ def test_default_fp16(): out = paddle.tensor.random.gaussian([2, 3]) self.assertEqual(out.dtype, base.core.VarDesc.VarType.FP16) + def test_default_bf16(): + paddle.framework.set_default_dtype('bfloat16') + out = paddle.tensor.random.gaussian([2, 3]) + self.assertEqual(out.dtype, base.core.VarDesc.VarType.BF16) + def test_default_fp32(): paddle.framework.set_default_dtype('float32') out = paddle.tensor.random.gaussian([2, 3]) @@ -278,6 +305,7 @@ def test_default_fp64(): test_default_fp64() test_default_fp32() test_default_fp16() + test_default_bf16() paddle.enable_static() @@ -291,6 +319,11 @@ def test_default_fp16(): out = paddle.tensor.random.standard_normal([2, 3]) self.assertEqual(out.dtype, base.core.VarDesc.VarType.FP16) + def test_default_bf16(): + paddle.framework.set_default_dtype('bfloat16') + out = paddle.tensor.random.standard_normal([2, 3]) + self.assertEqual(out.dtype, base.core.VarDesc.VarType.BF16) + def test_default_fp32(): paddle.framework.set_default_dtype('float32') out = paddle.tensor.random.standard_normal([2, 3]) @@ -304,6 +337,7 @@ def test_default_fp64(): test_default_fp64() test_default_fp32() test_default_fp16() + test_default_bf16() paddle.enable_static() diff --git a/test/xpu/test_uniform_random_op_xpu.py b/test/xpu/test_uniform_random_op_xpu.py index 24972d64b0eb6..a82f305b047a4 100644 --- a/test/xpu/test_uniform_random_op_xpu.py +++ b/test/xpu/test_uniform_random_op_xpu.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -16,32 +16,97 @@ import unittest import numpy as np -from test_uniform_random_op import ( - TestUniformRandomOp, - TestUniformRandomOpSelectedRows, +from get_test_cover_info import ( + XPUOpTestWrapper, + create_test_class, + get_xpu_op_support_types, ) +from op_test_xpu import XPUOpTest import paddle paddle.enable_static() +from paddle.base import core +typeid_dict = { + 'int32': int(core.VarDesc.VarType.INT32), + 'int64': int(core.VarDesc.VarType.INT64), + 'float32': int(core.VarDesc.VarType.FP32), + 'float16': int(core.VarDesc.VarType.FP16), + 'bfloat16': int(core.VarDesc.VarType.BF16), + 'bool': int(core.VarDesc.VarType.BOOL), + 'int8': int(core.VarDesc.VarType.INT8), + 'uint8': int(core.VarDesc.VarType.UINT8), + 'float64': int(core.VarDesc.VarType.FP64), +} -class TestXPUUniformRandomOp(TestUniformRandomOp): - def test_check_output(self): - if paddle.is_compiled_with_xpu(): - place = paddle.XPUPlace(0) - outs = self.calc_output(place) - outs = [np.array(out) for out in outs] - outs.sort(key=len) - self.verify_output(outs) +def output_hist(out): + if out.dtype == np.uint16: + out = convert_uint16_to_float(out) + hist, _ = np.histogram(out, range=(-5, 10)) + hist = hist.astype("float32") + hist /= float(out.size) + prob = 0.1 * np.ones(10) + return hist, prob -class TestXPUUniformRandomOpSelectedRows(TestUniformRandomOpSelectedRows): - def test_check_output(self): - if paddle.is_compiled_with_xpu(): - place = paddle.XPUPlace(0) - self.check_with_place(place) +from op_test import convert_uint16_to_float + + +class XPUTestUniformRandomOp(XPUOpTestWrapper): + def __init__(self): + self.op_name = 'uniform_random' + self.use_dynamic_create_class = False + + class TestUniformRandomOp(XPUOpTest): + def init(self): + self.dtype = self.in_type + self.place = paddle.XPUPlace(0) + self.op_type = "uniform_random" + self.python_api = paddle.uniform + + def setUp(self): + self.init() + self.inputs = {} + self.use_mkldnn = False + self.set_attrs() + paddle.seed(10) + + self.outputs = {"Out": np.zeros((1000, 784), dtype=self.dtype)} + + def set_attrs(self): + self.attrs = { + "shape": [1000, 784], + "min": -5.0, + "max": 10.0, + "dtype": typeid_dict[self.in_type_str], + } + self.output_hist = output_hist + + def test_check_output(self): + self.check_output_with_place_customized( + self.verify_output, self.place + ) + + def verify_output(self, outs): + hist, prob = self.output_hist(np.array(outs[0])) + np.testing.assert_allclose(hist, prob, rtol=0, atol=0.01) + + class TestMaxMinAreInt(TestUniformRandomOp): + def set_attrs(self): + self.attrs = { + "shape": [1000, 784], + "min": -5, + "max": 10, + "dtype": typeid_dict[self.in_type_str], + } + self.output_hist = output_hist + + +support_types = get_xpu_op_support_types('uniform_random') +for stype in support_types: + create_test_class(globals(), XPUTestUniformRandomOp, stype) if __name__ == "__main__": unittest.main() From 1d530a7f0946512568600cf772e7b2407352dc3c Mon Sep 17 00:00:00 2001 From: houj04 Date: Fri, 3 Nov 2023 21:14:17 +0800 Subject: [PATCH 2/2] fix zero dim. --- paddle/phi/kernels/xpu/uniform_kernel.cc | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/paddle/phi/kernels/xpu/uniform_kernel.cc b/paddle/phi/kernels/xpu/uniform_kernel.cc index dd3e0e2931536..ead65b65a8466 100644 --- a/paddle/phi/kernels/xpu/uniform_kernel.cc +++ b/paddle/phi/kernels/xpu/uniform_kernel.cc @@ -30,6 +30,10 @@ void UniformKernel(const Context &dev_ctx, DenseTensor *out) { out->Resize(phi::make_ddim(shape.GetData())); T *data = dev_ctx.template Alloc(out); + if (out->numel() == 0) { + return; + } + using XPUType = typename XPUTypeTrait::Type; int64_t real_seed = seed != 0 ? seed : dev_ctx.GetGenerator()->Random64();