From a3b001e6ee40fea295f2afcd5c21b9de683b34cc Mon Sep 17 00:00:00 2001 From: Qi Shao <17864154871@163.com> Date: Thu, 13 Jul 2023 10:44:03 +0800 Subject: [PATCH] Modify bf16 and fix the elementwise_max (#54799) * modify the accuracy checking framework of bf16 optest, including both of forward and backward --- .../phi/kernels/funcs/elementwise_grad_base.h | 73 ++++----- test/legacy_test/eager_op_test.py | 138 +++++++++++++----- test/legacy_test/test_elementwise_div_op.py | 2 - test/legacy_test/test_matmul_v2_op.py | 4 + test/legacy_test/test_pool_max_op.py | 4 +- test/legacy_test/test_unique.py | 24 ++- test/legacy_test/testsuite.py | 2 +- test/white_list/op_accuracy_white_list.py | 4 + 8 files changed, 175 insertions(+), 76 deletions(-) diff --git a/paddle/phi/kernels/funcs/elementwise_grad_base.h b/paddle/phi/kernels/funcs/elementwise_grad_base.h index 49f593289a9f1..df9e93474446e 100644 --- a/paddle/phi/kernels/funcs/elementwise_grad_base.h +++ b/paddle/phi/kernels/funcs/elementwise_grad_base.h @@ -18,6 +18,7 @@ limitations under the License. */ #include "paddle/phi/backends/context_pool.h" #include "paddle/phi/backends/gpu/gpu_info.h" +#include "paddle/phi/common/amp_type_traits.h" #include "paddle/phi/common/memory_utils.h" #include "paddle/phi/core/dense_tensor.h" #include "paddle/phi/kernels/funcs/common_shape.h" @@ -114,41 +115,43 @@ static void ElemwiseGradBroadcast1CPU(const T *x, DY_OP dy_op, T *dx, T *dy) { + using MPType = typename phi::dtype::MPTypeTrait::Type; + if (is_xsize_larger) { - for (int i = 0; i < h; ++i) { - for (int j = 0; j < w; ++j) { + for (int j = 0; j < w; ++j) { + MPType sum_y = static_cast(0); + for (int i = 0; i < h; ++i) { int x_offset = i * w + j; if (dx != nullptr) { dx[x_offset] = dx_op(x[x_offset], y[j], out[x_offset], dout[x_offset]); } if (dy != nullptr) { - T tmp = dy_op(x[x_offset], y[j], out[x_offset], dout[x_offset]); - if (i == 0) { - dy[j] = tmp; - } else { - dy[j] += tmp; - } + sum_y += static_cast( + dy_op(x[x_offset], y[j], out[x_offset], dout[x_offset])); } } + if (dy != nullptr) { + dy[j] = static_cast(sum_y); + } } - } else { // x.dims < y.dims, broadcast for x. - for (int i = 0; i < h; ++i) { - for (int j = 0; j < w; ++j) { + } else { + for (int j = 0; j < w; ++j) { + MPType sum_x = static_cast(0); + for (int i = 0; i < h; ++i) { int y_offset = i * w + j; if (dy != nullptr) { dy[y_offset] = dy_op(x[j], y[y_offset], out[y_offset], dout[y_offset]); } if (dx != nullptr) { - T tmp = dx_op(x[j], y[y_offset], out[y_offset], dout[y_offset]); - if (i == 0) { - dx[j] = tmp; - } else { - dx[j] += tmp; - } + sum_x += static_cast( + dx_op(x[j], y[y_offset], out[y_offset], dout[y_offset])); } } + if (dx != nullptr) { + dx[j] = static_cast(sum_x); + } } } } @@ -166,9 +169,12 @@ static void ElemwiseGradBroadcast2CPU(const T *x, DY_OP dy_op, T *dx, T *dy) { + using MPType = typename phi::dtype::MPTypeTrait::Type; + if (is_xsize_larger) { - for (int i = 0; i < pre; ++i) { - for (int j = 0; j < n; ++j) { + for (int j = 0; j < n; ++j) { + MPType sum_y = static_cast(0); + for (int i = 0; i < pre; ++i) { for (int k = 0; k < post; ++k) { int x_offset = i * n * post + j * post + k; if (dx != nullptr) { @@ -176,19 +182,19 @@ static void ElemwiseGradBroadcast2CPU(const T *x, dx_op(x[x_offset], y[j], out[x_offset], dout[x_offset]); } if (dy != nullptr) { - T tmp = dy_op(x[x_offset], y[j], out[x_offset], dout[x_offset]); - if (i == 0 && k == 0) { - dy[j] = tmp; - } else { - dy[j] += tmp; - } + sum_y += static_cast( + dy_op(x[x_offset], y[j], out[x_offset], dout[x_offset])); } } } + if (dy != nullptr) { + dy[j] = static_cast(sum_y); + } } - } else { // x.dims < y.dims, broadcast for x. - for (int i = 0; i < pre; ++i) { - for (int j = 0; j < n; ++j) { + } else { + for (int j = 0; j < n; ++j) { + MPType sum_x = static_cast(0); + for (int i = 0; i < pre; ++i) { for (int k = 0; k < post; ++k) { int y_offset = i * n * post + j * post + k; if (dy != nullptr) { @@ -196,15 +202,14 @@ static void ElemwiseGradBroadcast2CPU(const T *x, dy_op(x[j], y[y_offset], out[y_offset], dout[y_offset]); } if (dx != nullptr) { - T tmp = dx_op(x[j], y[y_offset], out[y_offset], dout[y_offset]); - if (i == 0 && k == 0) { - dx[j] = tmp; - } else { - dx[j] += tmp; - } + sum_x += static_cast( + dx_op(x[j], y[y_offset], out[y_offset], dout[y_offset])); } } } + if (dx != nullptr) { + dx[j] = static_cast(sum_x); + } } } } diff --git a/test/legacy_test/eager_op_test.py b/test/legacy_test/eager_op_test.py index 04f9c0240f305..e4d1ff08d99af 100644 --- a/test/legacy_test/eager_op_test.py +++ b/test/legacy_test/eager_op_test.py @@ -552,8 +552,20 @@ def is_fp16_compared_with_fp32(self): not in op_accuracy_white_list.NO_FP16_COMPARED_WITH_FP32_OP_LIST ) + def is_bf16_compared_with_fp32(self): + return self.is_bfloat16_op() and ( + self.op_type + not in op_accuracy_white_list.NO_BF16_COMPARED_WITH_FP32_OP_LIST + ) + + def is_compared_with_fp32(self): + return ( + self.is_fp16_compared_with_fp32() + or self.is_bf16_compared_with_fp32() + ) + def enable_cal_ref_output(self): - self.is_calc_ref = self.is_fp16_compared_with_fp32() + self.is_calc_ref = True def disable_cal_ref_output(self): self.is_calc_ref = False @@ -654,20 +666,47 @@ def feed_var(self, input_vars, place): if isinstance(np_value, tuple): tensor.set(np_value[0], place) dtype = np.array(np_value[1]).dtype - if self.is_calc_ref and dtype == np.float16: - if isinstance(np_value[1], list): - tensor.set_recursive_sequence_lengths( - np.array(np_value[1]).astype(np.float32) - ) + + if self.is_calc_ref: + # convert the float16 to float by numpy.astype + if dtype == np.float16: + if isinstance(np_value[1], list): + tensor.set_recursive_sequence_lengths( + np.array(np_value[1]).astype(np.float32) + ) + else: + tensor.set_recursive_sequence_lengths( + np_value[1].astype(np.float32) + ) + # convert the bfloat16 to float by convert_uint16_to_float + # provided in this file + elif dtype == np.uint16: + if isinstance(np_value[1], list): + tensor.set_recursive_sequence_lengths( + convert_uint16_to_float( + np.array(np_value[1]) + ) + ) + else: + tensor.set_recursive_sequence_lengths( + convert_uint16_to_float(np_value[1]) + ) else: tensor.set_recursive_sequence_lengths( - np_value[1].astype(np.float32) + np_value[1] ) else: tensor.set_recursive_sequence_lengths(np_value[1]) else: - if self.is_calc_ref and np_value.dtype == np.float16: - tensor.set(np_value.astype(np.float32), place) + if self.is_calc_ref: + if np_value.dtype == np.float16: + tensor.set(np_value.astype(np.float32), place) + elif np_value.dtype == np.uint16: + tensor.set( + convert_uint16_to_float(np_value), place + ) + else: + tensor.set(np_value, place) else: tensor.set(np_value, place) feed_map[name] = tensor @@ -675,25 +714,57 @@ def feed_var(self, input_vars, place): tensor = core.LoDTensor() if isinstance(self.inputs[var_name], tuple): tensor.set(self.inputs[var_name][0], place) - if ( - self.is_calc_ref - and self.inputs[var_name][1].dtype == np.float16 - ): - tensor.set_recursive_sequence_lengths( - self.inputs[var_name][1].astype(np.float32) - ) + if self.is_calc_ref: + if isinstance(self.inputs[var_name][1], list): + dtype = np.array(self.inputs[var_name][1]).dtype + if dtype == np.float16: + tensor.set_recursive_sequence_lengths( + np.array(self.inputs[var_name][1]).astype( + np.float32 + ) + ) + elif dtype == np.uint16: + tensor.set_recursive_sequence_lengths( + convert_uint16_to_float( + np.array(self.inputs[var_name][1]) + ) + ) + else: + tensor.set_recursive_sequence_lengths( + self.inputs[var_name][1] + ) + + elif self.inputs[var_name][1].dtype == np.float16: + tensor.set_recursive_sequence_lengths( + self.inputs[var_name][1].astype(np.float32) + ) + elif self.inputs[var_name][1].dtype == np.uint16: + tensor.set_recursive_sequence_lengths( + convert_uint16_to_float( + self.inputs[var_name][1] + ) + ) + else: + tensor.set_recursive_sequence_lengths( + self.inputs[var_name][1] + ) else: tensor.set_recursive_sequence_lengths( self.inputs[var_name][1] ) else: - if ( - self.is_calc_ref - and self.inputs[var_name].dtype == np.float16 - ): - tensor.set( - self.inputs[var_name].astype(np.float32), place - ) + if self.is_calc_ref: + if self.inputs[var_name].dtype == np.float16: + tensor.set( + self.inputs[var_name].astype(np.float32), place + ) + elif self.inputs[var_name].dtype == np.uint16: + tensor.set( + convert_uint16_to_float(self.inputs[var_name]), + place, + ) + else: + tensor.set(self.inputs[var_name], place) else: tensor.set(self.inputs[var_name], place) feed_map[var_name] = tensor @@ -711,7 +782,8 @@ def _append_ops(self, block): self.__class__.use_xpu = True op_proto = OpProtoHolder.instance().get_op_proto(self.op_type) - "infer datatype from inputs and outputs for this test case" + # "infer datatype from inputs and outputs for this test case" + if self.is_float16_op(): self.dtype = np.float16 self.__class__.dtype = self.dtype @@ -722,6 +794,7 @@ def _append_ops(self, block): self.output_dtype = np.uint16 else: self.infer_dtype_from_inputs_outputs(self.inputs, self.outputs) + inputs = append_input_output( block, op_proto, self.inputs, True, self.dtype, self.is_calc_ref ) @@ -1809,7 +1882,7 @@ def _compare_list(self, name, actual, expect): def compare_single_output_with_expect(self, name, expect): actual, actual_np = self.find_actual_value(name) # expect_np = expect[0] if isinstance(expect, tuple) else expect - if self.op_test.is_fp16_compared_with_fp32(): + if self.op_test.is_compared_with_fp32(): expect, expect_np = self.find_expect_value(name) else: expect_np = ( @@ -1864,7 +1937,7 @@ def calculate_output(self): ) self.outputs = outs self.fetch_list = fetch_list - if self.op_test.is_fp16_compared_with_fp32(): + if self.op_test.is_compared_with_fp32(): self.op_test.enable_cal_ref_output() ref_outs, ref_fetch_list = self.op_test._calc_output( place, no_check_set=no_check_set @@ -1931,7 +2004,7 @@ def calculate_output(self): place, no_check_set=no_check_set ) self.outputs = dygraph_outs - if self.op_test.is_fp16_compared_with_fp32(): + if self.op_test.is_compared_with_fp32(): self.op_test.enable_cal_ref_output() self.is_python_api_test = True self.ref_outputs = self.op_test._calc_python_api_output( @@ -2460,9 +2533,7 @@ def check_grad_with_place( if self.is_bfloat16_op(): if self.is_mkldnn_op(): check_dygraph = False - atol = 1e-2 if atol < 1e-2 else atol - else: - atol = 1e-1 if atol < 1e-1 else atol + atol = 1e-2 if atol < 1e-2 else atol if self.is_float16_op(): atol = 1e-3 if atol < 1e-3 else atol @@ -2492,7 +2563,6 @@ def check_grad_with_place( if "use_mkldnn" in op_attrs and op_attrs["use_mkldnn"]: op_attrs["use_mkldnn"] = False use_onednn = True - self.op = create_op( self.scope, self.op_type, @@ -2538,8 +2608,9 @@ def check_grad_with_place( if numeric_place is None: numeric_place = place - if user_defined_grads is None and self.is_fp16_compared_with_fp32(): + if user_defined_grads is None and self.is_compared_with_fp32(): self.enable_cal_ref_output() + numeric_grads = self._get_gradient( inputs_to_check, place, @@ -2573,6 +2644,7 @@ def check_grad_with_place( ) # comparison of bf16 results will happen as fp32 # loop over list of grads and convert bf16 to fp32 + fp32_analytic_grads = [] for grad in analytic_grads: if grad.dtype == np.uint16: @@ -2869,7 +2941,7 @@ def _get_gradient( feed_dict = self.feed_var(inputs, place) if user_defined_grad_outputs is None: - if self.dtype == np.uint16: + if self.dtype == np.uint16 and not self.is_calc_ref: cast_inputs = list(map(block.var, output_names)) if self.op_type in ["broadcast_tensors", "meshgrid"]: output_names = self.cast_bf16_output(block, cast_inputs) diff --git a/test/legacy_test/test_elementwise_div_op.py b/test/legacy_test/test_elementwise_div_op.py index eba60625668b2..5972f8089fd55 100644 --- a/test/legacy_test/test_elementwise_div_op.py +++ b/test/legacy_test/test_elementwise_div_op.py @@ -212,8 +212,6 @@ def test_check_gradient(self): check_args = [check_option['grad'], 'Out'] check_kwargs = { 'no_grad_set': check_option['no_grad'], - 'user_defined_grads': check_option['val_grad'], - 'user_defined_grad_outputs': [self.grad_out], 'check_dygraph': self.check_dygraph, } if self.place is None: diff --git a/test/legacy_test/test_matmul_v2_op.py b/test/legacy_test/test_matmul_v2_op.py index 6adc3603fb03e..7869042f507e3 100644 --- a/test/legacy_test/test_matmul_v2_op.py +++ b/test/legacy_test/test_matmul_v2_op.py @@ -441,6 +441,8 @@ def test_check_grad_x(self): ['X'], 'Out', no_grad_set={'Y'}, + max_relative_error=3e-2, + atol=3e-2, user_defined_grads=[numeric_grads], check_cinn=self.check_cinn if hasattr(self, 'check_cinn') @@ -455,6 +457,8 @@ def test_check_grad_y(self): ['Y'], 'Out', no_grad_set={'X'}, + max_relative_error=3e-2, + atol=3e-2, user_defined_grads=[numeric_grads], check_cinn=self.check_cinn if hasattr(self, 'check_cinn') diff --git a/test/legacy_test/test_pool_max_op.py b/test/legacy_test/test_pool_max_op.py index 16295c575b19b..70b37da5ebb39 100644 --- a/test/legacy_test/test_pool_max_op.py +++ b/test/legacy_test/test_pool_max_op.py @@ -317,7 +317,9 @@ def test_check_grad(self): numeric_grads = self.get_numeric_grad(place, 'X') if core.is_bfloat16_supported(place): self.check_grad_with_place( - place, {'X'}, ['Out'], user_defined_grads=[numeric_grads] + place, + {'X'}, + ['Out'], ) cls_name = "{}_{}".format(parent.__name__, "BF16OP") diff --git a/test/legacy_test/test_unique.py b/test/legacy_test/test_unique.py index 97566042720f7..438a50df403cc 100644 --- a/test/legacy_test/test_unique.py +++ b/test/legacy_test/test_unique.py @@ -15,7 +15,7 @@ import unittest import numpy as np -from eager_op_test import OpTest, paddle_static_guard +from eager_op_test import OpTest, convert_float_to_uint16, paddle_static_guard import paddle from paddle.fluid import core @@ -147,7 +147,14 @@ def init_dtype(self): self.dtype = np.float64 def init_config(self): - self.inputs = {'X': np.array([2, 3, 3, 1, 5, 3], dtype=self.dtype)} + if self.dtype == np.uint16: + self.inputs = { + 'X': convert_float_to_uint16( + np.array([2, 3, 3, 1, 5, 3], dtype=np.float32) + ) + } + else: + self.inputs = {'X': np.array([2, 3, 3, 1, 5, 3], dtype=self.dtype)} unique, indices, inverse, count = np.unique( self.inputs['X'], return_index=True, @@ -197,9 +204,16 @@ def init_dtype(self): self.dtype = np.float64 def init_config(self): - self.inputs = { - 'X': np.random.randint(0, 100, (4, 7, 10)).astype(self.dtype) - } + if self.dtype == np.uint16: + self.inputs = { + 'X': convert_float_to_uint16( + np.random.randint(0, 100, (4, 7, 10)).astype(np.float32) + ) + } + else: + self.inputs = { + 'X': np.random.randint(0, 100, (4, 7, 10)).astype(self.dtype) + } unique, indices, inverse, counts = np.unique( self.inputs['X'], return_index=True, diff --git a/test/legacy_test/testsuite.py b/test/legacy_test/testsuite.py index 7f2cd09fb97b0..9c591fdaafad0 100644 --- a/test/legacy_test/testsuite.py +++ b/test/legacy_test/testsuite.py @@ -120,7 +120,7 @@ def create_var(block, name, np_list, var_proto, is_calc_ref=False): if is_input: shape = list(np_value.shape) lod_level = 0 - if is_calc_ref and dtype == np.float16: + if is_calc_ref and (dtype == np.float16 or dtype == np.uint16): dtype = np.float32 return block.create_var( dtype=dtype, shape=shape, lod_level=lod_level, name=name diff --git a/test/white_list/op_accuracy_white_list.py b/test/white_list/op_accuracy_white_list.py index d7613f7b284e8..12a97a160aab6 100644 --- a/test/white_list/op_accuracy_white_list.py +++ b/test/white_list/op_accuracy_white_list.py @@ -94,3 +94,7 @@ 'fake_quantize_moving_average_abs_max', 'p_norm', ] + +NO_BF16_COMPARED_WITH_FP32_OP_LIST = [ + 'dequantize', +]