From 5a56ea0bfd58dd7d81aaa39db98ff697b637549f Mon Sep 17 00:00:00 2001 From: CaiZhi Date: Tue, 15 Aug 2023 01:21:00 +0800 Subject: [PATCH] [MTAI-484] fix(build): modify format for MUSA --- paddle/phi/backends/gpu/gpu_resources.cc | 38 +-- paddle/phi/kernels/funcs/math_function.cu | 38 +-- paddle/phi/kernels/funcs/softmax.cu | 4 +- .../phi/kernels/gpu/batch_norm_grad_kernel.cu | 45 +-- paddle/phi/kernels/gpu/batch_norm_kernel.cu | 14 +- .../kernels/gpu/instance_norm_grad_kernel.cu | 35 +- paddle/phi/kernels/gpu/rnn_functor.h | 43 +-- paddle/phi/kernels/gpu/rnn_grad_kernel.cu.cc | 22 +- paddle/phi/kernels/gpu/rnn_kernel.cu.cc | 28 +- paddle/phi/kernels/gpudnn/conv_grad_kernel.cu | 16 +- paddle/phi/kernels/gpudnn/conv_kernel.cu | 9 +- .../gpudnn/conv_transpose_grad_kernel.cu | 9 +- paddle/phi/kernels/impl/matmul_kernel_impl.h | 319 +++++++++--------- 13 files changed, 227 insertions(+), 393 deletions(-) diff --git a/paddle/phi/backends/gpu/gpu_resources.cc b/paddle/phi/backends/gpu/gpu_resources.cc index 01c307bd9e29e2..98ea78a3e51098 100644 --- a/paddle/phi/backends/gpu/gpu_resources.cc +++ b/paddle/phi/backends/gpu/gpu_resources.cc @@ -155,9 +155,10 @@ void InitGpuProperties(Place place, "version."; } #elif defined(PADDLE_WITH_MUSA) - // TODO(@caizhi): enable dynload module + // TODO(@caizhi): mudnnGetVersion is not supported for MUSA now. + // Requests have been submitted to Mudnn. // size_t mudnn_dso_ver = dynload::mudnnGetVersion(); - size_t mudnn_dso_ver = 0; + size_t mudnn_dso_ver = 1100; LOG_FIRST_N(WARNING, 1) << "device: " << static_cast(place.device) << ", muDNN Version: " << mudnn_dso_ver / 1000 << "." << (mudnn_dso_ver % 1000) / 100 << "."; @@ -168,21 +169,20 @@ void InitGpuProperties(Place place, auto compile_musa_version = (MUSA_VERSION / 1000) * 10 + (MUSA_VERSION % 100) / 10; #if defined(__linux__) - // TODO(@caizhi): enable dynload module - //PADDLE_ENFORCE_EQ( - // (local_musa_version / 10 < compile_musa_version / 10) && - // (mudnn_dso_ver / 1000 < MUDNN_VERSION / 1000), - // false, - // phi::errors::InvalidArgument( - // "The installed Paddle is compiled with MUDA%d/muDNN%d," - // "but MUSA/muDNN version in your machine is MUSA%d/muDNN%d. " - // "which will cause serious incompatible bug. " - // "Please recompile or reinstall Paddle with compatible MUSA/muDNN " - // "version.", - // compile_musa_version / 10, - // MUDNN_VERSION / 1000, - // local_musa_version / 10, - // mudnn_dso_ver / 1000)); + PADDLE_ENFORCE_EQ( + (local_musa_version / 10 < compile_musa_version / 10) && + (mudnn_dso_ver / 1000 < MUDNN_VERSION / 1000), + false, + phi::errors::InvalidArgument( + "The installed Paddle is compiled with MUDA%d/muDNN%d," + "but MUSA/muDNN version in your machine is MUSA%d/muDNN%d. " + "which will cause serious incompatible bug. " + "Please recompile or reinstall Paddle with compatible MUSA/muDNN " + "version.", + compile_musa_version / 10, + MUDNN_VERSION / 1000, + local_musa_version / 10, + mudnn_dso_ver / 1000)); #endif if (local_musa_version < compile_musa_version) { LOG_FIRST_N(WARNING, 1) @@ -335,9 +335,7 @@ void InitDnnHandle(dnnHandle_t* handle, gpuStream_t stream, Place place) { } PADDLE_ENFORCE_GPU_SUCCESS(dynload::miopenCreate(handle)); PADDLE_ENFORCE_GPU_SUCCESS(dynload::miopenSetStream(*handle, stream)); -#elif defined(PADDLE_WITH_MUSA) - -#else +#elif defined(PADDLE_WITH_CUDA) auto local_cudnn_version = phi::dynload::cudnnGetVersion() / 100; auto compile_cudnn_version = CUDNN_VERSION / 100; if (local_cudnn_version < static_cast(compile_cudnn_version)) { diff --git a/paddle/phi/kernels/funcs/math_function.cu b/paddle/phi/kernels/funcs/math_function.cu index 8dac85bb3c5856..5b9fecbd43b30f 100644 --- a/paddle/phi/kernels/funcs/math_function.cu +++ b/paddle/phi/kernels/funcs/math_function.cu @@ -430,16 +430,15 @@ void ColwiseSum::operator()( SetConstant set; set(context, &one, static_cast(1.0)); - // TODO(@caizhi): enable blas modules - //phi::funcs::GetBlas(context).GEMV( - // true, - // static_cast(in_dims[0]), - // static_cast(in_dims[1]), - // 1.0, - // input.data(), - // one.data(), - // 0.0, - // vector->data()); + phi::funcs::GetBlas(context).GEMV( + true, + static_cast(in_dims[0]), + static_cast(in_dims[1]), + 1.0, + input.data(), + one.data(), + 0.0, + vector->data()); } template struct RowwiseSum; @@ -469,16 +468,15 @@ void RowwiseSum::operator()( SetConstant set; set(context, &one, static_cast(1.0)); - // TODO(@caizhi): enable blas modules - //phi::funcs::GetBlas(context).GEMV( - // true, - // static_cast(in_dims[1]), - // static_cast(in_dims[0]), - // 1.0, - // one.data(), - // input.data(), - // 0.0, - // vector->data()); + phi::funcs::GetBlas(context).GEMV( + true, + static_cast(in_dims[1]), + static_cast(in_dims[0]), + 1.0, + one.data(), + input.data(), + 0.0, + vector->data()); } template struct RowwiseMean; diff --git a/paddle/phi/kernels/funcs/softmax.cu b/paddle/phi/kernels/funcs/softmax.cu index 11ee9c23fa2ffc..fb76ab3017179a 100644 --- a/paddle/phi/kernels/funcs/softmax.cu +++ b/paddle/phi/kernels/funcs/softmax.cu @@ -21,6 +21,7 @@ limitations under the License. */ namespace phi { namespace funcs { + using ScopedTensorDescriptor = phi::backends::gpu::ScopedTensorDescriptor; using DataLayout = phi::backends::gpu::DataLayout; template @@ -117,8 +118,6 @@ void SoftmaxGradCUDNNFunctor::operator()( MIOPEN_SOFTMAX_ACCURATE, MIOPEN_SOFTMAX_MODE_INSTANCE)); #elif defined(PADDLE_WITH_MUSA) - // TODO -#else cudnnTensorDescriptor_t cudnn_y_desc = yDesc.descriptor(layout, cudnn_tensor_dims); cudnnTensorDescriptor_t cudnn_xgrad_desc = @@ -154,6 +153,7 @@ template class SoftmaxGradCUDNNFunctor; template class SoftmaxCUDNNFunctor; template class SoftmaxGradCUDNNFunctor; #endif + template class SoftmaxFunctor; template class SoftmaxFunctor; template class SoftmaxFunctor; diff --git a/paddle/phi/kernels/gpu/batch_norm_grad_kernel.cu b/paddle/phi/kernels/gpu/batch_norm_grad_kernel.cu index 19930eb28add5f..7dc5194e2c150d 100644 --- a/paddle/phi/kernels/gpu/batch_norm_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/batch_norm_grad_kernel.cu @@ -568,7 +568,7 @@ void BatchNormGradRawKernel(const Context &ctx, scale.dims()[0])); auto dtype = phi::backends::gpu::CudnnDataType::type; -#if defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#ifdef PADDLE_WITH_HIP auto compute_format = data_layout == DataLayout::kNHWC ? DataLayout::kNHWC : DataLayout::kNCHW; @@ -650,8 +650,7 @@ void BatchNormGradRawKernel(const Context &ctx, // platform::dynload::miopenCreateTensorDescriptor(&data_desc_)); // PADDLE_ENFORCE_GPU_SUCCESS( // platform::dynload::miopenCreateTensorDescriptor(&bn_param_desc_)); -#elif defined(PADDLE_WITH_MUSA) -#else +#elif defined(PADDLE_WITH_CUDA) cudnnTensorDescriptor_t data_desc_; cudnnTensorDescriptor_t bn_param_desc_; cudnnBatchNormMode_t mode_; @@ -697,16 +696,7 @@ void BatchNormGradRawKernel(const Context &ctx, // PADDLE_ENFORCE_GPU_SUCCESS( // platform::dynload::miopenDeriveBNTensorDescriptor(bn_param_desc_, // data_desc_, mode_)); -#elif defined(PADDLE_WITH_MUSA) - PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::mudnnSetTensorNdDescriptor( - data_desc_, - CudnnDataType::type, - x_dims.size() > 3 ? x_dims.size() : 4, - dims.data(), - strides.data())); - PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::mudnnDeriveBNTensorDescriptor( - bn_param_desc_, data_desc_, mode_)); -#else +#elif defined(PADDLE_WITH_CUDA) PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cudnnSetTensorNdDescriptor( data_desc_, CudnnDataType::type, @@ -789,9 +779,7 @@ void BatchNormGradRawKernel(const Context &ctx, // d_bias->template mutable_data>( // ctx.GetPlace()), // epsilon, saved_mean_data, saved_var_data)); -#elif defined(PADDLE_WITH_MUSA) - -#else +#elif defined(PADDLE_WITH_CUDA) } // CUDNN only support small batch size bool use_native_nhwc = @@ -1127,12 +1115,7 @@ void BatchNormGradRawKernel(const Context &ctx, // platform::dynload::miopenDestroyTensorDescriptor(data_desc_)); // PADDLE_ENFORCE_GPU_SUCCESS( // platform::dynload::miopenDestroyTensorDescriptor(bn_param_desc_)); -#elif defined(PADDLE_WITH_MUSA) - PADDLE_ENFORCE_GPU_SUCCESS( - phi::dynload::mudnnDestroyTensorDescriptor(data_desc_)); - PADDLE_ENFORCE_GPU_SUCCESS( - phi::dynload::mudnnDestroyTensorDescriptor(bn_param_desc_)); -#else +#elif defined(PADDLE_WITH_CUDA) // clean when exit. PADDLE_ENFORCE_GPU_SUCCESS( phi::dynload::cudnnDestroyTensorDescriptor(data_desc_)); @@ -1392,21 +1375,7 @@ PD_REGISTER_KERNEL(batch_norm_grad_raw, phi::BatchNormGradRawKernel, float, phi::dtype::float16) {} -#elif defined(PADDLE_WITH_MUSA) -PD_REGISTER_KERNEL(batch_norm_grad, - GPU, - ALL_LAYOUT, - phi::BatchNormGradKernel, - float, - phi::dtype::float16) {} - -PD_REGISTER_KERNEL(batch_norm_grad_raw, - GPU, - ALL_LAYOUT, - phi::BatchNormGradRawKernel, - float, - phi::dtype::float16) {} -#else +#elif defined(PADDLE_WITH_CUDA) #if CUDNN_VERSION_MIN(8, 1, 0) PD_REGISTER_KERNEL(batch_norm_grad, @@ -1440,7 +1409,7 @@ PD_REGISTER_KERNEL(batch_norm_grad_raw, kernel->OutputAt(2).SetDataType(phi::DataType::FLOAT32); // bias_grad } } -#else // CUDA & MUSA +#else // CUDA PD_REGISTER_KERNEL(batch_norm_grad, GPU, ALL_LAYOUT, diff --git a/paddle/phi/kernels/gpu/batch_norm_kernel.cu b/paddle/phi/kernels/gpu/batch_norm_kernel.cu index d6a1fb761719ed..c3750162637b05 100644 --- a/paddle/phi/kernels/gpu/batch_norm_kernel.cu +++ b/paddle/phi/kernels/gpu/batch_norm_kernel.cu @@ -554,7 +554,7 @@ void BatchNormKernel(const Context &ctx, auto dtype = phi::backends::gpu::CudnnDataType::type; -#if defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#ifdef PADDLE_WITH_HIP auto compute_format = data_layout == DataLayout::kNHWC ? DataLayout::kNHWC : DataLayout::kNCHW; @@ -597,6 +597,7 @@ void BatchNormKernel(const Context &ctx, // PADDLE_ENFORCE_GPU_SUCCESS( // platform::dynload::miopenCreateTensorDescriptor(&bn_param_desc_)); #elif defined(PADDLE_WITH_MUSA) + #else cudnnTensorDescriptor_t data_desc_; cudnnTensorDescriptor_t bn_param_desc_; @@ -615,9 +616,11 @@ void BatchNormKernel(const Context &ctx, } epsilon = std::max(epsilon, CUDNN_BN_MIN_EPSILON); -#if defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#ifdef PADDLE_WITH_HIP // TODO(wangran16): wait for MIOpen to improve the performance of BN // mode_ = miopenBNSpatial; +#elif defined(PADDLE_WITH_MUSA) + #elif CUDNN_VERSION_MIN(7, 0, 1) if (FLAGS_cudnn_batchnorm_spatial_persistent) { mode_ = CUDNN_BATCHNORM_SPATIAL_PERSISTENT; @@ -1210,12 +1213,7 @@ void BatchNormKernel(const Context &ctx, // platform::dynload::miopenDestroyTensorDescriptor(data_desc_)); // PADDLE_ENFORCE_GPU_SUCCESS( // platform::dynload::miopenDestroyTensorDescriptor(bn_param_desc_)); -#elif defined(PADDLE_WITH_MUSA) - PADDLE_ENFORCE_GPU_SUCCESS( - phi::dynload::mudnnDestroyTensorDescriptor(data_desc_)); - PADDLE_ENFORCE_GPU_SUCCESS( - phi::dynload::mudnnDestroyTensorDescriptor(bn_param_desc_)); -#else +#elif defined(PADDLE_WITH_CUDA) // clean when exit. PADDLE_ENFORCE_GPU_SUCCESS( phi::dynload::cudnnDestroyTensorDescriptor(data_desc_)); diff --git a/paddle/phi/kernels/gpu/instance_norm_grad_kernel.cu b/paddle/phi/kernels/gpu/instance_norm_grad_kernel.cu index 3c8e4274474e14..5e48ed565f353a 100644 --- a/paddle/phi/kernels/gpu/instance_norm_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/instance_norm_grad_kernel.cu @@ -401,15 +401,7 @@ void InstanceNormGradKernel(const Context &dev_ctx, phi::dynload::miopenCreateTensorDescriptor(&data_desc_)); PADDLE_ENFORCE_GPU_SUCCESS( phi::dynload::miopenCreateTensorDescriptor(&in_param_desc_)); -#elif defined(PADDLE_WITH_MUSA) - mudnnTensorDescriptor_t data_desc_; - mudnnTensorDescriptor_t in_param_desc_; - - PADDLE_ENFORCE_GPU_SUCCESS( - phi::dynload::mudnnCreateTensorDescriptor(&data_desc_)); - PADDLE_ENFORCE_GPU_SUCCESS( - phi::dynload::mudnnCreateTensorDescriptor(&in_param_desc_)); -#else +#elif defined(PADDLE_WITH_CUDA) cudnnTensorDescriptor_t data_desc_; cudnnTensorDescriptor_t in_param_desc_; @@ -435,16 +427,7 @@ void InstanceNormGradKernel(const Context &dev_ctx, const_cast(strides.data()))); PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::miopenDeriveBNTensorDescriptor( in_param_desc_, data_desc_, miopenBNSpatial)); -#elif defined(PADDLE_WITH_MUSA) - PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::mudnnSetTensorDescriptor( - data_desc_, - CudnnDataType::type, - x_dims.size() > 3 ? x_dims.size() : 4, - const_cast(dims.data()), - const_cast(strides.data()))); - PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::mudnnDeriveBNTensorDescriptor( - in_param_desc_, data_desc_, miopenBNSpatial)); -#else +#elif defined(PADDLE_WITH_CUDA) PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cudnnSetTensorNdDescriptor( data_desc_, CudnnDataType::type, @@ -481,14 +464,9 @@ void InstanceNormGradKernel(const Context &dev_ctx, epsilon, saved_mean_data, saved_var_data)); -#else -#ifdef PADDLE_WITH_MUSA - PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::mudnnBatchNormalizationBackward( - dev_ctx.mudnn_handle(), -#else +#elif defined(PADDLE_WITH_CUDA) PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cudnnBatchNormalizationBackward( dev_ctx.cudnn_handle(), -#endif CUDNN_BATCHNORM_SPATIAL, CudnnDataType::kOne(), CudnnDataType::kZero(), @@ -533,12 +511,7 @@ void InstanceNormGradKernel(const Context &dev_ctx, phi::dynload::miopenDestroyTensorDescriptor(data_desc_)); PADDLE_ENFORCE_GPU_SUCCESS( phi::dynload::miopenDestroyTensorDescriptor(in_param_desc_)); -#elif defined(PADDLE_WITH_MUSA) - PADDLE_ENFORCE_GPU_SUCCESS( - phi::dynload::mudnnDestroyTensorDescriptor(data_desc_)); - PADDLE_ENFORCE_GPU_SUCCESS( - phi::dynload::mudnnDestroyTensorDescriptor(in_param_desc_)); -#else +#elif defined(PADDLE_WITH_CUDA) PADDLE_ENFORCE_GPU_SUCCESS( phi::dynload::cudnnDestroyTensorDescriptor(data_desc_)); PADDLE_ENFORCE_GPU_SUCCESS( diff --git a/paddle/phi/kernels/gpu/rnn_functor.h b/paddle/phi/kernels/gpu/rnn_functor.h index e351c29138ee82..3bf38383e9bda3 100644 --- a/paddle/phi/kernels/gpu/rnn_functor.h +++ b/paddle/phi/kernels/gpu/rnn_functor.h @@ -107,10 +107,7 @@ class RNNDescriptors { #ifdef PADDLE_WITH_HIP PADDLE_ENFORCE_GPU_SUCCESS( phi::dynload::miopenDropoutGetStatesSize(handle, &state_size)); -#elif defined(PADDLE_WITH_MUSA) - PADDLE_ENFORCE_GPU_SUCCESS( - phi::dynload::mudnnDropoutGetStatesSize(handle, &state_size)); -#else +#elif defined(PADDLE_WITH_CUDA) PADDLE_ENFORCE_GPU_SUCCESS( phi::dynload::cudnnDropoutGetStatesSize(handle, &state_size)); #endif @@ -150,10 +147,7 @@ class RNNDescriptors { mode_, CUDNN_RNN_ALGO_STANDARD, cudnn_type)); -#else -#ifdef PADDLE_WITH_MUSA - PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::mudnnSetRNNDescriptor( -#else +#elif defined(PADDLE_WITH_CUDA) PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cudnnSetRNNDescriptor( #endif rnn_desc_.desc(), @@ -178,10 +172,7 @@ class RNNDescriptors { #ifdef PADDLE_WITH_HIP PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::miopenGetRNNParamsSize( handle, rnn_desc_.desc(), x_descs_[0], &weights_size_, cudnn_type)); -#elif defined(PADDLE_WITH_MUSA) - PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::mudnnGetRNNParamsSize( - handle, rnn_desc_.desc(), x_descs_[0], &weights_size_, cudnn_type)); -#else +#elif defined(PADDLE_WITH_CUDA) PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cudnnGetRNNParamsSize( handle, rnn_desc_.desc(), x_descs_[0], &weights_size_, cudnn_type)); #endif @@ -205,16 +196,7 @@ class RNNDescriptors { workspace_size)); PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::miopenGetRNNTrainingReserveSize( handle, rnn_desc_.desc(), seq_length_, x_descs_.data(), reserve_size)); -#elif defined(PADDLE_WITH_MUSA) - PADDLE_ENFORCE_GPU_SUCCESS( - phi::dynload::mudnnGetRNNWorkspaceSize(handle, - rnn_desc_.desc(), - seq_length_, - x_descs_.data(), - workspace_size)); - PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::mudnnGetRNNTrainingReserveSize( - handle, rnn_desc_.desc(), seq_length_, x_descs_.data(), reserve_size)); -#else +#elif defined(PADDLE_WITH_CUDA) PADDLE_ENFORCE_GPU_SUCCESS( phi::dynload::cudnnGetRNNWorkspaceSize(handle, rnn_desc_.desc(), @@ -235,17 +217,7 @@ class RNNDescriptors { miopenRNNDescriptor_t rnn_desc() { return rnn_desc_.desc(); } miopenDropoutDescriptor_t dropout_desc() { return dropout_desc_.desc(); } miopenTensorDescriptor_t weight_desc() { return weight_desc_.desc(); } -#elif defined(PADDLE_WITH_MUSA) - mudnnTensorDescriptor_t *x_descs() { return x_descs_.data(); } - mudnnTensorDescriptor_t *y_descs() { return y_descs_.data(); } - mudnnTensorDescriptor_t init_h_desc() { return init_h_desc_.desc(); } - mudnnTensorDescriptor_t init_c_desc() { return init_c_desc_.desc(); } - mudnnTensorDescriptor_t last_h_desc() { return last_h_desc_.desc(); } - mudnnTensorDescriptor_t last_c_desc() { return last_c_desc_.desc(); } - mudnnRNNDescriptor_t rnn_desc() { return rnn_desc_.desc(); } - mudnnDropoutDescriptor_t dropout_desc() { return dropout_desc_.desc(); } - mudnnTensorDescriptor_t weight_desc() { return weight_desc_.desc(); } -#else +#elif defined(PADDLE_WITH_CUDA) cudnnTensorDescriptor_t *x_descs() { return x_descs_.data(); } cudnnTensorDescriptor_t *y_descs() { return y_descs_.data(); } #if CUDNN_VERSION >= 7201 @@ -276,10 +248,7 @@ class RNNDescriptors { #ifdef PADDLE_WITH_HIP std::vector x_descs_; std::vector y_descs_; -#elif defined(PADDLE_WITH_HIP) - std::vector x_descs_; - std::vector y_descs_; -#else +#elif defined(PADDLE_WITH_CUDA) std::vector x_descs_; std::vector y_descs_; #endif diff --git a/paddle/phi/kernels/gpu/rnn_grad_kernel.cu.cc b/paddle/phi/kernels/gpu/rnn_grad_kernel.cu.cc index 44bca2124770ab..5bd1af8f789e52 100644 --- a/paddle/phi/kernels/gpu/rnn_grad_kernel.cu.cc +++ b/paddle/phi/kernels/gpu/rnn_grad_kernel.cu.cc @@ -105,17 +105,7 @@ void RnnGradKernel(const Context &dev_ctx, rnn_mode = miopenRNNRELU; else if (mode == "RNN_TANH") rnn_mode = miopenRNNTANH; -#elif defined(PADDLE_WITH_MUSA) - mudnnRNNMode_t rnn_mode = MUDNN_LSTM; - if (mode == "LSTM") - rnn_mode = MUDNN_LSTM; - else if (mode == "GRU") - rnn_mode = MUDNN_GRU; - else if (mode == "RNN_RELU") - rnn_mode = MUDNN_RNN_RELU; - else if (mode == "RNN_TANH") - rnn_mode = MUDNN_RNN_TANH; -#else +#elif defined(PADDLE_WITH_CUDA) cudnnRNNMode_t rnn_mode = CUDNN_LSTM; if (mode == "LSTM") rnn_mode = CUDNN_LSTM; @@ -205,9 +195,7 @@ void RnnGradKernel(const Context &dev_ctx, T *init_c_grad_data = nullptr; #ifdef PADDLE_WITH_HIP if (rnn_mode == miopenLSTM) { -#elif defined(PADDLE_WITH_MUSA) - if (rnn_mode == MUDNN_LSTM) -#else +#elif defined(PADDLE_WITH_CUDA) if (rnn_mode == CUDNN_LSTM) { #endif init_c_data = pre_state[1]->data(); @@ -353,12 +341,8 @@ void RnnGradKernel(const Context &dev_ctx, // permute weight grad list from weight grad tensor TensorToPermutedWeight( place, stream, weight_grad, &weight_grad_list, rnn_mode, is_bidirec); -#else -#ifdef PADDLE_WITH_MUSA - PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::mudnnRNNBackwardWeights( -#else +#elif defined(PADDLE_WITH_CUDA) PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cudnnRNNBackwardWeights( -#endif handle, rnn.rnn_desc(), seq_length, diff --git a/paddle/phi/kernels/gpu/rnn_kernel.cu.cc b/paddle/phi/kernels/gpu/rnn_kernel.cu.cc index 601c1a524c4023..eb2a1b9d4bce7c 100644 --- a/paddle/phi/kernels/gpu/rnn_kernel.cu.cc +++ b/paddle/phi/kernels/gpu/rnn_kernel.cu.cc @@ -63,13 +63,9 @@ void RNNInferece(bool has_seq_length, last_c_data, workspace_data->data(), workspace_size)); -#else +#elif defined(PADDLE_WITH_CUDA) PADDLE_ENFORCE_GPU_SUCCESS( -#ifdef PADDLE_WITH_MUSA - phi::dynload::mudnnRNNForwardInference(handle, -#else phi::dynload::cudnnRNNForwardInference(handle, -#endif rnn->rnn_desc(), seq_length, rnn->x_descs(), @@ -158,17 +154,7 @@ void RnnKernel(const Context &dev_ctx, rnn_mode = miopenRNNRELU; else if (mode == "RNN_TANH") rnn_mode = miopenRNNTANH; -#elif defined(PADDLE_WITH_MUSA) - gpuRNNMode_t rnn_mode = MUDNN_LSTM; - if (mode == "LSTM") - rnn_mode = MUDNN_LSTM; - else if (mode == "GRU") - rnn_mode = MUDNN_GRU; - else if (mode == "RNN_RELU") - rnn_mode = MUDNN_RNN_RELU; - else if (mode == "RNN_TANH") - rnn_mode = MUDNN_RNN_TANH; -#else +#elif defined(PADDLE_WITH_CUDA) gpuRNNMode_t rnn_mode = CUDNN_LSTM; if (mode == "LSTM") rnn_mode = CUDNN_LSTM; @@ -202,9 +188,7 @@ void RnnKernel(const Context &dev_ctx, T *last_c_data = nullptr; #ifdef PADDLE_WITH_HIP if (rnn_mode == miopenLSTM) { -#elif defined(PADDLE_WITH_MUSA) - if (rnn_mode == MUDNN_LSTM) { -#else +#elif defined(PADDLE_WITH_CUDA) if (rnn_mode == CUDNN_LSTM) { #endif init_c_data = pre_state[1]->data(); @@ -347,13 +331,9 @@ void RnnKernel(const Context &dev_ctx, workspace_size, reserve_data, reserve_size)); -#else +#elif defined(PADDLE_WITH_CUDA) PADDLE_ENFORCE_GPU_SUCCESS( -#ifdef PADDLE_WITH_MUSA - phi::dynload::mudnnRNNForwardTraining(handle, -#else phi::dynload::cudnnRNNForwardTraining(handle, -#endif rnn.rnn_desc(), seq_length, rnn.x_descs(), diff --git a/paddle/phi/kernels/gpudnn/conv_grad_kernel.cu b/paddle/phi/kernels/gpudnn/conv_grad_kernel.cu index 7dde591c950426..681af7a28d1d8a 100644 --- a/paddle/phi/kernels/gpudnn/conv_grad_kernel.cu +++ b/paddle/phi/kernels/gpudnn/conv_grad_kernel.cu @@ -140,10 +140,7 @@ void ConvCudnnGradKernelImplV7( #ifdef PADDLE_WITH_HIP SearchResult bwd_result; SearchResult filter_result; -#elif defined(PADDLE_WITH_MUSA) - SearchResult bwd_result; - SearchResult filter_result; -#else +#elif defined(PADDLE_WITH_CUDA) SearchResult bwd_result; SearchResult filter_result; #endif @@ -151,7 +148,7 @@ void ConvCudnnGradKernelImplV7( int iwo_groups = groups; int c_groups = 1; -#if defined(PADDLE_WITH_HIP) || CUDNN_VERSION_MIN(7, 0, 1) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_HIP) || CUDNN_VERSION_MIN(7, 0, 1) iwo_groups = 1; c_groups = groups; groups = 1; @@ -1009,7 +1006,7 @@ void ConvCudnnGradGradKernel( int iwo_group = groups; int c_group = 1; -#if defined(PADDLE_WITH_HIP) || CUDNN_VERSION_MIN(7, 0, 1) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_HIP) || CUDNN_VERSION_MIN(7, 0, 1) iwo_group = 1; c_group = groups; groups = 1; @@ -1066,12 +1063,7 @@ void ConvCudnnGradGradKernel( SearchResult fwd_result2; SearchResult data_result; SearchResult filter_result; -#elif defined(PADDLE_WITH_MUSA) - SearchResult fwd_result1; - SearchResult fwd_result2; - SearchResult data_result; - SearchResult filter_result; -#else +#elif defined(PADDLE_WITH_CUDA) SearchResult fwd_result1; SearchResult fwd_result2; SearchResult data_result; diff --git a/paddle/phi/kernels/gpudnn/conv_kernel.cu b/paddle/phi/kernels/gpudnn/conv_kernel.cu index 2193050bdc909f..a78b61690259ee 100644 --- a/paddle/phi/kernels/gpudnn/conv_kernel.cu +++ b/paddle/phi/kernels/gpudnn/conv_kernel.cu @@ -153,12 +153,7 @@ void ConvCudnnKernelImplV7(const DenseTensor* transformed_input, workspace_size = search::GetWorkspaceSize(args); fwd_result.algo = search::Find( args, exhaustive_search, deterministic, workspace_size, ctx); -#elif defined(PADDLE_WITH_MUSA) - SearchResult fwd_result; - using search = SearchAlgorithm; - fwd_result = search::Find(ctx, args, exhaustive_search, deterministic); - workspace_size = fwd_result.workspace_size; -#else +#elif defined(PADDLE_WITH_CUDA) SearchResult fwd_result; using search = SearchAlgorithm; fwd_result = search::Find(ctx, args, exhaustive_search, deterministic); @@ -370,7 +365,7 @@ void ConvCudnnKernel(const Context& ctx, const bool channel_last = (data_format == "NHWC" || data_format == "NDHWC"); auto dtype = phi::backends::gpu::CudnnDataType::type; -#if defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_HIP) // HIP MIOPEN ONLY SUPPORT NCHW format auto compute_format = phi::backends::gpu::DataLayout::kNCHW; #else diff --git a/paddle/phi/kernels/gpudnn/conv_transpose_grad_kernel.cu b/paddle/phi/kernels/gpudnn/conv_transpose_grad_kernel.cu index bcfd4aa1e6d3bc..8e71d9f619a5f5 100644 --- a/paddle/phi/kernels/gpudnn/conv_transpose_grad_kernel.cu +++ b/paddle/phi/kernels/gpudnn/conv_transpose_grad_kernel.cu @@ -613,7 +613,7 @@ void Conv2dTransposeDoubleGradGPUDNNKernel( int iwo_group = groups; int c_group = 1; -#if defined(PADDLE_WITH_HIP) || CUDNN_VERSION_MIN(7, 0, 1) || defined(PADDLE_WITH_MUSA) +#if defined(PADDLE_WITH_HIP) || CUDNN_VERSION_MIN(7, 0, 1) iwo_group = 1; c_group = groups; groups = 1; @@ -670,12 +670,7 @@ void Conv2dTransposeDoubleGradGPUDNNKernel( SearchResult bwd_result2; SearchResult filter_result; SearchResult fwd_result; -#elif defined(PADDLE_WITH_MUSA) - SearchResult bwd_result1; - SearchResult bwd_result2; - SearchResult filter_result; - SearchResult fwd_result; -#else +#elif defined(PADDLE_WITH_CUDA) SearchResult bwd_result1; SearchResult bwd_result2; SearchResult filter_result; diff --git a/paddle/phi/kernels/impl/matmul_kernel_impl.h b/paddle/phi/kernels/impl/matmul_kernel_impl.h index f9f861b67c740f..b88c1dffdd0185 100644 --- a/paddle/phi/kernels/impl/matmul_kernel_impl.h +++ b/paddle/phi/kernels/impl/matmul_kernel_impl.h @@ -128,17 +128,16 @@ void MatMulFunctionImplWithBlas( VLOG(3) << "MatMul's case 1"; Out->Resize(phi::make_ddim({})); dev_ctx.template Alloc(Out); - // TODO(@caizhi): enable it - //blas.GEMM(CblasNoTrans, - // CblasTrans, - // 1, - // 1, - // M, - // static_cast(1), - // y_data, - // x_data, - // static_cast(flag), - // dev_ctx.template Alloc(Out)); + blas.GEMM(CblasNoTrans, + CblasTrans, + 1, + 1, + M, + static_cast(1), + y_data, + x_data, + static_cast(flag), + dev_ctx.template Alloc(Out)); return; } @@ -179,45 +178,42 @@ void MatMulFunctionImplWithBlas( if (trans_y) { const int M = Y.numel() / N; VLOG(3) << "MatMul's case 2"; - // TODO(@caizhi): enable it - //blas.GEMV(false, - // M, - // N, - // static_cast(1), - // y_data, - // x_data, - // static_cast(flag), - // dev_ctx.template Alloc(Out)); + blas.GEMV(false, + M, + N, + static_cast(1), + y_data, + x_data, + static_cast(flag), + dev_ctx.template Alloc(Out)); } else { const int M = y_dims[y_ndim - 1]; const int batch_size = Y.numel() / (M * N); if (batch_size == 1) { VLOG(3) << "MatMul's case 3"; - // TODO(@caizhi): enable it - //blas.GEMV(true, - // N, - // M, - // static_cast(1), - // y_data, - // x_data, - // static_cast(flag), - // dev_ctx.template Alloc(Out)); + blas.GEMV(true, + N, + M, + static_cast(1), + y_data, + x_data, + static_cast(flag), + dev_ctx.template Alloc(Out)); } else { VLOG(3) << "MatMul's case 4"; - // TODO(@caizhi): enable it - //blas.BatchedGEMM(CblasTrans, - // CblasNoTrans, - // M, - // 1, - // N, - // static_cast(1), - // y_data, - // x_data, - // static_cast(flag), - // dev_ctx.template Alloc(Out), - // batch_size, - // M * N, - // 0); + blas.BatchedGEMM(CblasTrans, + CblasNoTrans, + M, + 1, + N, + static_cast(1), + y_data, + x_data, + static_cast(flag), + dev_ctx.template Alloc(Out), + batch_size, + M * N, + 0); } } return; @@ -263,44 +259,41 @@ void MatMulFunctionImplWithBlas( const int batch_size = X.numel() / (M * N); if (batch_size == 1) { VLOG(3) << "MatMul's case 5"; - // TODO(@caizhi): enable it - //blas.GEMV(true, - // N, - // M, - // static_cast(1), - // x_data, - // y_data, - // static_cast(flag), - // dev_ctx.template Alloc(Out)); + blas.GEMV(true, + N, + M, + static_cast(1), + x_data, + y_data, + static_cast(flag), + dev_ctx.template Alloc(Out)); } else { VLOG(3) << "MatMul's case 6"; - // TODO(@caizhi): enable it - //blas.BatchedGEMM(CblasTrans, - // CblasNoTrans, - // M, - // 1, - // N, - // static_cast(1), - // x_data, - // y_data, - // static_cast(flag), - // dev_ctx.template Alloc(Out), - // batch_size, - // M * N, - // 0); + blas.BatchedGEMM(CblasTrans, + CblasNoTrans, + M, + 1, + N, + static_cast(1), + x_data, + y_data, + static_cast(flag), + dev_ctx.template Alloc(Out), + batch_size, + M * N, + 0); } } else { const int M = X.numel() / N; VLOG(3) << "MatMul's case 7"; - // TODO(@caizhi): enable it - //blas.GEMV(false, - // M, - // N, - // static_cast(1), - // x_data, - // y_data, - // static_cast(flag), - // dev_ctx.template Alloc(Out)); + blas.GEMV(false, + M, + N, + static_cast(1), + x_data, + y_data, + static_cast(flag), + dev_ctx.template Alloc(Out)); } return; } @@ -374,93 +367,87 @@ void MatMulFunctionImplWithBlas( if (out_batch_size == 0) return; if (x_batch_size == 1 && y_batch_size == 1) { VLOG(3) << "MatMul's case 8"; - // TODO(@caizhi): enable it - //blas.GEMM(trans_x ? CblasTrans : CblasNoTrans, - // trans_y ? CblasTrans : CblasNoTrans, - // M, - // N, - // K, - // static_cast(1), - // x_data, - // y_data, - // static_cast(flag), - // dev_ctx.template Alloc(Out)); + blas.GEMM(trans_x ? CblasTrans : CblasNoTrans, + trans_y ? CblasTrans : CblasNoTrans, + M, + N, + K, + static_cast(1), + x_data, + y_data, + static_cast(flag), + dev_ctx.template Alloc(Out)); } else if (x_batch_size == 1) { if (M == 1 && trans_y) { VLOG(3) << "MatMul's case 9"; - // TODO(@caizhi): enable it - //blas.GEMV(false, - // y_batch_size * N, - // K, - // static_cast(1), - // y_data, - // x_data, - // static_cast(flag), - // dev_ctx.template Alloc(Out)); + blas.GEMV(false, + y_batch_size * N, + K, + static_cast(1), + y_data, + x_data, + static_cast(flag), + dev_ctx.template Alloc(Out)); } else { VLOG(3) << "MatMul's case 10"; - // TODO(@caizhi): enable it - //blas.BatchedGEMM(trans_x ? CblasTrans : CblasNoTrans, - // trans_y ? CblasTrans : CblasNoTrans, - // M, - // N, - // K, - // static_cast(1), - // x_data, - // y_data, - // static_cast(flag), - // dev_ctx.template Alloc(Out), - // out_batch_size, - // 0, - // K * N); + blas.BatchedGEMM(trans_x ? CblasTrans : CblasNoTrans, + trans_y ? CblasTrans : CblasNoTrans, + M, + N, + K, + static_cast(1), + x_data, + y_data, + static_cast(flag), + dev_ctx.template Alloc(Out), + out_batch_size, + 0, + K * N); } } else if (y_batch_size == 1) { if (!trans_x) { VLOG(3) << "MatMul's case 11"; - // TODO(@caizhi): enable it - //blas.GEMM(CblasNoTrans, - // trans_y ? CblasTrans : CblasNoTrans, - // x_batch_size * M, - // N, - // K, - // static_cast(1), - // x_data, - // y_data, - // static_cast(flag), - // dev_ctx.template Alloc(Out)); + blas.GEMM(CblasNoTrans, + trans_y ? CblasTrans : CblasNoTrans, + x_batch_size * M, + N, + K, + static_cast(1), + x_data, + y_data, + static_cast(flag), + dev_ctx.template Alloc(Out)); } else { VLOG(3) << "MatMul's case 12"; - // TODO(@caizhi): enable it - //blas.BatchedGEMM(CblasTrans, - // trans_y ? CblasTrans : CblasNoTrans, - // M, - // N, - // K, - // static_cast(1), - // x_data, - // y_data, - // static_cast(flag), - // dev_ctx.template Alloc(Out), - // out_batch_size, - // M * K, - // 0); + blas.BatchedGEMM(CblasTrans, + trans_y ? CblasTrans : CblasNoTrans, + M, + N, + K, + static_cast(1), + x_data, + y_data, + static_cast(flag), + dev_ctx.template Alloc(Out), + out_batch_size, + M * K, + 0); } } else if (!is_broadcast_dims) { VLOG(3) << "MatMul's case 13"; - // TODO(@caizhi): enable it - //blas.BatchedGEMM(trans_x ? CblasTrans : CblasNoTrans, - // trans_y ? CblasTrans : CblasNoTrans, - // M, - // N, - // K, - // static_cast(1), - // x_data, - // y_data, - // static_cast(flag), - // dev_ctx.template Alloc(Out), - // out_batch_size, - // M * K, - // K * N); + blas.BatchedGEMM(trans_x ? CblasTrans : CblasNoTrans, + trans_y ? CblasTrans : CblasNoTrans, + M, + N, + K, + static_cast(1), + x_data, + y_data, + static_cast(flag), + dev_ctx.template Alloc(Out), + out_batch_size, + M * K, + K * N); } else { // in the case, can't use stridedgemm std::vector x_ptr(out_batch_size); @@ -480,18 +467,17 @@ void MatMulFunctionImplWithBlas( IndexIncreaseFromDims(batch_dim, out_broadcast_dims.data(), index.data()); } VLOG(3) << "MatMul's case 14"; - // TODO(@caizhi): enable it - //blas.BatchedGEMM(trans_x ? CblasTrans : CblasNoTrans, - // trans_y ? CblasTrans : CblasNoTrans, - // M, - // N, - // K, - // static_cast(1), - // x_ptr.data(), - // y_ptr.data(), - // static_cast(flag), - // out_ptr.data(), - // out_batch_size); + blas.BatchedGEMM(trans_x ? CblasTrans : CblasNoTrans, + trans_y ? CblasTrans : CblasNoTrans, + M, + N, + K, + static_cast(1), + x_ptr.data(), + y_ptr.data(), + static_cast(flag), + out_ptr.data(), + out_batch_size); } } @@ -913,8 +899,8 @@ struct MatMulDispatcher { bool trans_x, bool trans_y, bool flag = false) { - //MatMulFunctionImplWithBlas( - // ctx, x, y, x_dims, y_dims, out, trans_x, trans_y, flag); + MatMulFunctionImplWithBlas( + ctx, x, y, x_dims, y_dims, out, trans_x, trans_y, flag); } }; @@ -997,9 +983,8 @@ void MatmulKernel(const Context& ctx, " but reviced dims size is 0. ")); const std::vector x_dims = vectorize(x.dims()); const std::vector y_dims = vectorize(y.dims()); - // TODO(@caizhi): enable it - //MatMulFunction( - // ctx, x, y, x_dims, y_dims, out, transpose_x, transpose_y); + MatMulFunction( + ctx, x, y, x_dims, y_dims, out, transpose_x, transpose_y); } template @@ -1022,9 +1007,7 @@ void MatmulWithFlattenKernel(const Context& dev_ctx, auto blas = phi::funcs::GetBlas(dev_ctx); - (void)blas; - // TODO(@caizhi): enable it - //blas.MatMul(x_matrix, y_matrix, out); + blas.MatMul(x_matrix, y_matrix, out); if (z_dim.size() != 2) { out->Resize(z_dim); }