Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[PHI decoupling] remove "gpu_primitives.h" in fluid #48063

Merged
merged 5 commits into from
Nov 18, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
41 changes: 20 additions & 21 deletions paddle/fluid/framework/fleet/heter_ps/feature_value.cu
Original file line number Diff line number Diff line change
Expand Up @@ -13,12 +13,12 @@ limitations under the License. */

#ifdef PADDLE_WITH_HETERPS
#include "paddle/fluid/framework/fleet/heter_ps/feature_value.h"
#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
#include "paddle/phi/backends/gpu/gpu_primitives.h"

namespace paddle {
namespace framework {

const int CUDA_NUM_THREADS = platform::PADDLE_CUDA_NUM_THREADS;
const int CUDA_NUM_THREADS = phi::PADDLE_CUDA_NUM_THREADS;
#define GET_BLOCK(N) ((N + CUDA_NUM_THREADS - 1) / CUDA_NUM_THREADS)
#define CUDA_BLOCK(N) GET_BLOCK(N), CUDA_NUM_THREADS, 0

Expand All @@ -45,7 +45,7 @@ __global__ void PullCopy(float** dest,
int x = low;
int y = i - (x ? len[x - 1] : 0);
float* feature_value_ptr =
(float*)((char*)src + uint64_t(i) * uint64_t(max_val_size));
(float*)((char*)src + uint64_t(i) * uint64_t(max_val_size)); // NOLINT
int mf_dim = gpu_dim[x] - 3;
gpu_accessor.Select(
dest[x] + y * (mf_dim + 3), feature_value_ptr, keys[x] + y, mf_dim);
Expand Down Expand Up @@ -79,7 +79,7 @@ __global__ void PullDedupCopy(const size_t N,
return;
}

float* src_ptr = (float*)((char*)src + uint64_t(restore_idx[i]) *
float* src_ptr = (float*)((char*)src + uint64_t(restore_idx[i]) * // NOLINT
uint64_t(max_val_size));
switch (off) {
case 0:
Expand Down Expand Up @@ -125,9 +125,10 @@ __global__ void PushCopyWithPool(float* dest,
}
int x = low;
int y = i - (x ? len[low - 1] : 0);
float* cur = (float*)((char*)dest + i * grad_value_size);
float* cur = (float*)((char*)dest + i * grad_value_size); // NOLINT

cur[gpu_accessor.common_push_value.SlotIndex()] = (float)slot_vector[x];
cur[gpu_accessor.common_push_value.SlotIndex()] =
(float)slot_vector[x]; // NOLINT
int mf_dim = mf_dim_vector[x];
cur[gpu_accessor.common_push_value.MfDimIndex()] = mf_dim;

Expand Down Expand Up @@ -170,31 +171,29 @@ __global__ void PushMergeCopyAtomic(const size_t N,
int y = i - slot_lens[x];

const float* ptr = src[x] + y * hidden;
float* cur = (float*)((char*)dest + d_restore_idx[i] * grad_value_size);
float* cur =
(float*)((char*)dest + d_restore_idx[i] * grad_value_size); // NOLINT
int mf_dim = slot_dims[x] - 3;
switch (off) {
case 0:
cur[accessor.SlotIndex()] = (float)slot_vector[x];
cur[accessor.SlotIndex()] = (float)slot_vector[x]; // NOLINT
cur[accessor.MfDimIndex()] = mf_dim;
paddle::platform::CudaAtomicAdd(&cur[accessor.ShowIndex()],
*(ptr + off));
phi::CudaAtomicAdd(&cur[accessor.ShowIndex()], *(ptr + off));
break;
case 1:
paddle::platform::CudaAtomicAdd(&cur[accessor.ClickIndex()],
*(ptr + off));
phi::CudaAtomicAdd(&cur[accessor.ClickIndex()], *(ptr + off));
break;
case 2:
paddle::platform::CudaAtomicAdd(&cur[accessor.EmbedGIndex()],
*(ptr + off) * -1. * bs);
phi::CudaAtomicAdd(&cur[accessor.EmbedGIndex()],
*(ptr + off) * -1. * bs);
break;
default:
int embedx_idx = off - 3;
if (mf_dim < embedx_idx) {
return;
}
paddle::platform::CudaAtomicAdd(
&cur[accessor.EmbedxGIndex() + embedx_idx],
*(ptr + off) * -1. * bs);
phi::CudaAtomicAdd(&cur[accessor.EmbedxGIndex() + embedx_idx],
*(ptr + off) * -1. * bs);
break;
}
}
Expand Down Expand Up @@ -228,7 +227,7 @@ __global__ void PushMergeCopy(const size_t N,
int i = idx / hidden;
int off = idx % hidden;
// filter 0 keys
float* cur = (float*)((char*)dest + i * grad_value_size);
float* cur = (float*)((char*)dest + i * grad_value_size); // NOLINT

if (total_keys[i] == 0) {
switch (off) {
Expand Down Expand Up @@ -262,7 +261,7 @@ __global__ void PushMergeCopy(const size_t N,

switch (off) {
case 0:
cur[accessor.SlotIndex()] = (float)slot_vector[x];
cur[accessor.SlotIndex()] = (float)slot_vector[x]; // NOLINT
cur[accessor.MfDimIndex()] = mf_dim;
SUM_GRAD_VALUE
cur[accessor.ShowIndex()] = val;
Expand Down Expand Up @@ -331,8 +330,8 @@ void AccessorWrapper<GPUAccessor>::CopyForPushImpl(
const uint64_t total_length,
const int batch_size,
size_t grad_value_size,
std::vector<int>& slot_vector,
std::vector<int>& slot_mf_dim_vector) {
std::vector<int>& slot_vector, // NOLINT
std::vector<int>& slot_mf_dim_vector) { // NOLINT
auto stream = dynamic_cast<phi::GPUContext*>(
paddle::platform::DeviceContextPool::Instance().Get(place))
->stream();
Expand Down
4 changes: 2 additions & 2 deletions paddle/fluid/framework/fleet/ps_gpu_wrapper.cu
Original file line number Diff line number Diff line change
Expand Up @@ -22,12 +22,12 @@ limitations under the License. */
#include "paddle/fluid/framework/fleet/ps_gpu_wrapper.h"
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/platform/device/gpu/gpu_info.h"
#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
#include "paddle/phi/backends/gpu/gpu_primitives.h"

namespace paddle {
namespace framework {

const int CUDA_NUM_THREADS = platform::PADDLE_CUDA_NUM_THREADS;
const int CUDA_NUM_THREADS = phi::PADDLE_CUDA_NUM_THREADS;
#define GET_BLOCK(N) ((N + CUDA_NUM_THREADS - 1) / CUDA_NUM_THREADS)
#define CUDA_BLOCK(N) GET_BLOCK(N), CUDA_NUM_THREADS, 0

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,8 +20,8 @@

#include "paddle/fluid/framework/tensor.h"
#include "paddle/fluid/framework/tensor_util.h"
#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
#include "paddle/fluid/platform/device_context.h"
#include "paddle/phi/backends/gpu/gpu_primitives.h"

#include "paddle/fluid/inference/tensorrt/plugin/fused_token_prune_op_plugin.h"
#include "paddle/fluid/operators/fused_token_prune_op.cu.h"
Expand Down Expand Up @@ -149,7 +149,7 @@ __global__ void ReduceSum2<half>(
}

if (tid == 0) {
platform::fastAtomicAdd<platform::float16>(
phi::fastAtomicAdd<platform::float16>(
reinterpret_cast<platform::float16*>(dst),
static_cast<size_t>(batch * max_seq_len + col),
static_cast<size_t>(bsz * max_seq_len),
Expand Down
2 changes: 1 addition & 1 deletion paddle/fluid/operators/affine_channel_op.cu
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ namespace cub = hipcub;

#include "paddle/fluid/framework/data_layout.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
#include "paddle/phi/backends/gpu/gpu_primitives.h"

namespace paddle {
namespace operators {
Expand Down
4 changes: 2 additions & 2 deletions paddle/fluid/operators/assign_pos_op.cu
Original file line number Diff line number Diff line change
Expand Up @@ -23,8 +23,8 @@ We retain the following license from the original files:

#include "paddle/fluid/operators/assign_pos_op.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
#include "paddle/fluid/platform/float16.h"
#include "paddle/phi/backends/gpu/gpu_primitives.h"

DECLARE_bool(avoid_op_randomness);

Expand All @@ -47,7 +47,7 @@ __global__ void AssignPos(T* cum_count,
CUDA_KERNEL_LOOP(i, limit) {
int number_idx = numbers[i];
if (number_idx > -1) {
int p = platform::CudaAtomicAdd(cum_count + number_idx, -1);
int p = phi::CudaAtomicAdd(cum_count + number_idx, -1);
out[p - 1] = i;
}
}
Expand Down
2 changes: 1 addition & 1 deletion paddle/fluid/operators/batch_fc_op.cu
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ limitations under the License. */
#include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/operators/batch_fc_op.h"
#include "paddle/fluid/platform/device/gpu/gpu_info.h"
#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
#include "paddle/phi/backends/gpu/gpu_primitives.h"
#include "paddle/phi/kernels/funcs/blas/blas.h"

namespace paddle {
Expand Down
2 changes: 1 addition & 1 deletion paddle/fluid/operators/bilateral_slice_op.cu
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@

#include "paddle/fluid/operators/bilateral_slice_op.h"
#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
#include "paddle/phi/backends/gpu/gpu_primitives.h"

namespace paddle {
namespace operators {
Expand Down
6 changes: 3 additions & 3 deletions paddle/fluid/operators/center_loss_op.cu
Original file line number Diff line number Diff line change
Expand Up @@ -16,11 +16,11 @@ limitations under the License. */

#include "paddle/fluid/operators/center_loss_op.h"
#include "paddle/fluid/platform/device/gpu/gpu_info.h"
#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
#include "paddle/phi/backends/gpu/gpu_primitives.h"
namespace paddle {
namespace operators {

using platform::PADDLE_CUDA_NUM_THREADS;
using phi::PADDLE_CUDA_NUM_THREADS;

template <typename T, int BlockDimX, int BlockDimY, int GridDimX>
__global__ void ComputeDifferent(T *centers_diff,
Expand Down Expand Up @@ -75,7 +75,7 @@ __global__ void UpdateCenters(T *centers,
const T *diff = centers_diff + idy * D;
T *cent = centers + id * D;
for (int i = idx; i < D; i += BlockDimX) {
paddle::platform::CudaAtomicAdd(&cent[i], alpha[0] * diff[i] / count);
phi::CudaAtomicAdd(&cent[i], alpha[0] * diff[i] / count);
}
idy += BlockDimY * GridDimX;
}
Expand Down
5 changes: 2 additions & 3 deletions paddle/fluid/operators/collective/c_embedding_op.cu
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,8 @@ limitations under the License. */
#include "paddle/fluid/framework/convert_utils.h"
#include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
#include "paddle/fluid/platform/float16.h"
#include "paddle/phi/backends/gpu/gpu_primitives.h"

namespace paddle {
namespace operators {
Expand Down Expand Up @@ -77,8 +77,7 @@ __global__ void CEmbeddingGrad(T *table,
auto id = ids[row];
if (id >= start_idx && id < end_idx) {
auto real_idx = id - start_idx;
paddle::platform::CudaAtomicAdd(&table[real_idx * columns + col],
output[i]);
phi::CudaAtomicAdd(&table[real_idx * columns + col], output[i]);
}
}
}
Expand Down
2 changes: 1 addition & 1 deletion paddle/fluid/operators/collective/c_split_op.cu
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ limitations under the License. */

#include "paddle/fluid/operators/collective/c_split_op.h"
#include "paddle/fluid/operators/math/concat_and_split.h"
#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
#include "paddle/phi/backends/gpu/gpu_primitives.h"

namespace paddle {
namespace operators {
Expand Down
2 changes: 1 addition & 1 deletion paddle/fluid/operators/conv_shift_op.cu
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */

#include "paddle/fluid/operators/conv_shift_op.h"
#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
#include "paddle/phi/backends/gpu/gpu_primitives.h"
#include "paddle/phi/kernels/funcs/math_function.h"

namespace paddle {
Expand Down
4 changes: 2 additions & 2 deletions paddle/fluid/operators/cvm_op.cu
Original file line number Diff line number Diff line change
Expand Up @@ -16,12 +16,12 @@ limitations under the License. */
#include "paddle/fluid/operators/cvm_op.h"
#include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
#include "paddle/phi/backends/gpu/gpu_primitives.h"

namespace paddle {
namespace operators {

using platform::PADDLE_CUDA_NUM_THREADS;
using phi::PADDLE_CUDA_NUM_THREADS;
using Tensor = phi::DenseTensor;
using LoDTensor = phi::DenseTensor;

Expand Down
4 changes: 2 additions & 2 deletions paddle/fluid/operators/data_norm_op.cu
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ limitations under the License. */

#include "paddle/fluid/framework/data_layout.h"
#include "paddle/fluid/operators/data_norm_op.h"
#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
#include "paddle/phi/backends/gpu/gpu_primitives.h"
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
#include "paddle/fluid/platform/collective_helper.h"
#include "paddle/fluid/platform/device/gpu/nccl_helper.h"
Expand All @@ -29,7 +29,7 @@ namespace operators {
using Tensor = phi::DenseTensor;
using LoDTensor = phi::DenseTensor;
using DataLayout = phi::DataLayout;
using platform::PADDLE_CUDA_NUM_THREADS;
using phi::PADDLE_CUDA_NUM_THREADS;

inline int GET_BLOCKS(const int N) {
return (N + PADDLE_CUDA_NUM_THREADS - 1) / PADDLE_CUDA_NUM_THREADS;
Expand Down
28 changes: 12 additions & 16 deletions paddle/fluid/operators/deformable_psroi_pooling_op.cu
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/memory/malloc.h"
#include "paddle/fluid/operators/deformable_psroi_pooling_op.h"
#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
#include "paddle/phi/backends/gpu/gpu_primitives.h"
#include "paddle/phi/kernels/funcs/blas/blas.h"
#include "paddle/phi/kernels/funcs/math_function.h"

Expand All @@ -41,7 +41,7 @@ namespace operators {

using Tensor = phi::DenseTensor;
using LoDTensor = phi::DenseTensor;
using paddle::platform::PADDLE_CUDA_NUM_THREADS;
using phi::PADDLE_CUDA_NUM_THREADS;

static inline int GET_BLOCKS(const int N) {
return (N + PADDLE_CUDA_NUM_THREADS - 1) / PADDLE_CUDA_NUM_THREADS;
Expand Down Expand Up @@ -447,18 +447,14 @@ __global__ void DeformablePSROIPoolBackwardAccKernel(

// compute gradient of input
if (bottom_data_diff) {
platform::CudaAtomicAdd(
bottom_data_diff + bottom_index + y0 * width + x0,
q00 * diff_val);
platform::CudaAtomicAdd(
bottom_data_diff + bottom_index + y1 * width + x0,
q01 * diff_val);
platform::CudaAtomicAdd(
bottom_data_diff + bottom_index + y0 * width + x1,
q10 * diff_val);
platform::CudaAtomicAdd(
bottom_data_diff + bottom_index + y1 * width + x1,
q11 * diff_val);
phi::CudaAtomicAdd(bottom_data_diff + bottom_index + y0 * width + x0,
q00 * diff_val);
phi::CudaAtomicAdd(bottom_data_diff + bottom_index + y1 * width + x0,
q01 * diff_val);
phi::CudaAtomicAdd(bottom_data_diff + bottom_index + y0 * width + x1,
q10 * diff_val);
phi::CudaAtomicAdd(bottom_data_diff + bottom_index + y1 * width + x1,
q11 * diff_val);
}

// compute gradient of trans
Expand All @@ -478,8 +474,8 @@ __global__ void DeformablePSROIPoolBackwardAccKernel(
u00 * (1 - dist_x)) *
trans_std * diff_val;
diff_y *= roi_height;
platform::CudaAtomicAdd(bottom_trans_diff + trans_index_x, diff_x);
platform::CudaAtomicAdd(bottom_trans_diff + trans_index_y, diff_y);
phi::CudaAtomicAdd(bottom_trans_diff + trans_index_x, diff_x);
phi::CudaAtomicAdd(bottom_trans_diff + trans_index_y, diff_y);
}
}
}
Expand Down
3 changes: 2 additions & 1 deletion paddle/fluid/operators/dequantize_log_op.cu
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,9 @@ See the License for the specific language governing permissions and
limitations under the License. */

#include "paddle/fluid/operators/dequantize_log_op.h"
#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
#include "paddle/phi/backends/gpu/gpu_primitives.h"
#include "paddle/phi/core/hostdevice.h"
#include "paddle/phi/kernels/funcs/math.h"

namespace paddle {
namespace operators {
Expand Down
2 changes: 1 addition & 1 deletion paddle/fluid/operators/detection/box_clip_op.cu
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ limitations under the License. */

#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/detection/box_clip_op.h"
#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
#include "paddle/phi/backends/gpu/gpu_primitives.h"
#include "paddle/phi/core/hostdevice.h"
#include "paddle/phi/kernels/funcs/math_function.h"

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ limitations under the License. */

#include "paddle/fluid/operators/detection/box_decoder_and_assign_op.h"
#include "paddle/fluid/memory/memcpy.h"
#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
#include "paddle/phi/backends/gpu/gpu_primitives.h"

namespace paddle {
namespace operators {
Expand Down
4 changes: 2 additions & 2 deletions paddle/fluid/operators/detection/collect_fpn_proposals_op.cu
Original file line number Diff line number Diff line change
Expand Up @@ -26,8 +26,8 @@ namespace cub = hipcub;
#include "paddle/fluid/operators/detection/collect_fpn_proposals_op.h"
#include "paddle/fluid/operators/math/concat_and_split.h"
#include "paddle/fluid/operators/strided_memcpy.h"
#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
#include "paddle/fluid/platform/for_range.h"
#include "paddle/phi/backends/gpu/gpu_primitives.h"
#include "paddle/phi/kernels/funcs/gather.cu.h"

namespace paddle {
Expand All @@ -50,7 +50,7 @@ static __global__ void GetLengthLoD(const int nthreads,
const int* batch_ids,
int* length_lod) {
CUDA_KERNEL_LOOP(i, nthreads) {
platform::CudaAtomicAdd(length_lod + batch_ids[i], 1);
phi::CudaAtomicAdd(length_lod + batch_ids[i], 1);
}
}

Expand Down
Loading