Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Move reduceSum to elementwise_op_function.h #9773

Merged
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
47 changes: 24 additions & 23 deletions paddle/fluid/operators/elementwise_op_function.h
Original file line number Diff line number Diff line change
Expand Up @@ -13,14 +13,15 @@ See the License for the specific language governing permissions and
limitations under the License. */

#pragma once
#include <algorithm>
#include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/platform/transform.h"

#ifdef __NVCC__
#include <thrust/iterator/iterator_adaptor.h>
#include "paddle/fluid/platform/cuda_helper.h"
#include "paddle/fluid/platform/cuda_device_functions.h"
constexpr int ELEMWISE_MAX_BLOCK_DIM = 1024;
#endif

Expand All @@ -43,35 +44,35 @@ namespace operators {
*/
inline void get_mid_dims(const framework::DDim& x_dims,
const framework::DDim& y_dims, const int axis,
int& pre, int& n, int& post) {
pre = 1;
n = 1;
post = 1;
int* pre, int* n, int* post) {
*pre = 1;
*n = 1;
*post = 1;
for (int i = 0; i < axis; ++i) {
pre *= x_dims[i];
(*pre) *= x_dims[i];
}

for (int i = 0; i < y_dims.size(); ++i) {
PADDLE_ENFORCE_EQ(x_dims[i + axis], y_dims[i],
"Broadcast dimension mismatch.");
n *= y_dims[i];
(*n) *= y_dims[i];
}

for (int i = axis + y_dims.size(); i < x_dims.size(); ++i) {
post *= x_dims[i];
(*post) *= x_dims[i];
}
}

inline void trim_trailing_singular_dims(framework::DDim& dims) {
inline void trim_trailing_singular_dims(framework::DDim* dims) {
// Remove trailing dimensions of size 1 for y
auto actual_dims_size = dims.size();
auto actual_dims_size = dims->size();
for (; actual_dims_size != 0; --actual_dims_size) {
if (dims[actual_dims_size - 1] != 1) break;
if ((*dims)[actual_dims_size - 1] != 1) break;
}
if (actual_dims_size != dims.size()) {
auto actual_dims = framework::vectorize(dims);
if (actual_dims_size != dims->size()) {
auto actual_dims = framework::vectorize(*dims);
actual_dims.resize(actual_dims_size);
dims = framework::make_ddim(actual_dims);
*dims = framework::make_ddim(actual_dims);
}
}

Expand Down Expand Up @@ -159,7 +160,7 @@ class RowwiseTransformIterator<T, platform::CUDADeviceContext>
RowwiseTransformIterator<T, platform::CUDADeviceContext>, const T*>
super_t;
HOSTDEVICE RowwiseTransformIterator(const T* x, int n)
: super_t(x), begin_(x), n_(n){};
: super_t(x), begin_(x), n_(n) {}
friend class thrust::iterator_core_access;

private:
Expand All @@ -179,7 +180,7 @@ class MidWiseTransformIterator<T, platform::CUDADeviceContext>
MidWiseTransformIterator<T, platform::CUDADeviceContext>, const T*>
super_t;
HOSTDEVICE MidWiseTransformIterator(const T* x, int n, int post)
: super_t(x), begin_(x), n_(n), post_(post){};
: super_t(x), begin_(x), n_(n), post_(post) {}
friend class thrust::iterator_core_access;

private:
Expand Down Expand Up @@ -472,11 +473,11 @@ void ElemwiseGradCompute(const framework::ExecutionContext& ctx,
auto y_dim = y.dims();

axis = (axis == -1 ? x_dim.size() - y_dim.size() : axis);
trim_trailing_singular_dims(y_dim);
trim_trailing_singular_dims(&y_dim);
axis = (y_dim.size() == 0) ? x_dim.size() : axis;

int pre, n, post;
get_mid_dims(x_dim, y_dim, axis, pre, n, post);
get_mid_dims(x_dim, y_dim, axis, &pre, &n, &post);
if (post == 1) {
int h = pre;
int w = n;
Expand Down Expand Up @@ -514,7 +515,7 @@ void ElemwiseGradCompute(const framework::ExecutionContext& ctx,
}
}
}
};
}

template <typename DeviceContext, typename T, typename functor,
typename broadcastfunctor, typename broadcast2functor>
Expand Down Expand Up @@ -543,11 +544,11 @@ void ElementwiseGradCompute(const framework::ExecutionContext& ctx,
}

axis = (axis == -1 ? x_dims.size() - y_dims.size() : axis);
trim_trailing_singular_dims(y_dims);
trim_trailing_singular_dims(&y_dims);
axis = (y_dims.size() == 0) ? x_dims.size() : axis;

int pre, n, post;
get_mid_dims(x_dims, y_dims, axis, pre, n, post);
get_mid_dims(x_dims, y_dims, axis, &pre, &n, &post);

if (post == 1) {
broadcastfunctor f;
Expand Down Expand Up @@ -582,11 +583,11 @@ void ElementwiseComputeEx(const framework::ExecutionContext& ctx,
axis = (axis == -1 ? x_dims.size() - y_dims.size() : axis);
PADDLE_ENFORCE(axis >= 0 && axis < x_dims.size(),
"Axis should be in range [0, x_dims)");
trim_trailing_singular_dims(y_dims);
trim_trailing_singular_dims(&y_dims);
axis = (y_dims.size() == 0) ? x_dims.size() : axis;

int pre, n, post;
get_mid_dims(x_dims, y_dims, axis, pre, n, post);
get_mid_dims(x_dims, y_dims, axis, &pre, &n, &post);
if (post == 1) {
functor.RunRowWise(n, pre);
return;
Expand Down
70 changes: 70 additions & 0 deletions paddle/fluid/platform/cuda_device_functions.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */

#pragma once
#include <cuda.h>

namespace paddle {
namespace platform {

// __shfl_down has been deprecated as of CUDA 9.0.
#if CUDA_VERSION < 9000
template <typename T>
__forceinline__ __device__ T __shfl_down_sync(unsigned, T val, int delta) {
return __shfl_down(val, delta);
}
#define CREATE_SHFL_MASK(mask, predicate) mask = 0u;
#else
#define FULL_WARP_MASK 0xFFFFFFFF
#define CREATE_SHFL_MASK(mask, predicate) \
mask = __ballot_sync(FULL_WARP_MASK, (predicate))
#endif

template <typename T>
__device__ T reduceSum(T val, int tid, int len) {
// TODO(zcd): The warp size should be taken from the
// parameters of the GPU but not specified as 32 simply.
// To make the reduceSum more efficiently,
// I use Warp-Level Parallelism and assume the Warp size
// is 32 which may be different for different GPU,
// but most card's warp size is 32.
__shared__ T shm[32];
const int warpSize = 32;
unsigned mask = 0u;
CREATE_SHFL_MASK(mask, tid < len);

for (int offset = warpSize / 2; offset > 0; offset /= 2)
val += __shfl_down_sync(mask, val, offset);

if (tid < warpSize) shm[tid] = 0;

__syncthreads();

if (tid % warpSize == 0) {
shm[tid / warpSize] = val;
}

CREATE_SHFL_MASK(mask, tid < warpSize);

if (tid < warpSize) {
val = shm[tid];
for (int offset = warpSize / 2; offset > 0; offset /= 2)
val += __shfl_down_sync(mask, val, offset);
}

return val;
}

} // namespace platform
} // namespace paddle
48 changes: 0 additions & 48 deletions paddle/fluid/platform/cuda_helper.h
Original file line number Diff line number Diff line change
Expand Up @@ -62,53 +62,5 @@ CUDA_ATOMIC_WRAPPER(Add, double) {
}
#endif

// __shfl_down has been deprecated as of CUDA 9.0.
#if CUDA_VERSION < 9000
template <typename T>
__forceinline__ __device__ T __shfl_down_sync(unsigned, T val, int delta) {
return __shfl_down(val, delta);
}
#define CREATE_SHFL_MASK(mask, predicate) mask = 0u;
#else
#define FULL_WARP_MASK 0xFFFFFFFF
#define CREATE_SHFL_MASK(mask, predicate) \
mask = __ballot_sync(FULL_WARP_MASK, (predicate))
#endif

template <typename T>
__device__ T reduceSum(T val, int tid, int len) {
// TODO(zcd): The warp size should be taken from the
// parameters of the GPU but not specified as 32 simply.
// To make the reduceSum more efficiently,
// I use Warp-Level Parallelism and assume the Warp size
// is 32 which may be different for different GPU,
// but most card's warp size is 32.
__shared__ T shm[32];
const int warpSize = 32;
unsigned mask = 0u;
CREATE_SHFL_MASK(mask, tid < len);

for (int offset = warpSize / 2; offset > 0; offset /= 2)
val += __shfl_down_sync(mask, val, offset);

if (tid < warpSize) shm[tid] = 0;

__syncthreads();

if (tid % warpSize == 0) {
shm[tid / warpSize] = val;
}

CREATE_SHFL_MASK(mask, tid < warpSize);

if (tid < warpSize) {
val = shm[tid];
for (int offset = warpSize / 2; offset > 0; offset /= 2)
val += __shfl_down_sync(mask, val, offset);
}

return val;
}

} // namespace platform
} // namespace paddle