Skip to content

Commit

Permalink
Merge pull request apache#45 from antinucleon/master
Browse files Browse the repository at this point in the history
SoftmaxGrad and Standalone BLAS
  • Loading branch information
antinucleon committed Aug 22, 2015
2 parents 9819128 + 7d42e84 commit ccf4b93
Show file tree
Hide file tree
Showing 6 changed files with 134 additions and 12 deletions.
1 change: 1 addition & 0 deletions make/mshadow.mk
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@ endif
else
MSHADOW_CFLAGS += -DMSHADOW_USE_CBLAS=1 -DMSHADOW_USE_MKL=0
endif

ifeq ($(USE_BLAS), openblas)
MSHADOW_LDFLAGS += -lopenblas
else ifeq ($(USE_BLAS), atlas)
Expand Down
54 changes: 43 additions & 11 deletions mshadow/cuda/tensor_gpu-inl.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ const int kMaxGridNum = 65535;
/*! \brief suggested grid number for mapping kernel */
const int kBaseGridNum = 1024;
/*! \brief get align stride for given size in x dimension */
inline index_t GetAlignStride(index_t xsize) {
inline index_t GetAlignStride(index_t xsize) {
if (xsize >= MSHADOW_MIN_PAD_RATIO * 32) {
return ((xsize + kMemUnit - 1) >> kMemUnitBits) << kMemUnitBits;
} else {
Expand All @@ -45,7 +45,7 @@ inline void CheckLaunchParam(dim3 dimGrid, dim3 dimBlock, const char *estr = "")
dimGrid.x > 65535 || dimGrid.y > 65535) {
fprintf(stderr, "%s[%u,%u,%u]:", estr, dimBlock.x, dimBlock.y, dimBlock.z);
utils::Error("too large launch parameter\n");
}
}
}
template<typename Saver, typename DstPlan,
typename Plan, int block_dim_bits>
Expand All @@ -68,7 +68,7 @@ __global__ void MapPlanKernel(DstPlan dst, index_t xstride,
template<typename Saver, int block_dim_bits, int grid_size,
typename DstPlan, typename Plan>
__global__ void MapPlanLargeKernel(DstPlan dst, index_t xstride,
Shape<2> dshape, const Plan exp, int repeat) {
Shape<2> dshape, const Plan exp, int repeat) {
for (int i = 0; i < repeat; ++i) {
MapPlanProc<Saver, DstPlan, Plan, block_dim_bits>
(dst, xstride, dshape, exp, blockIdx.x + i * grid_size);
Expand All @@ -83,7 +83,7 @@ inline void MapPlan(expr::Plan<DstExp, DType> dst,
const index_t xstride = GetAlignStride(dshape[1]);
const int num_block = (dshape[0] * xstride + kBaseThreadNum-1) / kBaseThreadNum;
dim3 dimBlock(kBaseThreadNum, 1, 1);

if (num_block < kMaxGridNum) {
dim3 dimGrid(num_block, 1, 1);
MapPlanKernel<Saver, kBaseThreadBits,
Expand Down Expand Up @@ -150,15 +150,15 @@ template<typename Saver, typename Reducer, int block_dim_bits,
__global__ void MapReduceKeepDim1Kernel(DstPlan dst, Plan plan, DType scale, Shape<4> pshape) {
const int block_size = 1 << block_dim_bits;
__shared__ DType s_rec[block_size];
const int c = blockIdx.x;
const int c = blockIdx.x;
const index_t tot = pshape[3] * pshape[2] * pshape[0];

DType res; Reducer::SetInitValue(res);
for (index_t i_offset = 0; i_offset < tot; i_offset += block_size) {
index_t i = i_offset + threadIdx.x;
if (i< tot) {
const index_t x = i % pshape[3];
i /= pshape[3];
i /= pshape[3];
const index_t y = i % pshape[2];
const index_t n = i / pshape[2];
Reducer::Reduce(res, plan.Eval((n * pshape[1] + c) * pshape[2] + y, x));
Expand Down Expand Up @@ -186,14 +186,28 @@ inline void MapReduceKeepDim1(expr::Plan<DstExp, DType> dst,
<<<dimGrid, dimBlock, 0, stream>>>(dst, plan, scale, pshape);
}

template<int x_bits, typename DType, typename DstPlan, typename SrcPlan1, typename SrcPlan2>
__global__ void SoftmaxGradKernel(DstPlan dst, SrcPlan1 src, SrcPlan2 label, index_t xmax) {
const int y = blockIdx.x;
const int x = threadIdx.x;
const int k = static_cast<int>(label.Eval(y, 0));
if (x < xmax) {
if (x == k) {
dst.REval(y, k) = src.Eval(y, k) - 1.0f;
} else {
dst.REval(y, k) = src.Eval(y, k);
}
}
}

template<int x_bits, typename DType, typename DstPlan, typename SrcPlan>
__global__ void SoftmaxKernel(DstPlan dst, SrcPlan src, index_t xmax) {
const unsigned x_size = 1 << x_bits;
const int y = blockIdx.x;
__shared__ DType s_rec[x_size];
__shared__ DType s_rec[x_size];
// step 1: get max
if (threadIdx.x < xmax) {
s_rec[threadIdx.x] = src.Eval(y, threadIdx.x);
s_rec[threadIdx.x] = src.Eval(y, threadIdx.x);
}
for (unsigned x = x_size; x < xmax; x += x_size) {
if (x + threadIdx.x < xmax) {
Expand All @@ -212,7 +226,7 @@ __global__ void SoftmaxKernel(DstPlan dst, SrcPlan src, index_t xmax) {
__syncthreads();
s_rec[threadIdx.x] = 0.0f;
__syncthreads();

// calculate normalizer, with writeback
for (unsigned x = 0; x < xmax; x += x_size) {
if (x + threadIdx.x < xmax) {
Expand All @@ -227,7 +241,7 @@ __global__ void SoftmaxKernel(DstPlan dst, SrcPlan src, index_t xmax) {
Reduce1D<red::sum, x_bits>(s_rec);
__syncthreads();
DType ssum = s_rec[0];

for (unsigned x = 0; x < xmax; x += x_size) {
if (x + threadIdx.x < xmax) {
dst.REval(y, x + threadIdx.x) /= ssum;
Expand All @@ -248,6 +262,24 @@ inline void Softmax(Tensor<gpu, 2, DType> &dst,
expr::MakePlan(src),
dst.size(1));
}

template<typename DType>
inline void SoftmaxGrad(Tensor<gpu, 2, DType> &dst,
const Tensor<gpu, 2, DType> &src,
const Tensor<gpu, 1, DType> &label) {
dim3 dimBlock(kBaseThreadNum);
dim3 dimGrid(dst.size(0));
utils::Check(dst.shape_ == src.shape_, "SoftmaxGrad: shape mismatch");
utils::Check(dst.size(0) == label.size(0), "SoftmaxGrad: label shape mismatch");
CheckLaunchParam(dimGrid, dimBlock, "SoftmaxGrad");
cudaStream_t stream = Stream<gpu>::GetStream(dst.stream_);
SoftmaxGradKernel<kBaseThreadBits, DType>
<<<dimGrid, dimBlock, 0, stream>>>
(expr::MakePlan(dst),
expr::MakePlan(src),
expr::MakePlan(label),
dst.size(1));
}
} // namespace cuda
} // namespace mshadow
#endif // MSHADOW_CUDA_TENSOR_GPU_INL_CUH_
51 changes: 50 additions & 1 deletion mshadow/dot_engine-inl.h
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,56 @@ struct BLASEngine<cpu> {
cblas_dger(CblasColMajor, m, n, alpha, X, incX, Y, incY, A, lda);
}
};
#endif // MSHADOW_USE_CBLAS || MSHADOW_USE_MKL
#elif MSHADOW_STAND_ALONE == 1
template<>
struct BLASEngine<cpu> {
inline static bool GetT(bool t) {
return t ? true : false;
}
inline static void SetStream(Stream<cpu> *stream) {
}
inline static void gemm(Stream<cpu> *stream,
bool transa, bool transb,
int m, int n, int k, float alpha,
const float *A, int lda, const float *B, int ldb,
float beta, float *C, int ldc) {
utils::Error("Not implmented!");
}
inline static void gemm(Stream<cpu> *stream,
bool transa, bool transb,
int m, int n, int k, double alpha,
const double *A, int lda, const double *B, int ldb,
double beta, double *C, int ldc) {
utils::Error("Not implmented!");
}
inline static void gemv(Stream<cpu> *stream,
bool trans, int m, int n,
float alpha, const float *A, int lda,
const float *X, int incX,
float beta, float *Y, int incY) {
utils::Error("Not implmented!");
}
inline static void gemv(Stream<cpu> *stream,
bool trans, int m, int n, double alpha,
const double *A, int lda,
const double *X, int incX,
double beta, double *Y, int incY) {
utils::Error("Not implmented!");
}
inline static void ger(Stream<cpu> *stream,
int m, int n, float alpha,
const float *X, int incX,
const float *Y, int incY, float *A, int lda) {
utils::Error("Not implmented!");
}
inline static void ger(Stream<cpu> *stream,
int m, int n, double alpha,
const double *X, int incX,
const double *Y, int incY, double *A, int lda) {
utils::Error("Not implmented!");
}
};
#endif // MSHADOW_USE_CBLAS || MSHADOW_USE_MKL || MSHADOW_STAND_ALONE
// CuBLAS redirect code
#if MSHADOW_USE_CUDA
// All CuBLAS goes to here, use legacy API: not threadsafe
Expand Down
21 changes: 21 additions & 0 deletions mshadow/tensor.h
Original file line number Diff line number Diff line change
Expand Up @@ -610,6 +610,27 @@ inline void Softmax(Tensor<cpu, 2, DType> dst, const Tensor<cpu, 2, DType> &ener
*/
template<typename DType>
inline void Softmax(Tensor<gpu, 2, DType> dst, const Tensor<gpu, 2, DType> &energy);

/*!
* \brief CPU/GPU: softmax gradient
* \param dst destination
* \param src source output
* \param label label info
*/
template<typename DType>
inline void SoftmaxGrad(Tensor<cpu, 2, DType> dst,
const Tensor<cpu, 2, DType> &src,
const Tensor<cpu, 1, DType> &label);
/*!
* \brief CPU/GPU: softmax gradient
* \param dst destination
* \param src source output
* \param label label info
*/
template<typename DType>
inline void SoftmaxGrad(Tensor<gpu, 2, DType> dst,
const Tensor<gpu, 2, DType> &src,
const Tensor<gpu, 1, DType> &label);
// function declarations to support expression, no need to understand them
// these functions do not need to be directly used
/*!
Expand Down
11 changes: 11 additions & 0 deletions mshadow/tensor_cpu-inl.h
Original file line number Diff line number Diff line change
Expand Up @@ -253,6 +253,17 @@ inline void Softmax(Tensor<cpu, 1, DType> dst,
dst[x] /= sum;
}
}

template<typename DType>
inline void SoftmaxGrad(Tensor<cpu, 2, DType> dst,
const Tensor<cpu, 2, DType> &src,
const Tensor<cpu, 1, DType> &label) {
for (index_t y = 0; y < dst.size(0); ++y) {
const int k = static_cast<int>(label[y]);
dst[y][k] = src[y][k] - 1.0f;
}
}

template<typename DType>
inline void Softmax(Tensor<cpu, 2, DType> dst,
const Tensor<cpu, 2, DType> &energy) {
Expand Down
8 changes: 8 additions & 0 deletions mshadow/tensor_gpu-inl.h
Original file line number Diff line number Diff line change
Expand Up @@ -167,6 +167,14 @@ inline void Softmax(Tensor<gpu, 2, DType> dst,
const Tensor<gpu, 2, DType>& src) {
cuda::Softmax(dst, src);
}

template<typename DType>
inline void SoftmaxGrad(Tensor<gpu, 2, DType> dst,
const Tensor<gpu, 2, DType> &src,
const Tensor<gpu, 1, DType> &label) {
cuda::SoftmaxGrad(dst, src, label);
}

} // namespace mshadow
#endif // __CUDACC__
#endif // MSHADOW_TENSOR_GPU_INL_H_

0 comments on commit ccf4b93

Please sign in to comment.