Merge pull request apache#45 from antinucleon/master

SoftmaxGrad and Standalone BLAS
stefanhenneking · Aug 22, 2015 · ccf4b93 · ccf4b93
2 parents 9819128 + 7d42e84
commit ccf4b93
Show file tree

Hide file tree

Showing 6 changed files with 134 additions and 12 deletions.
diff --git a/make/mshadow.mk b/make/mshadow.mk
@@ -47,6 +47,7 @@ endif
 else
 	MSHADOW_CFLAGS += -DMSHADOW_USE_CBLAS=1 -DMSHADOW_USE_MKL=0
 endif
+
 ifeq ($(USE_BLAS), openblas)
 	MSHADOW_LDFLAGS += -lopenblas
 else ifeq ($(USE_BLAS), atlas)

diff --git a/mshadow/cuda/tensor_gpu-inl.cuh b/mshadow/cuda/tensor_gpu-inl.cuh
@@ -32,7 +32,7 @@ const int kMaxGridNum = 65535;
 /*! \brief suggested grid number for mapping kernel */
 const int kBaseGridNum = 1024;
 /*! \brief get align stride for given size in x dimension */
-inline index_t GetAlignStride(index_t xsize) { 
+inline index_t GetAlignStride(index_t xsize) {
   if (xsize >= MSHADOW_MIN_PAD_RATIO * 32) {
     return ((xsize  + kMemUnit - 1) >> kMemUnitBits) << kMemUnitBits;
   } else {
@@ -45,7 +45,7 @@ inline void CheckLaunchParam(dim3 dimGrid, dim3 dimBlock, const char *estr = "")
       dimGrid.x > 65535 || dimGrid.y > 65535) {
     fprintf(stderr, "%s[%u,%u,%u]:", estr, dimBlock.x, dimBlock.y, dimBlock.z);
     utils::Error("too large launch parameter\n");
-  } 
+  }
 }
 template<typename Saver, typename DstPlan,
          typename Plan, int block_dim_bits>
@@ -68,7 +68,7 @@ __global__ void MapPlanKernel(DstPlan dst, index_t xstride,
 template<typename Saver, int block_dim_bits, int grid_size,
          typename DstPlan, typename Plan>
 __global__ void MapPlanLargeKernel(DstPlan dst, index_t xstride,
-                                   Shape<2> dshape, const Plan exp, int repeat) {  
+                                   Shape<2> dshape, const Plan exp, int repeat) {
   for (int i = 0; i < repeat; ++i) {
   MapPlanProc<Saver, DstPlan, Plan, block_dim_bits>
       (dst, xstride, dshape, exp, blockIdx.x + i * grid_size);
@@ -83,7 +83,7 @@ inline void MapPlan(expr::Plan<DstExp, DType> dst,
   const index_t xstride = GetAlignStride(dshape[1]);
   const int num_block = (dshape[0] * xstride + kBaseThreadNum-1) / kBaseThreadNum;
   dim3 dimBlock(kBaseThreadNum, 1, 1);
-  
+
   if (num_block < kMaxGridNum) {
     dim3 dimGrid(num_block, 1, 1);
     MapPlanKernel<Saver, kBaseThreadBits,
@@ -150,15 +150,15 @@ template<typename Saver, typename Reducer, int block_dim_bits,
 __global__ void MapReduceKeepDim1Kernel(DstPlan dst, Plan plan, DType scale, Shape<4> pshape) {
   const int block_size = 1 << block_dim_bits;
   __shared__ DType s_rec[block_size];
-  const int c = blockIdx.x;  
+  const int c = blockIdx.x;
   const index_t tot = pshape[3] * pshape[2] * pshape[0];
-  
+
   DType res; Reducer::SetInitValue(res);
   for (index_t i_offset = 0; i_offset < tot; i_offset += block_size) {
     index_t i = i_offset + threadIdx.x;
     if (i< tot) {
       const index_t x = i % pshape[3];
-      i /= pshape[3]; 
+      i /= pshape[3];
       const index_t y = i % pshape[2];
       const index_t n = i / pshape[2];
       Reducer::Reduce(res, plan.Eval((n * pshape[1] + c) * pshape[2] + y, x));
@@ -186,14 +186,28 @@ inline void MapReduceKeepDim1(expr::Plan<DstExp, DType> dst,
       <<<dimGrid, dimBlock, 0, stream>>>(dst, plan, scale, pshape);
 }
 
+template<int x_bits, typename DType, typename DstPlan, typename SrcPlan1, typename SrcPlan2>
+__global__ void SoftmaxGradKernel(DstPlan dst, SrcPlan1 src, SrcPlan2 label, index_t xmax) {
+  const int y = blockIdx.x;
+  const int x = threadIdx.x;
+  const int k = static_cast<int>(label.Eval(y, 0));
+  if (x < xmax) {
+    if (x == k) {
+      dst.REval(y, k) = src.Eval(y, k) - 1.0f;
+    } else {
+      dst.REval(y, k) = src.Eval(y, k);
+    }
+  }
+}
+
 template<int x_bits, typename DType,  typename DstPlan, typename SrcPlan>
 __global__ void SoftmaxKernel(DstPlan dst, SrcPlan src, index_t xmax) {
   const unsigned x_size = 1 << x_bits;
   const int y = blockIdx.x;
-  __shared__ DType s_rec[x_size];  
+  __shared__ DType s_rec[x_size];
   // step 1: get max
   if (threadIdx.x < xmax) {
-    s_rec[threadIdx.x] = src.Eval(y, threadIdx.x); 
+    s_rec[threadIdx.x] = src.Eval(y, threadIdx.x);
   }
   for (unsigned x = x_size; x < xmax; x += x_size) {
     if (x + threadIdx.x < xmax) {
@@ -212,7 +226,7 @@ __global__ void SoftmaxKernel(DstPlan dst, SrcPlan src, index_t xmax) {
   __syncthreads();
   s_rec[threadIdx.x] = 0.0f;
   __syncthreads();
- 
+
   // calculate normalizer, with writeback
   for (unsigned x = 0; x < xmax; x += x_size) {
     if (x + threadIdx.x < xmax) {
@@ -227,7 +241,7 @@ __global__ void SoftmaxKernel(DstPlan dst, SrcPlan src, index_t xmax) {
   Reduce1D<red::sum, x_bits>(s_rec);
   __syncthreads();
   DType ssum = s_rec[0];
-  
+
   for (unsigned x = 0; x < xmax; x += x_size) {
     if (x + threadIdx.x < xmax) {
       dst.REval(y, x + threadIdx.x) /= ssum;
@@ -248,6 +262,24 @@ inline void Softmax(Tensor<gpu, 2, DType> &dst,
        expr::MakePlan(src),
        dst.size(1));
 }
+
+template<typename DType>
+inline void SoftmaxGrad(Tensor<gpu, 2, DType> &dst,
+                        const Tensor<gpu, 2, DType> &src,
+                        const Tensor<gpu, 1, DType> &label) {
+  dim3 dimBlock(kBaseThreadNum);
+  dim3 dimGrid(dst.size(0));
+  utils::Check(dst.shape_ == src.shape_, "SoftmaxGrad: shape mismatch");
+  utils::Check(dst.size(0) == label.size(0), "SoftmaxGrad: label shape mismatch");
+  CheckLaunchParam(dimGrid, dimBlock, "SoftmaxGrad");
+  cudaStream_t stream = Stream<gpu>::GetStream(dst.stream_);
+  SoftmaxGradKernel<kBaseThreadBits, DType>
+      <<<dimGrid, dimBlock, 0, stream>>>
+      (expr::MakePlan(dst),
+       expr::MakePlan(src),
+       expr::MakePlan(label),
+       dst.size(1));
+}
 }  // namespace cuda
 }  // namespace mshadow
 #endif  // MSHADOW_CUDA_TENSOR_GPU_INL_CUH_
diff --git a/mshadow/dot_engine-inl.h b/mshadow/dot_engine-inl.h
@@ -75,7 +75,56 @@ struct BLASEngine<cpu> {
     cblas_dger(CblasColMajor, m, n, alpha, X, incX, Y, incY, A, lda);
   }
 };
-#endif  // MSHADOW_USE_CBLAS || MSHADOW_USE_MKL
+#elif MSHADOW_STAND_ALONE == 1
+template<>
+struct BLASEngine<cpu> {
+  inline static bool GetT(bool t) {
+    return t ? true : false;
+  }
+  inline static void SetStream(Stream<cpu> *stream) {
+  }
+  inline static void gemm(Stream<cpu> *stream,
+                          bool transa, bool transb,
+                          int m, int n, int k, float alpha,
+                          const float *A, int lda, const float *B, int ldb,
+                          float beta, float *C, int ldc) {
+    utils::Error("Not implmented!");
+  }
+  inline static void gemm(Stream<cpu> *stream,
+                          bool transa, bool transb,
+                          int m, int n, int k, double alpha,
+                          const double *A, int lda, const double *B, int ldb,
+                          double beta, double *C, int ldc) {
+    utils::Error("Not implmented!");
+  }
+  inline static void gemv(Stream<cpu> *stream,
+                          bool trans, int m, int n,
+                          float alpha, const float *A, int lda,
+                          const float *X, int incX,
+                          float beta, float *Y, int incY) {
+    utils::Error("Not implmented!");
+  }
+  inline static void gemv(Stream<cpu> *stream,
+                          bool trans, int m, int n, double alpha,
+                          const double *A, int lda,
+                          const double *X, int incX,
+                          double beta, double *Y, int incY) {
+    utils::Error("Not implmented!");
+  }
+  inline static void ger(Stream<cpu> *stream,
+                         int m, int n, float alpha,
+                         const float *X, int incX,
+                         const float *Y, int incY, float *A, int lda) {
+    utils::Error("Not implmented!");
+  }
+  inline static void ger(Stream<cpu> *stream,
+                         int m, int n, double alpha,
+                         const double *X, int incX,
+                         const double *Y, int incY, double *A, int lda) {
+    utils::Error("Not implmented!");
+  }
+};
+#endif  // MSHADOW_USE_CBLAS || MSHADOW_USE_MKL || MSHADOW_STAND_ALONE
 // CuBLAS redirect code
 #if MSHADOW_USE_CUDA
 // All CuBLAS goes to here, use legacy API: not threadsafe

diff --git a/mshadow/tensor.h b/mshadow/tensor.h
@@ -610,6 +610,27 @@ inline void Softmax(Tensor<cpu, 2, DType> dst, const Tensor<cpu, 2, DType> &ener
  */
 template<typename DType>
 inline void Softmax(Tensor<gpu, 2, DType> dst, const Tensor<gpu, 2, DType> &energy);
+
+/*!
+ * \brief CPU/GPU: softmax gradient
+ * \param dst destination
+ * \param src source output
+ * \param label label info
+ */
+template<typename DType>
+inline void SoftmaxGrad(Tensor<cpu, 2, DType> dst,
+                        const Tensor<cpu, 2, DType> &src,
+                        const Tensor<cpu, 1, DType> &label);
+/*!
+ * \brief CPU/GPU: softmax gradient
+ * \param dst destination
+ * \param src source output
+ * \param label label info
+ */
+template<typename DType>
+inline void SoftmaxGrad(Tensor<gpu, 2, DType> dst,
+                        const Tensor<gpu, 2, DType> &src,
+                        const Tensor<gpu, 1, DType> &label);
 // function declarations to support expression, no need to understand them
 // these functions do not need to be directly used
 /*!

diff --git a/mshadow/tensor_cpu-inl.h b/mshadow/tensor_cpu-inl.h
@@ -253,6 +253,17 @@ inline void Softmax(Tensor<cpu, 1, DType> dst,
     dst[x] /= sum;
   }
 }
+
+template<typename DType>
+inline void SoftmaxGrad(Tensor<cpu, 2, DType> dst,
+                        const Tensor<cpu, 2, DType> &src,
+                        const Tensor<cpu, 1, DType> &label) {
+  for (index_t y = 0; y < dst.size(0); ++y) {
+    const int k = static_cast<int>(label[y]);
+    dst[y][k] = src[y][k] - 1.0f;
+  }
+}
+
 template<typename DType>
 inline void Softmax(Tensor<cpu, 2, DType> dst,
                     const Tensor<cpu, 2, DType> &energy) {

diff --git a/mshadow/tensor_gpu-inl.h b/mshadow/tensor_gpu-inl.h
@@ -167,6 +167,14 @@ inline void Softmax(Tensor<gpu, 2, DType> dst,
                     const Tensor<gpu, 2, DType>& src) {
   cuda::Softmax(dst, src);
 }
+
+template<typename DType>
+inline void SoftmaxGrad(Tensor<gpu, 2, DType> dst,
+                        const Tensor<gpu, 2, DType> &src,
+                        const Tensor<gpu, 1, DType> &label) {
+  cuda::SoftmaxGrad(dst, src, label);
+}
+
 }  // namespace mshadow
 #endif  // __CUDACC__
 #endif  // MSHADOW_TENSOR_GPU_INL_H_