diff --git a/src/operator/tensor/dot-inl.cuh b/src/operator/tensor/dot-inl.cuh index 50b8164ca249..98528d427b34 100644 --- a/src/operator/tensor/dot-inl.cuh +++ b/src/operator/tensor/dot-inl.cuh @@ -40,7 +40,7 @@ namespace op { * \brief GPU scalar kernel of dot(csr, dns1) = dns2 * Parallelization by output matrix elements: 1 thread/element */ -template +template struct DotCsrDnsDnsScalarKernel { /*! * \brief This function represents performing an inner product between a row of lhs @@ -63,12 +63,12 @@ struct DotCsrDnsDnsScalarKernel { const nnvm::dim_t num_cols_r) { const nnvm::dim_t irow = tid / num_cols_r; // row id of the lhs const nnvm::dim_t icol = tid % num_cols_r; // col id of the rhs - DType sum = 0; + AType sum = 0; for (IType j = indptr_l[irow]; j < indptr_l[irow+1]; ++j) { const CType cur_col = col_idx_l[j]; // corresponding row id of the rhs sum += data_l[j] * data_r[cur_col*num_cols_r+icol]; } - KERNEL_ASSIGN(out[tid], req, sum); + KERNEL_ASSIGN(out[tid], req, static_cast(sum)); } }; @@ -76,7 +76,7 @@ struct DotCsrDnsDnsScalarKernel { * \brief GPU vector kernel of dot(csr, dns1) = dns2 * Parallelization by output matrix elements: 1 warp/element */ -template +template struct DotCsrDnsDnsVectorKernel { /*! * \brief see DotCsrDnsDnsScalarKernel Map for documentation. @@ -90,7 +90,7 @@ struct DotCsrDnsDnsVectorKernel { const DType* data_r, const nnvm::dim_t num_cols_r) { using nnvm::dim_t; - __shared__ volatile DType vals[mshadow::cuda::kBaseThreadNum]; + __shared__ volatile AType vals[mshadow::cuda::kBaseThreadNum]; const dim_t warp_id = tid / 32; // global warp id const dim_t lane = tid & (32-1); // local thread id within warp const dim_t irow = warp_id / num_cols_r; // lhs row that this warp computes @@ -101,9 +101,9 @@ struct DotCsrDnsDnsVectorKernel { const dim_t high = static_cast(indptr_l[irow+1]); // Compute running sum per thread - DType sum = 0; + AType sum = 0; for (dim_t j = low+lane; j < high; j+=32) { - sum += data_l[j] * data_r[col_idx_l[j]*num_cols_r + kcol]; + sum += static_cast(data_l[j]) * static_cast(data_r[col_idx_l[j]*num_cols_r + kcol]); } vals[threadIdx.x] = sum; __syncwarp(); @@ -115,7 +115,7 @@ struct DotCsrDnsDnsVectorKernel { if (lane < 1) {vals[threadIdx.x] += vals[threadIdx.x+ 1];} __syncwarp(); if (lane == 0) { - KERNEL_ASSIGN(out[irow*num_cols_r+kcol], req, vals[threadIdx.x]); + KERNEL_ASSIGN(out[irow*num_cols_r+kcol], req, static_cast(vals[threadIdx.x])); } } }; @@ -418,7 +418,7 @@ inline void DotCsrDnsDnsImpl(const OpContext& ctx, const TBlob& data_r = rhs; const TBlob data_out = *ret; - MSHADOW_SGL_DBL_TYPE_SWITCH(data_l.type_flag_, DType, { // data type + MXNET_REAL_ACC_TYPE_SWITCH(data_l.type_flag_, DType, AType, { // data type and accelerator type MSHADOW_IDX_TYPE_SWITCH(indptr_l.type_flag_, IType, { // indptr type MSHADOW_IDX_TYPE_SWITCH(col_idx_l.type_flag_, CType, { // col idx type if (kWriteTo == req) { @@ -513,14 +513,14 @@ inline void DotCsrDnsDnsImpl(const OpContext& ctx, if (num_cols_r > 4) { num_threads = data_out.Size(); MXNET_ASSIGN_REQ_SWITCH(req, ReqType, { - Kernel, gpu>::Launch(s, num_threads, + Kernel, gpu>::Launch(s, num_threads, data_out.dptr(), csc_data_ptr, csc_indptr_ptr, csc_indices_ptr, data_r.dptr(), num_cols_r); }); } else { num_threads = threads_per_warp * num_rows_l * num_cols_r; MXNET_ASSIGN_REQ_SWITCH(req, ReqType, { - Kernel, gpu>::Launch(s, num_threads, + Kernel, gpu>::Launch(s, num_threads, data_out.dptr(), csc_data_ptr, csc_indptr_ptr, csc_indices_ptr, data_r.dptr(), num_cols_r); }); @@ -529,14 +529,14 @@ inline void DotCsrDnsDnsImpl(const OpContext& ctx, if (num_cols_r > 4) { num_threads = data_out.Size(); MXNET_ASSIGN_REQ_SWITCH(req, ReqType, { - Kernel, gpu>::Launch(s, num_threads, + Kernel, gpu>::Launch(s, num_threads, data_out.dptr(), data_l.dptr(), indptr_l.dptr(), col_idx_l.dptr(), data_r.dptr(), num_cols_r); }); } else { num_threads = threads_per_warp * num_rows_l * num_cols_r; MXNET_ASSIGN_REQ_SWITCH(req, ReqType, { - Kernel, gpu>::Launch(s, num_threads, + Kernel, gpu>::Launch(s, num_threads, data_out.dptr(), data_l.dptr(), indptr_l.dptr(), col_idx_l.dptr(), data_r.dptr(), num_cols_r); }); diff --git a/tests/python/gpu/test_operator_gpu.py b/tests/python/gpu/test_operator_gpu.py index a017b8ce59c2..9378480d9af6 100644 --- a/tests/python/gpu/test_operator_gpu.py +++ b/tests/python/gpu/test_operator_gpu.py @@ -24,6 +24,8 @@ import numpy as np import unittest from nose.tools import assert_raises +import scipy.sparse as sps +import mxnet.ndarray.sparse as mxsps from mxnet.test_utils import check_consistency, set_default_context, assert_almost_equal, assert_allclose from mxnet.base import MXNetError from mxnet import autograd @@ -2548,6 +2550,18 @@ def test_arange_like_dtype(): for v in out: assert v.dtype == t + +def test_fp16_spmm(): + inp = mxsps.csr_matrix(sps.coo_matrix(([2.0], ([150], [100000]))).tocsr()) + inp = inp.astype('float16', copy=False) + weight = mx.nd.random.randn(100001, 151) + weight = weight.astype('float16', copy=False) + out = mxsps.dot(inp, weight) + out_np = mx.nd.dot(inp, weight) + assert_almost_equal(out.asnumpy(), out_np, rtol=1e-3, atol=1e-5) + + if __name__ == '__main__': import nose nose.runmodule() +