eric-haibin-lin
diff --git a/‎python/mxnet/optimizer.py
+3-1 b/‎python/mxnet/optimizer.py
+3-1
diff --git a/‎python/mxnet/sparse_ndarray.py
+4-2 b/‎python/mxnet/sparse_ndarray.py
+4-2
diff --git a/‎src/operator/operator_common.h
+31 b/‎src/operator/operator_common.h
+31
diff --git a/‎src/operator/optimizer_op-inl.h
+118-41 b/‎src/operator/optimizer_op-inl.h
+118-41
diff --git a/‎src/operator/tensor/elemwise_unary_op.h
+3-6 b/‎src/operator/tensor/elemwise_unary_op.h
+3-6
diff --git a/‎src/operator/tensor/indexing_op.cc
+17-9 b/‎src/operator/tensor/indexing_op.cc
+17-9
@@ -4,6 +4,7 @@
 import logging
 from .ndarray import NDArray, zeros, clip, sqrt, sign
 from .ndarray import sgd_update, sgd_mom_update, adam_update, rmsprop_update, rmspropalex_update
+from .sparse_ndarray import zeros as sparse_zeros
 from .random import normal
 
 
@@ -332,7 +333,8 @@ def create_state(self, index, weight):
         if self.momentum == 0.0:
             return None
         else:
-            return zeros(weight.shape, weight.context, dtype=weight.dtype)
+            return sparse_zeros(weight.storage_type, weight.shape,
+                                weight.context, dtype=weight.dtype)
 
     def update(self, index, weight, grad, state):
         assert(isinstance(weight, NDArray))
 
@@ -571,7 +571,7 @@ def to_dense(source):
     """
     return ndarray.cast_storage(source, storage_type='default')
 
-def zeros(storage_type, shape, ctx=None, dtype=None, aux_types=None):
+def zeros(storage_type, shape, ctx=None, dtype=None, aux_types=None, **kwargs):
     """Return a new array of given shape and type, filled with zeros.
 
     Parameters
@@ -599,6 +599,8 @@ def zeros(storage_type, shape, ctx=None, dtype=None, aux_types=None):
     >>> mx.sparse_nd.zeros('row_sparse', (1,2), mx.gpu(0), 'float16').asnumpy()
     array([[ 0.,  0.]], dtype=float16)
     """
+    if storage_type == 'default':
+        return ndarray.zeros(shape, ctx, dtype, **kwargs)
     if ctx is None:
         ctx = Context.default_ctx
     dtype = mx_real_t if dtype is None else dtype
@@ -609,7 +611,7 @@ def zeros(storage_type, shape, ctx=None, dtype=None, aux_types=None):
             raise Exception("unknown storage type")
     assert(len(aux_types) == len(_STORAGE_AUX_TYPES[storage_type]))
     out = _ndarray_cls(_new_alloc_handle(storage_type, shape, ctx, True, dtype, aux_types))
-    return _internal._zeros(shape=shape, ctx=ctx, dtype=dtype, out=out)
+    return _internal._zeros(shape=shape, ctx=ctx, dtype=dtype, out=out, **kwargs)
 
 def _ndarray_cls(handle, writable=True):
     stype = _storage_type(handle)
 
@@ -110,6 +110,19 @@ inline std::string type_string(const int& x) {
   return "unknown";
 }
 
+/*! \brief get string representation of storage_type */
+inline std::string stype_string(const int& x) {
+  switch (x) {
+    case kDefaultStorage:
+      return "default";
+    case kCSRStorage:
+      return "csr";
+    case kRowSparseStorage:
+      return "row_sparse";
+  }
+  return "unknown";
+}
+
 /*!
  * \brief Assign x to y. Checks for compatiblity when y is not empty.
  *  Allow missing dim in both x and y (as 0).
@@ -186,6 +199,24 @@ inline bool type_assign(int *y, const int& x) {
     }                                                                       \
   }
 
+/*!
+ * \brief macro assign type to out if out is unknown (-1) otherwise check consistency
+ *  Use macro so we can see the error file more clearly
+ * \param type_array the storage type array to store the result
+ * \param index the index of in the array
+ * \param type the inferred storage type
+ */
+#define STORAGE_TYPE_ASSIGN_CHECK(type_array, index, type)                  \
+  {                                                                         \
+    if (!type_assign(&(type_array)[index], type)) {                         \
+      std::ostringstream os;                                                \
+      os << "Storage type inconsistent, Provided="                          \
+         << stype_string((type_array)[index]) << ','                        \
+         << " inferred storage type=" << stype_string(type);                \
+      throw ::mxnet::op::InferTypeError(os.str(), index);                   \
+    }                                                                       \
+  }
+
 // helper macro to implement bind dispatch
 #if MXNET_USE_CUDA
 #define DO_BIND_DISPATCH(Method, ...)                                \
 
@@ -112,32 +112,31 @@ struct SGDDnsRspKernel {
 
 template<typename xpu>
 inline void SGDUpdateDnsRspImpl(const SGDParam& param,
-                        const OpContext &ctx,
-                        const std::vector<NDArray> &inputs,
-                        const std::vector<OpReqType> &req,
-                        const std::vector<NDArray> &outputs) {
+                                const OpContext &ctx,
+                                const TBlob& weight,
+                                const NDArray& grad,
+                                const OpReqType& req,
+                                TBlob *out) {
   using namespace mshadow;
   using namespace mshadow::expr;
   using namespace mshadow_op;
+  using namespace mxnet_op;
   Stream<xpu>* s = ctx.get_stream<xpu>();
-  auto &weight = inputs[0];
-  auto &grad = inputs[1];
-  auto &out = outputs[0];
-  CHECK_EQ(weight.storage_type(), kDefaultStorage);
   CHECK_EQ(grad.storage_type(), kRowSparseStorage);
-  if (!grad.storage_initialized()) return;
+  // if gradients are zeros, no weights are updated
+  if (!grad.storage_initialized() || req == kNullOp) return;
+  CHECK_GT(weight.shape_.Size(), 0);
 
-  MSHADOW_REAL_TYPE_SWITCH(weight.dtype(), DType, {
+  MSHADOW_REAL_TYPE_SWITCH(weight.type_flag_, DType, {
     MSHADOW_INT_TYPE_SWITCH(grad.aux_type(rowsparse::kIdx), IType, {
-      MXNET_ASSIGN_REQ_SWITCH(req[0], req_type, {
-        auto weight_data = weight.data().FlatTo2D<xpu, DType>(s);
-        auto grad_idx = grad.aux_data(rowsparse::kIdx).FlatTo1D<xpu, IType>(s);
-        auto grad_val = grad.data().FlatTo2D<xpu, DType>(s);
-        auto out_data = out.data().FlatTo2D<xpu, DType>(s);
+      MXNET_ASSIGN_REQ_SWITCH(req, req_type, {
+        auto weight_data = weight.dptr<DType>();
+        auto grad_idx = grad.aux_data(rowsparse::kIdx).dptr<IType>();
+        auto grad_val = grad.data().dptr<DType>();
         auto num_rows = grad.aux_shape(rowsparse::kIdx)[0];
-        auto width = weight.shape().ProdShape(1, weight.shape().ndim());
-        mxnet_op::Kernel<SGDDnsRspKernel<req_type>, xpu>::Launch(s, num_rows, width,
-          out_data.dptr_, weight_data.dptr_, grad_idx.dptr_, grad_val.dptr_,
+        auto width = weight.shape_.ProdShape(1, weight.ndim());
+        Kernel<SGDDnsRspKernel<req_type>, xpu>::Launch(s, num_rows, width,
+          out->dptr<DType>(), weight_data, grad_idx, grad_val,
           static_cast<DType>(param.clip_gradient),
           static_cast<DType>(param.lr), static_cast<DType>(param.wd),
           static_cast<DType>(param.rescale_grad));
@@ -146,6 +145,29 @@ inline void SGDUpdateDnsRspImpl(const SGDParam& param,
   });
 }
 
+template<typename xpu>
+inline void SGDUpdateRspRspImpl(const SGDParam& param,
+                                const OpContext& ctx,
+                                const NDArray& weight,
+                                const NDArray& grad,
+                                const OpReqType& req,
+                                NDArray *out) {
+  if (weight.storage_shape()[0] == weight.shape()[0] &&
+      out->storage_shape()[0] == out->shape()[0]) {
+    // TODO(haibin) this is a temporary solution, due to the fact that imperative_invoke only
+    // feed in kWriteTo as req for all operators.
+    // For sgd we don't want to assign zeros to the output values when req == kWriteTo
+    auto out_req = req;
+    if (out_req == kWriteTo) out_req = kWriteInplace;
+    // reuse dns rsp implementation when storage_shape == shape
+    TBlob out_blob = out->data();
+    SGDUpdateDnsRspImpl<xpu>(param, ctx, weight.data(), grad, out_req, &out_blob);
+  } else {
+    LOG(FATAL) << "SGDUpdate for RowSparse weights is only implemented when "
+               << "weights.values.shape == weights.shape";
+  }
+}
+
 template<typename xpu>
 inline void SGDUpdateEx(const nnvm::NodeAttrs& attrs,
                         const OpContext &ctx,
@@ -159,7 +181,11 @@ inline void SGDUpdateEx(const nnvm::NodeAttrs& attrs,
   auto weight_stype = inputs[0].storage_type();
   auto grad_stype = inputs[1].storage_type();
   if (weight_stype == kDefaultStorage && grad_stype == kRowSparseStorage) {
-    SGDUpdateDnsRspImpl<xpu>(param, ctx, inputs, req, outputs);
+    TBlob out = outputs[0].data();
+    SGDUpdateDnsRspImpl<xpu>(param, ctx, inputs[0].data(), inputs[1], req[0], &out);
+  } else if (weight_stype == kRowSparseStorage && grad_stype == kRowSparseStorage) {
+    NDArray out = outputs[0];
+    SGDUpdateRspRspImpl<xpu>(param, ctx, inputs[0], inputs[1], req[0], &out);
   } else if (weight_stype == kDefaultStorage && grad_stype == kDefaultStorage) {
     FCompExFallback<xpu>(attrs, ctx, inputs, req, outputs, SGDUpdate<xpu>, "SGDUpdate");
   }
@@ -262,30 +288,31 @@ struct SGDMomDnsRspDnsKernel {
 
 template<typename xpu>
 inline void SGDMomUpdateDnsRspDnsImpl(const SGDMomParam& param,
-                                            const OpContext &ctx,
-                                            const std::vector<NDArray> &inputs,
-                                            const std::vector<OpReqType> &req,
-                                            const std::vector<NDArray> &outputs) {
+                                      const OpContext& ctx,
+                                      const TBlob& weight,
+                                      const NDArray& grad,
+                                      const TBlob& mom,
+                                      const OpReqType& req,
+                                      TBlob *out) {
   using namespace mxnet_op;
+  using namespace rowsparse;
   Stream<xpu>* s = ctx.get_stream<xpu>();
-  auto &weight = inputs[0];
-  auto &grad = inputs[1];
-  auto &mom = inputs[2];
-  auto &out = outputs[0];
-  if (!grad.storage_initialized()) return;
+  if (!grad.storage_initialized() || req == kNullOp) return;
+  CHECK_GT(weight.shape_.Size(), 0);
+  CHECK_GT(mom.shape_.Size(), 0);
 
-  MSHADOW_REAL_TYPE_SWITCH(weight.dtype(), DType, {
-    MSHADOW_INT_TYPE_SWITCH(grad.aux_type(rowsparse::kIdx), IType, {
-      MXNET_ASSIGN_REQ_SWITCH(req[0], req_type, {
-        auto weight_data = weight.data().FlatTo2D<xpu, DType>(s);
-        auto grad_idx = grad.aux_data(rowsparse::kIdx).FlatTo1D<xpu, IType>(s);
-        auto grad_val = grad.data().FlatTo2D<xpu, DType>(s);
-        auto mom_data = mom.data().FlatTo2D<xpu, DType>(s);
-        auto out_data = out.data().FlatTo2D<xpu, DType>(s);
-        auto num_rows = grad.aux_shape(rowsparse::kIdx)[0];
-        auto width = weight.shape().ProdShape(1, weight.shape().ndim());
+  MSHADOW_REAL_TYPE_SWITCH(weight.type_flag_, DType, {
+    MSHADOW_INT_TYPE_SWITCH(grad.aux_type(kIdx), IType, {
+      MXNET_ASSIGN_REQ_SWITCH(req, req_type, {
+        auto weight_data = weight.dptr<DType>();
+        auto grad_idx = grad.aux_data(kIdx).dptr<IType>();
+        auto grad_val = grad.data().dptr<DType>();
+        auto mom_data = mom.dptr<DType>();
+        auto out_data = out->dptr<DType>();
+        auto num_rows = grad.aux_shape(kIdx)[0];
+        auto width = weight.shape_.ProdShape(1, weight.ndim());
         Kernel<SGDMomDnsRspDnsKernel<req_type>, xpu>::Launch(s, num_rows, width,
-          out_data.dptr_, mom_data.dptr_, weight_data.dptr_, grad_idx.dptr_, grad_val.dptr_,
+          out_data, mom_data, weight_data, grad_idx, grad_val,
           static_cast<DType>(param.clip_gradient), static_cast<DType>(param.momentum),
           static_cast<DType>(param.lr), static_cast<DType>(param.wd),
           static_cast<DType>(param.rescale_grad));
@@ -294,6 +321,50 @@ inline void SGDMomUpdateDnsRspDnsImpl(const SGDMomParam& param,
   });
 }
 
+template<typename xpu>
+inline void SGDMomUpdateRspRspRspImpl(const SGDMomParam& param,
+                                      const OpContext& ctx,
+                                      const NDArray& weight,
+                                      const NDArray& grad,
+                                      const NDArray& mom,
+                                      const OpReqType& req,
+                                      NDArray *out) {
+  using namespace mshadow;
+  using namespace mshadow::expr;
+  using namespace mxnet_op;
+  using namespace rowsparse;
+  if (weight.storage_shape()[0] == weight.shape()[0] &&
+      out->storage_shape()[0] == out->shape()[0]) {
+    Stream<xpu>* s = ctx.get_stream<xpu>();
+    // fill mom with zero values in order to reuse the sgd mom dns impl
+    if (!mom.storage_initialized()) {
+      MSHADOW_REAL_TYPE_SWITCH(mom.dtype(), DType, {
+        MSHADOW_INT_TYPE_SWITCH(mom.aux_type(kIdx), IType, {
+          auto num_rows = mom.shape()[0];
+          mom.CheckAndAlloc({Shape1(num_rows)});
+          auto mom_idx = mom.aux_data(kIdx).FlatTo1D<xpu, IType>(s);
+          auto mom_val = mom.data();
+          // TODO(haibin) this is single-thread execution
+          Kernel<set_zero, xpu>::Launch(s, mom_val.Size(), mom_val.dptr<DType>());
+          ASSIGN_DISPATCH(mom_idx, kWriteTo, range<IType>(0, num_rows, 1, 1))
+        });
+      });
+    }
+    // TODO(haibin) this is a temporary solution, due to the fact that imperative_invoke only
+    // feed in kWriteTo as req for all operators.
+    // For sgd we don't want to assign zeros to the output values when req == kWriteTo
+    auto out_req = req;
+    if (out_req == kWriteTo) out_req = kWriteInplace;
+    TBlob out_blob = out->data();
+    // reuse dns rsp implementation when storage_shape == shape
+    SGDMomUpdateDnsRspDnsImpl<xpu>(param, ctx, weight.data(), grad,
+                                   mom.data(), out_req, &out_blob);
+  } else {
+    LOG(FATAL) << "SGDUpdate for RowSparse weights is only implemented when "
+               << "weights.values.shape == weights.shape";
+  }
+}
+
 template<typename xpu>
 inline void SGDMomUpdateEx(const nnvm::NodeAttrs& attrs,
                            const OpContext &ctx,
@@ -305,10 +376,16 @@ inline void SGDMomUpdateEx(const nnvm::NodeAttrs& attrs,
   auto weight_stype = inputs[0].storage_type();
   auto grad_stype = inputs[1].storage_type();
   auto mom_stype = inputs[2].storage_type();
-
   if (weight_stype == kDefaultStorage && grad_stype == kRowSparseStorage &&
       mom_stype == kDefaultStorage) {
-    SGDMomUpdateDnsRspDnsImpl<xpu>(param, ctx, inputs, req, outputs);
+    TBlob out = outputs[0].data();
+    SGDMomUpdateDnsRspDnsImpl<xpu>(param, ctx, inputs[0].data(), inputs[1],
+                                   inputs[2].data(), req[0], &out);
+  } else if (weight_stype == kRowSparseStorage && grad_stype == kRowSparseStorage &&
+      mom_stype == kRowSparseStorage) {
+     NDArray out = outputs[0];
+     SGDMomUpdateRspRspRspImpl<xpu>(param, ctx, inputs[0], inputs[1],
+                                    inputs[2], req[0], &out);
   } else if (weight_stype == kDefaultStorage && grad_stype == kDefaultStorage &&
       mom_stype == kDefaultStorage) {
     FCompExFallback<xpu>(attrs, ctx, inputs, req, outputs,
 
@@ -324,10 +324,8 @@ inline void CastStorageDnsRspImpl(mshadow::Stream<cpu>* s, const TBlob& dns, NDA
 struct CastStorageRspDnsKernel {
   template<typename DType, typename IType>
   MSHADOW_XINLINE static void Map(int i, const index_t width, const IType* idx, const DType *data,
-                                  DType* dns, const index_t invalid_rid) {
+                                  DType* dns) {
     auto rid = idx[i];
-    // skip invalid rows
-    if (rid == invalid_rid) return;
     auto dns_offset = rid * width;
     auto rsp_offset = i * width;
     for (size_t col = 0; col < width; col++) {
@@ -356,10 +354,9 @@ void CastStorageRspDnsImpl(mshadow::Stream<xpu>* s, const NDArray& rsp, TBlob* d
         auto out_data = dns->FlatTo2D<xpu, DType>(s).dptr_;
         auto num_rows = rsp.aux_shape(rowsparse::kIdx).Size();
         auto rsp_shape = rsp.shape();
-        auto invalid_rid = rsp_shape[0];
         auto width = rsp_shape.ProdShape(1, rsp_shape.ndim());
-        mxnet_op::Kernel<CastStorageRspDnsKernel, xpu>::Launch(s, num_rows, width, in_idx, in_data,
-                                                               out_data, invalid_rid);
+        mxnet_op::Kernel<CastStorageRspDnsKernel, xpu>::Launch(s, num_rows, width, in_idx,
+                                                               in_data, out_data);
       }
     });
   });
 
@@ -87,39 +87,47 @@ NNVM_REGISTER_OP(_backward_Embedding)
 .set_attr<FCompute>("FCompute<cpu>", EmbeddingOpBackward<cpu>);
 
 NNVM_REGISTER_OP(SparseEmbedding)
-.describe(R"code(Maps integer indices to vector representations (embeddings) with sparse weight update
-)code" ADD_FILELINE)
+.describe(R"doc(Represents words or other sparse inputs by dense continuous vectors.
+It assumes that the input is in one-hot form. E.g., for a vocabulary size of 10,000,
+ each input vector is expected to have dimension 10,000.
+The index of the non-zero entry is the index of the word or item it represents.
+
+The corresponding embedding vectors are stored as rows of a matrix.
+Hence, mapping an input word to its embedding is implemented as a matrix product.
+
+The gradient of an embedding matrix has the form of gradient vectors that are only
+ non-zero for words seen in a minibatch.
+)doc" ADD_FILELINE)
 .set_num_inputs(2)
 .set_num_outputs(1)
 .set_attr_parser(ParamParser<EmbeddingParam>)
 .set_attr<nnvm::FListInputNames>("FListInputNames",
   [](const NodeAttrs& attrs) {
     return std::vector<std::string>{"data", "weight"};
   })
-.set_attr<nnvm::FInferShape>("FInferShape", EmbeddingOpShape)
+.set_attr<nnvm::FInferShape>("FInferShape", SparseEmbeddingShape)
 .set_attr<nnvm::FInferType>("FInferType", EmbeddingOpType)
+.set_attr<nnvm::FInferStorageType>("FInferStorageType", SparseEmbeddingForwardStorageType)
 .set_attr<FResourceRequest>("FResourceRequest",
   [](const NodeAttrs& attrs) {
     return std::vector<ResourceRequest>{ResourceRequest::kTempSpace};
   })
-.set_attr<FCompute>("FCompute<cpu>", EmbeddingOpForward<cpu>)
+.set_attr<FComputeEx>(FCOMP_EX_CPU, SparseEmbeddingForwardEx<cpu>)
 .set_attr<nnvm::FGradient>("FGradient",
   [](const nnvm::NodePtr& n, const std::vector<nnvm::NodeEntry>& ograds) {
     return MakeNonlossGradNode("_backward_SparseEmbedding", n, ograds,
                                {n->inputs[0]}, n->attrs.dict);
   })
-.add_argument("data", "NDArray-or-Symbol", "The input array to the embedding operator.")
+.add_argument("data", "NDArray-or-Symbol",
+              "The input array to the sparse embedding operator.")
 .add_argument("weight", "NDArray-or-Symbol", "The embedding weight matrix.")
 .add_arguments(EmbeddingParam::__FIELDS__());
 
 NNVM_REGISTER_OP(_backward_SparseEmbedding)
 .set_num_inputs(2)
 .set_num_outputs(2)
 .set_attr<nnvm::TIsBackward>("TIsBackward", true)
-.set_attr<nnvm::FInferStorageType>("FInferStorageType", SparseEmbeddingBackwardStorageType)
-.set_attr<FComputeEx>("FComputeEx<cpu>", SparseEmbeddingOpBackwardEx<cpu>);
-// TODO(haibin) handle dense case
-// .set_attr<FCompute>("FCompute<cpu>", EmbeddingOpBackward<cpu>);
+.set_attr<FComputeEx>("FComputeEx<cpu>", SparseEmbeddingBackwardEx<cpu>);
 
 NNVM_REGISTER_OP(take)
 .describe(R"code(Takes elements from an input array along the given axis.