diff --git a/docs/tutorials/sparse/row_sparse.md b/docs/tutorials/sparse/row_sparse.md
index 85d76f498d71..c4cab75df543 100644
--- a/docs/tutorials/sparse/row_sparse.md
+++ b/docs/tutorials/sparse/row_sparse.md
@@ -469,7 +469,7 @@ state = momentum * state + rescaled_grad
 weight = weight - state
 ```
 
-Meanwhile, the sparse update rule for SGD optimizer is:
+However, with sparse gradient the SGD optimizer uses the following lazy update by default:
 
 ```
 for row in grad.indices:
@@ -478,6 +478,9 @@ for row in grad.indices:
     weight[row] = weight[row] - state[row]
 ```
 
+This means that the lazy update leads to different optimization results if `weight_decay` or `momentum` is non-zero.
+To disable lazy update, please set `lazy_update` to be False when creating the optimizer.
+
 
 ```python
 # Create weight
@@ -531,8 +534,8 @@ sgd.update(0, weight, grad, momentum)
 
 
 
-Note that both [mxnet.optimizer.SGD](https://mxnet.incubator.apache.org/api/python/optimization/optimization.html#mxnet.optimizer.SGD)
-and [mxnet.optimizer.Adam](https://mxnet.incubator.apache.org/api/python/optimization/optimization.html#mxnet.optimizer.Adam) support sparse updates in MXNet.
+Note that only [mxnet.optimizer.SGD](https://mxnet.incubator.apache.org/api/python/optimization/optimization.html#mxnet.optimizer.SGD), [mxnet.optimizer.Adam](https://mxnet.incubator.apache.org/api/python/optimization/optimization.html#mxnet.optimizer.Adam), and
+[mxnet.optimizer.AdaGrad](https://mxnet.incubator.apache.org/api/python/optimization/optimization.html#mxnet.optimizer.AdaGrad) support sparse updates in MXNet.
 
 ## Advanced Topics
 
@@ -541,7 +544,7 @@ and [mxnet.optimizer.Adam](https://mxnet.incubator.apache.org/api/python/optimiz
 By default, RowSparseNDArray operators are executed on CPU. In MXNet, GPU support for RowSparseNDArray is limited
 to a few sparse operators such as [sgd_update](https://mxnet.incubator.apache.org/api/python/ndarray/sparse.html#mxnet.ndarray.sparse.sgd_update),
 [dot](https://mxnet.incubator.apache.org/api/python/ndarray/sparse.html#mxnet.ndarray.sparse.dot) and
-[SparseEmbedding](https://mxnet.incubator.apache.org/api/python/ndarray/contrib.html#mxnet.ndarray.contrib.SparseEmbedding).
+[Embedding](https://mxnet.incubator.apache.org/api/python/ndarray/ndarray.html#mxnet.ndarray.Embedding).
 
 To create a RowSparseNDArray on gpu, we need to explicitly specify the context:
 
diff --git a/docs/tutorials/sparse/train.md b/docs/tutorials/sparse/train.md
index ce7020553c2d..7472fcd14ca3 100644
--- a/docs/tutorials/sparse/train.md
+++ b/docs/tutorials/sparse/train.md
@@ -190,7 +190,7 @@ fallback_log = fallback_exec.outputs[1]
 
 
 
-### Inspecting Storage Types of the Symbol Graph (Work in Progress)
+### Inspecting Storage Types of the Symbol Graph
 
 When the environment variable `MXNET_INFER_STORAGE_TYPE_VERBOSE_LOGGING` is set to `1`, MXNet will log the storage type information of
 operators' inputs and outputs in the computation graph. For example, we can inspect the storage types of
@@ -312,8 +312,10 @@ assert metric.get()[1] < 1, "Achieved MSE (%f) is larger than expected (1.0)" %
 
 
 
-### Training the model with multiple machines
+### Training the model with multiple machines or multiple devices
 
-To train a sparse model with multiple machines, please refer to the example in [mxnet/example/sparse/](https://github.com/apache/incubator-mxnet/tree/master/example/sparse)
+To train a sparse model with multiple machines, you need to call `prepare` before `forward`, or `save_checkpoint`.
+Please refer to the example in [mxnet/example/sparse/linear_classification](https://github.com/apache/incubator-mxnet/tree/master/example/sparse/linear_classification)
+for more details.
 
 <!-- INSERT SOURCE DOWNLOAD BUTTONS -->
diff --git a/example/sparse/linear_classification/train.py b/example/sparse/linear_classification/train.py
index 4d60efbaf4f3..0a8acfd87bef 100644
--- a/example/sparse/linear_classification/train.py
+++ b/example/sparse/linear_classification/train.py
@@ -32,9 +32,9 @@
 parser.add_argument('--kvstore', type=str, default=None,
                     help='what kvstore to use',
                     choices=["dist_sync", "dist_async", "local"])
-parser.add_argument('--optimizer', type=str, default='ftrl',
+parser.add_argument('--optimizer', type=str, default='sgd',
                     help='what optimizer to use',
-                    choices=["ftrl", "sgd", "adam"])
+                    choices=["adagrad", "sgd", "adam"])
 
 AVAZU = {
     'train': 'avazu-app',
@@ -129,6 +129,9 @@ def all_row_ids(data_batch):
         # evaluate metric on validation dataset
         score = mod.score(eval_data, ['nll_loss'])
         logging.info('epoch %d, eval nll = %s ' % (epoch, score[0][1]))
+
+        # prepare the module weight with all row ids before making a checkpoint.
+        mod.prepare(None, all_row_ids)
         mod.save_checkpoint("checkpoint", epoch)
         # reset the iterator for next pass of data
         train_data.reset()
diff --git a/python/mxnet/gluon/contrib/nn/basic_layers.py b/python/mxnet/gluon/contrib/nn/basic_layers.py
index 1edef1476ee3..843a02286594 100644
--- a/python/mxnet/gluon/contrib/nn/basic_layers.py
+++ b/python/mxnet/gluon/contrib/nn/basic_layers.py
@@ -118,6 +118,12 @@ class SparseEmbedding(Block):
     This SparseBlock is designed for distributed training with extremely large
     input dimension. Both weight and gradient w.r.t. weight are `RowSparseNDArray`.
 
+    Note: if `sparse_grad` is set to True, the gradient w.r.t weight will be
+    sparse. Only a subset of optimizers support sparse gradients, including SGD, AdaGrad
+    and Adam. By default lazy updates is turned on, which may perform differently
+    from standard updates. For more details, please check the Optimization API at:
+    https://mxnet.incubator.apache.org/api/python/optimization/optimization.html
+
     Parameters
     ----------
     input_dim : int
diff --git a/python/mxnet/gluon/nn/basic_layers.py b/python/mxnet/gluon/nn/basic_layers.py
index abde51b433af..ad69d4e9dd90 100644
--- a/python/mxnet/gluon/nn/basic_layers.py
+++ b/python/mxnet/gluon/nn/basic_layers.py
@@ -370,6 +370,11 @@ class Embedding(HybridBlock):
     r"""Turns non-negative integers (indexes/tokens) into dense vectors
     of fixed size. eg. [4, 20] -> [[0.25, 0.1], [0.6, -0.2]]
 
+    Note: if `sparse_grad` is set to True, the gradient w.r.t weight will be
+    sparse. Only a subset of optimizers support sparse gradients, including SGD, AdaGrad
+    and Adam. By default lazy updates is turned on, which may perform differently
+    from standard updates. For more details, please check the Optimization API at:
+    https://mxnet.incubator.apache.org/api/python/optimization/optimization.html
 
     Parameters
     ----------
diff --git a/python/mxnet/optimizer.py b/python/mxnet/optimizer.py
index e73a45f74b04..f758af5f982c 100644
--- a/python/mxnet/optimizer.py
+++ b/python/mxnet/optimizer.py
@@ -550,7 +550,7 @@ def update_multi_precision(self, index, weight, grad, state):
 class Signum(Optimizer):
     """The Signum optimizer that takes the sign of gradient or momentum.
 
-    The optimizer updates the weight by:
+    The optimizer updates the weight by::
 
         rescaled_grad = rescale_grad * clip(grad, clip_gradient) + wd * weight
         state = momentum * state + (1-momentum)*rescaled_grad
diff --git a/src/operator/tensor/dot.cc b/src/operator/tensor/dot.cc
index 556fd1fea56d..d45551d383b8 100644
--- a/src/operator/tensor/dot.cc
+++ b/src/operator/tensor/dot.cc
@@ -66,6 +66,14 @@ forward_stype option for output storage type. Implemented sparse operations incl
 If the combination of input storage types and forward_stype does not match any of the
 above patterns, ``dot`` will fallback and generate output with default storage.
 
+.. Note::
+
+    If the storage type of the lhs is "csr", the storage type of gradient w.r.t rhs will be
+    "row_sparse". Only a subset of optimizers support sparse gradients, including SGD, AdaGrad
+    and Adam. Note that by default lazy updates is turned on, which may perform differently
+    from standard updates. For more details, please check the Optimization API at:
+    https://mxnet.incubator.apache.org/api/python/optimization/optimization.html
+
 )doc" ADD_FILELINE)
 .set_num_inputs(2)
 .set_num_outputs(1)
diff --git a/src/operator/tensor/indexing_op.cc b/src/operator/tensor/indexing_op.cc
index e5ba058fb25e..64c5d86cbd1c 100644
--- a/src/operator/tensor/indexing_op.cc
+++ b/src/operator/tensor/indexing_op.cc
@@ -226,8 +226,15 @@ Examples::
                             [ 10.,  11.,  12.,  13.,  14.]]]
 
 
-The storage type of weight can be either row_sparse or default, while
-the storage type of weight's grad depends on the value of "sparse_grad".
+The storage type of weight can be either row_sparse or default.
+
+.. Note::
+
+    If "sparse_grad" is set to True, the storage type of gradient w.r.t weights will be
+    "row_sparse". Only a subset of optimizers support sparse gradients, including SGD, AdaGrad
+    and Adam. Note that by default lazy updates is turned on, which may perform differently
+    from standard updates. For more details, please check the Optimization API at:
+    https://mxnet.incubator.apache.org/api/python/optimization/optimization.html
 
 )code" ADD_FILELINE)
 .set_num_inputs(2)