From b005306064369528d1295687326eb41ecc481251 Mon Sep 17 00:00:00 2001
From: Eric Junyuan Xie <piiswrong@users.noreply.github.com>
Date: Sat, 13 May 2017 06:14:42 -0700
Subject: [PATCH] Pre nn patch (#6201)

* pre-nn patch

* ifx

* fix

* fix
---
 cpp-package/example/inception_bn.cpp     |   2 +-
 cpp-package/example/resnet.cpp           |   6 +-
 include/mxnet/ndarray.h                  |  34 +-
 python/mxnet/_ctypes/ndarray.py          |   5 +
 python/mxnet/callback.py                 |   6 +-
 python/mxnet/context.py                  |  13 +-
 python/mxnet/contrib/autograd.py         |  20 +-
 python/mxnet/initializer.py              |  97 ++---
 python/mxnet/metric.py                   | 483 +++++++++++++++++------
 python/mxnet/module/executor_group.py    |  41 +-
 python/mxnet/ndarray.py                  |  28 +-
 python/mxnet/optimizer.py                |  51 +--
 python/mxnet/registry.py                 | 141 +++++++
 python/mxnet/symbol.py                   |  29 +-
 src/c_api/c_api.cc                       |  26 +-
 src/c_api/c_api_ndarray.cc               |   4 +-
 src/ndarray/autograd.cc                  |  52 ++-
 src/ndarray/autograd.h                   |   4 +-
 src/ndarray/ndarray.cc                   | 142 ++++---
 src/operator/batch_norm.cc               |   2 +
 src/operator/convolution-inl.h           |  44 ++-
 src/operator/pooling.cu                  |  23 +-
 src/operator/tensor/elemwise_unary_op.cc |   4 +
 tests/python/unittest/test_autograd.py   |   8 +-
 tests/python/unittest/test_metric.py     |  22 ++
 tests/python/unittest/test_ndarray.py    |  27 +-
 26 files changed, 896 insertions(+), 418 deletions(-)
 create mode 100644 python/mxnet/registry.py
 create mode 100644 tests/python/unittest/test_metric.py

diff --git a/cpp-package/example/inception_bn.cpp b/cpp-package/example/inception_bn.cpp
index b65611215b7a..a4ed75a0a855 100644
--- a/cpp-package/example/inception_bn.cpp
+++ b/cpp-package/example/inception_bn.cpp
@@ -23,7 +23,7 @@ Symbol ConvFactoryBN(Symbol data, int num_filter,
   Symbol conv = Convolution("conv_" + name + suffix, data,
                             conv_w, conv_b, kernel,
                             num_filter, stride, Shape(1, 1), pad);
-  Symbol bn = BatchNorm("bn_" + name + suffix, conv, BN_GAMMA, BN_BETA);
+  Symbol bn = BatchNorm("bn_" + name + suffix, conv, Symbol(), Symbol(), Symbol(), Symbol());
   return Activation("relu_" + name + suffix, bn, "relu");
 }
 
diff --git a/cpp-package/example/resnet.cpp b/cpp-package/example/resnet.cpp
index 5d3131223ef3..c09b2c2fa485 100644
--- a/cpp-package/example/resnet.cpp
+++ b/cpp-package/example/resnet.cpp
@@ -48,7 +48,8 @@ Symbol getConv(const std::string & name, Symbol data,
                                   kernel, num_filter, stride, Shape(1, 1),
                                   pad, 1, 512);
 
-  Symbol bn = BatchNorm(name + "_bn", conv, BN_GAMMA, BN_BETA, 2e-5, bn_momentum, false);
+  Symbol bn = BatchNorm(name + "_bn", conv, Symbol(), Symbol(), Symbol(),
+                        Symbol(), 2e-5, bn_momentum, false);
 
   if (with_relu) {
     return Activation(name + "_relu", bn, "relu");
@@ -108,7 +109,8 @@ Symbol ResNetSymbol(int num_class, int num_level = 3, int num_block = 9,
   Symbol data = Symbol::Variable("data");
   Symbol data_label = Symbol::Variable("data_label");
 
-  Symbol zscore = BatchNorm("zscore", data, BN_GAMMA, BN_BETA, 0.001, bn_momentum);
+  Symbol zscore = BatchNorm("zscore", data, Symbol(), Symbol(), Symbol(),
+                            Symbol(), 0.001, bn_momentum);
 
   Symbol conv = getConv("conv0", zscore, num_filter,
                         Shape(3, 3), Shape(1, 1), Shape(1, 1),
diff --git a/include/mxnet/ndarray.h b/include/mxnet/ndarray.h
index 31318329c0e1..ea38909d07f1 100644
--- a/include/mxnet/ndarray.h
+++ b/include/mxnet/ndarray.h
@@ -283,33 +283,13 @@ class NDArray {
    * \param end end index in first dim
    * \return sliced NDArray
    */
-  inline NDArray Slice(index_t begin, index_t end) const {
-    NDArray ret = *this;
-    CHECK(!is_none()) << "NDArray is not initialized";
-    CHECK_GE(shape_[0], end) << "Slice end index out of range";
-    size_t length = shape_.ProdShape(1, shape_.ndim());
-    ret.offset_ += begin * length;
-    ret.shape_[0] = end - begin;
-    return ret;
-  }
+  NDArray Slice(index_t begin, index_t end) const;
   /*!
    * \brief Index a NDArray
    * \param idx the index
    * \return idx-th sub array NDArray
    */
-  inline NDArray At(index_t idx) const {
-    NDArray ret = *this;
-    CHECK(!is_none()) << "NDArray is not initialized";
-    CHECK_GT(shape_[0], idx) << "index out of range";
-    size_t length = shape_.ProdShape(1, shape_.ndim());
-    ret.offset_ += idx * length;
-    if (shape_.ndim() > 1) {
-      ret.shape_ = TShape(shape_.data()+1, shape_.data()+shape_.ndim());
-    } else {
-      ret.shape_ = mshadow::Shape1(1);
-    }
-    return ret;
-  }
+  NDArray At(index_t idx) const;
   /*!
    * \brief Create a NDArray that shares memory with current one
    *  The new array must have smaller memory size than the current array.
@@ -337,13 +317,7 @@ class NDArray {
    * \param shape new shape
    * \return NDArray in new shape
    */
-  inline NDArray Reshape(const TShape &shape) const {
-    CHECK_GE(shape_.Size(), shape.Size())
-        << "NDArray.Reshape: target shape size is different from current shape";
-    NDArray ret = *this;
-    ret.shape_ = shape;
-    return ret;
-  }
+  NDArray Reshape(const TShape &shape) const;
   /*!
    * \brief Allocate the space if it is delayed allocated.
    * This is an internal function used by system that normal user should not use
@@ -566,7 +540,7 @@ void SamplePoisson(real_t lambda, NDArray *out);
 /*!
  * \brief Sample negative binomial distribution for each elements of out.
  * \param k failure limit
- * \param p success probability 
+ * \param p success probability
  * \param out output NDArray.
  */
 void SampleNegBinomial(int32_t k, real_t p, NDArray *out);
diff --git a/python/mxnet/_ctypes/ndarray.py b/python/mxnet/_ctypes/ndarray.py
index c81945184780..786b134befa6 100644
--- a/python/mxnet/_ctypes/ndarray.py
+++ b/python/mxnet/_ctypes/ndarray.py
@@ -100,6 +100,7 @@ def _make_ndarray_function(handle, name):
             kwarg_names.append(name)
     #signature.append('is_train=False')
     signature.append('out=None')
+    signature.append('name=None')
     signature.append('**kwargs')
     signature = ndsignature + signature
 
@@ -120,6 +121,10 @@ def %s(*%s, **kwargs):"""%(func_name, arr_name))
         kwargs['%s'] = np.dtype(kwargs['%s']).name"""%(
             dtype_name, dtype_name, dtype_name))
         code.append("""
+    try:
+        kwargs.pop('name')
+    except:
+        pass
     out = kwargs.pop('out', None)
     keys = list(kwargs.keys())
     vals = [str(i) for i in kwargs.values()]""")
diff --git a/python/mxnet/callback.py b/python/mxnet/callback.py
index 28300805b5be..b585ce82b525 100644
--- a/python/mxnet/callback.py
+++ b/python/mxnet/callback.py
@@ -145,9 +145,9 @@ def __call__(self, param):
                     name_value = param.eval_metric.get_name_value()
                     if self.auto_reset:
                         param.eval_metric.reset()
-                    for name, value in name_value:
-                        logging.info('Epoch[%d] Batch [%d]\tSpeed: %.2f samples/sec\tTrain-%s=%f',
-                                     param.epoch, count, speed, name, value)
+                    msg = 'Epoch[%d] Batch [%d]\tSpeed: %.2f samples/sec'
+                    msg += '\t%s=%f'*len(name_value)
+                    logging.info(msg, param.epoch, count, speed, *sum(name_value, ()))
                 else:
                     logging.info("Iter[%d] Batch [%d]\tSpeed: %.2f samples/sec",
                                  param.epoch, count, speed)
diff --git a/python/mxnet/context.py b/python/mxnet/context.py
index 3580edae3d11..9822a6d86708 100644
--- a/python/mxnet/context.py
+++ b/python/mxnet/context.py
@@ -73,16 +73,17 @@ def device_type(self):
         """
         return Context.devtype2str[self.device_typeid]
 
+    def __hash__(self):
+        """Compute hash value of context for dictionary lookup"""
+        return hash((self.device_typeid, self.device_id))
+
     def __eq__(self, other):
         """Compares two contexts. Two contexts are equal if they
         have the same device type and device id.
         """
-        if not isinstance(other, Context):
-            return False
-        if self.device_typeid == other.device_typeid and \
-                        self.device_id == other.device_id:
-            return True
-        return False
+        return isinstance(other, Context) and \
+            self.device_typeid == other.device_typeid and \
+            self.device_id == other.device_id
 
     def __str__(self):
         return '%s(%d)' % (self.device_type, self.device_id)
diff --git a/python/mxnet/contrib/autograd.py b/python/mxnet/contrib/autograd.py
index a6d7f32e2fb1..40ab289c8f4c 100644
--- a/python/mxnet/contrib/autograd.py
+++ b/python/mxnet/contrib/autograd.py
@@ -51,25 +51,29 @@ def __exit__(self, ptype, value, trace):
             set_is_training(self._prev)
 
 
-def train():
-    """Returns a training TrainingStateScope
+def train_section():
+    """Returns a training scope context to be used in 'with' statement
+    and captures training code.
 
     Example::
-        with autograd.train():
+        with autograd.train_section():
             y = model(x)
             compute_gradient([y])
+        metric.update(...)
+        optim.step(...)
     """
     return TrainingStateScope(True)
 
 
-def test():
-    """Returns a testing TrainingStateScope.
+def test_section():
+    """Returns a testing scope context to be used in 'with' statement
+    and captures testing code.
 
     Example::
-        with autograd.train():
+        with autograd.train_section():
             y = model(x)
             compute_gradient([y])
-            with autograd.test():
+            with autograd.test_section():
                 # testing, IO, gradient updates...
     """
     return TrainingStateScope(False)
@@ -146,7 +150,7 @@ def wrapped(*args):
             assert isinstance(x, NDArray), "type of autograd input should NDArray."
         grads = [zeros_like(x) for x in variables]
         mark_variables(variables, grads)
-        with train():
+        with train_section():
             outputs = func(*args)
         compute_gradient([outputs] if isinstance(outputs, NDArray) else outputs)
         return grads, outputs
diff --git a/python/mxnet/initializer.py b/python/mxnet/initializer.py
index 13076ff44fb7..5cc2ede3f3ed 100755
--- a/python/mxnet/initializer.py
+++ b/python/mxnet/initializer.py
@@ -9,6 +9,7 @@
 from .base import string_types
 from .ndarray import NDArray, load
 from . import random
+from . import registry
 
 # inherit str for backward compatibility
 class InitDesc(str):
@@ -29,54 +30,11 @@ def __new__(cls, name, attrs=None, global_init=None):
         ret.global_init = global_init
         return ret
 
-_INITIALIZER_REGISTRY = {}
-
-def register(klass):
-    """Registers a custom initializer.
-
-    Custom initializers can be created by extending `mx.init.Initializer` and implementing the
-    required functions like `_init_weight` and `_init_bias`. The created initializer must be
-    registered using `mx.init.register` before it can be used.
-
-    Parameters
-    ----------
-    klass : class
-        A subclass of `mx.init.Initializer` that needs to be registered as a custom initializer.
-
-    Example
-    -------
-    >>> # Create and register a custom initializer that
-    ... # initializes weights to 0.1 and biases to 1.
-    ...
-    >>> @mx.init.register
-    ... class CustomInit(mx.init.Initializer):
-    ...   def __init__(self):
-    ...     super(CustomInit, self).__init__()
-    ...   def _init_weight(self, _, arr):
-    ...     arr[:] = 0.1
-    ...   def _init_bias(self, _, arr):
-    ...     arr[:] = 1
-    ...
-    >>> # Module is an instance of 'mxnet.module.Module'
-    ...
-    >>> module.init_params(CustomInit())
-    """
-    assert issubclass(klass, Initializer), "Can only register subclass of Initializer"
-    name = klass.__name__.lower()
-    if name in _INITIALIZER_REGISTRY:
-        warnings.warn(
-            "\033[91mNew initializer %s.%s is overriding existing initializer %s.%s\033[0m"%(
-                klass.__module__, klass.__name__,
-                _INITIALIZER_REGISTRY[name].__module__,
-                _INITIALIZER_REGISTRY[name].__name__),
-            UserWarning, stacklevel=2)
-    _INITIALIZER_REGISTRY[name] = klass
-    return klass
 
 class Initializer(object):
     """The base class of an initializer."""
     def __init__(self, **kwargs):
-        self.kwargs = kwargs
+        self._kwargs = kwargs
 
     def dumps(self):
         """Saves the initializer to string
@@ -97,7 +55,7 @@ def dumps(self):
         >>> init.dumps()
         '["xavier", {"rnd_type": "uniform", "magnitude": 2.34, "factor_type": "in"}]'
         """
-        return json.dumps([self.__class__.__name__.lower(), self.kwargs])
+        return json.dumps([self.__class__.__name__.lower(), self._kwargs])
 
     def __call__(self, desc, arr):
         """Initialize an array
@@ -120,8 +78,7 @@ def __call__(self, desc, arr):
 
         if init:
             # when calling Variable initializer
-            klass, kwargs = json.loads(init)
-            _INITIALIZER_REGISTRY[klass.lower()](**kwargs)._init_weight(desc, arr)
+            create(init)._init_weight(desc, arr)
         else:
             # register nnvm::FSetInputVariableAttrs in the backend for new patterns
             # don't add new cases here.
@@ -223,6 +180,48 @@ def _init_default(self, name, _):
             'Please use mx.sym.Variable(init=mx.init.*) to set initialization pattern' % name)
 
 
+# pylint: disable=invalid-name
+_register = registry.get_register_func(Initializer, 'initializer')
+alias = registry.get_alias_func(Initializer, 'initializer')
+create = registry.get_create_func(Initializer, 'initializer')
+# pylint: enable=invalid-name
+
+def register(klass):
+    """Registers a custom initializer.
+
+    Custom initializers can be created by extending `mx.init.Initializer` and implementing the
+    required functions like `_init_weight` and `_init_bias`. The created initializer must be
+    registered using `mx.init.register` before it can be called by name.
+
+    Parameters
+    ----------
+    klass : class
+        A subclass of `mx.init.Initializer` that needs to be registered as a custom initializer.
+
+    Example
+    -------
+    >>> # Create and register a custom initializer that
+    ... # initializes weights to 0.1 and biases to 1.
+    ...
+    >>> @mx.init.register
+    ... @alias('myinit')
+    ... class CustomInit(mx.init.Initializer):
+    ...   def __init__(self):
+    ...     super(CustomInit, self).__init__()
+    ...   def _init_weight(self, _, arr):
+    ...     arr[:] = 0.1
+    ...   def _init_bias(self, _, arr):
+    ...     arr[:] = 1
+    ...
+    >>> # Module is an instance of 'mxnet.module.Module'
+    ...
+    >>> module.init_params("custominit")
+    >>> # module.init_params("myinit")
+    >>> # module.init_params(CustomInit())
+    """
+    return _register(klass)
+
+
 class Load(object):
     """Initializes variables by loading data from file or dict.
 
@@ -312,6 +311,7 @@ def __call__(self, name, arr):
                          'add a ".*" pattern at the and with default Initializer.')
 
 @register
+@alias("zeros")
 class Zero(Initializer):
     """Initializes weights to zero.
 
@@ -336,6 +336,7 @@ def _init_weight(self, _, arr):
         arr[:] = 0
 
 @register
+@alias("ones")
 class One(Initializer):
     """Initializes weights to one.
 
@@ -561,9 +562,9 @@ class MSRAPrelu(Xavier):
         initial slope of any PReLU (or similar) nonlinearities.
     """
     def __init__(self, factor_type="avg", slope=0.25):
-        self.kwargs = {'factor_type': factor_type, 'slope': slope}
         magnitude = 2. / (1 + slope ** 2)
         super(MSRAPrelu, self).__init__("gaussian", factor_type, magnitude)
+        self._kwargs = {'factor_type': factor_type, 'slope': slope}
 
 @register
 class Bilinear(Initializer):
diff --git a/python/mxnet/metric.py b/python/mxnet/metric.py
index 1bc7d9ae423b..17a0b20d106a 100644
--- a/python/mxnet/metric.py
+++ b/python/mxnet/metric.py
@@ -1,11 +1,17 @@
 # coding: utf-8
-# pylint: disable=no-member
+# pylint: disable=no-member, too-many-lines
 
 """Online evaluation metric module."""
 from __future__ import absolute_import
 import math
+from collections import OrderedDict
+
 import numpy
+
+from .base import numeric_types, string_types
 from . import ndarray
+from . import registry
+
 
 def check_label_shapes(labels, preds, shape=0):
     if shape == 0:
@@ -17,6 +23,7 @@ def check_label_shapes(labels, preds, shape=0):
         raise ValueError("Shape of labels {} does not match shape of "
                          "predictions {}".format(label_shape, pred_shape))
 
+
 class EvalMetric(object):
     """Base class for all evaluation metrics.
 
@@ -25,13 +32,64 @@ class EvalMetric(object):
         This is a base class that provides common metric interfaces.
         One should not use this class directly, but instead create new metric
         classes that extend it.
-    """
 
-    def __init__(self, name, num=None):
-        self.name = name
-        self.num = num
+    Parameters
+    ----------
+    name : str
+        Name of this metric instance for display.
+    output_names : list of str, or None
+        Name of predictions that should be used when updating with update_dict.
+        By default include all predictions.
+    label_names : list of str, or None
+        Name of labels that should be used when updating with update_dict.
+        By default include all labels.
+    """
+    def __init__(self, name, output_names=None,
+                 label_names=None, **kwargs):
+        self.name = str(name)
+        self.output_names = output_names
+        self.label_names = label_names
+        self._kwargs = kwargs
         self.reset()
 
+    def __str__(self):
+        return "EvalMetric: {}".format(dict(self.get_name_value()))
+
+    def get_config(self):
+        """Save configurations of metric. Can be recreated
+        from configs with metric.create(**config)
+        """
+        config = self._kwargs.copy()
+        config.update({
+            'metric': self.__class__.__name__,
+            'name': self.name,
+            'output_names': self.output_names,
+            'label_names': self.label_names})
+        return config
+
+    def update_dict(self, label, pred):
+        """Update the internal evaluation with named label and pred
+
+        Parameters
+        ----------
+        labels : OrderedDict of str -> NDArray
+            name to array mapping for labels.
+
+        preds : list of NDArray
+            name to array mapping of predicted outputs.
+        """
+        if self.output_names is not None:
+            pred = [pred[name] for name in self.output_names]
+        else:
+            pred = pred.values()
+
+        if self.label_names is not None:
+            label = [label[name] for name in self.label_names]
+        else:
+            label = label.values()
+
+        self.update(label, pred)
+
     def update(self, labels, preds):
         """Updates the internal evaluation result.
 
@@ -47,12 +105,8 @@ def update(self, labels, preds):
 
     def reset(self):
         """Resets the internal evaluation result to initial state."""
-        if self.num is None:
-            self.num_inst = 0
-            self.sum_metric = 0.0
-        else:
-            self.num_inst = [0] * self.num
-            self.sum_metric = [0.0] * self.num
+        self.num_inst = 0
+        self.sum_metric = 0.0
 
     def get(self):
         """Gets the current evaluation result.
@@ -64,16 +118,10 @@ def get(self):
         values : list of float
            Value of the evaluations.
         """
-        if self.num is None:
-            if self.num_inst == 0:
-                return (self.name, float('nan'))
-            else:
-                return (self.name, self.sum_metric / self.num_inst)
+        if self.num_inst == 0:
+            return (self.name, float('nan'))
         else:
-            names = ['%s_%d'%(self.name, i) for i in range(self.num)]
-            values = [x / y if y != 0 else float('nan') \
-                for x, y in zip(self.sum_metric, self.num_inst)]
-            return (names, values)
+            return (self.name, self.sum_metric / self.num_inst)
 
     def get_name_value(self):
         """Returns zipped name and value pairs.
@@ -88,15 +136,75 @@ def get_name_value(self):
             name = [name]
         if not isinstance(value, list):
             value = [value]
-        return zip(name, value)
+        return list(zip(name, value))
 
-    def __str__(self):
-        return "EvalMetric: {}".format(dict(self.get_name_value()))
+# pylint: disable=invalid-name
+register = registry.get_register_func(EvalMetric, 'metric')
+alias = registry.get_alias_func(EvalMetric, 'metric')
+_create = registry.get_create_func(EvalMetric, 'metric')
+# pylint: enable=invalid-name
+
+
+def create(metric, *args, **kwargs):
+    """Creates evaluation metric from metric names or instances of EvalMetric
+    or a custom metric function.
+
+    Parameters
+    ----------
+    metric : str or callable
+        Specifies the metric to create.
+        This argument must be one of the below:
 
+        - Name of a metric.
+        - An instance of `EvalMetric`.
+        - A list, each element of which is a metric or a metric name.
+        - An evaluation function that computes custom metric for a given batch of
+          labels and predictions.
+    *args : list
+        Additional arguments to metric constructor.
+        Only used when metric is str.
+    **kwargs : dict
+        Additional arguments to metric constructor.
+        Only used when metric is str
 
+    Examples
+    --------
+    >>> def custom_metric(label, pred):
+    ...     return np.mean(np.abs(label - pred))
+    ...
+    >>> metric1 = mx.metric.create('acc')
+    >>> metric2 = mx.metric.create(custom_metric)
+    >>> metric3 = mx.metric.create([metric1, metric2, 'rmse'])
+    """
+    if callable(metric):
+        return CustomMetric(metric, *args, **kwargs)
+    elif isinstance(metric, list):
+        composite_metric = CompositeEvalMetric()
+        for child_metric in metric:
+            composite_metric.add(create(child_metric, *args, **kwargs))
+        return composite_metric
+
+    return _create(metric, *args, **kwargs)
+
+
+@register
+@alias('composite')
 class CompositeEvalMetric(EvalMetric):
     """Manages multiple evaluation metrics.
 
+    Parameters
+    ----------
+    metrics : list of EvalMetric
+        List of child metrics.
+    name : str
+        Name of this metric instance for display.
+    output_names : list of str, or None
+        Name of predictions that should be used when updating with update_dict.
+        By default include all predictions.
+    label_names : list of str, or None
+        Name of labels that should be used when updating with update_dict.
+        By default include all labels.
+
     Examples
     --------
     >>> predicts = [mx.nd.array([[0.3, 0.7], [0, 1.], [0.4, 0.6]])]
@@ -111,12 +219,13 @@ class CompositeEvalMetric(EvalMetric):
     (['accuracy', 'f1'], [0.6666666666666666, 0.8])
     """
 
-    def __init__(self, **kwargs):
-        super(CompositeEvalMetric, self).__init__('composite')
-        try:
-            self.metrics = kwargs['metrics']
-        except KeyError:
-            self.metrics = []
+    def __init__(self, metrics=None, name='composite',
+                 output_names=None, label_names=None):
+        super(CompositeEvalMetric, self).__init__(
+            'composite', output_names=output_names, label_names=label_names)
+        if metrics is None:
+            metrics = []
+        self.metrics = [create(i) for i in metrics]
 
     def add(self, metric):
         """Adds a child metric.
@@ -126,7 +235,7 @@ def add(self, metric):
         metric
             A metric instance.
         """
-        self.metrics.append(metric)
+        self.metrics.append(create(metric))
 
     def get_metric(self, index):
         """Returns a child metric.
@@ -142,6 +251,17 @@ def get_metric(self, index):
             return ValueError("Metric index {} is out of range 0 and {}".format(
                 index, len(self.metrics)))
 
+    def update_dict(self, labels, preds):
+        if self.label_names is not None:
+            labels = OrderedDict([i for i in labels.items()
+                                  if i[0] in self.label_names])
+        if self.output_names is not None:
+            preds = OrderedDict([i for i in preds.items()
+                                 if i[0] in self.output_names])
+
+        for metric in self.metrics:
+            metric.update_dict(labels, preds)
+
     def update(self, labels, preds):
         """Updates the internal evaluation result.
 
@@ -175,20 +295,46 @@ def get(self):
            Value of the evaluations.
         """
         names = []
-        results = []
+        values = []
         for metric in self.metrics:
-            result = metric.get()
-            names.append(result[0])
-            results.append(result[1])
-        return (names, results)
+            name, value = metric.get()
+            if isinstance(name, string_types):
+                name = [name]
+            if isinstance(value, numeric_types):
+                value = [value]
+            names.extend(name)
+            values.extend(value)
+        return (names, values)
+
+    def get_config(self):
+        config = super(CompositeEvalMetric, self).get_config()
+        config.update({'metrics': [i.get_config() for i in self.metrics]})
+        return config
+
 
 ########################
 # CLASSIFICATION METRICS
 ########################
 
+
+@register
+@alias('acc')
 class Accuracy(EvalMetric):
     """Computes accuracy classification score.
 
+    Parameters
+    ----------
+    axis : int, default=1
+        The axis that represents classes
+    name : str
+        Name of this metric instance for display.
+    output_names : list of str, or None
+        Name of predictions that should be used when updating with update_dict.
+        By default include all predictions.
+    label_names : list of str, or None
+        Name of labels that should be used when updating with update_dict.
+        By default include all labels.
+
     Examples
     --------
     >>> predicts = [mx.nd.array([[0.3, 0.7], [0, 1.], [0.4, 0.6]])]
@@ -198,9 +344,12 @@ class Accuracy(EvalMetric):
     >>> print acc.get()
     ('accuracy', 0.6666666666666666)
     """
-
-    def __init__(self):
-        super(Accuracy, self).__init__('accuracy')
+    def __init__(self, axis=1, name='accuracy',
+                 output_names=None, label_names=None):
+        super(Accuracy, self).__init__(
+            name, axis=axis,
+            output_names=output_names, label_names=label_names)
+        self.axis = axis
 
     def update(self, labels, preds):
         """Updates the internal evaluation result.
@@ -217,7 +366,7 @@ def update(self, labels, preds):
 
         for label, pred_label in zip(labels, preds):
             if pred_label.shape != label.shape:
-                pred_label = ndarray.argmax_channel(pred_label)
+                pred_label = ndarray.argmax(pred_label, axis=self.axis)
             pred_label = pred_label.asnumpy().astype('int32')
             label = label.asnumpy().astype('int32')
 
@@ -226,6 +375,9 @@ def update(self, labels, preds):
             self.sum_metric += (pred_label.flat == label.flat).sum()
             self.num_inst += len(pred_label.flat)
 
+
+@register
+@alias('top_k_accuracy', 'top_k_acc')
 class TopKAccuracy(EvalMetric):
     """Computes top k predictions accuracy.
 
@@ -239,6 +391,14 @@ class TopKAccuracy(EvalMetric):
     ----------
     top_k : int
         Whether targets are in top k predictions.
+    name : str
+        Name of this metric instance for display.
+    output_names : list of str, or None
+        Name of predictions that should be used when updating with update_dict.
+        By default include all predictions.
+    label_names : list of str, or None
+        Name of labels that should be used when updating with update_dict.
+        By default include all labels.
 
     Examples
     --------
@@ -252,12 +412,12 @@ class TopKAccuracy(EvalMetric):
     ('top_k_accuracy', 0.3)
     """
 
-    def __init__(self, **kwargs):
-        super(TopKAccuracy, self).__init__('top_k_accuracy')
-        try:
-            self.top_k = kwargs['top_k']
-        except KeyError:
-            self.top_k = 1
+    def __init__(self, top_k=1, name='top_k_accuracy',
+                 output_names=None, label_names=None):
+        super(TopKAccuracy, self).__init__(
+            name, top_k=top_k,
+            output_names=output_names, label_names=label_names)
+        self.top_k = top_k
         assert(self.top_k > 1), 'Please use Accuracy if top_k is no more than 1'
         self.name += '_%d' % self.top_k
 
@@ -290,6 +450,8 @@ def update(self, labels, preds):
                     self.sum_metric += (pred_label[:, num_classes - 1 - j].flat == label.flat).sum()
             self.num_inst += num_samples
 
+
+@register
 class F1(EvalMetric):
     """Computes the F1 score of a binary classification problem.
 
@@ -307,6 +469,17 @@ class F1(EvalMetric):
 
         This F1 score only supports binary classification.
 
+    Parameters
+    ----------
+    name : str
+        Name of this metric instance for display.
+    output_names : list of str, or None
+        Name of predictions that should be used when updating with update_dict.
+        By default include all predictions.
+    label_names : list of str, or None
+        Name of labels that should be used when updating with update_dict.
+        By default include all labels.
+
     Examples
     --------
     >>> predicts = [mx.nd.array([[0.3, 0.7], [0., 1.], [0.4, 0.6]])]
@@ -317,8 +490,10 @@ class F1(EvalMetric):
     ('f1', 0.8)
     """
 
-    def __init__(self):
-        super(F1, self).__init__('f1')
+    def __init__(self, name='f1',
+                 output_names=None, label_names=None):
+        super(F1, self).__init__(
+            name, output_names=output_names, label_names=label_names)
 
     def update(self, labels, preds):
         """Updates the internal evaluation result.
@@ -371,6 +546,7 @@ def update(self, labels, preds):
             self.num_inst += 1
 
 
+@register
 class Perplexity(EvalMetric):
     """Computes perplexity.
 
@@ -406,6 +582,14 @@ class Perplexity(EvalMetric):
         The axis from prediction that was used to
         compute softmax. By default use the last
         axis.
+    name : str
+        Name of this metric instance for display.
+    output_names : list of str, or None
+        Name of predictions that should be used when updating with update_dict.
+        By default include all predictions.
+    label_names : list of str, or None
+        Name of labels that should be used when updating with update_dict.
+        By default include all labels.
 
     Examples
     --------
@@ -416,8 +600,11 @@ class Perplexity(EvalMetric):
     >>> print perp.get()
     ('Perplexity', 1.7710976285155853)
     """
-    def __init__(self, ignore_label, axis=-1):
-        super(Perplexity, self).__init__('Perplexity')
+    def __init__(self, ignore_label, axis=-1, name='perplexity',
+                 output_names=None, label_names=None):
+        super(Perplexity, self).__init__(
+            name, ignore_label=ignore_label,
+            output_names=output_names, label_names=label_names)
         self.ignore_label = ignore_label
         self.axis = axis
 
@@ -463,6 +650,8 @@ def get(self):
 # REGRESSION METRICS
 ####################
 
+
+@register
 class MAE(EvalMetric):
     """Computes Mean Absolute Error (MAE) loss.
 
@@ -471,6 +660,17 @@ class MAE(EvalMetric):
     .. math::
         \\frac{\\sum_i^n |y_i - \\hat{y}_i|}{n}
 
+    Parameters
+    ----------
+    name : str
+        Name of this metric instance for display.
+    output_names : list of str, or None
+        Name of predictions that should be used when updating with update_dict.
+        By default include all predictions.
+    label_names : list of str, or None
+        Name of labels that should be used when updating with update_dict.
+        By default include all labels.
+
     Examples
     --------
     >>> predicts = [mx.nd.array(np.array([3, -0.5, 2, 7]).reshape(4,1))]
@@ -481,8 +681,10 @@ class MAE(EvalMetric):
     ('mae', 0.5)
     """
 
-    def __init__(self):
-        super(MAE, self).__init__('mae')
+    def __init__(self, name='mae',
+                 output_names=None, label_names=None):
+        super(MAE, self).__init__(
+            name, output_names=output_names, label_names=label_names)
 
     def update(self, labels, preds):
         """Updates the internal evaluation result.
@@ -508,6 +710,7 @@ def update(self, labels, preds):
             self.num_inst += 1 # numpy.prod(label.shape)
 
 
+@register
 class MSE(EvalMetric):
     """Computes Mean Squared Error (MSE) loss.
 
@@ -516,6 +719,17 @@ class MSE(EvalMetric):
     .. math::
         \\frac{\\sum_i^n (y_i - \\hat{y}_i)^2}{n}
 
+    Parameters
+    ----------
+    name : str
+        Name of this metric instance for display.
+    output_names : list of str, or None
+        Name of predictions that should be used when updating with update_dict.
+        By default include all predictions.
+    label_names : list of str, or None
+        Name of labels that should be used when updating with update_dict.
+        By default include all labels.
+
     Examples
     --------
     >>> predicts = [mx.nd.array(np.array([3, -0.5, 2, 7]).reshape(4,1))]
@@ -525,8 +739,10 @@ class MSE(EvalMetric):
     >>> print mean_squared_error.get()
     ('mse', 0.375)
     """
-    def __init__(self):
-        super(MSE, self).__init__('mse')
+    def __init__(self, name='mse',
+                 output_names=None, label_names=None):
+        super(MSE, self).__init__(
+            name, output_names=output_names, label_names=label_names)
 
     def update(self, labels, preds):
         """Updates the internal evaluation result.
@@ -551,6 +767,8 @@ def update(self, labels, preds):
             self.sum_metric += ((label - pred)**2.0).mean()
             self.num_inst += 1 # numpy.prod(label.shape)
 
+
+@register
 class RMSE(EvalMetric):
     """Computes Root Mean Squred Error (RMSE) loss.
 
@@ -559,6 +777,17 @@ class RMSE(EvalMetric):
     .. math::
         \\sqrt{\\frac{\\sum_i^n (y_i - \\hat{y}_i)^2}{n}}
 
+    Parameters
+    ----------
+    name : str
+        Name of this metric instance for display.
+    output_names : list of str, or None
+        Name of predictions that should be used when updating with update_dict.
+        By default include all predictions.
+    label_names : list of str, or None
+        Name of labels that should be used when updating with update_dict.
+        By default include all labels.
+
     Examples
     --------
     >>> predicts = [mx.nd.array(np.array([3, -0.5, 2, 7]).reshape(4,1))]
@@ -568,8 +797,10 @@ class RMSE(EvalMetric):
     >>> print root_mean_squared_error.get()
     ('rmse', 0.612372457981)
     """
-    def __init__(self):
-        super(RMSE, self).__init__('rmse')
+    def __init__(self, name='rmse',
+                 output_names=None, label_names=None):
+        super(RMSE, self).__init__(
+            name, output_names=output_names, label_names=label_names)
 
     def update(self, labels, preds):
         """Updates the internal evaluation result.
@@ -594,6 +825,9 @@ def update(self, labels, preds):
             self.sum_metric += numpy.sqrt(((label - pred)**2.0).mean())
             self.num_inst += 1
 
+
+@register
+@alias('ce')
 class CrossEntropy(EvalMetric):
     """Computes Cross Entropy loss.
 
@@ -607,6 +841,14 @@ class CrossEntropy(EvalMetric):
     eps : float
         Cross Entropy loss is undefined for predicted value is 0 or 1,
         so predicted values are added with the small constant.
+    name : str
+        Name of this metric instance for display.
+    output_names : list of str, or None
+        Name of predictions that should be used when updating with update_dict.
+        By default include all predictions.
+    label_names : list of str, or None
+        Name of labels that should be used when updating with update_dict.
+        By default include all labels.
 
     Examples
     --------
@@ -617,8 +859,11 @@ class CrossEntropy(EvalMetric):
     >>> print ce.get()
     ('cross-entropy', 0.57159948348999023)
     """
-    def __init__(self, eps=1e-8):
-        super(CrossEntropy, self).__init__('cross-entropy')
+    def __init__(self, eps=1e-8, name='cross-entropy',
+                 output_names=None, label_names=None):
+        super(CrossEntropy, self).__init__(
+            name, eps=eps,
+            output_names=output_names, label_names=label_names)
         self.eps = eps
 
     def update(self, labels, preds):
@@ -645,21 +890,52 @@ def update(self, labels, preds):
             self.sum_metric += (-numpy.log(prob + self.eps)).sum()
             self.num_inst += label.shape[0]
 
-class Torch(EvalMetric):
-    """Dummy metric for torch criterions."""
-    def __init__(self, name='torch'):
-        super(Torch, self).__init__(name)
+
+@register
+class Loss(EvalMetric):
+    """Dummy metric for directly printing loss.
+
+    Parameters
+    ----------
+    name : str
+        Name of this metric instance for display.
+    output_names : list of str, or None
+        Name of predictions that should be used when updating with update_dict.
+        By default include all predictions.
+    label_names : list of str, or None
+        Name of labels that should be used when updating with update_dict.
+        By default include all labels.
+    """
+    def __init__(self, name='loss',
+                 output_names=None, label_names=None):
+        super(Loss, self).__init__(
+            name, output_names=output_names, label_names=label_names)
 
     def update(self, _, preds):
         for pred in preds:
-            self.sum_metric += pred.asnumpy().mean()
-        self.num_inst += 1
+            self.sum_metric += ndarray.sum(pred).asscalar()
+            self.num_inst += pred.size
+
+
+@register
+class Torch(Loss):
+    """Dummy metric for torch criterions."""
+    def __init__(self, name='torch',
+                 output_names=None, label_names=None):
+        super(Torch, self).__init__(
+            name, output_names=output_names, label_names=label_names)
+
+
+@register
+class Caffe(Loss):
+    """Dummy metric for caffe criterions."""
+    def __init__(self, name='caffe',
+                 output_names=None, label_names=None):
+        super(Caffe, self).__init__(
+            name, output_names=output_names, label_names=label_names)
 
-class Caffe(Torch):
-    """Dummy metric for caffe criterions"""
-    def __init__(self):
-        super(Caffe, self).__init__('caffe')
 
+@register
 class CustomMetric(EvalMetric):
     """Computes a customized evaluation metric.
 
@@ -676,6 +952,14 @@ class CustomMetric(EvalMetric):
         If true, the prediction outputs can have extra outputs.
         This is useful in RNN, where the states are also produced
         in outputs for forwarding. (the default is False).
+    name : str
+        Name of this metric instance for display.
+    output_names : list of str, or None
+        Name of predictions that should be used when updating with update_dict.
+        By default include all predictions.
+    label_names : list of str, or None
+        Name of labels that should be used when updating with update_dict.
+        By default include all labels.
 
     Examples
     --------
@@ -687,12 +971,16 @@ class CustomMetric(EvalMetric):
     >>> print eval_metrics.get()
     ('custom(<lambda>)', 6.0)
     """
-    def __init__(self, feval, name=None, allow_extra_outputs=False):
+    def __init__(self, feval, name=None, allow_extra_outputs=False,
+                 output_names=None, label_names=None):
         if name is None:
             name = feval.__name__
             if name.find('<') != -1:
                 name = 'custom(%s)' % name
-        super(CustomMetric, self).__init__(name)
+        super(CustomMetric, self).__init__(
+            name, feval=feval,
+            allow_extra_outputs=allow_extra_outputs,
+            output_names=output_names, label_names=label_names)
         self._feval = feval
         self._allow_extra_outputs = allow_extra_outputs
 
@@ -723,6 +1011,10 @@ def update(self, labels, preds):
                 self.sum_metric += reval
                 self.num_inst += 1
 
+    def get_config(self):
+        raise NotImplementedError("CustomMetric cannot be serialized")
+
+
 # pylint: disable=invalid-name
 def np(numpy_feval, name=None, allow_extra_outputs=False):
     """Creates a custom evaluation metric that receives its inputs as numpy arrays.
@@ -757,56 +1049,3 @@ def feval(label, pred):
     feval.__name__ = numpy_feval.__name__
     return CustomMetric(feval, name, allow_extra_outputs)
 # pylint: enable=invalid-name
-
-def create(metric, **kwargs):
-    """Creates evaluation metric from metric names or instances of EvalMetric
-    or a custom metric function.
-
-    Parameters
-    ----------
-    metric : str or callable
-        Specifies the metric to create.
-        This argument must be one of the below:
-
-        - Name of a metric.
-        - An instance of `EvalMetric`.
-        - A list, each element of which is a metric or a metric name.
-        - An evaluation function that computes custom metric for a given batch of
-          labels and predictions.
-
-    Examples
-    --------
-    >>> def custom_metric(label, pred):
-    ...     return np.mean(np.abs(label - pred))
-    ...
-    >>> metric1 = mx.metric.create('acc')
-    >>> metric2 = mx.metric.create(custom_metric)
-    >>> metric3 = mx.metric.create([metric1, metric2, 'rmse'])
-    """
-
-    if callable(metric):
-        return CustomMetric(metric)
-    elif isinstance(metric, EvalMetric):
-        return metric
-    elif isinstance(metric, list):
-        composite_metric = CompositeEvalMetric()
-        for child_metric in metric:
-            composite_metric.add(create(child_metric, **kwargs))
-        return composite_metric
-
-    metrics = {
-        'acc': Accuracy,
-        'accuracy': Accuracy,
-        'ce': CrossEntropy,
-        'f1': F1,
-        'mae': MAE,
-        'mse': MSE,
-        'rmse': RMSE,
-        'top_k_accuracy': TopKAccuracy
-    }
-
-    try:
-        return metrics[metric.lower()](**kwargs)
-    except:
-        raise ValueError("Metric must be either callable or in {}".format(
-            metrics.keys()))
diff --git a/python/mxnet/module/executor_group.py b/python/mxnet/module/executor_group.py
index e8fe360e030d..2903399a4b96 100644
--- a/python/mxnet/module/executor_group.py
+++ b/python/mxnet/module/executor_group.py
@@ -3,6 +3,8 @@
 """Executor group is a convenient tool for managing a group of executors."""
 
 import logging
+from collections import OrderedDict
+
 import numpy as np
 
 from .. import context as ctx
@@ -197,10 +199,14 @@ def __init__(self, symbol, contexts, workload, data_shapes, label_shapes, param_
 
         self.data_shapes = None
         self.label_shapes = None
+        self.data_names = None
+        self.label_names = None
         self.data_layouts = None
         self.label_layouts = None
+        self.output_names = self.symbol.list_outputs()
         self.output_layouts = [DataDesc.get_batch_axis(self.symbol[name].attr('__layout__'))
-                               for name in self.symbol.list_outputs()]
+                               for name in self.output_names]
+        self.num_outputs = len(self.symbol.list_outputs())
 
         self.bind_exec(data_shapes, label_shapes, shared_group)
 
@@ -302,6 +308,9 @@ def bind_exec(self, data_shapes, label_shapes, shared_group=None, reshape=False)
 
         self.data_shapes = data_shapes
         self.label_shapes = label_shapes
+        self.data_names = [i.name for i in self.data_shapes]
+        if label_shapes is not None:
+            self.label_names = [i.name for i in self.label_shapes]
         self._collect_arrays()
 
     def reshape(self, data_shapes, label_shapes):
@@ -370,10 +379,8 @@ def forward(self, data_batch, is_train=None):
         if is_train is None:
             is_train = self.for_training
 
-        if self.label_arrays is not None:
-            assert not is_train or data_batch.label
-            if data_batch.label:
-                _load_label(data_batch, self.label_arrays, self.label_layouts)
+        if self.label_arrays is not None and data_batch.label:
+            _load_label(data_batch, self.label_arrays, self.label_layouts)
 
         for exec_ in self.execs:
             exec_.forward(is_train=is_train)
@@ -391,8 +398,10 @@ def get_output_shapes(self):
             concat_shapes.append((key, tuple(the_shape)))
         return concat_shapes
 
-    def get_outputs(self, merge_multi_context=True):
+    def get_outputs(self, merge_multi_context=True, begin=0, end=None):
         """Get outputs of the previous forward computation.
+        If begin or end is specified, return [begin, end)-th outputs,
+        otherwise return all outputs.
 
         Parameters
         ----------
@@ -401,6 +410,10 @@ def get_outputs(self, merge_multi_context=True):
             will be collected from multiple devices. A `True` value indicate that we
             should merge the collected results so that they look like from a single
             executor.
+        begin : int
+            starting index of returned outputs in all outputs
+        end : int or None
+            ending index (excluded) of returned outputs.
 
         Returns
         -------
@@ -408,8 +421,10 @@ def get_outputs(self, merge_multi_context=True):
         is like ``[[out1_dev1, out1_dev2], [out2_dev1, out2_dev2]]``. All the output
         elements are `NDArray`.
         """
+        if end is None:
+            end = self.num_outputs
         outputs = [[exec_.outputs[i] for exec_ in self.execs]
-                   for i in range(len(self.execs[0].outputs))]
+                   for i in range(begin, end)]
         if merge_multi_context:
             outputs = _merge_multi_context(outputs, self.output_layouts)
         return outputs
@@ -508,7 +523,9 @@ def backward(self, out_grads=None):
             exec_.backward(out_grads=out_grads_slice)
 
     def update_metric(self, eval_metric, labels):
-        """Accumulate the performance according to `eval_metric` on all devices.
+        """Accumulate the performance according to `eval_metric` on all devices
+        by comparing outputs from [begin, end) to labels. By default use all
+        outputs.
 
         Parameters
         ----------
@@ -516,6 +533,10 @@ def update_metric(self, eval_metric, labels):
             The metric used for evaluation.
         labels : list of NDArray
             Typically comes from `label` of a `DataBatch`.
+        begin : int
+            Starting index of used outputs.
+        end : int or None
+            Ending index of used outputs.
         """
         for texec, islice in zip(self.execs, self.slices):
             labels_slice = []
@@ -532,7 +553,9 @@ def update_metric(self, eval_metric, labels):
                 else:
                     labels_slice.append(label)
 
-            eval_metric.update(labels_slice, texec.outputs)
+            labels = OrderedDict(zip(self.label_names, labels_slice))
+            preds = OrderedDict(zip(self.output_names, texec.outputs))
+            eval_metric.update_dict(labels, preds)
 
     def _bind_ith_exec(self, i, data_shapes, label_shapes, shared_group):
         """Internal utility function to bind the i-th executor.
diff --git a/python/mxnet/ndarray.py b/python/mxnet/ndarray.py
index 067126f8221f..53afb0639abf 100644
--- a/python/mxnet/ndarray.py
+++ b/python/mxnet/ndarray.py
@@ -506,9 +506,13 @@ def reshape(self, shape):
         shape : tuple of int
             The new shape should not change the array size, namely
             ``np.prod(new_shape)`` should be equal to ``np.prod(self.shape)``.
-            One shape dimension can be -1. In this case, the value is inferred
+
+            One dimension can be -1. In this case, the value is inferred
             from the length of the array and remaining dimensions.
 
+            0 Dimensions in shape will be copied from original shape, i.e.
+            if x.shape == (3, 4, 5), x.reshape((0, 20)).shape will be (3, 20).
+
 
         Returns
         -------
@@ -538,22 +542,6 @@ def reshape(self, shape):
         """
         handle = NDArrayHandle()
 
-        # Infer the correct size for dim == -1
-        shape = list(shape)
-        for index, element in enumerate(shape):
-            if element == -1:
-                remainder = list(self.shape)
-                for i, e in enumerate(shape):  # pylint: disable=invalid-name
-                    if i != index and e == -1:
-                        raise ValueError('Only one dimension can be inferred.')
-                    try:
-                        remainder.remove(e)
-                    except ValueError:
-                        pass
-                shape[index] = np.product(remainder)
-                # We have already gone through the whole shape, break
-                break
-
         # Actual reshape
         check_call(_LIB.MXNDArrayReshape(self.handle,
                                          len(shape),
@@ -959,7 +947,7 @@ def empty(shape, ctx=None, dtype=mx_real_t):
         ctx = Context.default_ctx
     return NDArray(handle=_new_alloc_handle(shape, ctx, False, dtype))
 
-def zeros(shape, ctx=None, dtype=mx_real_t):
+def zeros(shape, ctx=None, dtype=mx_real_t, **kwargs):
     """Returns a new array filled with all zeros, with the given shape and type.
 
     Parameters
@@ -985,13 +973,14 @@ def zeros(shape, ctx=None, dtype=mx_real_t):
     >>> mx.nd.zeros((1,2), mx.gpu(0), 'float16').asnumpy()
     array([[ 0.,  0.]], dtype=float16)
     """
+    # pylint: disable= unused-argument
     if ctx is None:
         ctx = Context.default_ctx
     # pylint: disable= no-member, protected-access
     return _internal._zeros(shape=shape, ctx=ctx, dtype=dtype)
     # pylint: enable= no-member, protected-access
 
-def ones(shape, ctx=None, dtype=mx_real_t):
+def ones(shape, ctx=None, dtype=mx_real_t, **kwargs):
     """Returns a new array filled with all ones, with the given shape and type.
 
     Parameters
@@ -1018,6 +1007,7 @@ def ones(shape, ctx=None, dtype=mx_real_t):
     >>> mx.nd.ones((1,2), dtype='float16').asnumpy()
     array([[ 1.,  1.]], dtype=float16)
     """
+    # pylint: disable= unused-argument
     if ctx is None:
         ctx = Context.default_ctx
     # pylint: disable= no-member, protected-access
diff --git a/python/mxnet/optimizer.py b/python/mxnet/optimizer.py
index df30fb071b5c..d2d394076e89 100644
--- a/python/mxnet/optimizer.py
+++ b/python/mxnet/optimizer.py
@@ -328,11 +328,6 @@ class SGD(Optimizer):
     def __init__(self, momentum=0.0, **kwargs):
         super(SGD, self).__init__(**kwargs)
         self.momentum = momentum
-        self.kwargs = {'rescale_grad': self.rescale_grad}
-        if self.momentum > 0:
-            self.kwargs['momentum'] = self.momentum
-        if self.clip_gradient:
-            self.kwargs['clip_gradient'] = self.clip_gradient
 
     def create_state(self, index, weight):
         if self.momentum == 0.0:
@@ -347,12 +342,18 @@ def update(self, index, weight, grad, state):
         wd = self._get_wd(index)
         self._update_count(index)
 
+        kwargs = {'rescale_grad': self.rescale_grad}
+        if self.momentum > 0:
+            kwargs['momentum'] = self.momentum
+        if self.clip_gradient:
+            kwargs['clip_gradient'] = self.clip_gradient
+
         if state is not None:
             sgd_mom_update(weight, grad, state, out=weight,
-                           lr=lr, wd=wd, **self.kwargs)
+                           lr=lr, wd=wd, **kwargs)
         else:
             sgd_update(weight, grad, out=weight,
-                       lr=lr, wd=wd, **self.kwargs)
+                       lr=lr, wd=wd, **kwargs)
 
 @register
 class DCASGD(Optimizer):
@@ -506,10 +507,7 @@ def __init__(self, learning_rate=0.001, beta1=0.9, beta2=0.999, epsilon=1e-8,
         super(Adam, self).__init__(learning_rate=learning_rate, **kwargs)
         self.beta1 = beta1
         self.beta2 = beta2
-        self.kwargs = {'beta1': beta1, 'beta2': beta2, 'epsilon': epsilon,
-                       'rescale_grad': self.rescale_grad}
-        if self.clip_gradient:
-            self.kwargs['clip_gradient'] = self.clip_gradient
+        self.epsilon = epsilon
 
     def create_state(self, index, weight):
         return (zeros(weight.shape, weight.context, dtype=weight.dtype),  # mean
@@ -526,9 +524,15 @@ def update(self, index, weight, grad, state):
         coef1 = 1. - self.beta1**t
         coef2 = 1. - self.beta2**t
         lr *= math.sqrt(coef2)/coef1
+
+        kwargs = {'beta1': self.beta1, 'beta2': self.beta2, 'epsilon': self.epsilon,
+                  'rescale_grad': self.rescale_grad}
+        if self.clip_gradient:
+            kwargs['clip_gradient'] = self.clip_gradient
+
         mean, var = state
         adam_update(weight, grad, mean, var, out=weight,
-                    lr=lr, wd=wd, **self.kwargs)
+                    lr=lr, wd=wd, **kwargs)
 
 @register
 class AdaGrad(Optimizer):
@@ -606,15 +610,8 @@ def __init__(self, learning_rate=0.001, gamma1=0.9, gamma2=0.9,
         self.gamma1 = gamma1
         self.gamma2 = gamma2
         self.centered = centered
+        self.epsilon = epsilon
         self.clip_weights = clip_weights
-        self.kwargs = {'gamma1': gamma1, 'epsilon': epsilon,
-                       'rescale_grad': self.rescale_grad}
-        if self.centered:
-            self.kwargs['gamma2'] = gamma2
-        if self.clip_gradient:
-            self.kwargs['clip_gradient'] = self.clip_gradient
-        if self.clip_weights:
-            self.kwargs['clip_weights'] = self.clip_weights
 
     def create_state(self, index, weight):
         if self.centered:
@@ -631,14 +628,24 @@ def update(self, index, weight, grad, state):
         lr = self._get_lr(index)
         wd = self._get_wd(index)
         self._update_count(index)
+
+        kwargs = {'gamma1': self.gamma1, 'epsilon': self.epsilon,
+                  'rescale_grad': self.rescale_grad}
+        if self.centered:
+            kwargs['gamma2'] = self.gamma2
+        if self.clip_gradient:
+            kwargs['clip_gradient'] = self.clip_gradient
+        if self.clip_weights:
+            kwargs['clip_weights'] = self.clip_weights
+
         if not self.centered:
             (n, ) = state
             rmsprop_update(
-                weight, grad, n, out=weight, lr=lr, wd=wd, **self.kwargs)
+                weight, grad, n, out=weight, lr=lr, wd=wd, **kwargs)
         else:
             n, g, delta = state
             rmspropalex_update(weight, grad, n, g, delta, out=weight,
-                               lr=lr, wd=wd, **self.kwargs)
+                               lr=lr, wd=wd, **kwargs)
 
 @register
 class AdaDelta(Optimizer):
diff --git a/python/mxnet/registry.py b/python/mxnet/registry.py
new file mode 100644
index 000000000000..fdd095e1ebb5
--- /dev/null
+++ b/python/mxnet/registry.py
@@ -0,0 +1,141 @@
+# coding: utf-8
+# pylint: disable=no-member
+
+"""Registry for serializable objects."""
+from __future__ import absolute_import
+
+import json
+import warnings
+
+from .base import string_types
+
+_REGISTRY = {}
+
+
+def get_register_func(base_class, nickname):
+    """Get registrator function.
+
+    Parameters
+    ----------
+    base_class : type
+        base class for classes that will be reigstered
+    nickname : str
+        nickname of base_class for logging
+
+    Returns
+    -------
+    a registrator function
+    """
+    if base_class not in _REGISTRY:
+        _REGISTRY[base_class] = {}
+    registry = _REGISTRY[base_class]
+
+    def register(klass, name=None):
+        """Register functions"""
+        assert issubclass(klass, base_class), \
+            "Can only register subclass of %s"%base_class.__name__
+        if name is None:
+            name = klass.__name__.lower()
+        if name in registry:
+            warnings.warn(
+                "\033[91mNew %s %s.%s registered with name %s is"
+                "overriding existing %s %s.%s\033[0m"%(
+                    nickname, klass.__module__, klass.__name__, name,
+                    nickname, registry[name].__module__, registry[name].__name__),
+                UserWarning, stacklevel=2)
+        registry[name] = klass
+        return klass
+
+    register.__doc__ = "Register %s to the %s factory"%(nickname, nickname)
+    return register
+
+
+def get_alias_func(base_class, nickname):
+    """Get registrator function that allow aliases.
+
+    Parameters
+    ----------
+    base_class : type
+        base class for classes that will be reigstered
+    nickname : str
+        nickname of base_class for logging
+
+    Returns
+    -------
+    a registrator function
+    """
+    register = get_register_func(base_class, nickname)
+
+    def alias(*aliases):
+        """alias registrator"""
+        def reg(klass):
+            """registrator function"""
+            for name in aliases:
+                register(klass, name)
+            return klass
+        return reg
+    return alias
+
+
+def get_create_func(base_class, nickname):
+    """Get creator function
+
+    Parameters
+    ----------
+    base_class : type
+        base class for classes that will be reigstered
+    nickname : str
+        nickname of base_class for logging
+
+    Returns
+    -------
+    a creator function
+    """
+    if base_class not in _REGISTRY:
+        _REGISTRY[base_class] = {}
+    registry = _REGISTRY[base_class]
+
+    def create(*args, **kwargs):
+        """Create instance from config"""
+        if len(args):
+            name = args[0]
+            args = args[1:]
+        else:
+            name = kwargs.pop(nickname)
+
+        if isinstance(name, base_class):
+            assert len(args) == 0 and len(kwargs) == 0, \
+                "%s is already an instance. Additional arguments are invalid"%(nickname)
+            return name
+
+        if isinstance(name, dict):
+            return create(**name)
+
+        assert isinstance(name, string_types), "%s must be of string type"%nickname
+
+        if name.startswith('['):
+            assert not args and not kwargs
+            name, kwargs = json.loads(name)
+            return create(name, **kwargs)
+        elif name.startswith('{'):
+            assert not args and not kwargs
+            kwargs = json.loads(name)
+            return create(**kwargs)
+
+        name = name.lower()
+        assert name in registry, \
+            "%s is not registered. Please register with %s.register first"%(
+                str(name), nickname)
+        return registry[name](*args, **kwargs)
+
+    create.__doc__ = """Create a %s instance from config.
+
+Parameters
+----------
+%s : str or %s instance
+    class name of desired instance. If is a instance,
+    it will be returned directly.
+**kwargs : dict
+    arguments to be passed to constructor"""%(nickname, nickname, base_class.__name__)
+
+    return create
diff --git a/python/mxnet/symbol.py b/python/mxnet/symbol.py
index b2fdf595fed9..13cfd9da183e 100644
--- a/python/mxnet/symbol.py
+++ b/python/mxnet/symbol.py
@@ -47,8 +47,11 @@ class Symbol(SymbolBase):
     def __repr__(self):
         """Get a string representation of the symbol."""
         name = self.name
-        return '<%s %s>' % (self.__class__.__name__,
-                            'Grouped' if name is None else name)
+        if name is None:
+            name = ', '.join([i.name for i in self])
+            return '<%s group [%s]>' % (self.__class__.__name__, name)
+        else:
+            return '<%s %s>' % (self.__class__.__name__, name)
 
     def __iter__(self):
         """Returns a generator object of symbol.
@@ -1432,6 +1435,24 @@ def eval(self, ctx=cpu(), **kwargs):
         """
         return self.bind(ctx, kwargs).forward()
 
+    def reshape(self, shape):
+        """Shorthand for mxnet.sym.reshape.
+
+        Parameters
+        ----------
+        shape : tuple of int
+            The new shape should not change the array size, namely
+            ``np.prod(new_shape)`` should be equal to ``np.prod(self.shape)``.
+            One shape dimension can be -1. In this case, the value is inferred
+            from the length of the array and remaining dimensions.
+
+
+        Returns
+        -------
+        Symbol
+            A reshaped symbol.
+        """
+        return reshape(self, shape=shape)
 
 
 def var(name, attr=None, shape=None, lr_mult=None, wd_mult=None, dtype=None, init=None, **kwargs):
@@ -1485,7 +1506,9 @@ def var(name, attr=None, shape=None, lr_mult=None, wd_mult=None, dtype=None, ini
     if dtype is not None:
         attr['__dtype__'] = str(_DTYPE_NP_TO_MX[_numpy.dtype(dtype).type])
     if init is not None:
-        attr['__init__'] = init.dumps()
+        if not isinstance(init, string_types):
+            init = init.dumps()
+        attr['__init__'] = init
     for k, v in kwargs.items():
         if k.startswith('__') and k.endswith('__'):
             attr[k] = str(v)
diff --git a/src/c_api/c_api.cc b/src/c_api/c_api.cc
index 154eb1bc7969..7e5194620b39 100644
--- a/src/c_api/c_api.cc
+++ b/src/c_api/c_api.cc
@@ -296,8 +296,32 @@ MXNET_DLL int MXNDArrayReshape(NDArrayHandle handle,
                                NDArrayHandle *out) {
   NDArray *ptr = new NDArray();
   API_BEGIN();
+  NDArray *arr = static_cast<NDArray*>(handle);
   TShape new_shape(dims, dims+ndim);
-  *ptr = static_cast<NDArray*>(handle)->Reshape(new_shape);
+  int size = 1;
+  int pos = -1;
+  for (int i = 0; i < ndim; ++i) {
+    int dim = dims[i];
+    if (dim == -1) {
+      CHECK_EQ(pos, -1)
+        << "Invalid new shape " << new_shape
+        << ": more than one dimensions are -1";
+      pos = i;
+    } else {
+      if (dim == 0) {
+        CHECK_LT(i, arr->shape().ndim())
+          << "Invalid new shape " << new_shape
+          << ": 0 dimension exceeds original shape " << arr->shape();
+        dim = arr->shape()[i];
+      }
+      size *= dim;
+      new_shape[i] = dim;
+    }
+  }
+  if (pos >= 0) {
+    new_shape[pos] = arr->shape().Size() / size;
+  }
+  *ptr = arr->Reshape(new_shape);
   *out = ptr;
   API_END_HANDLE_ERROR(delete ptr);
 }
diff --git a/src/c_api/c_api_ndarray.cc b/src/c_api/c_api_ndarray.cc
index a51c7a84805c..c633e8609cd4 100644
--- a/src/c_api/c_api_ndarray.cc
+++ b/src/c_api/c_api_ndarray.cc
@@ -86,6 +86,8 @@ void SetNDInputsOutputs(const nnvm::Op* op,
     *num_outputs = num_visible_outputs;
     ndoutputs.resize(infered_num_outputs);
   } else {
+    CHECK(!AutogradRuntime::Get()->IsTraining())
+      << "Cannot assign to NDArray or specify 'out' when training with autograd";
     CHECK(*num_outputs == infered_num_outputs || *num_outputs == num_visible_outputs)
       << "Expecting " << infered_num_outputs << " (all) or "
       << num_visible_outputs << " (visible only) outputs, got "
@@ -372,7 +374,7 @@ int MXImperativeInvoke(AtomicSymbolCreator creator,
 
     if (fn) {
       if (AutogradRuntime::Get()->IsTraining()) {
-        AutogradRuntime::Get()->RecordImperativeFCompute(fn, op,
+        AutogradRuntime::Get()->RecordImperativeFCompute(op,
             attrs, &ndinputs, &ndoutputs);
       }
       PushFCompute(fn, op, attrs, ctx, read_vars, write_vars,
diff --git a/src/ndarray/autograd.cc b/src/ndarray/autograd.cc
index 69514297584d..ff7049a10c6e 100644
--- a/src/ndarray/autograd.cc
+++ b/src/ndarray/autograd.cc
@@ -67,8 +67,7 @@ void AutogradRuntime::MarkVariables(
   }
 }
 
-void AutogradRuntime::RecordImperativeFCompute(FCompute fn,
-                                               const nnvm::Op* op,
+void AutogradRuntime::RecordImperativeFCompute(const nnvm::Op* op,
                                                const nnvm::NodeAttrs& attrs,
                                                std::vector<NDArray> *p_inputs,
                                                std::vector<NDArray> *p_outputs) {
@@ -109,9 +108,16 @@ AGNodePtr AutogradRuntime::RecordOp(const nnvm::Op* op,
   ag_node->opr = opr;
 
   for (uint32_t i = 0; i < outputs.size(); ++i) {
-    outputs[i].entry_.clear();
-    ag_node->outputs.push_back(outputs[i]);
-    outputs[i].entry_ = AGNodeEntry{ag_node, i, 0};
+    if (outputs[i].entry_.ag_node == nullptr ||
+        !outputs[i].entry_.ag_node->out_grads.size()) {
+      outputs[i].entry_.clear();
+      ag_node->outputs.push_back(outputs[i]);
+      outputs[i].entry_ = AGNodeEntry{ag_node, i, 0};
+    } else {
+      NDArray copy = outputs[i];
+      copy.entry_.clear();
+      ag_node->outputs.push_back(copy);
+    }
   }
 
   for (size_t i = 0; i < inputs.size(); ++i) {
@@ -130,6 +136,7 @@ AGNodePtr AutogradRuntime::RecordOp(const nnvm::Op* op,
 }
 
 void AutogradRuntime::ComputeGradient(const std::vector<NDArray>& outputs) {
+  static auto& fmutate_inputs = nnvm::Op::GetAttr<nnvm::FMutateInputs>("FMutateInputs");
   std::vector<AGNodeEntry> heads;
   Symbol sym;
   NodeEntryMap<NDArray> feed_dict;
@@ -139,29 +146,44 @@ void AutogradRuntime::ComputeGradient(const std::vector<NDArray>& outputs) {
       << "computation history. Did you forget to set is_training?";
     heads.emplace_back(i.entry_);
     sym.outputs.emplace_back(i.entry_.nn_entry());
-    feed_dict.insert({i.entry_.nn_entry(), i});
   }
 
+  std::unordered_set<AGNode*> mutable_set;
+  std::vector<AGNodePtr> vlist;
   std::vector<NDArray> args, args_grad;
+  std::vector<NDArray> aux_states;
   std::vector<OpReqType> grad_reqs;
   std::unordered_map<const nnvm::Node*, std::shared_ptr<Operator>> saved_opr;
   AGDFSVisit(heads, [&](const AGNodePtr& n) {
-      if (n->opr != nullptr) {
-        saved_opr.insert({n->nn_node.get(), n->opr});
-      } else if (n->nn_node->is_variable()) {
-        args.push_back(n->outputs[0]);
-        args_grad.push_back(n->out_grads[0]);
-        grad_reqs.push_back(n->grad_req);
+      if (n->nn_node->is_variable()) {
+        vlist.push_back(n);
+      } else {
+        if (n->opr != nullptr) {
+          saved_opr.insert({n->nn_node.get(), n->opr});
+        }
+        if (fmutate_inputs.count(n->nn_node->op())) {
+          for (uint32_t i : fmutate_inputs[n->nn_node->op()](n->nn_node->attrs)) {
+            mutable_set.insert(n->inputs[i].ag_node.get());
+          }
+        }
       }
-      for (const auto& i : n->inputs) {
-        feed_dict.insert({i.nn_entry(), i.ag_node->outputs[i.index]});
+      for (uint32_t i = 0; i < n->outputs.size(); ++i) {
+        feed_dict.insert({NodeEntry{n->nn_node, i, 0}, n->outputs[i]});
       }
     });
 
+  for (const auto& n : vlist) {
+    if (mutable_set.count(n.get())) {
+      aux_states.push_back(n->outputs[0]);
+    } else {
+      args.push_back(n->outputs[0]);
+      args_grad.push_back(n->out_grads[0]);
+      grad_reqs.push_back(n->grad_req);
+    }
+  }
 
   if (args.size()) {
     std::map<std::string, Context> ctx_map;
-    std::vector<NDArray> aux_states;
     auto exec = new exec::GraphExecutor();
     // (TODO) too hack here
     exec->saved_opr_ = saved_opr;
diff --git a/src/ndarray/autograd.h b/src/ndarray/autograd.h
index c4ad0c99bc1c..6a18851de9e3 100644
--- a/src/ndarray/autograd.h
+++ b/src/ndarray/autograd.h
@@ -10,6 +10,7 @@
 #include <mxnet/base.h>
 #include <mxnet/ndarray.h>
 #include <mxnet/op_attr_types.h>
+#include <mxnet/c_api.h>
 #include <nnvm/symbolic.h>
 #include <nnvm/op.h>
 #include <nnvm/graph.h>
@@ -65,8 +66,7 @@ class AutogradRuntime {
                      const std::vector<mx_uint>& grad_reqs,
                      const std::vector<NDArray*>& gradients);
   /*! \brief record imperative operator which is executed by fcompute. */
-  void RecordImperativeFCompute(FCompute fn,
-                                const nnvm::Op* op,
+  void RecordImperativeFCompute(const nnvm::Op* op,
                                 const nnvm::NodeAttrs& attrs,
                                 std::vector<NDArray>* p_inputs,
                                 std::vector<NDArray>* p_outputs);
diff --git a/src/ndarray/ndarray.cc b/src/ndarray/ndarray.cc
index 3b1eed9940a4..c19a82b164c4 100644
--- a/src/ndarray/ndarray.cc
+++ b/src/ndarray/ndarray.cc
@@ -11,6 +11,7 @@
 #include <mxnet/resource.h>
 #include <mshadow/tensor.h>
 #include "./ndarray_function.h"
+#include "./autograd.h"
 
 #if MXNET_USE_OPENCV
 #include <opencv2/opencv.hpp>
@@ -22,6 +23,73 @@ DMLC_REGISTRY_ENABLE(::mxnet::NDArrayFunctionReg);
 
 namespace mxnet {
 
+NDArray NDArray::Reshape(const TShape &shape) const {
+  using namespace autograd;
+  CHECK_GE(shape_.Size(), shape.Size())
+      << "NDArray.Reshape: target shape size is different from current shape";
+  NDArray ret = *this;
+  ret.shape_ = shape;
+  if (AutogradRuntime::Get()->IsTraining()) {
+    // fake a Reshape op
+    ret.entry_.clear();
+    const nnvm::Op* op = nnvm::Op::Get("Reshape");
+    nnvm::NodeAttrs attrs;
+    attrs.op = op;
+    std::ostringstream os;
+    os << shape;
+    attrs.dict.insert({"shape", os.str()});
+    op->attr_parser(&attrs);
+    std::vector<NDArray> inputs, outputs;
+    inputs.emplace_back(*this);
+    outputs.emplace_back(std::move(ret));
+    AutogradRuntime::Get()->RecordImperativeFCompute(
+      op, attrs, &inputs, &outputs);
+    return outputs[0];
+  } else {
+    return ret;
+  }
+}
+
+
+NDArray NDArray::Slice(index_t begin, index_t end) const {
+  using namespace autograd;
+  NDArray ret = *this;
+  CHECK(!is_none()) << "NDArray is not initialized";
+  CHECK_GE(shape_[0], end) << "Slice end index out of range";
+  size_t length = shape_.ProdShape(1, shape_.ndim());
+  ret.offset_ += begin * length;
+  ret.shape_[0] = end - begin;
+  if (AutogradRuntime::Get()->IsTraining()) {
+    // fake a slice_axis op
+    ret.entry_.clear();
+    const nnvm::Op* op = nnvm::Op::Get("slice_axis");
+    nnvm::NodeAttrs attrs;
+    attrs.op = op;
+    attrs.dict.insert({"axis", "0"});
+    attrs.dict.insert({"begin", std::to_string(begin)});
+    attrs.dict.insert({"end", std::to_string(end)});
+    op->attr_parser(&attrs);
+    std::vector<NDArray> inputs, outputs;
+    inputs.emplace_back(*this);
+    outputs.emplace_back(std::move(ret));
+    AutogradRuntime::Get()->RecordImperativeFCompute(
+      op, attrs, &inputs, &outputs);
+    return outputs[0];
+  } else {
+    return ret;
+  }
+}
+
+
+NDArray NDArray::At(index_t idx) const {
+  NDArray ret = this->Slice(idx, idx+1);
+  if (shape_.ndim() > 1) {
+    return ret.Reshape(TShape(shape_.data()+1, shape_.data()+shape_.ndim()));
+  } else {
+    return ret;
+  }
+}
+
 /*!
 * \brief run a ternary operation
 * \param lhs left operand
@@ -545,63 +613,6 @@ NDArray &NDArray::operator/=(const real_t &src) {
   return ScalarOpApply<ndarray::Div>(this, src);
 }
 
-/*!
- * \brief Get a broadcasted NDArray
- * \param src the source ndarray
- * \param dim dimension to broadcast
- * \param size size after broadcasting
- */
-void Broadcast(const NDArray& src, int dim, int size, NDArray *out) {
-  CHECK(0 <= dim && dim < static_cast<int>(src.shape().ndim()))
-      << "Broadcast dimension out of bound.";
-  CHECK(src.shape()[dim] == 1) << "Cannot broadcast a dimension that is not 1.";
-  TShape new_shape = src.shape();
-  new_shape[dim] = size;
-  if (out->is_none()) {
-    *out = NDArray(new_shape, src.ctx(), true, src.dtype());
-  } else {
-    CHECK(out->ctx() == src.ctx()) << "target context mismatch";
-    CHECK(out->shape() == new_shape)
-      << "invalid target shape: " << out->shape() << " should be: " << new_shape;
-  }
-  std::vector<Engine::VarHandle> const_vars;
-  const_vars.push_back(src.var());
-  size_t before = src.shape().ProdShape(0, dim);
-  size_t after = src.shape().ProdShape(dim + 1, src.shape().ndim());
-
-  // important: callback must always capture by value
-  NDArray ret = *out;
-  switch (src.ctx().dev_mask()) {
-    case cpu::kDevMask: {
-      Engine::Get()->PushSync([src, ret, before, size, after](RunContext ctx) {
-          ret.CheckAndAlloc();
-          NDArray inter_in = src.Reshape(mshadow::Shape2(before, after));
-          NDArray inter_out = ret.Reshape(mshadow::Shape3(before, size, after));
-          TBlob tmp = inter_out.data();
-          ndarray::EvalBroadcast<cpu>(inter_in.data(), &tmp, size, ctx);
-      }, src.ctx(), const_vars, {ret.var()},
-      FnProperty::kNormal, 0, PROFILER_MESSAGE_FUNCNAME);
-      break;
-    }
-#if MXNET_USE_CUDA
-    case gpu::kDevMask: {
-      Engine::Get()->PushSync([src, ret, before, size, after](RunContext ctx) {
-          ret.CheckAndAlloc();
-          NDArray inter_in = src.Reshape(mshadow::Shape2(before, after));
-          NDArray inter_out = ret.Reshape(mshadow::Shape3(before, size, after));
-          TBlob tmp = inter_out.data();
-          ndarray::EvalBroadcast<gpu>(inter_in.data(), &tmp, size, ctx);
-          // Wait GPU kernel to complete
-          ctx.get_stream<gpu>()->Wait();
-      }, src.ctx(), const_vars, {ret.var()},
-      FnProperty::kNormal, 0, PROFILER_MESSAGE_FUNCNAME);
-      break;
-    }
-#endif
-    default: LOG(FATAL) << MXNET_GPU_NOT_ENABLED_ERROR;
-  }
-}
-
 void NDArray::Save(dmlc::Stream *strm) const {
   // save shape
   shape_.Save(strm);
@@ -857,23 +868,6 @@ void Imdecode(NDArray *ret, NDArray mean, size_t index,
 #endif  // MXNET_USE_OPENCV
 }
 
-MXNET_REGISTER_NDARRAY_FUN(_broadcast)
-.set_type_mask(kAcceptEmptyMutateTarget | kNDArrayArgBeforeScalar)
-.set_body([](NDArray **u, real_t *s, NDArray **out,
-             int num_params, char **param_keys, char **param_vals) {
-      Broadcast(*u[0],
-                static_cast<int>(s[0]),
-                static_cast<int>(s[1]),
-                out[0]);
-    })
-.set_num_use_vars(1)
-.set_num_scalars(2)
-.set_num_mutate_vars(1)
-.describe("Broadcast array in the given axis to the given size")
-.add_argument("src", "NDArray-or-Symbol", "source ndarray")
-.add_argument("axis", "int", "axis to broadcast")
-.add_argument("size", "int", "size of broadcast");
-
 MXNET_REGISTER_NDARRAY_FUN(_imdecode)
 .set_type_mask(kAcceptEmptyMutateTarget | kNDArrayArgBeforeScalar)
 .set_body([](NDArray **u, real_t *s, NDArray **out,
diff --git a/src/operator/batch_norm.cc b/src/operator/batch_norm.cc
index 74f43b60b217..92457e41002e 100644
--- a/src/operator/batch_norm.cc
+++ b/src/operator/batch_norm.cc
@@ -80,6 +80,8 @@ then set ``gamma`` to 1 and its gradient to 0.
 .add_argument("data", "NDArray-or-Symbol", "Input data to batch normalization")
 .add_argument("gamma", "NDArray-or-Symbol", "gamma array")
 .add_argument("beta", "NDArray-or-Symbol", "beta array")
+.add_argument("moving_mean", "NDArray-or-Symbol", "running mean of input")
+.add_argument("moving_var", "NDArray-or-Symbol", "running variance of input")
 .add_arguments(BatchNormParam::__FIELDS__());
 
 NNVM_REGISTER_OP(BatchNorm)
diff --git a/src/operator/convolution-inl.h b/src/operator/convolution-inl.h
index 8bd2ff5c9d6e..0036befcdb6a 100644
--- a/src/operator/convolution-inl.h
+++ b/src/operator/convolution-inl.h
@@ -368,13 +368,26 @@ class ConvolutionProp : public OperatorProperty {
           << "incorrect stride size: " << param_.stride;
       CHECK_GT(param_.dilate.Size(), 0U) \
           << "incorrect dilate size: " << param_.dilate;
-      CHECK(dilated_ksize_x <= AddPad(dshape[2], param_.pad[0]))
-          << "kernel size exceed input";
       Shape<3> oshape;
       oshape[0] = dshape[0];
       oshape[1] = param_.num_filter;
-      oshape[2] = (AddPad(dshape[2], param_.pad[0]) - dilated_ksize_x) / param_.stride[0] + 1;
+      oshape[2] = dshape[2] ?
+          (AddPad(dshape[2], param_.pad[0]) - dilated_ksize_x) / param_.stride[0] + 1 : 0;
       SHAPE_ASSIGN_CHECK(*out_shape, 0, ConvertLayout(oshape, kNCW, param_.layout.value()));
+      // Perform incomplete shape inference. Fill in the missing values in data shape.
+      // 1) We can always fill in the batch_size.
+      // 2) We can back-calculate the input height/width if the corresponding stride is 1.
+      oshape = ConvertLayout((*out_shape)[0].get<3>(), param_.layout.value(), kNCW);
+      dshape[0] = oshape[0];
+      if (oshape[2] && param_.stride[0] == 1) {
+        dshape[2] = oshape[2] + dilated_ksize_x - 1 - 2 * param_.pad[0];
+      }
+      SHAPE_ASSIGN_CHECK(*in_shape, conv::kData,
+                          ConvertLayout(dshape, kNCW, param_.layout.value()));
+      // Check whether the kernel sizes are valid
+      if (dshape[2] != 0) {
+        CHECK_LE(dilated_ksize_x, AddPad(dshape[2], param_.pad[0])) << "kernel size exceed input";
+      }
       return true;
     } else if (param_.kernel.ndim() == 2) {
       // 2d conv
@@ -406,18 +419,20 @@ class ConvolutionProp : public OperatorProperty {
       Shape<4> oshape;
       oshape[0] = dshape[0];
       oshape[1] = param_.num_filter;
-      oshape[2] = (AddPad(dshape[2], param_.pad[0]) - dilated_ksize_y) / param_.stride[0] + 1;
-      oshape[3] = (AddPad(dshape[3], param_.pad[1]) - dilated_ksize_x) / param_.stride[1] + 1;
+      oshape[2] = dshape[2] ?
+        (AddPad(dshape[2], param_.pad[0]) - dilated_ksize_y) / param_.stride[0] + 1 : 0;
+      oshape[3] = dshape[3] ?
+        (AddPad(dshape[3], param_.pad[1]) - dilated_ksize_x) / param_.stride[1] + 1 : 0;
       SHAPE_ASSIGN_CHECK(*out_shape, 0, ConvertLayout(oshape, kNCHW, param_.layout.value()));
       // Perform incomplete shape inference. Fill in the missing values in data shape.
       // 1) We can always fill in the batch_size.
       // 2) We can back-calculate the input height/width if the corresponding stride is 1.
       oshape = ConvertLayout((*out_shape)[0].get<4>(), param_.layout.value(), kNCHW);
       dshape[0] = oshape[0];
-      if (param_.stride[0] == 1) {
+      if (oshape[2] && param_.stride[0] == 1) {
         dshape[2] = oshape[2] + dilated_ksize_y - 1 - 2 * param_.pad[0];
       }
-      if (param_.stride[1] == 1) {
+      if (oshape[3] && param_.stride[1] == 1) {
         dshape[3] = oshape[3] + dilated_ksize_x - 1 - 2 * param_.pad[1];
       }
       SHAPE_ASSIGN_CHECK(*in_shape, conv::kData,
@@ -464,22 +479,25 @@ class ConvolutionProp : public OperatorProperty {
       Shape<5> oshape;
       oshape[0] = dshape[0];
       oshape[1] = param_.num_filter;
-      oshape[2] = (AddPad(dshape[2], param_.pad[0]) - dilated_ksize_d) / param_.stride[0] + 1;
-      oshape[3] = (AddPad(dshape[3], param_.pad[1]) - dilated_ksize_y) / param_.stride[1] + 1;
-      oshape[4] = (AddPad(dshape[4], param_.pad[2]) - dilated_ksize_x) / param_.stride[2] + 1;
+      oshape[2] = dshape[2] ?
+        (AddPad(dshape[2], param_.pad[0]) - dilated_ksize_d) / param_.stride[0] + 1 : 0;
+      oshape[3] = dshape[3] ?
+        (AddPad(dshape[3], param_.pad[1]) - dilated_ksize_y) / param_.stride[1] + 1 : 0;
+      oshape[4] = dshape[4] ?
+        (AddPad(dshape[4], param_.pad[2]) - dilated_ksize_x) / param_.stride[2] + 1 : 0;
       SHAPE_ASSIGN_CHECK(*out_shape, 0, ConvertLayout(oshape, kNCDHW, param_.layout.value()));
       // Perform incomplete shape inference. Fill in the missing values in data shape.
       // 1) We can always fill in the batch_size.
       // 2) We can back-calculate the input depth/height/width if the corresponding stride is 1.
       oshape = ConvertLayout((*out_shape)[0].get<5>(), param_.layout.value(), kNCDHW);
       dshape[0] = oshape[0];
-      if (param_.stride[0] == 1) {
+      if (oshape[2] && param_.stride[0] == 1) {
         dshape[2] = oshape[2] + dilated_ksize_d - 1 - 2 * param_.pad[0];
       }
-      if (param_.stride[1] == 1) {
+      if (oshape[3] && param_.stride[1] == 1) {
         dshape[3] = oshape[3] + dilated_ksize_y - 1 - 2 * param_.pad[1];
       }
-      if (param_.stride[2] == 1) {
+      if (oshape[4] && param_.stride[2] == 1) {
         dshape[4] = oshape[4] + dilated_ksize_x - 1 - 2 * param_.pad[2];
       }
       SHAPE_ASSIGN_CHECK(*in_shape, conv::kData,
diff --git a/src/operator/pooling.cu b/src/operator/pooling.cu
index c420852b1c8d..bc7716b946af 100644
--- a/src/operator/pooling.cu
+++ b/src/operator/pooling.cu
@@ -15,25 +15,25 @@ namespace op {
 
 template<>
 Operator *CreateOp<gpu>(PoolingParam param, int dtype) {
+  Operator *op = NULL;
 #if MXNET_USE_CUDNN == 1
-  MSHADOW_REAL_TYPE_SWITCH(dtype, DType, {
-    if (!param.cudnn_off) {
+  if (!param.cudnn_off && param.kernel.ndim() > 1) {
+    MSHADOW_REAL_TYPE_SWITCH(dtype, DType, {
       switch (param.pool_type) {
         case pool_enum::kMaxPooling:
-          return new CuDNNPoolingOp<DType>(param);
+          op = new CuDNNPoolingOp<DType>(param);
+          break;
         case pool_enum::kAvgPooling:
-          return new CuDNNPoolingOp<DType>(param);
+          op = new CuDNNPoolingOp<DType>(param);
+          break;
         case pool_enum::kSumPooling:
           LOG(WARNING) << "Sum pooling is not supported by cudnn, MXNet sum pooling is applied.";
-          return new PoolingOp<gpu, DType>(param);
-        default:
-          LOG(FATAL) << "unknown pooling type";
-          return NULL;
+          break;
       }
-    }
-  });
+    });
+  }
+  if (op) return op;
 #endif  // MXNET_USE_CUDNN
-  Operator *op = NULL;
   MSHADOW_REAL_TYPE_SWITCH(dtype, DType, {
     if (pool_enum::kMaxPooling == param.pool_type
         || pool_enum::kAvgPooling == param.pool_type
@@ -48,4 +48,3 @@ Operator *CreateOp<gpu>(PoolingParam param, int dtype) {
 
 }  // namespace op
 }  // namespace mxnet
-
diff --git a/src/operator/tensor/elemwise_unary_op.cc b/src/operator/tensor/elemwise_unary_op.cc
index e5a8ed019768..ce29a2fdb308 100644
--- a/src/operator/tensor/elemwise_unary_op.cc
+++ b/src/operator/tensor/elemwise_unary_op.cc
@@ -96,6 +96,10 @@ MXNET_OPERATOR_REGISTER_UNARY(make_loss)
 .describe(R"code(Stops gradient computation.
 .. note:: ``make_loss`` is deprecated, use ``MakeLoss``.
 )code" ADD_FILELINE)
+.set_attr<nnvm::FListOutputNames>("FListOutputNames",
+  [](const NodeAttrs& attrs) {
+    return std::vector<std::string>{"loss"};
+  })
 .set_attr<FCompute>("FCompute<cpu>", IdentityCompute<cpu>)
 .set_attr<nnvm::FGradient>("FGradient",
   [](const nnvm::NodePtr& n, const std::vector<nnvm::NodeEntry>& ograds) {
diff --git a/tests/python/unittest/test_autograd.py b/tests/python/unittest/test_autograd.py
index 1abff65c5064..c84438d72363 100644
--- a/tests/python/unittest/test_autograd.py
+++ b/tests/python/unittest/test_autograd.py
@@ -1,5 +1,5 @@
 import mxnet.ndarray as nd
-from mxnet.contrib.autograd import grad, grad_and_loss, train, test
+from mxnet.contrib.autograd import *
 from mxnet.test_utils import *
 
 def autograd_assert(*args, **kwargs):
@@ -73,16 +73,18 @@ def f_with_mode(a, b, mode):
     autograd_assert(a, b, False,
         argnum=[0, 1], func=f_with_mode, grad_func=f_mul_grad)
 
+
 def test_training():
     x = nd.ones((10, 10))
-    with train():
+    with train_section():
         y = nd.Dropout(x, p=0.5)
         assert not (y.asnumpy() == x.asnumpy()).all()
-        with test():
+        with test_section():
             y = nd.Dropout(x, p=0.5)
             assert (y.asnumpy() == x.asnumpy()).all()
 
 
+
 if __name__ == "__main__":
     test_training()
     test_unary_func()
diff --git a/tests/python/unittest/test_metric.py b/tests/python/unittest/test_metric.py
new file mode 100644
index 000000000000..98740b05ee32
--- /dev/null
+++ b/tests/python/unittest/test_metric.py
@@ -0,0 +1,22 @@
+import mxnet as mx
+import json
+
+def check_metric(metric, *args, **kwargs):
+    metric = mx.metric.create(metric, *args, **kwargs)
+    str_metric = json.dumps(metric.get_config())
+    metric2 = mx.metric.create(str_metric)
+
+    assert metric.get_config() == metric2.get_config()
+
+
+def test_metrics():
+    check_metric('acc', axis=0)
+    check_metric('f1')
+    check_metric('perplexity', -1)
+    composite = mx.metric.create(['acc', 'f1'])
+    check_metric(composite)
+
+
+if __name__ == '__main__':
+    import nose
+    nose.runmodule()
diff --git a/tests/python/unittest/test_ndarray.py b/tests/python/unittest/test_ndarray.py
index 8673673cfdf7..7f0a1d2b6301 100644
--- a/tests/python/unittest/test_ndarray.py
+++ b/tests/python/unittest/test_ndarray.py
@@ -124,6 +124,7 @@ def test_ndarray_reshape():
     true_res  = mx.nd.array([[1, 2, 3, 4],
                              [5, 6, 7, 8]])
     assert same(tensor.reshape((2, -1)).asnumpy(), true_res.asnumpy())
+    assert same(tensor.reshape((0, -1)).asnumpy(), true_res.asnumpy())
     true_res  = mx.nd.array([[1, 2],
                              [3, 4],
                              [5, 6],
@@ -618,27 +619,5 @@ def test_iter():
 
 
 if __name__ == '__main__':
-    test_broadcast_binary()
-    test_ndarray_setitem()
-    test_ndarray_crop()
-    test_ndarray_concatenate()
-    test_broadcast()
-    test_ndarray_elementwise()
-    test_ndarray_elementwisesum()
-    test_ndarray_slice()
-    test_ndarray_pickle()
-    test_ndarray_saveload()
-    test_ndarray_copy()
-    test_ndarray_negate()
-    test_ndarray_scalar()
-    test_clip()
-    test_dot()
-    test_ndarray_choose()
-    test_ndarray_onehot()
-    test_ndarray_fill()
-    test_reduce()
-    test_arange()
-    test_order()
-    test_ndarray_equal()
-    test_take()
-    test_iter()
+    import nose
+    nose.runmodule()