From f07d35e897b5afd27e133074ea92bfba8f8cd78f Mon Sep 17 00:00:00 2001
From: Ubuntu <ubuntu@ip-172-31-12-243.us-east-2.compute.internal>
Date: Wed, 15 Apr 2020 15:02:45 +0000
Subject: [PATCH 01/24] finish 5 changes

---
 python/mxnet/metric.py | 446 ++++++++++++++++++++++++++++++++++++++---
 1 file changed, 418 insertions(+), 28 deletions(-)

diff --git a/python/mxnet/metric.py b/python/mxnet/metric.py
index eb8f99a66d48..f790c2ded617 100644
--- a/python/mxnet/metric.py
+++ b/python/mxnet/metric.py
@@ -576,7 +576,9 @@ def update(self, labels, preds):
             num_samples = pred_label.shape[0]
             num_dims = len(pred_label.shape)
             if num_dims == 1:
-                self.sum_metric += (pred_label.flat == label.flat).sum()
+                num_correct = (pred_label.flat == label.flat).sum()
+                self.sum_metric += num_correct
+                self.global_sum_metric += num_correct
             elif num_dims == 2:
                 num_classes = pred_label.shape[1]
                 top_k = min(num_classes, self.top_k)
@@ -594,9 +596,19 @@ class _BinaryClassificationMetrics(object):
     True/false positive and true/false negative counts are sufficient statistics for various classification metrics.
     This class provides the machinery to track those statistics across mini-batches of
     (label, prediction) pairs.
+    
+    Parameters
+    ----------
+    beta : float, default 1
+        weight of precision in harmonic mean. 
+    threshold : float, default 0.5
+        threshold for deciding whether the predictions are positive or negative.
+        
     """
 
-    def __init__(self):
+    def __init__(self, threshold=0.5, beta=1):
+        self.threshold = threshold
+        self.beta = beta
         self.true_positives = 0
         self.false_negatives = 0
         self.false_positives = 0
@@ -619,9 +631,19 @@ def update_binary_stats(self, label, pred):
         """
         pred = pred.asnumpy()
         label = label.asnumpy().astype('int32')
-        pred_label = numpy.argmax(pred, axis=1)
-
-        check_label_shapes(label, pred)
+        if len(pred.shape) == 1: # assume each value refers to confidence(positive)
+            pass
+        elif pred.shape[-1] > 2:
+            raise ValueError("%s currently only supports binary classification."
+                             % self.__class__.__name__)
+        elif pred.shape[-1] == 1: # classify positive when confidence(positive) > threshold
+            pred = pred.flat
+        else:
+            pred = pred.reshape(-1, 2)[:, 1] 
+        pred_label = pred > self.threshold 
+        label = label.flat
+        
+        check_label_shapes(label, pred_label)
         if len(numpy.unique(label)) > 2:
             raise ValueError("%s currently only supports binary classification."
                              % self.__class__.__name__)
@@ -674,14 +696,14 @@ def global_recall(self):
     @property
     def fscore(self):
         if self.precision + self.recall > 0:
-            return 2 * self.precision * self.recall / (self.precision + self.recall)
+            return (1 + self.beta ** 2) * self.precision * self.recall / (self.beta ** 2 * self.precision + self.recall)
         else:
             return 0.
 
     @property
     def global_fscore(self):
         if self.global_precision + self.global_recall > 0:
-            return 2 * self.global_precision * self.global_recall / (self.global_precision + self.global_recall)
+            return (1 + self.beta ** 2) * self.global_precision * self.global_recall / (self.beta ** 2 * self.global_precision + self.global_recall)
         else:
             return 0.
 
@@ -723,6 +745,20 @@ def global_total_examples(self):
         return self.global_false_negatives + self.global_false_positives + \
                self.global_true_negatives + self.global_true_positives
 
+    @property
+    def accuracy(self):
+        if self.total_examples > 0:
+            return float(self.true_positives + self.true_negatives) / self.total_examples
+        else:
+            return 0.
+
+    @property
+    def global_accuracy(self):
+        if self.global_total_examples > 0:
+            return float(self.global_true_positives + self.global_true_negatives) / self.global_total_examples
+        else:
+            return 0.
+            
     def local_reset_stats(self):
         self.false_positives = 0
         self.false_negatives = 0
@@ -768,6 +804,8 @@ class F1(EvalMetric):
     label_names : list of str, or None
         Name of labels that should be used when updating with update_dict.
         By default include all labels.
+    threshold : float, default 0.5
+        threshold for postive confidence value.
     average : str, default 'macro'
         Strategy to be used for aggregating across mini-batches.
             "macro": average the F1 scores for each batch.
@@ -784,9 +822,106 @@ class F1(EvalMetric):
     """
 
     def __init__(self, name='f1',
-                 output_names=None, label_names=None, average="macro"):
+                 output_names=None, label_names=None, threshold=0.5, average="macro"):
+        self.average = average
+        self.metrics = _BinaryClassificationMetrics(threshold=threshold)
+        EvalMetric.__init__(self, name=name,
+                            output_names=output_names, label_names=label_names,
+                            has_global_stats=True)
+
+    def update(self, labels, preds):
+        """Updates the internal evaluation result.
+
+        Parameters
+        ----------
+        labels : list of `NDArray`
+            The labels of the data.
+
+        preds : list of `NDArray`
+            Predicted values.
+        """
+        labels, preds = check_label_shapes(labels, preds, True)
+
+        for label, pred in zip(labels, preds):
+            self.metrics.update_binary_stats(label, pred)
+
+        if self.average == "macro":
+            self.sum_metric += self.metrics.fscore
+            self.global_sum_metric += self.metrics.global_fscore
+            self.num_inst += 1
+            self.global_num_inst += 1
+            self.metrics.reset_stats()
+        else:
+            self.sum_metric = self.metrics.fscore * self.metrics.total_examples
+            self.global_sum_metric = self.metrics.global_fscore * self.metrics.global_total_examples
+            self.num_inst = self.metrics.total_examples
+            self.global_num_inst = self.metrics.global_total_examples
+
+    def reset(self):
+        """Resets the internal evaluation result to initial state."""
+        self.sum_metric = 0.
+        self.num_inst = 0
+        self.global_num_inst = 0
+        self.global_sum_metric = 0.0
+        self.metrics.reset_stats()
+
+    def reset_local(self):
+        """Resets the internal evaluation result to initial state."""
+        self.sum_metric = 0.
+        self.num_inst = 0
+        self.metrics.local_reset_stats()
+
+@register
+class Fbeta(EvalMetric):
+    """Computes the Fbeta score of a binary classification problem.
+
+    The Fbeta score is equivalent to harmonic mean of the precision and recall,
+    where the best value is 1.0 and the worst value is 0.0. The formula for Fbeta score is::
+
+        Fbeta = (1 + beta ** 2) * (precision * recall) / (beta ** 2 * precision + recall)
+
+    The formula for precision and recall is::
+
+        precision = true_positives / (true_positives + false_positives)
+        recall    = true_positives / (true_positives + false_negatives)
+
+    .. note::
+
+        This Fbeta score only supports binary classification.
+
+    Parameters
+    ----------
+    name : str
+        Name of this metric instance for display.
+    output_names : list of str, or None
+        Name of predictions that should be used when updating with update_dict.
+        By default include all predictions.
+    label_names : list of str, or None
+        Name of labels that should be used when updating with update_dict.
+        By default include all labels.
+    beta : float, default 1
+        weight of precision in harmonic mean. 
+    threshold : float, default 0.5
+        threshold for deciding whether the predictions are positive or negative.
+    average : str, default 'macro'
+        Strategy to be used for aggregating across mini-batches.
+            "macro": average the F1 scores for each batch.
+            "micro": compute a single F1 score across all batches.
+
+    Examples
+    --------
+    >>> predicts = [mx.nd.array([[0.3, 0.7], [0., 1.], [0.4, 0.6]])]
+    >>> labels   = [mx.nd.array([0., 1., 1.])]
+    >>> fbeta = mx.metric.Fbeta(beta=2)
+    >>> fbeta.update(preds = predicts, labels = labels)
+    >>> print fbeta.get()
+    ('fbeta', 0.9090909090909091)
+    """
+
+    def __init__(self, name='fbeta',
+                 output_names=None, label_names=None, beta=1, threshold=0.5, average="macro"):
         self.average = average
-        self.metrics = _BinaryClassificationMetrics()
+        self.metrics = _BinaryClassificationMetrics(threshold=threshold, beta=beta)
         EvalMetric.__init__(self, name=name,
                             output_names=output_names, label_names=label_names,
                             has_global_stats=True)
@@ -834,6 +969,76 @@ def reset_local(self):
         self.metrics.local_reset_stats()
 
 
+@register
+class BinaryAccuracy(EvalMetric):
+    """Computes the accuracy of a binary classification problem.
+
+    Parameters
+    ----------
+    name : str
+        Name of this metric instance for display.
+    output_names : list of str, or None
+        Name of predictions that should be used when updating with update_dict.
+        By default include all predictions.
+    label_names : list of str, or None
+        Name of labels that should be used when updating with update_dict.
+        By default include all labels.
+    threshold : float, default 0.5
+        threshold for deciding whether the predictions are positive or negative.
+
+    Examples
+    --------
+    >>> predicts = [mx.nd.array([0.7, 1, 0.55])]
+    >>> labels   = [mx.nd.array([0., 1., 0.])]
+    >>> bacc = mx.metric.BinaryAccuracy(threshold=0.6)
+    >>> bacc.update(preds = predicts, labels = labels)
+    >>> print bacc.get()
+    ('binary_accuracy', 0.6666666666666666)
+    """
+
+    def __init__(self, name='binary_accuracy',
+                 output_names=None, label_names=None, threshold=0.5):
+        self.metrics = _BinaryClassificationMetrics(threshold=threshold)
+        EvalMetric.__init__(self, name=name,
+                            output_names=output_names, label_names=label_names,
+                            has_global_stats=True)
+
+    def update(self, labels, preds):
+        """Updates the internal evaluation result.
+
+        Parameters
+        ----------
+        labels : list of `NDArray`
+            The labels of the data.
+
+        preds : list of `NDArray`
+            Predicted values.
+        """
+        labels, preds = check_label_shapes(labels, preds, True)
+
+        for label, pred in zip(labels, preds):
+            self.metrics.update_binary_stats(label, pred)
+
+        self.sum_metric = self.metrics.accuracy * self.metrics.total_examples
+        self.global_sum_metric = self.metrics.global_accuracy * self.metrics.global_total_examples
+        self.num_inst = self.metrics.total_examples
+        self.global_num_inst = self.metrics.global_total_examples
+
+    def reset(self):
+        """Resets the internal evaluation result to initial state."""
+        self.sum_metric = 0.
+        self.num_inst = 0
+        self.global_num_inst = 0
+        self.global_sum_metric = 0.0
+        self.metrics.reset_stats()
+
+    def reset_local(self):
+        """Resets the internal evaluation result to initial state."""
+        self.sum_metric = 0.
+        self.num_inst = 0
+        self.metrics.local_reset_stats()
+        
+        
 @register
 class MCC(EvalMetric):
     """Computes the Matthews Correlation Coefficient of a binary classification problem.
@@ -1092,7 +1297,10 @@ class MAE(EvalMetric):
     label_names : list of str, or None
         Name of labels that should be used when updating with update_dict.
         By default include all labels.
-
+    average : str, default 'macro'
+        Strategy to be used for aggregating across mini-batches.
+            "macro": average MAE results for each batch.
+            "micro": compute a single MAE result across all batches.
     Examples
     --------
     >>> predicts = [mx.nd.array(np.array([3, -0.5, 2, 7]).reshape(4,1))]
@@ -1104,11 +1312,12 @@ class MAE(EvalMetric):
     """
 
     def __init__(self, name='mae',
-                 output_names=None, label_names=None):
+                 output_names=None, label_names=None, average='macro'):
         super(MAE, self).__init__(
             name, output_names=output_names, label_names=label_names,
             has_global_stats=True)
-
+        self.average = average
+        
     def update(self, labels, preds):
         """Updates the internal evaluation result.
 
@@ -1130,12 +1339,18 @@ def update(self, labels, preds):
                 label = label.reshape(label.shape[0], 1)
             if len(pred.shape) == 1:
                 pred = pred.reshape(pred.shape[0], 1)
-
-            mae = numpy.abs(label - pred).mean()
+            
+            if self.average == "macro":
+                mae = numpy.abs(label - pred).mean()
+                num_inst = 1
+            else:
+                num_inst = label.shape[0]
+                mae = numpy.abs(label - pred).reshape(num_inst, -1).mean(axis=-1).sum()
+                
             self.sum_metric += mae
             self.global_sum_metric += mae
-            self.num_inst += 1 # numpy.prod(label.shape)
-            self.global_num_inst += 1 # numpy.prod(label.shape)
+            self.num_inst += num_inst
+            self.global_num_inst += num_inst
 
 
 @register
@@ -1157,7 +1372,10 @@ class MSE(EvalMetric):
     label_names : list of str, or None
         Name of labels that should be used when updating with update_dict.
         By default include all labels.
-
+    average : str, default 'macro'
+        Strategy to be used for aggregating across mini-batches.
+            "macro": average MSE results for each batch.
+            "micro": compute a single MSE result across all batches.
     Examples
     --------
     >>> predicts = [mx.nd.array(np.array([3, -0.5, 2, 7]).reshape(4,1))]
@@ -1168,11 +1386,12 @@ class MSE(EvalMetric):
     ('mse', 0.375)
     """
     def __init__(self, name='mse',
-                 output_names=None, label_names=None):
+                 output_names=None, label_names=None, average="macro"):
         super(MSE, self).__init__(
             name, output_names=output_names, label_names=label_names,
             has_global_stats=True)
-
+        self.average = average
+        
     def update(self, labels, preds):
         """Updates the internal evaluation result.
 
@@ -1195,11 +1414,16 @@ def update(self, labels, preds):
             if len(pred.shape) == 1:
                 pred = pred.reshape(pred.shape[0], 1)
 
-            mse = ((label - pred)**2.0).mean()
+            if self.average == "macro":
+                mse = ((label - pred)**2.0).mean()
+                num_inst = 1
+            else:
+                num_inst = label.shape[0]
+                mse = ((label - pred)**2.0).reshape(num_inst, -1).mean(axis=-1).sum()
             self.sum_metric += mse
             self.global_sum_metric += mse
-            self.num_inst += 1 # numpy.prod(label.shape)
-            self.global_num_inst += 1 # numpy.prod(label.shape)
+            self.num_inst += num_inst
+            self.global_num_inst += num_inst
 
 
 @register
@@ -1221,7 +1445,10 @@ class RMSE(EvalMetric):
     label_names : list of str, or None
         Name of labels that should be used when updating with update_dict.
         By default include all labels.
-
+    average : str, default 'macro'
+        Strategy to be used for aggregating across mini-batches.
+            "macro": average RMSE results for each batch.
+            "micro": compute a single RSME result across all batches.
     Examples
     --------
     >>> predicts = [mx.nd.array(np.array([3, -0.5, 2, 7]).reshape(4,1))]
@@ -1232,11 +1459,12 @@ class RMSE(EvalMetric):
     ('rmse', 0.612372457981)
     """
     def __init__(self, name='rmse',
-                 output_names=None, label_names=None):
+                 output_names=None, label_names=None, average="macro"):
         super(RMSE, self).__init__(
             name, output_names=output_names, label_names=label_names,
             has_global_stats=True)
-
+        self.average = average
+        
     def update(self, labels, preds):
         """Updates the internal evaluation result.
 
@@ -1259,13 +1487,175 @@ def update(self, labels, preds):
             if len(pred.shape) == 1:
                 pred = pred.reshape(pred.shape[0], 1)
 
-            rmse = numpy.sqrt(((label - pred)**2.0).mean())
+            if self.average == "macro":
+                rmse = numpy.sqrt(((label - pred)**2.0).mean())
+                num_inst = 1
+            else:
+                num_inst = label.shape[0]
+                rmse = numpy.sqrt(((label - pred)**2.0).reshape(num_inst, -1).mean(axis=1)).sum()
             self.sum_metric += rmse
             self.global_sum_metric += rmse
-            self.num_inst += 1
-            self.global_num_inst += 1
+            self.num_inst += num_inst
+            self.global_num_inst += num_inst
+
+
+@register
+class MeanPairwiseDistance(EvalMetric):
+    """Computes Mean Pairwise Distance.
+
+    The mean pairwise distance is given by
 
+    .. math::
+        \\sqrt{\\frac{(\\sum_i^n (y_i - \\hat{y}_i)^p)^\\frac{1}{p}}{n}}
 
+    Parameters
+    ----------
+    name : str
+        Name of this metric instance for display.
+    output_names : list of str, or None
+        Name of predictions that should be used when updating with update_dict.
+        By default include all predictions.
+    label_names : list of str, or None
+        Name of labels that should be used when updating with update_dict.
+        By default include all labels.
+    p : float, default 2
+        calculating distance using the p-norm
+    average : str, default 'macro'
+        Strategy to be used for aggregating across mini-batches.
+            "macro": average MPD results for each batch.
+            "micro": compute a single MPD result across all batches.
+    Examples
+    --------
+    >>> predicts = [mx.nd.array([[1., 2.], [3., 4.]])]
+    >>> labels = [mx.nd.array([[1., 0.], [4., 2.]])]
+    >>> mpd = mx.metric.MeanPairwiseDistance()
+    >>> mpd.update(labels = labels, preds = predicts)
+    >>> print mpd.get()
+    ('mpd', 2.1180338859558105)
+    """
+    def __init__(self, name='mpd',
+                 output_names=None, label_names=None, p=2, average="micro"):
+        super(MeanPairwiseDistance, self).__init__(
+            name, output_names=output_names, label_names=label_names,
+            has_global_stats=True)
+        self.average = average
+        self.p = p
+       
+    def update(self, labels, preds):
+        """Updates the internal evaluation result.
+
+        Parameters
+        ----------
+        labels : list of `NDArray`
+            The labels of the data.
+
+        preds : list of `NDArray`
+            Predicted values.
+        """
+        labels, preds = check_label_shapes(labels, preds, True)
+
+        for label, pred in zip(labels, preds):
+            label = label.asnumpy()
+            pred = pred.asnumpy()
+
+            label = label.reshape(label.shape[0], -1)
+            pred = pred.reshape(pred.shape[0], -1)
+
+            pd = (((label - pred) ** self.p).sum(axis=-1)) ** (1./self.p)
+            if self.average == "macro":
+                pd = pd.mean()
+                num_inst = 1
+            else:
+                pd = pd.sum()
+                num_inst = label.shape[0]
+
+            self.sum_metric += pd
+            self.global_sum_metric += pd
+            self.num_inst += num_inst
+            self.global_num_inst += num_inst
+            
+
+@register
+class MeanCosineSimilarity(EvalMetric):
+    """Computes Mean Cosine Similarity.
+
+    The mean cosine similarity is given by
+
+    .. math::
+        cos\_sim(label, pred) = \frac{{label}.{pred}}{max(||label||.||pred||, eps)}
+    (calculating on the last dimension of label and pred.)
+    
+    Parameters
+    ----------
+    name : str
+        Name of this metric instance for display.
+    output_names : list of str, or None
+        Name of predictions that should be used when updating with update_dict.
+        By default include all predictions.
+    label_names : list of str, or None
+        Name of labels that should be used when updating with update_dict.
+        By default include all labels.
+    eps : float, default 1e-8
+        small vale to avoid division by zero.
+    average : str, default 'micro'
+        Strategy to be used for aggregating across mini-batches.
+            "macro": average RMSE results for each batch.
+            "micro": compute a single RSME result across all batches.
+    Examples
+    --------
+    >>> predicts = [mx.nd.array([[1., 0.], [1., 1.]])]
+    >>> labels = [mx.nd.array([[3., 4.], [2., 2.]])]
+    >>> mcs = mx.metric.MeanCosineSimilarity()
+    >>> mcs.update(labels = labels, preds = predicts)
+    >>> print mcs.get()
+    ('cos_sim', 0.8)
+    """
+    def __init__(self, name='cos_sim',
+                 output_names=None, label_names=None, eps=1e-8, average="micro"):
+        super(MeanCosineSimilarity, self).__init__(
+            name, output_names=output_names, label_names=label_names,
+            has_global_stats=True)
+        self.average = average
+        self.eps = eps
+        
+    def update(self, labels, preds):
+        """Updates the internal evaluation result.
+
+        Parameters
+        ----------
+        labels : list of `NDArray`
+            The labels of the data.
+
+        preds : list of `NDArray`
+            Predicted values.
+        """
+        labels, preds = check_label_shapes(labels, preds, True)
+
+        for label, pred in zip(labels, preds):
+            label = label.asnumpy()
+            pred = pred.asnumpy()
+
+            if len(label.shape) == 1:
+                label = label.reshape(1, label.shape[0])
+            if len(pred.shape) == 1:
+                pred = pred.reshape(1, pred.shape[0])
+
+            sim = (label * pred).sum(axis=-1)
+            n_p = numpy.linalg.norm(pred, axis=-1)
+            n_l = numpy.linalg.norm(label, axis=-1)
+            sim = sim / numpy.maximum(n_l * n_p, self.eps)
+            if self.average == "macro":
+                sim = sim.mean()
+                num_inst = 1
+            else:
+                sim = sim.sum()
+                num_inst = numpy.prod(label.shape[:-1])
+            self.sum_metric += sim
+            self.global_sum_metric += sim
+            self.num_inst += num_inst
+            self.global_num_inst += num_inst
+
+            
 @register
 @alias('ce')
 class CrossEntropy(EvalMetric):

From 575f23b49dd0d5c7504b6dfd05314d10ada8b314 Mon Sep 17 00:00:00 2001
From: acphile <phile_999@126.com>
Date: Thu, 16 Apr 2020 14:53:18 +0800
Subject: [PATCH 02/24] move metric.py to gluon, replace mx.metric with
 mx.gluon.metric in python/mxnet/

---
 python/mxnet/__init__.py                      |  1 -
 .../contrib/svrg_optimization/svrg_module.py  |  4 +-
 .../gluon/contrib/estimator/estimator.py      |  2 +-
 .../gluon/contrib/estimator/event_handler.py  |  4 +-
 python/mxnet/gluon/contrib/estimator/utils.py |  4 +-
 python/mxnet/{ => gluon}/metric.py            | 58 +++++++++----------
 python/mxnet/model.py                         |  2 +-
 python/mxnet/module/base_module.py            |  4 +-
 8 files changed, 39 insertions(+), 40 deletions(-)
 rename python/mxnet/{ => gluon}/metric.py (98%)

diff --git a/python/mxnet/__init__.py b/python/mxnet/__init__.py
index 49f10aace531..284788fa2276 100644
--- a/python/mxnet/__init__.py
+++ b/python/mxnet/__init__.py
@@ -51,7 +51,6 @@
 from . import random
 from . import optimizer
 from . import model
-from . import metric
 from . import notebook
 from . import initializer
 # use mx.init as short for mx.initializer
diff --git a/python/mxnet/contrib/svrg_optimization/svrg_module.py b/python/mxnet/contrib/svrg_optimization/svrg_module.py
index eecb87cf25bb..fc5a6c224809 100644
--- a/python/mxnet/contrib/svrg_optimization/svrg_module.py
+++ b/python/mxnet/contrib/svrg_optimization/svrg_module.py
@@ -478,8 +478,8 @@ def fit(self, train_data, eval_data=None, eval_metric='acc',
 
         if validation_metric is None:
             validation_metric = eval_metric
-        if not isinstance(eval_metric, mx.metric.EvalMetric):
-            eval_metric = mx.metric.create(eval_metric)
+        if not isinstance(eval_metric, mx.gluon.metric.EvalMetric):
+            eval_metric = mx.gluon.metric.create(eval_metric)
 
         ################################################################################
         # training loop
diff --git a/python/mxnet/gluon/contrib/estimator/estimator.py b/python/mxnet/gluon/contrib/estimator/estimator.py
index ed8a53d7c3a6..c47e02b7213f 100644
--- a/python/mxnet/gluon/contrib/estimator/estimator.py
+++ b/python/mxnet/gluon/contrib/estimator/estimator.py
@@ -33,7 +33,7 @@
 from ...trainer import Trainer
 from ...utils import split_and_load
 from ....context import Context, cpu, gpu, num_gpus
-from ....metric import Loss as metric_loss
+from ...metric import Loss as metric_loss
 from .batch_processor import BatchProcessor
 
 __all__ = ['Estimator']
diff --git a/python/mxnet/gluon/contrib/estimator/event_handler.py b/python/mxnet/gluon/contrib/estimator/event_handler.py
index 338c7f00e05e..5709a803a610 100644
--- a/python/mxnet/gluon/contrib/estimator/event_handler.py
+++ b/python/mxnet/gluon/contrib/estimator/event_handler.py
@@ -25,8 +25,8 @@
 
 import numpy as np
 
-from ....metric import CompositeEvalMetric, EvalMetric
-from ....metric import Loss as metric_loss
+from ...metric import CompositeEvalMetric, EvalMetric
+from ...metric import Loss as metric_loss
 from .utils import _check_metrics
 
 __all__ = ['TrainBegin', 'TrainEnd', 'EpochBegin', 'EpochEnd', 'BatchBegin', 'BatchEnd',
diff --git a/python/mxnet/gluon/contrib/estimator/utils.py b/python/mxnet/gluon/contrib/estimator/utils.py
index d9126a2f6763..dc0c4bf8f081 100644
--- a/python/mxnet/gluon/contrib/estimator/utils.py
+++ b/python/mxnet/gluon/contrib/estimator/utils.py
@@ -20,7 +20,7 @@
 """Gluon Estimator Utility Functions"""
 
 from ...loss import SoftmaxCrossEntropyLoss
-from ....metric import Accuracy, EvalMetric, CompositeEvalMetric
+from ...metric import Accuracy, EvalMetric, CompositeEvalMetric
 
 def _check_metrics(metrics):
     if isinstance(metrics, CompositeEvalMetric):
@@ -31,7 +31,7 @@ def _check_metrics(metrics):
         metrics = metrics or []
         if not all([isinstance(metric, EvalMetric) for metric in metrics]):
             raise ValueError("metrics must be a Metric or a list of Metric, "
-                             "refer to mxnet.metric.EvalMetric: {}".format(metrics))
+                             "refer to mxnet.gluon.metric.EvalMetric: {}".format(metrics))
     return metrics
 
 def _check_handler_metric_ref(handler, known_metrics):
diff --git a/python/mxnet/metric.py b/python/mxnet/gluon/metric.py
similarity index 98%
rename from python/mxnet/metric.py
rename to python/mxnet/gluon/metric.py
index f790c2ded617..1c9073096adb 100644
--- a/python/mxnet/metric.py
+++ b/python/mxnet/gluon/metric.py
@@ -24,9 +24,9 @@
 
 import numpy
 
-from .base import numeric_types, string_types
-from . import ndarray
-from . import registry
+from ..base import numeric_types, string_types
+from .. import ndarray
+from .. import registry
 
 
 def check_label_shapes(labels, preds, wrap=False, shape=False):
@@ -256,9 +256,9 @@ def create(metric, *args, **kwargs):
     >>> def custom_metric(label, pred):
     ...     return np.mean(np.abs(label - pred))
     ...
-    >>> metric1 = mx.metric.create('acc')
-    >>> metric2 = mx.metric.create(custom_metric)
-    >>> metric3 = mx.metric.create([metric1, metric2, 'rmse'])
+    >>> metric1 = mx.gluon.metric.create('acc')
+    >>> metric2 = mx.gluon.metric.create(custom_metric)
+    >>> metric3 = mx.gluon.metric.create([metric1, metric2, 'rmse'])
     """
     if callable(metric):
         return CustomMetric(metric, *args, **kwargs)
@@ -293,9 +293,9 @@ class CompositeEvalMetric(EvalMetric):
     --------
     >>> predicts = [mx.nd.array([[0.3, 0.7], [0, 1.], [0.4, 0.6]])]
     >>> labels   = [mx.nd.array([0, 1, 1])]
-    >>> eval_metrics_1 = mx.metric.Accuracy()
-    >>> eval_metrics_2 = mx.metric.F1()
-    >>> eval_metrics = mx.metric.CompositeEvalMetric()
+    >>> eval_metrics_1 = mx.gluon.metric.Accuracy()
+    >>> eval_metrics_2 = mx.gluon.metric.F1()
+    >>> eval_metrics = mx.gluon.metric.CompositeEvalMetric()
     >>> for child_metric in [eval_metrics_1, eval_metrics_2]:
     >>>     eval_metrics.add(child_metric)
     >>> eval_metrics.update(labels = labels, preds = predicts)
@@ -460,7 +460,7 @@ class Accuracy(EvalMetric):
     --------
     >>> predicts = [mx.nd.array([[0.3, 0.7], [0, 1.], [0.4, 0.6]])]
     >>> labels   = [mx.nd.array([0, 1, 1])]
-    >>> acc = mx.metric.Accuracy()
+    >>> acc = mx.gluon.metric.Accuracy()
     >>> acc.update(preds = predicts, labels = labels)
     >>> print acc.get()
     ('accuracy', 0.6666666666666666)
@@ -535,7 +535,7 @@ class TopKAccuracy(EvalMetric):
     >>> top_k = 3
     >>> labels = [mx.nd.array([2, 6, 9, 2, 3, 4, 7, 8, 9, 6])]
     >>> predicts = [mx.nd.array(np.random.rand(10, 10))]
-    >>> acc = mx.metric.TopKAccuracy(top_k=top_k)
+    >>> acc = mx.gluon.metric.TopKAccuracy(top_k=top_k)
     >>> acc.update(labels, predicts)
     >>> print acc.get()
     ('top_k_accuracy', 0.3)
@@ -815,7 +815,7 @@ class F1(EvalMetric):
     --------
     >>> predicts = [mx.nd.array([[0.3, 0.7], [0., 1.], [0.4, 0.6]])]
     >>> labels   = [mx.nd.array([0., 1., 1.])]
-    >>> f1 = mx.metric.F1()
+    >>> f1 = mx.gluon.metric.F1()
     >>> f1.update(preds = predicts, labels = labels)
     >>> print f1.get()
     ('f1', 0.8)
@@ -912,7 +912,7 @@ class Fbeta(EvalMetric):
     --------
     >>> predicts = [mx.nd.array([[0.3, 0.7], [0., 1.], [0.4, 0.6]])]
     >>> labels   = [mx.nd.array([0., 1., 1.])]
-    >>> fbeta = mx.metric.Fbeta(beta=2)
+    >>> fbeta = mx.gluon.metric.Fbeta(beta=2)
     >>> fbeta.update(preds = predicts, labels = labels)
     >>> print fbeta.get()
     ('fbeta', 0.9090909090909091)
@@ -990,7 +990,7 @@ class BinaryAccuracy(EvalMetric):
     --------
     >>> predicts = [mx.nd.array([0.7, 1, 0.55])]
     >>> labels   = [mx.nd.array([0., 1., 0.])]
-    >>> bacc = mx.metric.BinaryAccuracy(threshold=0.6)
+    >>> bacc = mx.gluon.metric.BinaryAccuracy(threshold=0.6)
     >>> bacc.update(preds = predicts, labels = labels)
     >>> print bacc.get()
     ('binary_accuracy', 0.6666666666666666)
@@ -1092,9 +1092,9 @@ class MCC(EvalMetric):
         [0.]*(false_positives + true_negatives) +
         [1.]*(false_negatives + true_positives)
     )]
-    >>> f1 = mx.metric.F1()
+    >>> f1 = mx.gluon.metric.F1()
     >>> f1.update(preds = predicts, labels = labels)
-    >>> mcc = mx.metric.MCC()
+    >>> mcc = mx.gluon.metric.MCC()
     >>> mcc.update(preds = predicts, labels = labels)
     >>> print f1.get()
     ('f1', 0.95233560306652054)
@@ -1203,7 +1203,7 @@ class Perplexity(EvalMetric):
     --------
     >>> predicts = [mx.nd.array([[0.3, 0.7], [0, 1.], [0.4, 0.6]])]
     >>> labels   = [mx.nd.array([0, 1, 1])]
-    >>> perp = mx.metric.Perplexity(ignore_label=None)
+    >>> perp = mx.gluon.metric.Perplexity(ignore_label=None)
     >>> perp.update(labels, predicts)
     >>> print perp.get()
     ('Perplexity', 1.7710976285155853)
@@ -1305,7 +1305,7 @@ class MAE(EvalMetric):
     --------
     >>> predicts = [mx.nd.array(np.array([3, -0.5, 2, 7]).reshape(4,1))]
     >>> labels = [mx.nd.array(np.array([2.5, 0.0, 2, 8]).reshape(4,1))]
-    >>> mean_absolute_error = mx.metric.MAE()
+    >>> mean_absolute_error = mx.gluon.metric.MAE()
     >>> mean_absolute_error.update(labels = labels, preds = predicts)
     >>> print mean_absolute_error.get()
     ('mae', 0.5)
@@ -1380,7 +1380,7 @@ class MSE(EvalMetric):
     --------
     >>> predicts = [mx.nd.array(np.array([3, -0.5, 2, 7]).reshape(4,1))]
     >>> labels = [mx.nd.array(np.array([2.5, 0.0, 2, 8]).reshape(4,1))]
-    >>> mean_squared_error = mx.metric.MSE()
+    >>> mean_squared_error = mx.gluon.metric.MSE()
     >>> mean_squared_error.update(labels = labels, preds = predicts)
     >>> print mean_squared_error.get()
     ('mse', 0.375)
@@ -1453,7 +1453,7 @@ class RMSE(EvalMetric):
     --------
     >>> predicts = [mx.nd.array(np.array([3, -0.5, 2, 7]).reshape(4,1))]
     >>> labels = [mx.nd.array(np.array([2.5, 0.0, 2, 8]).reshape(4,1))]
-    >>> root_mean_squared_error = mx.metric.RMSE()
+    >>> root_mean_squared_error = mx.gluon.metric.RMSE()
     >>> root_mean_squared_error.update(labels = labels, preds = predicts)
     >>> print root_mean_squared_error.get()
     ('rmse', 0.612372457981)
@@ -1528,7 +1528,7 @@ class MeanPairwiseDistance(EvalMetric):
     --------
     >>> predicts = [mx.nd.array([[1., 2.], [3., 4.]])]
     >>> labels = [mx.nd.array([[1., 0.], [4., 2.]])]
-    >>> mpd = mx.metric.MeanPairwiseDistance()
+    >>> mpd = mx.gluon.metric.MeanPairwiseDistance()
     >>> mpd.update(labels = labels, preds = predicts)
     >>> print mpd.get()
     ('mpd', 2.1180338859558105)
@@ -1605,7 +1605,7 @@ class MeanCosineSimilarity(EvalMetric):
     --------
     >>> predicts = [mx.nd.array([[1., 0.], [1., 1.]])]
     >>> labels = [mx.nd.array([[3., 4.], [2., 2.]])]
-    >>> mcs = mx.metric.MeanCosineSimilarity()
+    >>> mcs = mx.gluon.metric.MeanCosineSimilarity()
     >>> mcs.update(labels = labels, preds = predicts)
     >>> print mcs.get()
     ('cos_sim', 0.8)
@@ -1688,7 +1688,7 @@ class :math:`k`.
     --------
     >>> predicts = [mx.nd.array([[0.3, 0.7], [0, 1.], [0.4, 0.6]])]
     >>> labels   = [mx.nd.array([0, 1, 1])]
-    >>> ce = mx.metric.CrossEntropy()
+    >>> ce = mx.gluon.metric.CrossEntropy()
     >>> ce.update(labels, predicts)
     >>> print ce.get()
     ('cross-entropy', 0.57159948348999023)
@@ -1760,7 +1760,7 @@ class NegativeLogLikelihood(EvalMetric):
     --------
     >>> predicts = [mx.nd.array([[0.3, 0.7], [0, 1.], [0.4, 0.6]])]
     >>> labels   = [mx.nd.array([0, 1, 1])]
-    >>> nll_loss = mx.metric.NegativeLogLikelihood()
+    >>> nll_loss = mx.gluon.metric.NegativeLogLikelihood()
     >>> nll_loss.update(labels, predicts)
     >>> print nll_loss.get()
     ('nll-loss', 0.57159948348999023)
@@ -1829,7 +1829,7 @@ class PearsonCorrelation(EvalMetric):
     --------
     >>> predicts = [mx.nd.array([[0.3, 0.7], [0, 1.], [0.4, 0.6]])]
     >>> labels   = [mx.nd.array([[1, 0], [0, 1], [0, 1]])]
-    >>> pr = mx.metric.PearsonCorrelation()
+    >>> pr = mx.gluon.metric.PearsonCorrelation()
     >>> pr.update(labels, predicts)
     >>> print pr.get()
     ('pearsonr', 0.42163704544016178)
@@ -1957,9 +1957,9 @@ class PCC(EvalMetric):
         [0]*(false_positives + true_negatives) +
         [1]*(false_negatives + true_positives)
     )]
-    >>> f1 = mx.metric.F1()
+    >>> f1 = mx.gluon.metric.F1()
     >>> f1.update(preds = predicts, labels = labels)
-    >>> pcc = mx.metric.PCC()
+    >>> pcc = mx.gluon.metric.PCC()
     >>> pcc.update(preds = predicts, labels = labels)
     >>> print f1.get()
     ('f1', 0.95233560306652054)
@@ -2129,7 +2129,7 @@ class CustomMetric(EvalMetric):
     >>> predicts = [mx.nd.array(np.array([3, -0.5, 2, 7]).reshape(4,1))]
     >>> labels = [mx.nd.array(np.array([2.5, 0.0, 2, 8]).reshape(4,1))]
     >>> feval = lambda x, y : (x + y).mean()
-    >>> eval_metrics = mx.metric.CustomMetric(feval=feval)
+    >>> eval_metrics = mx.gluon.metric.CustomMetric(feval=feval)
     >>> eval_metrics.update(labels, predicts)
     >>> print eval_metrics.get()
     ('custom(<lambda>)', 6.0)
@@ -2209,7 +2209,7 @@ def np(numpy_feval, name=None, allow_extra_outputs=False):
     >>> def custom_metric(label, pred):
     ...     return np.mean(np.abs(label-pred))
     ...
-    >>> metric = mx.metric.np(custom_metric)
+    >>> metric = mx.gluon.metric.np(custom_metric)
     """
     def feval(label, pred):
         """Internal eval function."""
diff --git a/python/mxnet/model.py b/python/mxnet/model.py
index fa247624975d..bd80ec01738b 100644
--- a/python/mxnet/model.py
+++ b/python/mxnet/model.py
@@ -30,7 +30,7 @@
 from . import ndarray as nd
 from . import symbol as sym
 from . import optimizer as opt
-from . import metric
+from .gluon import metric
 from . import kvstore as kvs
 from .context import Context, cpu
 from .initializer import Uniform
diff --git a/python/mxnet/module/base_module.py b/python/mxnet/module/base_module.py
index 053a00b3abba..9154aebb4b25 100644
--- a/python/mxnet/module/base_module.py
+++ b/python/mxnet/module/base_module.py
@@ -24,7 +24,7 @@
 import warnings
 import numpy as np
 
-from .. import metric
+from ..gluon import metric
 from .. import ndarray
 
 from ..context import cpu
@@ -231,7 +231,7 @@ def score(self, eval_data, eval_metric, num_batch=None, batch_end_callback=None,
         --------
         >>> # An example of using score for prediction.
         >>> # Evaluate accuracy on val_dataiter
-        >>> metric = mx.metric.Accuracy()
+        >>> metric = mx.gluon.metric.Accuracy()
         >>> mod.score(val_dataiter, metric)
         >>> mod.score(val_dataiter, ['mse', 'acc'])
         """

From 89929959d5ac8ea9e0a781344b39e32358dfa883 Mon Sep 17 00:00:00 2001
From: acphile <phile_999@126.com>
Date: Thu, 16 Apr 2020 17:04:09 +0800
Subject: [PATCH 03/24] fix importError

---
 python/mxnet/gluon/__init__.py                | 2 ++
 python/mxnet/gluon/block.py                   | 3 ++-
 python/mxnet/gluon/contrib/data/text.py       | 2 +-
 python/mxnet/gluon/contrib/nn/basic_layers.py | 2 +-
 python/mxnet/gluon/data/dataloader.py         | 2 +-
 python/mxnet/gluon/data/vision/datasets.py    | 2 +-
 python/mxnet/gluon/nn/basic_layers.py         | 2 +-
 7 files changed, 9 insertions(+), 6 deletions(-)

diff --git a/python/mxnet/gluon/__init__.py b/python/mxnet/gluon/__init__.py
index 288937cf4a03..514087049edb 100644
--- a/python/mxnet/gluon/__init__.py
+++ b/python/mxnet/gluon/__init__.py
@@ -19,6 +19,8 @@
 # pylint: disable=wildcard-import
 """Neural network module."""
 
+from . import metric
+
 from .parameter import *
 
 from .block import *
diff --git a/python/mxnet/gluon/block.py b/python/mxnet/gluon/block.py
index 10c11b85ba97..864db34420ee 100644
--- a/python/mxnet/gluon/block.py
+++ b/python/mxnet/gluon/block.py
@@ -28,7 +28,8 @@
 import numpy as np
 
 from ..base import mx_real_t, MXNetError
-from .. import symbol, ndarray, initializer, np_symbol, autograd, _deferred_compute as dc
+from .. import symbol, ndarray, initializer, autograd, _deferred_compute as dc
+from ..symbol.numpy import _symbol as np_symbol
 from ..symbol import Symbol
 from ..ndarray import NDArray
 from .. import name as _name
diff --git a/python/mxnet/gluon/contrib/data/text.py b/python/mxnet/gluon/contrib/data/text.py
index 0536ac585484..916b41880d45 100644
--- a/python/mxnet/gluon/contrib/data/text.py
+++ b/python/mxnet/gluon/contrib/data/text.py
@@ -29,7 +29,7 @@
 from ...data import dataset
 from ...utils import download, check_sha1, _get_repo_file_url
 from ....contrib import text
-from .... import nd, base
+from .... import ndarray as nd, base
 
 class _LanguageModelDataset(dataset._DownloadedDataset): # pylint: disable=abstract-method
     def __init__(self, root, namespace, vocabulary):
diff --git a/python/mxnet/gluon/contrib/nn/basic_layers.py b/python/mxnet/gluon/contrib/nn/basic_layers.py
index bc7c3ce19e09..5df1a1e83660 100644
--- a/python/mxnet/gluon/contrib/nn/basic_layers.py
+++ b/python/mxnet/gluon/contrib/nn/basic_layers.py
@@ -24,7 +24,7 @@
            'PixelShuffle3D']
 
 import warnings
-from .... import nd, context
+from .... import ndarray as nd, context
 from ...block import HybridBlock, Block
 from ...nn import Sequential, HybridSequential, BatchNorm
 
diff --git a/python/mxnet/gluon/data/dataloader.py b/python/mxnet/gluon/data/dataloader.py
index d34148417355..e07a3a673a75 100644
--- a/python/mxnet/gluon/data/dataloader.py
+++ b/python/mxnet/gluon/data/dataloader.py
@@ -37,7 +37,7 @@
     pass
 
 from . import sampler as _sampler
-from ... import nd, context
+from ... import ndarray as nd, context
 from ...util import is_np_shape, is_np_array, set_np
 from ... import numpy as _mx_np  # pylint: disable=reimported
 
diff --git a/python/mxnet/gluon/data/vision/datasets.py b/python/mxnet/gluon/data/vision/datasets.py
index bdcaff52a042..90990a8436d8 100644
--- a/python/mxnet/gluon/data/vision/datasets.py
+++ b/python/mxnet/gluon/data/vision/datasets.py
@@ -30,7 +30,7 @@
 
 from .. import dataset
 from ...utils import download, check_sha1, _get_repo_file_url
-from .... import nd, image, recordio, base
+from .... import ndarray as nd, image, recordio, base
 from .... import numpy as _mx_np  # pylint: disable=reimported
 from ....util import is_np_array
 
diff --git a/python/mxnet/gluon/nn/basic_layers.py b/python/mxnet/gluon/nn/basic_layers.py
index 70b0a71841f1..c417b7752096 100644
--- a/python/mxnet/gluon/nn/basic_layers.py
+++ b/python/mxnet/gluon/nn/basic_layers.py
@@ -27,7 +27,7 @@
 from .activations import Activation
 from ..block import Block, HybridBlock
 from ..utils import _indent
-from ... import nd, sym
+from ... import ndarray as nd, symbol as sym
 from ...util import is_np_array
 
 

From 1b8f521d302bc5f1cd23d677e2c97f5d450ea012 Mon Sep 17 00:00:00 2001
From: acphile <phile_999@126.com>
Date: Thu, 16 Apr 2020 17:11:45 +0800
Subject: [PATCH 04/24] replace mx.metric with mx.gluon.metric in tests/python

---
 tests/python/gpu/test_contrib_amp.py          |  4 +-
 tests/python/tensorrt/lenet5_train.py         |  2 +-
 tests/python/train/test_autograd.py           |  4 +-
 tests/python/train/test_bucketing.py          |  4 +-
 tests/python/train/test_mlp.py                |  2 +-
 tests/python/train/test_sparse_fm.py          |  2 +-
 .../unittest/test_contrib_svrg_module.py      |  4 +-
 .../unittest/test_gluon_batch_processor.py    |  4 +-
 tests/python/unittest/test_gluon_estimator.py | 24 ++++----
 .../unittest/test_gluon_event_handler.py      | 18 +++---
 tests/python/unittest/test_loss.py            | 60 +++++++++----------
 tests/python/unittest/test_metric.py          | 44 +++++++-------
 tests/python/unittest/test_metric_perf.py     |  4 +-
 tests/python/unittest/test_module.py          |  4 +-
 14 files changed, 90 insertions(+), 90 deletions(-)

diff --git a/tests/python/gpu/test_contrib_amp.py b/tests/python/gpu/test_contrib_amp.py
index 527f8534969c..f3742629b804 100644
--- a/tests/python/gpu/test_contrib_amp.py
+++ b/tests/python/gpu/test_contrib_amp.py
@@ -325,7 +325,7 @@ def check_amp_convert_bucketing_module():
         data_val =  mx.rnn.BucketSentenceIter(val_sent, batch_size, buckets=buckets,
                                      invalid_label=invalid_label)
         result_model.bind(data_val.provide_data, data_val.provide_label, for_training=False)
-        result_model.score(data_val, mx.metric.Perplexity(invalid_label),
+        result_model.score(data_val, mx.gluon.metric.Perplexity(invalid_label),
                            batch_end_callback=mx.callback.Speedometer(batch_size, 1))
 
         # AMP conversion with cast_optional_params set to true
@@ -333,7 +333,7 @@ def check_amp_convert_bucketing_module():
         '''
         result_model = amp.convert_bucketing_module(model, cast_optional_params=True)
         result_model.bind(data_val.provide_data, data_val.provide_label, for_training=False)
-        result_model.score(data_val, mx.metric.Perplexity(invalid_label),
+        result_model.score(data_val, mx.gluon.metric.Perplexity(invalid_label),
                            batch_end_callback=mx.callback.Speedometer(batch_size, 1))
         '''
 
diff --git a/tests/python/tensorrt/lenet5_train.py b/tests/python/tensorrt/lenet5_train.py
index a0ea447de5a0..b04b3484de46 100755
--- a/tests/python/tensorrt/lenet5_train.py
+++ b/tests/python/tensorrt/lenet5_train.py
@@ -74,7 +74,7 @@ def train_lenet5(num_epochs, batch_size, train_iter, val_iter, test_iter):
                     num_epoch=num_epochs)
 
     # predict accuracy for lenet
-    acc = mx.metric.Accuracy()
+    acc = mx.gluon.metric.Accuracy()
     lenet_model.score(test_iter, acc)
     accuracy = acc.get()[1]
     assert accuracy > 0.95, "LeNet-5 training accuracy on MNIST was too low"
diff --git a/tests/python/train/test_autograd.py b/tests/python/train/test_autograd.py
index 712672cd0a9f..f8dbf3610a68 100644
--- a/tests/python/train/test_autograd.py
+++ b/tests/python/train/test_autograd.py
@@ -53,7 +53,7 @@ def get_net():
         batch_size=batch_size, shuffle=True, flat=True, silent=False)
 
 def score(net, ctx_list):
-    metric = mx.metric.Accuracy()
+    metric = mx.gluon.metric.Accuracy()
     val_data.reset()
     for batch in val_data:
         datas = gluon.utils.split_and_load(batch.data[0], ctx_list, batch_axis=0)
@@ -67,7 +67,7 @@ def score(net, ctx_list):
 def train(net, epoch, ctx_list):
     net.collect_params().initialize(mx.init.Xavier(magnitude=2.24), ctx=ctx_list)
     trainer = gluon.Trainer(net.collect_params(), 'sgd', {'learning_rate': 0.5})
-    metric = mx.metric.Accuracy()
+    metric = mx.gluon.metric.Accuracy()
     loss = gluon.loss.SoftmaxCrossEntropyLoss()
 
     for i in range(epoch):
diff --git a/tests/python/train/test_bucketing.py b/tests/python/train/test_bucketing.py
index a233e46e0992..f4b8f417a2cc 100644
--- a/tests/python/train/test_bucketing.py
+++ b/tests/python/train/test_bucketing.py
@@ -98,7 +98,7 @@ def sym_gen(seq_len):
     model.fit(
         train_data=data_train,
         eval_data=data_val,
-        eval_metric=mx.metric.Perplexity(invalid_label), # Use Perplexity for multiclass classification.
+        eval_metric=mx.gluon.metric.Perplexity(invalid_label), # Use Perplexity for multiclass classification.
         kvstore='device',
         optimizer='sgd',
         optimizer_params={'learning_rate': 0.01,
@@ -114,7 +114,7 @@ def sym_gen(seq_len):
 def test_bucket_module():
     # This test forecasts random sequence of words to check bucketing.
     # We cannot guarantee the accuracy of such an impossible task, and comments out the following line.
-    # assert model.score(data_val, mx.metric.MSE())[0][1] < 350, "High mean square error."
+    # assert model.score(data_val, mx.gluon.metric.MSE())[0][1] < 350, "High mean square error."
     model = train_model()
 
 
diff --git a/tests/python/train/test_mlp.py b/tests/python/train/test_mlp.py
index 1b8e06f53027..166fd8de28d8 100644
--- a/tests/python/train/test_mlp.py
+++ b/tests/python/train/test_mlp.py
@@ -64,7 +64,7 @@ def test_mlp():
         softmax,
         X=train_dataiter,
         eval_data=val_dataiter,
-        eval_metric=mx.metric.np(accuracy),
+        eval_metric=mx.gluon.metric.np(accuracy),
         epoch_end_callback=mx.callback.do_checkpoint(prefix),
         ctx=[mx.cpu(i) for i in range(2)],
         num_epoch=num_epoch,
diff --git a/tests/python/train/test_sparse_fm.py b/tests/python/train/test_sparse_fm.py
index 99a22f54cbbd..d967e2954775 100644
--- a/tests/python/train/test_sparse_fm.py
+++ b/tests/python/train/test_sparse_fm.py
@@ -102,7 +102,7 @@ def fm(factor_size, feature_dim, init):
         else:
             raise AssertionError("Unsupported optimizer type '" + optimizer + "' specified")
         # use accuracy as the metric
-        metric = mx.metric.create('MSE')
+        metric = mx.gluon.metric.create('MSE')
         # train 'num_epochs' epoch
         for epoch in range(num_epochs):
             train_iter.reset()
diff --git a/tests/python/unittest/test_contrib_svrg_module.py b/tests/python/unittest/test_contrib_svrg_module.py
index 79407d15fd7f..6c973952ba18 100644
--- a/tests/python/unittest/test_contrib_svrg_module.py
+++ b/tests/python/unittest/test_contrib_svrg_module.py
@@ -242,7 +242,7 @@ def create_module_with_sgd():
     num_epoch = 10
 
     # Use metric MSE
-    metrics = mx.metric.create("mse")
+    metrics = mx.gluon.metric.create("mse")
 
     # Train with SVRGModule
     for e in range(num_epoch):
@@ -299,7 +299,7 @@ def test_accumulate_kvstore():
 def test_fit():
     di, mod = setup()
     num_epoch = 100
-    metric = mx.metric.create("mse")
+    metric = mx.gluon.metric.create("mse")
     mod.fit(di, eval_metric=metric, optimizer='sgd', optimizer_params=(('learning_rate', 0.025),), num_epoch=num_epoch,
             kvstore='local')
 
diff --git a/tests/python/unittest/test_gluon_batch_processor.py b/tests/python/unittest/test_gluon_batch_processor.py
index 8604713fc129..336d75237820 100644
--- a/tests/python/unittest/test_gluon_batch_processor.py
+++ b/tests/python/unittest/test_gluon_batch_processor.py
@@ -52,7 +52,7 @@ def test_batch_processor_fit():
     num_epochs = 1
     ctx = mx.cpu()
     loss = gluon.loss.L2Loss()
-    acc = mx.metric.Accuracy()
+    acc = mx.gluon.metric.Accuracy()
     net.initialize(ctx=ctx)
     processor = BatchProcessor()
     trainer = gluon.Trainer(net.collect_params(), 'sgd', {'learning_rate': 0.001})
@@ -83,7 +83,7 @@ def test_batch_processor_validation():
     num_epochs = 1
     ctx = mx.cpu()
     loss = gluon.loss.L2Loss()
-    acc = mx.metric.Accuracy()
+    acc = mx.gluon.metric.Accuracy()
     val_loss = gluon.loss.L1Loss()
     net.initialize(ctx=ctx)
     processor = BatchProcessor()
diff --git a/tests/python/unittest/test_gluon_estimator.py b/tests/python/unittest/test_gluon_estimator.py
index 2c00b1609112..66b1e94335c6 100644
--- a/tests/python/unittest/test_gluon_estimator.py
+++ b/tests/python/unittest/test_gluon_estimator.py
@@ -58,7 +58,7 @@ def test_fit():
     num_epochs = 1
     ctx = mx.cpu()
     loss = gluon.loss.L2Loss()
-    acc = mx.metric.Accuracy()
+    acc = mx.gluon.metric.Accuracy()
     net.initialize(ctx=ctx)
     trainer = gluon.Trainer(net.collect_params(), 'sgd', {'learning_rate': 0.001})
     est = Estimator(net=net,
@@ -87,7 +87,7 @@ def test_validation():
     num_epochs = 1
     ctx = mx.cpu()
     loss = gluon.loss.L2Loss()
-    acc = mx.metric.Accuracy()
+    acc = mx.gluon.metric.Accuracy()
     val_loss = gluon.loss.L1Loss()
     net.initialize(ctx=ctx)
     trainer = gluon.Trainer(net.collect_params(), 'sgd', {'learning_rate': 0.001})
@@ -126,7 +126,7 @@ def test_initializer():
     ctx = mx.cpu()
 
     loss = gluon.loss.L2Loss()
-    acc = mx.metric.Accuracy()
+    acc = mx.gluon.metric.Accuracy()
     # no initializer
     est = Estimator(net=net,
                     loss=loss,
@@ -166,7 +166,7 @@ def test_trainer():
     ctx = mx.cpu()
 
     loss = gluon.loss.L2Loss()
-    acc = mx.metric.Accuracy()
+    acc = mx.gluon.metric.Accuracy()
     net.initialize(ctx=ctx)
     # input no trainer
     with warnings.catch_warnings(record=True) as w:
@@ -206,7 +206,7 @@ def test_metric():
     est.fit(train_data=train_data,
             epochs=num_epochs)
     # input list of metrics
-    metrics = [mx.metric.Accuracy(), mx.metric.Accuracy()]
+    metrics = [mx.gluon.metric.Accuracy(), mx.gluon.metric.Accuracy()]
     est = Estimator(net=net,
                     loss=loss,
                     train_metrics=metrics,
@@ -227,14 +227,14 @@ def test_metric():
                     loss=loss,
                     trainer=trainer,
                     context=ctx)
-    assert isinstance(est.train_metrics[0], mx.metric.Accuracy)
+    assert isinstance(est.train_metrics[0], mx.gluon.metric.Accuracy)
 
 
 def test_loss():
     ''' test with invalid loss '''
     net = _get_test_network()
     ctx = mx.cpu()
-    acc = mx.metric.Accuracy()
+    acc = mx.gluon.metric.Accuracy()
     net.initialize(ctx=ctx)
     trainer = gluon.Trainer(net.collect_params(), 'sgd', {'learning_rate': 0.001})
     # input invalid loss
@@ -250,7 +250,7 @@ def test_context():
     ''' test with no context, list of context, invalid context '''
     net = _get_test_network()
     loss = gluon.loss.L2Loss()
-    metrics = mx.metric.Accuracy()
+    metrics = mx.gluon.metric.Accuracy()
     # input no context
     est = Estimator(net=net,
                     loss=loss,
@@ -332,7 +332,7 @@ def test_default_handlers():
     net.initialize(ctx=ctx)
     trainer = gluon.Trainer(net.collect_params(), 'sgd', {'learning_rate': 0.001})
 
-    train_acc = mx.metric.RMSE()
+    train_acc = mx.gluon.metric.RMSE()
     loss = gluon.loss.L2Loss()
 
     est = Estimator(net=net,
@@ -359,7 +359,7 @@ def test_default_handlers():
 
     # handler with mixed metrics, some handler use metrics prepared by estimator
     # some handler use metrics user prepared
-    logging = LoggingHandler(metrics=[mx.metric.RMSE("val acc")])
+    logging = LoggingHandler(metrics=[mx.gluon.metric.RMSE("val acc")])
     with assert_raises(ValueError):
         est.fit(train_data=train_data, epochs=num_epochs, event_handlers=[logging])
 
@@ -383,7 +383,7 @@ def test_val_net():
     ctx = mx.cpu()
     loss = gluon.loss.L2Loss()
     val_loss = gluon.loss.L2Loss()
-    acc = mx.metric.Accuracy()
+    acc = mx.gluon.metric.Accuracy()
     net.initialize(ctx=ctx)
     trainer = gluon.Trainer(net.collect_params(), 'sgd', {'learning_rate': 0.001})
     est = Estimator(net=net,
@@ -448,7 +448,7 @@ def test_val_handlers():
     net.initialize(ctx=ctx)
     trainer = gluon.Trainer(net.collect_params(), 'sgd', {'learning_rate': 0.001})
 
-    train_acc = mx.metric.RMSE()
+    train_acc = mx.gluon.metric.RMSE()
     loss = gluon.loss.L2Loss()
 
     est = Estimator(net=net,
diff --git a/tests/python/unittest/test_gluon_event_handler.py b/tests/python/unittest/test_gluon_event_handler.py
index c81d29157e7f..a18895be34d2 100644
--- a/tests/python/unittest/test_gluon_event_handler.py
+++ b/tests/python/unittest/test_gluon_event_handler.py
@@ -84,7 +84,7 @@ def test_checkpoint_handler():
 
         net = _get_test_network()
         ce_loss = loss.SoftmaxCrossEntropyLoss()
-        acc = mx.metric.Accuracy()
+        acc = mx.gluon.metric.Accuracy()
         est = estimator.Estimator(net, loss=ce_loss, train_metrics=acc)
         checkpoint_handler = event_handler.CheckpointHandler(model_dir=tmpdir,
                                                              model_prefix=model_prefix,
@@ -130,7 +130,7 @@ def test_resume_checkpoint():
 
         net = _get_test_network()
         ce_loss = loss.SoftmaxCrossEntropyLoss()
-        acc = mx.metric.Accuracy()
+        acc = mx.gluon.metric.Accuracy()
         est = estimator.Estimator(net, loss=ce_loss, train_metrics=acc)
         checkpoint_handler = event_handler.CheckpointHandler(model_dir=tmpdir,
                                                              model_prefix=model_prefix,
@@ -155,7 +155,7 @@ def test_early_stopping():
 
     net = _get_test_network()
     ce_loss = loss.SoftmaxCrossEntropyLoss()
-    acc = mx.metric.Accuracy()
+    acc = mx.gluon.metric.Accuracy()
     est = estimator.Estimator(net, loss=ce_loss, train_metrics=acc)
     early_stopping = event_handler.EarlyStoppingHandler(monitor=acc,
                                                         patience=0,
@@ -179,7 +179,7 @@ def test_logging():
 
         net = _get_test_network()
         ce_loss = loss.SoftmaxCrossEntropyLoss()
-        acc = mx.metric.Accuracy()
+        acc = mx.gluon.metric.Accuracy()
         est = estimator.Estimator(net, loss=ce_loss, train_metrics=acc)
 
         est.logger.addHandler(logging.FileHandler(output_dir))
@@ -226,7 +226,7 @@ def epoch_end(self, estimator, *args, **kwargs):
     test_data = _get_test_data()
     net = _get_test_network()
     ce_loss = loss.SoftmaxCrossEntropyLoss()
-    acc = mx.metric.Accuracy()
+    acc = mx.gluon.metric.Accuracy()
     est = estimator.Estimator(net, loss=ce_loss, train_metrics=acc)
     custom_handler = CustomStopHandler(3, 2)
     est.fit(test_data, event_handlers=[custom_handler], epochs=3)
@@ -249,7 +249,7 @@ def test_logging_interval():
     dataloader = _get_test_data(in_size=data_size)
     num_epochs = 1
     ce_loss = loss.SoftmaxCrossEntropyLoss()
-    acc = mx.metric.Accuracy()
+    acc = mx.gluon.metric.Accuracy()
     logging = LoggingHandler(metrics=[acc], log_interval=log_interval)
     est = estimator.Estimator(net=net,
                               loss=ce_loss,
@@ -273,7 +273,7 @@ def test_logging_interval():
     ''' test case #2: log interval is 5 '''
     old_stdout = sys.stdout
     sys.stdout = mystdout = StringIO()
-    acc = mx.metric.Accuracy()
+    acc = mx.gluon.metric.Accuracy()
     log_interval = 5
     logging = LoggingHandler(metrics=[acc], log_interval=log_interval)
     est = estimator.Estimator(net=net,
@@ -299,7 +299,7 @@ def test_validation_handler_batch_axis():
     test_data = _get_test_data()
     net = _get_test_network()
     ce_loss = loss.SoftmaxCrossEntropyLoss()
-    acc = mx.metric.Accuracy()
+    acc = mx.gluon.metric.Accuracy()
     est = estimator.Estimator(net, loss=ce_loss, train_metrics=acc)
     est.fit(test_data, epochs=3)
 
@@ -315,7 +315,7 @@ def test_validation_handler():
 
     net = _get_test_network()
     ce_loss = loss.SoftmaxCrossEntropyLoss()
-    acc = mx.metric.Accuracy()
+    acc = mx.gluon.metric.Accuracy()
     est = estimator.Estimator(net, loss=ce_loss, train_metrics=acc)
     val_handler = ValidationHandler(val_data=test_data,
                                     eval_fn=est.evaluate,
diff --git a/tests/python/unittest/test_loss.py b/tests/python/unittest/test_loss.py
index a1a49c97d7f4..7f3df178ece4 100644
--- a/tests/python/unittest/test_loss.py
+++ b/tests/python/unittest/test_loss.py
@@ -79,9 +79,9 @@ def test_ce_loss():
     loss = mx.sym.make_loss(loss)
     mod = mx.mod.Module(loss, data_names=('data',), label_names=('label',))
     mod.fit(data_iter, num_epoch=200, optimizer_params={'learning_rate': 0.01},
-            eval_metric=mx.metric.Loss(), optimizer='adam',
+            eval_metric=mx.gluon.metric.Loss(), optimizer='adam',
             initializer=mx.init.Xavier(magnitude=2))
-    assert mod.score(data_iter, eval_metric=mx.metric.Loss())[0][1] < 0.05
+    assert mod.score(data_iter, eval_metric=mx.gluon.metric.Loss())[0][1] < 0.05
 
 # tracked at: https://github.com/apache/incubator-mxnet/issues/11691
 @with_seed()
@@ -97,9 +97,9 @@ def test_bce_loss():
     loss = mx.sym.make_loss(loss)
     mod = mx.mod.Module(loss, data_names=('data',), label_names=('label',))
     mod.fit(data_iter, num_epoch=200, optimizer_params={'learning_rate': 0.01},
-            eval_metric=mx.metric.Loss(), optimizer='adam',
+            eval_metric=mx.gluon.metric.Loss(), optimizer='adam',
             initializer=mx.init.Xavier(magnitude=2))
-    assert mod.score(data_iter, eval_metric=mx.metric.Loss())[0][1] < 0.01
+    assert mod.score(data_iter, eval_metric=mx.gluon.metric.Loss())[0][1] < 0.01
     # Test against npy
     data = mx.random.uniform(-5, 5, shape=(10,))
     label = mx.random.uniform(0, 1, shape=(10,))
@@ -142,8 +142,8 @@ def test_kl_loss():
     loss = mx.sym.make_loss(loss)
     mod = mx.mod.Module(loss, data_names=('data',), label_names=('label',))
     mod.fit(data_iter, num_epoch=200, optimizer_params={'learning_rate': 0.01},
-            eval_metric=mx.metric.Loss(), optimizer='adam')
-    assert mod.score(data_iter, eval_metric=mx.metric.Loss())[0][1] < 0.05
+            eval_metric=mx.gluon.metric.Loss(), optimizer='adam')
+    assert mod.score(data_iter, eval_metric=mx.gluon.metric.Loss())[0][1] < 0.05
 
 
 @with_seed()
@@ -159,9 +159,9 @@ def test_l2_loss():
     loss = mx.sym.make_loss(loss)
     mod = mx.mod.Module(loss, data_names=('data',), label_names=('label',))
     mod.fit(data_iter, num_epoch=200, optimizer_params={'learning_rate': 0.01},
-            initializer=mx.init.Xavier(magnitude=2), eval_metric=mx.metric.Loss(),
+            initializer=mx.init.Xavier(magnitude=2), eval_metric=mx.gluon.metric.Loss(),
             optimizer='adam')
-    assert mod.score(data_iter, eval_metric=mx.metric.Loss())[0][1] < 0.05
+    assert mod.score(data_iter, eval_metric=mx.gluon.metric.Loss())[0][1] < 0.05
 
 
 @with_seed()
@@ -177,9 +177,9 @@ def test_l1_loss():
     loss = mx.sym.make_loss(loss)
     mod = mx.mod.Module(loss, data_names=('data',), label_names=('label',))
     mod.fit(data_iter, num_epoch=200, optimizer_params={'learning_rate': 0.01},
-            initializer=mx.init.Xavier(magnitude=2), eval_metric=mx.metric.Loss(),
+            initializer=mx.init.Xavier(magnitude=2), eval_metric=mx.gluon.metric.Loss(),
             optimizer='adam')
-    assert mod.score(data_iter, eval_metric=mx.metric.Loss())[0][1] < 0.1
+    assert mod.score(data_iter, eval_metric=mx.gluon.metric.Loss())[0][1] < 0.1
 
 
 @with_seed()
@@ -222,9 +222,9 @@ def test_ctc_loss_train():
     loss = mx.sym.make_loss(loss)
     mod = mx.mod.Module(loss, data_names=('data',), label_names=('label',))
     mod.fit(data_iter, num_epoch=200, optimizer_params={'learning_rate': 0.01},
-            initializer=mx.init.Xavier(magnitude=2), eval_metric=mx.metric.Loss(),
+            initializer=mx.init.Xavier(magnitude=2), eval_metric=mx.gluon.metric.Loss(),
             optimizer='adam')
-    assert mod.score(data_iter, eval_metric=mx.metric.Loss())[0][1] < 10
+    assert mod.score(data_iter, eval_metric=mx.gluon.metric.Loss())[0][1] < 10
 
 
 @with_seed()
@@ -243,12 +243,12 @@ def test_sample_weight_loss():
     loss = mx.sym.make_loss(loss)
     mod = mx.mod.Module(loss, data_names=('data',), label_names=('label', 'w'))
     mod.fit(data_iter, num_epoch=200, optimizer_params={'learning_rate': 0.01},
-            eval_metric=mx.metric.Loss(), optimizer='adam')
+            eval_metric=mx.gluon.metric.Loss(), optimizer='adam')
     data_iter = mx.io.NDArrayIter(data[10:], {'label': label, 'w': weight}, batch_size=10)
-    score =  mod.score(data_iter, eval_metric=mx.metric.Loss())[0][1]
+    score =  mod.score(data_iter, eval_metric=mx.gluon.metric.Loss())[0][1]
     assert score > 1
     data_iter = mx.io.NDArrayIter(data[:10], {'label': label, 'w': weight}, batch_size=10)
-    score =  mod.score(data_iter, eval_metric=mx.metric.Loss())[0][1]
+    score =  mod.score(data_iter, eval_metric=mx.gluon.metric.Loss())[0][1]
     assert score < 0.05
 
 
@@ -266,13 +266,13 @@ def test_saveload():
     loss = mx.sym.make_loss(loss)
     mod = mx.mod.Module(loss, data_names=('data',), label_names=('label',))
     mod.fit(data_iter, num_epoch=100, optimizer_params={'learning_rate': 1.},
-            eval_metric=mx.metric.Loss())
+            eval_metric=mx.gluon.metric.Loss())
     mod.save_checkpoint('test', 100, save_optimizer_states=True)
     mod = mx.mod.Module.load('test', 100, load_optimizer_states=True,
                              data_names=('data',), label_names=('label',))
     mod.fit(data_iter, num_epoch=100, optimizer_params={'learning_rate': 1.},
-            eval_metric=mx.metric.Loss())
-    assert mod.score(data_iter, eval_metric=mx.metric.Loss())[0][1] < 0.05
+            eval_metric=mx.gluon.metric.Loss())
+    assert mod.score(data_iter, eval_metric=mx.gluon.metric.Loss())[0][1] < 0.05
 
 @with_seed()
 def test_huber_loss():
@@ -287,9 +287,9 @@ def test_huber_loss():
     loss = mx.sym.make_loss(loss)
     mod = mx.mod.Module(loss, data_names=('data',), label_names=('label',))
     mod.fit(data_iter, num_epoch=200, optimizer_params={'learning_rate': 0.01},
-            initializer=mx.init.Xavier(magnitude=2), eval_metric=mx.metric.Loss(),
+            initializer=mx.init.Xavier(magnitude=2), eval_metric=mx.gluon.metric.Loss(),
             optimizer='adam')
-    assert mod.score(data_iter, eval_metric=mx.metric.Loss())[0][1] < 0.05
+    assert mod.score(data_iter, eval_metric=mx.gluon.metric.Loss())[0][1] < 0.05
 
 
 @with_seed()
@@ -305,9 +305,9 @@ def test_hinge_loss():
     loss = mx.sym.make_loss(loss)
     mod = mx.mod.Module(loss, data_names=('data',), label_names=('label',))
     mod.fit(data_iter, num_epoch=200, optimizer_params={'learning_rate': 0.01},
-            initializer=mx.init.Xavier(magnitude=2), eval_metric=mx.metric.Loss(),
+            initializer=mx.init.Xavier(magnitude=2), eval_metric=mx.gluon.metric.Loss(),
             optimizer='adam')
-    assert mod.score(data_iter, eval_metric=mx.metric.Loss())[0][1] < 0.06
+    assert mod.score(data_iter, eval_metric=mx.gluon.metric.Loss())[0][1] < 0.06
 
 
 @with_seed()
@@ -323,9 +323,9 @@ def test_squared_hinge_loss():
     loss = mx.sym.make_loss(loss)
     mod = mx.mod.Module(loss, data_names=('data',), label_names=('label',))
     mod.fit(data_iter, num_epoch=200, optimizer_params={'learning_rate': 0.01},
-            initializer=mx.init.Xavier(magnitude=2), eval_metric=mx.metric.Loss(),
+            initializer=mx.init.Xavier(magnitude=2), eval_metric=mx.gluon.metric.Loss(),
             optimizer='adam')
-    assert mod.score(data_iter, eval_metric=mx.metric.Loss())[0][1] < 0.05
+    assert mod.score(data_iter, eval_metric=mx.gluon.metric.Loss())[0][1] < 0.05
 
 
 @with_seed()
@@ -344,9 +344,9 @@ def test_triplet_loss():
     loss = mx.sym.make_loss(loss)
     mod = mx.mod.Module(loss, data_names=('data',), label_names=('pos','neg'))
     mod.fit(data_iter, num_epoch=200, optimizer_params={'learning_rate': 0.01},
-            initializer=mx.init.Xavier(magnitude=2), eval_metric=mx.metric.Loss(),
+            initializer=mx.init.Xavier(magnitude=2), eval_metric=mx.gluon.metric.Loss(),
             optimizer='adam')
-    assert mod.score(data_iter, eval_metric=mx.metric.Loss())[0][1] < 0.05
+    assert mod.score(data_iter, eval_metric=mx.gluon.metric.Loss())[0][1] < 0.05
 
 @with_seed()
 def test_sdml_loss():
@@ -453,9 +453,9 @@ def test_poisson_nllloss_mod():
     loss = mx.sym.make_loss(loss)
     mod = mx.mod.Module(loss, data_names=('data',), label_names=('label',))
     mod.fit(data_iter, num_epoch=20, optimizer_params={'learning_rate': 0.01},
-            initializer=mx.init.Normal(sigma=0.1), eval_metric=mx.metric.Loss(),
+            initializer=mx.init.Normal(sigma=0.1), eval_metric=mx.gluon.metric.Loss(),
             optimizer='adam')
-    assert mod.score(data_iter, eval_metric=mx.metric.Loss())[0][1] < 0.05
+    assert mod.score(data_iter, eval_metric=mx.gluon.metric.Loss())[0][1] < 0.05
 
 @with_seed()
 def test_bce_loss_with_pos_weight():
@@ -474,9 +474,9 @@ def test_bce_loss_with_pos_weight():
     loss = mx.sym.make_loss(loss)
     mod = mx.mod.Module(loss, data_names=('data',), label_names=('label', 'pos_w'))
     mod.fit(data_iter, num_epoch=200, optimizer_params={'learning_rate': 0.01},
-            eval_metric=mx.metric.Loss(), optimizer='adam',
+            eval_metric=mx.gluon.metric.Loss(), optimizer='adam',
             initializer=mx.init.Xavier(magnitude=2))
-    assert mod.score(data_iter, eval_metric=mx.metric.Loss())[0][1] < 0.01
+    assert mod.score(data_iter, eval_metric=mx.gluon.metric.Loss())[0][1] < 0.01
     # Test against npy
     data = mx.nd.random.uniform(-5, 5, shape=(N, 5))
     label = mx.nd.array(np.random.randint(2, size=(N, 5)), dtype='float32')
diff --git a/tests/python/unittest/test_metric.py b/tests/python/unittest/test_metric.py
index e7273fba35d5..3408dd503d59 100644
--- a/tests/python/unittest/test_metric.py
+++ b/tests/python/unittest/test_metric.py
@@ -25,9 +25,9 @@
 from copy import deepcopy
 
 def check_metric(metric, *args, **kwargs):
-    metric = mx.metric.create(metric, *args, **kwargs)
+    metric = mx.gluon.metric.create(metric, *args, **kwargs)
     str_metric = json.dumps(metric.get_config())
-    metric2 = mx.metric.create(str_metric)
+    metric2 = mx.gluon.metric.create(str_metric)
 
     assert metric.get_config() == metric2.get_config()
 
@@ -40,7 +40,7 @@ def test_metrics():
     check_metric('pcc')
     check_metric('nll_loss')
     check_metric('loss')
-    composite = mx.metric.create(['acc', 'f1'])
+    composite = mx.gluon.metric.create(['acc', 'f1'])
     check_metric(composite)
 
 def _check_global_metric(metric, *args, **kwargs):
@@ -76,7 +76,7 @@ def _compare_metric_result(m1, m2):
 
     shape = kwargs.pop('shape', (10,10))
     use_same_shape = kwargs.pop('use_same_shape', False)
-    m1 = mx.metric.create(metric, *args, **kwargs)
+    m1 = mx.gluon.metric.create(metric, *args, **kwargs)
     m2 = deepcopy(m1)
     # check that global stats are not reset when calling
     # reset_local()
@@ -121,7 +121,7 @@ def custom_metric(label, pred):
     _check_global_metric(['acc', 'f1'], shape=(10,2))
 
 def test_nll_loss():
-    metric = mx.metric.create('nll_loss')
+    metric = mx.gluon.metric.create('nll_loss')
     pred = mx.nd.array([[0.2, 0.3, 0.5], [0.6, 0.1, 0.3]])
     label = mx.nd.array([2, 1])
     metric.update([label], [pred])
@@ -132,7 +132,7 @@ def test_nll_loss():
 def test_acc():
     pred = mx.nd.array([[0.3, 0.7], [0, 1.], [0.4, 0.6]])
     label = mx.nd.array([0, 1, 1])
-    metric = mx.metric.create('acc')
+    metric = mx.gluon.metric.create('acc')
     metric.update([label], [pred])
     _, acc = metric.get()
     expected_acc = (np.argmax(pred, axis=1) == label).sum().asscalar() / label.size
@@ -142,7 +142,7 @@ def test_acc_2d_label():
     # label maybe provided in 2d arrays in custom data iterator
     pred = mx.nd.array([[0.3, 0.7], [0, 1.], [0.4, 0.6], [0.8, 0.2], [0.3, 0.5], [0.6, 0.4]])
     label = mx.nd.array([[0, 1, 1], [1, 0, 1]])
-    metric = mx.metric.create('acc')
+    metric = mx.gluon.metric.create('acc')
     metric.update([label], [pred])
     _, acc = metric.get()
     expected_acc = (np.argmax(pred, axis=1).asnumpy() == label.asnumpy().ravel()).sum() / \
@@ -151,8 +151,8 @@ def test_acc_2d_label():
 
 def test_loss_update():
     pred = mx.nd.array([[0.3, 0.7], [0, 1.], [0.4, 0.6]])
-    metric1 = mx.metric.create('loss')
-    metric2 = mx.metric.create('loss')
+    metric1 = mx.gluon.metric.create('loss')
+    metric2 = mx.gluon.metric.create('loss')
     metric1.update(None, [pred])
     metric2.update(None, pred)
     _, acc1 = metric1.get()
@@ -160,8 +160,8 @@ def test_loss_update():
     assert acc1 == acc2
 
 def test_f1():
-    microF1 = mx.metric.create("f1", average="micro")
-    macroF1 = mx.metric.F1(average="macro")
+    microF1 = mx.gluon.metric.create("f1", average="micro")
+    macroF1 = mx.gluon.metric.F1(average="macro")
 
     assert np.isnan(macroF1.get()[1])
     assert np.isnan(microF1.get()[1])
@@ -207,8 +207,8 @@ def test_f1():
     np.testing.assert_almost_equal(macroF1.get()[1], (fscore1 + fscore2) / 2.)
 
 def test_mcc():
-    microMCC = mx.metric.create("mcc", average="micro")
-    macroMCC = mx.metric.MCC(average="macro")
+    microMCC = mx.gluon.metric.create("mcc", average="micro")
+    macroMCC = mx.gluon.metric.MCC(average="macro")
 
     assert np.isnan(microMCC.get()[1])
     assert np.isnan(macroMCC.get()[1])
@@ -259,7 +259,7 @@ def test_perplexity():
     label = mx.nd.array([0, 1, 1])
     p = pred.asnumpy()[np.arange(label.size), label.asnumpy().astype('int32')]
     perplexity_expected = np.exp(-np.log(p).sum()/label.size)
-    metric = mx.metric.create('perplexity', -1)
+    metric = mx.gluon.metric.create('perplexity', -1)
     metric.update([label], [pred])
     _, perplexity = metric.get()
     assert perplexity == perplexity_expected
@@ -269,8 +269,8 @@ def test_pearsonr():
     label1 = mx.nd.array([[1, 0], [0, 1], [0, 1]])
     pearsonr_expected_np = np.corrcoef(pred1.asnumpy().ravel(), label1.asnumpy().ravel())[0, 1]
     pearsonr_expected_scipy, _ = pearsonr(pred1.asnumpy().ravel(), label1.asnumpy().ravel())
-    macro_pr = mx.metric.create('pearsonr', average='macro')
-    micro_pr = mx.metric.create('pearsonr', average='micro')
+    macro_pr = mx.gluon.metric.create('pearsonr', average='macro')
+    micro_pr = mx.gluon.metric.create('pearsonr', average='micro')
 
     assert np.isnan(macro_pr.get()[1])
     assert np.isnan(micro_pr.get()[1])
@@ -317,18 +317,18 @@ def test_pcc():
         [ 7, 3 ],
         [ 2, 5 ],
     ])
-    met_pcc = mx.metric.create('pcc')
+    met_pcc = mx.gluon.metric.create('pcc')
     met_pcc.update(labels, preds)
     _, pcc = met_pcc.get()
 
     # pcc should agree with mcc for binary classification
-    met_mcc = mx.metric.create('mcc')
+    met_mcc = mx.gluon.metric.create('mcc')
     met_mcc.update(labels, preds)
     _, mcc = met_mcc.get()
     np.testing.assert_almost_equal(pcc, mcc)
 
     # pcc should agree with Pearson for binary classification
-    met_pear = mx.metric.create('pearsonr')
+    met_pear = mx.gluon.metric.create('pearsonr')
     met_pear.update(labels, [p.argmax(axis=1) for p in preds])
     _, pear = met_pear.get()
     np.testing.assert_almost_equal(pcc, pear)
@@ -391,18 +391,18 @@ def test_single_array_input():
     pred = mx.nd.array([[1,2,3,4]])
     label = pred + 0.1
 
-    mse = mx.metric.create('mse')
+    mse = mx.gluon.metric.create('mse')
     mse.update(label, pred)
     _, mse_res = mse.get()
     np.testing.assert_almost_equal(mse_res, 0.01)
 
-    mae = mx.metric.create('mae')
+    mae = mx.gluon.metric.create('mae')
     mae.update(label, pred)
     mae.get()
     _, mae_res = mae.get()
     np.testing.assert_almost_equal(mae_res, 0.1)
 
-    rmse = mx.metric.create('rmse')
+    rmse = mx.gluon.metric.create('rmse')
     rmse.update(label, pred)
     rmse.get()
     _, rmse_res = rmse.get()
diff --git a/tests/python/unittest/test_metric_perf.py b/tests/python/unittest/test_metric_perf.py
index 36cbc685797c..058d4cb8217f 100644
--- a/tests/python/unittest/test_metric_perf.py
+++ b/tests/python/unittest/test_metric_perf.py
@@ -66,7 +66,7 @@ def data(self):
 
 def run_metric(name, data_gen_cls, i, n, c, pred_ctx, label_ctx, **kwargs):
     """ Helper function for running one metric benchmark """
-    metric = mx.metric.create(name, **kwargs)
+    metric = mx.gluon.metric.create(name, **kwargs)
     data_gen = data_gen_cls(n, c, pred_ctx, label_ctx)
     try:
         label, pred = data_gen.data()
@@ -105,7 +105,7 @@ def test_metric_performance():
     output_dims = [128, 1024, 8192]
     ctxs = [mx.cpu(), mx.gpu()]
 
-    print("\nmx.metric benchmarks", file=sys.stderr)
+    print("\nmx.gluon.metric benchmarks", file=sys.stderr)
     print(
         "{:15}{:10}{:12}{:12}{:15}{:15}{}".format(
             'Metric', 'Data-Ctx', 'Label-Ctx', 'Data Size', 'Batch Size', 'Output Dim', 'Elapsed Time'),
diff --git a/tests/python/unittest/test_module.py b/tests/python/unittest/test_module.py
index b82933126d67..1b17b839a298 100644
--- a/tests/python/unittest/test_module.py
+++ b/tests/python/unittest/test_module.py
@@ -275,7 +275,7 @@ def sym_gen(seq_len):
     mod2.fit(
         train_data=data_train,
         eval_data=data_val,
-        eval_metric=mx.metric.Perplexity(invalid_label), # Use Perplexity for multiclass classification.
+        eval_metric=mx.gluon.metric.Perplexity(invalid_label), # Use Perplexity for multiclass classification.
         kvstore='device',
         optimizer='sgd',
         optimizer_params={'learning_rate': 0.01,
@@ -711,7 +711,7 @@ def fm(factor_size, feature_dim, init):
         expected_accuracy = 0.02
 
 	# use accuracy as the metric
-        metric = mx.metric.create('MSE')
+        metric = mx.gluon.metric.create('MSE')
         # train 'num_epochs' epoch
         for epoch in range(num_epochs):
             train_iter.reset()

From 2ff2e38a969277a06aa97b599fb228d3ebf1bdce Mon Sep 17 00:00:00 2001
From: acphile <phile_999@126.com>
Date: Mon, 20 Apr 2020 04:10:35 +0000
Subject: [PATCH 05/24] remove global support

---
 python/mxnet/gluon/metric.py | 325 ++++-------------------------------
 1 file changed, 33 insertions(+), 292 deletions(-)

diff --git a/python/mxnet/gluon/metric.py b/python/mxnet/gluon/metric.py
index 1c9073096adb..953a57894827 100644
--- a/python/mxnet/gluon/metric.py
+++ b/python/mxnet/gluon/metric.py
@@ -89,7 +89,6 @@ def __init__(self, name, output_names=None,
         self.name = str(name)
         self.output_names = output_names
         self.label_names = label_names
-        self._has_global_stats = kwargs.pop("has_global_stats", False)
         self._kwargs = kwargs
         self.reset()
 
@@ -148,13 +147,6 @@ def reset(self):
         """Resets the internal evaluation result to initial state."""
         self.num_inst = 0
         self.sum_metric = 0.0
-        self.global_num_inst = 0
-        self.global_sum_metric = 0.0
-
-    def reset_local(self):
-        """Resets the local portion of the internal evaluation results to initial state."""
-        self.num_inst = 0
-        self.sum_metric = 0.0
 
     def get(self):
         """Gets the current evaluation result.
@@ -171,24 +163,6 @@ def get(self):
         else:
             return (self.name, self.sum_metric / self.num_inst)
 
-    def get_global(self):
-        """Gets the current global evaluation result.
-
-        Returns
-        -------
-        names : list of str
-           Name of the metrics.
-        values : list of float
-           Value of the evaluations.
-        """
-        if self._has_global_stats:
-            if self.global_num_inst == 0:
-                return (self.name, float('nan'))
-            else:
-                return (self.name, self.global_sum_metric / self.global_num_inst)
-        else:
-            return self.get()
-
     def get_name_value(self):
         """Returns zipped name and value pairs.
 
@@ -204,24 +178,6 @@ def get_name_value(self):
             value = [value]
         return list(zip(name, value))
 
-    def get_global_name_value(self):
-        """Returns zipped name and value pairs for global results.
-
-        Returns
-        -------
-        list of tuples
-            A (name, value) tuple list.
-        """
-        if self._has_global_stats:
-            name, value = self.get_global()
-            if not isinstance(name, list):
-                name = [name]
-            if not isinstance(value, list):
-                value = [value]
-            return list(zip(name, value))
-        else:
-            return self.get_name_value()
-
 # pylint: disable=invalid-name
 register = registry.get_register_func(EvalMetric, 'metric')
 alias = registry.get_alias_func(EvalMetric, 'metric')
@@ -306,8 +262,7 @@ class CompositeEvalMetric(EvalMetric):
     def __init__(self, metrics=None, name='composite',
                  output_names=None, label_names=None):
         super(CompositeEvalMetric, self).__init__(
-            name, output_names=output_names, label_names=label_names,
-            has_global_stats=True)
+            name, output_names=output_names, label_names=label_names)
         if metrics is None:
             metrics = []
         self.metrics = [create(i) for i in metrics]
@@ -369,14 +324,6 @@ def reset(self):
         except AttributeError:
             pass
 
-    def reset_local(self):
-        """Resets the local portion of the internal evaluation results to initial state."""
-        try:
-            for metric in self.metrics:
-                metric.reset_local()
-        except AttributeError:
-            pass
-
     def get(self):
         """Returns the current evaluation result.
 
@@ -399,28 +346,6 @@ def get(self):
             values.extend(value)
         return (names, values)
 
-    def get_global(self):
-        """Returns the current evaluation result.
-
-        Returns
-        -------
-        names : list of str
-           Name of the metrics.
-        values : list of float
-           Value of the evaluations.
-        """
-        names = []
-        values = []
-        for metric in self.metrics:
-            name, value = metric.get_global()
-            if isinstance(name, string_types):
-                name = [name]
-            if isinstance(value, numeric_types):
-                value = [value]
-            names.extend(name)
-            values.extend(value)
-        return (names, values)
-
     def get_config(self):
         config = super(CompositeEvalMetric, self).get_config()
         config.update({'metrics': [i.get_config() for i in self.metrics]})
@@ -469,8 +394,7 @@ def __init__(self, axis=1, name='accuracy',
                  output_names=None, label_names=None):
         super(Accuracy, self).__init__(
             name, axis=axis,
-            output_names=output_names, label_names=label_names,
-            has_global_stats=True)
+            output_names=output_names, label_names=label_names)
         self.axis = axis
 
     def update(self, labels, preds):
@@ -500,9 +424,7 @@ def update(self, labels, preds):
 
             num_correct = (pred_label == label).sum()
             self.sum_metric += num_correct
-            self.global_sum_metric += num_correct
             self.num_inst += len(pred_label)
-            self.global_num_inst += len(pred_label)
 
 
 @register
@@ -545,8 +467,7 @@ def __init__(self, top_k=1, name='top_k_accuracy',
                  output_names=None, label_names=None):
         super(TopKAccuracy, self).__init__(
             name, top_k=top_k,
-            output_names=output_names, label_names=label_names,
-            has_global_stats=True)
+            output_names=output_names, label_names=label_names)
         self.top_k = top_k
         assert(self.top_k > 1), 'Please use Accuracy if top_k is no more than 1'
         self.name += '_%d' % self.top_k
@@ -578,16 +499,13 @@ def update(self, labels, preds):
             if num_dims == 1:
                 num_correct = (pred_label.flat == label.flat).sum()
                 self.sum_metric += num_correct
-                self.global_sum_metric += num_correct
             elif num_dims == 2:
                 num_classes = pred_label.shape[1]
                 top_k = min(num_classes, self.top_k)
                 for j in range(top_k):
                     num_correct = (pred_label[:, num_classes - 1 - j].flat == label.flat).sum()
                     self.sum_metric += num_correct
-                    self.global_sum_metric += num_correct
             self.num_inst += num_samples
-            self.global_num_inst += num_samples
 
 
 class _BinaryClassificationMetrics(object):
@@ -613,10 +531,6 @@ def __init__(self, threshold=0.5, beta=1):
         self.false_negatives = 0
         self.false_positives = 0
         self.true_negatives = 0
-        self.global_true_positives = 0
-        self.global_false_negatives = 0
-        self.global_false_positives = 0
-        self.global_true_negatives = 0
 
     def update_binary_stats(self, label, pred):
         """Update various binary classification counts for a single (label, pred) pair.
@@ -657,13 +571,9 @@ def update_binary_stats(self, label, pred):
         false_neg = (pred_false * label_true).sum()
         true_neg = (pred_false * label_false).sum()
         self.true_positives += true_pos
-        self.global_true_positives += true_pos
         self.false_positives += false_pos
-        self.global_false_positives += false_pos
         self.false_negatives += false_neg
-        self.global_false_negatives += false_neg
         self.true_negatives += true_neg
-        self.global_true_negatives += true_neg
 
     @property
     def precision(self):
@@ -672,13 +582,6 @@ def precision(self):
         else:
             return 0.
 
-    @property
-    def global_precision(self):
-        if self.global_true_positives + self.global_false_positives > 0:
-            return float(self.global_true_positives) / (self.global_true_positives + self.global_false_positives)
-        else:
-            return 0.
-
     @property
     def recall(self):
         if self.true_positives + self.false_negatives > 0:
@@ -686,13 +589,6 @@ def recall(self):
         else:
             return 0.
 
-    @property
-    def global_recall(self):
-        if self.global_true_positives + self.global_false_negatives > 0:
-            return float(self.global_true_positives) / (self.global_true_positives + self.global_false_negatives)
-        else:
-            return 0.
-
     @property
     def fscore(self):
         if self.precision + self.recall > 0:
@@ -700,31 +596,15 @@ def fscore(self):
         else:
             return 0.
 
-    @property
-    def global_fscore(self):
-        if self.global_precision + self.global_recall > 0:
-            return (1 + self.beta ** 2) * self.global_precision * self.global_recall / (self.beta ** 2 * self.global_precision + self.global_recall)
-        else:
-            return 0.
-
-    def matthewscc(self, use_global=False):
+    def matthewscc(self):
         """Calculate the Matthew's Correlation Coefficent"""
-        if use_global:
-            if not self.global_total_examples:
-                return 0.
-
-            true_pos = float(self.global_true_positives)
-            false_pos = float(self.global_false_positives)
-            false_neg = float(self.global_false_negatives)
-            true_neg = float(self.global_true_negatives)
-        else:
-            if not self.total_examples:
-                return 0.
+        if not self.total_examples:
+            return 0.
 
-            true_pos = float(self.true_positives)
-            false_pos = float(self.false_positives)
-            false_neg = float(self.false_negatives)
-            true_neg = float(self.true_negatives)
+        true_pos = float(self.true_positives)
+        false_pos = float(self.false_positives)
+        false_neg = float(self.false_negatives)
+        true_neg = float(self.true_negatives)
 
         terms = [(true_pos + false_pos),
                  (true_pos + false_neg),
@@ -740,11 +620,6 @@ def total_examples(self):
         return self.false_negatives + self.false_positives + \
                self.true_negatives + self.true_positives
 
-    @property
-    def global_total_examples(self):
-        return self.global_false_negatives + self.global_false_positives + \
-               self.global_true_negatives + self.global_true_positives
-
     @property
     def accuracy(self):
         if self.total_examples > 0:
@@ -752,28 +627,11 @@ def accuracy(self):
         else:
             return 0.
 
-    @property
-    def global_accuracy(self):
-        if self.global_total_examples > 0:
-            return float(self.global_true_positives + self.global_true_negatives) / self.global_total_examples
-        else:
-            return 0.
-            
-    def local_reset_stats(self):
-        self.false_positives = 0
-        self.false_negatives = 0
-        self.true_positives = 0
-        self.true_negatives = 0
-
     def reset_stats(self):
         self.false_positives = 0
         self.false_negatives = 0
         self.true_positives = 0
         self.true_negatives = 0
-        self.global_false_positives = 0
-        self.global_false_negatives = 0
-        self.global_true_positives = 0
-        self.global_true_negatives = 0
 
 
 @register
@@ -826,8 +684,7 @@ def __init__(self, name='f1',
         self.average = average
         self.metrics = _BinaryClassificationMetrics(threshold=threshold)
         EvalMetric.__init__(self, name=name,
-                            output_names=output_names, label_names=label_names,
-                            has_global_stats=True)
+                            output_names=output_names, label_names=label_names)
 
     def update(self, labels, preds):
         """Updates the internal evaluation result.
@@ -847,29 +704,18 @@ def update(self, labels, preds):
 
         if self.average == "macro":
             self.sum_metric += self.metrics.fscore
-            self.global_sum_metric += self.metrics.global_fscore
             self.num_inst += 1
-            self.global_num_inst += 1
             self.metrics.reset_stats()
         else:
             self.sum_metric = self.metrics.fscore * self.metrics.total_examples
-            self.global_sum_metric = self.metrics.global_fscore * self.metrics.global_total_examples
             self.num_inst = self.metrics.total_examples
-            self.global_num_inst = self.metrics.global_total_examples
 
     def reset(self):
         """Resets the internal evaluation result to initial state."""
         self.sum_metric = 0.
         self.num_inst = 0
-        self.global_num_inst = 0
-        self.global_sum_metric = 0.0
         self.metrics.reset_stats()
 
-    def reset_local(self):
-        """Resets the internal evaluation result to initial state."""
-        self.sum_metric = 0.
-        self.num_inst = 0
-        self.metrics.local_reset_stats()
 
 @register
 class Fbeta(EvalMetric):
@@ -923,8 +769,7 @@ def __init__(self, name='fbeta',
         self.average = average
         self.metrics = _BinaryClassificationMetrics(threshold=threshold, beta=beta)
         EvalMetric.__init__(self, name=name,
-                            output_names=output_names, label_names=label_names,
-                            has_global_stats=True)
+                            output_names=output_names, label_names=label_names)
 
     def update(self, labels, preds):
         """Updates the internal evaluation result.
@@ -944,30 +789,18 @@ def update(self, labels, preds):
 
         if self.average == "macro":
             self.sum_metric += self.metrics.fscore
-            self.global_sum_metric += self.metrics.global_fscore
             self.num_inst += 1
-            self.global_num_inst += 1
             self.metrics.reset_stats()
         else:
             self.sum_metric = self.metrics.fscore * self.metrics.total_examples
-            self.global_sum_metric = self.metrics.global_fscore * self.metrics.global_total_examples
             self.num_inst = self.metrics.total_examples
-            self.global_num_inst = self.metrics.global_total_examples
 
     def reset(self):
         """Resets the internal evaluation result to initial state."""
         self.sum_metric = 0.
         self.num_inst = 0
-        self.global_num_inst = 0
-        self.global_sum_metric = 0.0
         self.metrics.reset_stats()
 
-    def reset_local(self):
-        """Resets the internal evaluation result to initial state."""
-        self.sum_metric = 0.
-        self.num_inst = 0
-        self.metrics.local_reset_stats()
-
 
 @register
 class BinaryAccuracy(EvalMetric):
@@ -1000,8 +833,7 @@ def __init__(self, name='binary_accuracy',
                  output_names=None, label_names=None, threshold=0.5):
         self.metrics = _BinaryClassificationMetrics(threshold=threshold)
         EvalMetric.__init__(self, name=name,
-                            output_names=output_names, label_names=label_names,
-                            has_global_stats=True)
+                            output_names=output_names, label_names=label_names)
 
     def update(self, labels, preds):
         """Updates the internal evaluation result.
@@ -1020,24 +852,14 @@ def update(self, labels, preds):
             self.metrics.update_binary_stats(label, pred)
 
         self.sum_metric = self.metrics.accuracy * self.metrics.total_examples
-        self.global_sum_metric = self.metrics.global_accuracy * self.metrics.global_total_examples
         self.num_inst = self.metrics.total_examples
-        self.global_num_inst = self.metrics.global_total_examples
 
     def reset(self):
         """Resets the internal evaluation result to initial state."""
         self.sum_metric = 0.
         self.num_inst = 0
-        self.global_num_inst = 0
-        self.global_sum_metric = 0.0
         self.metrics.reset_stats()
-
-    def reset_local(self):
-        """Resets the internal evaluation result to initial state."""
-        self.sum_metric = 0.
-        self.num_inst = 0
-        self.metrics.local_reset_stats()
-        
+       
         
 @register
 class MCC(EvalMetric):
@@ -1107,8 +929,7 @@ def __init__(self, name='mcc',
         self._average = average
         self._metrics = _BinaryClassificationMetrics()
         EvalMetric.__init__(self, name=name,
-                            output_names=output_names, label_names=label_names,
-                            has_global_stats=True)
+                            output_names=output_names, label_names=label_names)
 
     def update(self, labels, preds):
         """Updates the internal evaluation result.
@@ -1128,31 +949,18 @@ def update(self, labels, preds):
 
         if self._average == "macro":
             self.sum_metric += self._metrics.matthewscc()
-            self.global_sum_metric += self._metrics.matthewscc(use_global=True)
             self.num_inst += 1
-            self.global_num_inst += 1
             self._metrics.reset_stats()
         else:
             self.sum_metric = self._metrics.matthewscc() * self._metrics.total_examples
-            self.global_sum_metric = self._metrics.matthewscc(use_global=True) * \
-                                     self._metrics.global_total_examples
             self.num_inst = self._metrics.total_examples
-            self.global_num_inst = self._metrics.global_total_examples
 
     def reset(self):
         """Resets the internal evaluation result to initial state."""
         self.sum_metric = 0.
         self.num_inst = 0.
-        self.global_sum_metric = 0.
-        self.global_num_inst = 0.
         self._metrics.reset_stats()
 
-    def reset_local(self):
-        """Resets the internal evaluation result to initial state."""
-        self.sum_metric = 0.
-        self.num_inst = 0.
-        self._metrics.local_reset_stats()
-
 
 @register
 class Perplexity(EvalMetric):
@@ -1212,8 +1020,7 @@ def __init__(self, ignore_label, axis=-1, name='perplexity',
                  output_names=None, label_names=None):
         super(Perplexity, self).__init__(
             name, ignore_label=ignore_label,
-            output_names=output_names, label_names=label_names,
-            has_global_stats=True)
+            output_names=output_names, label_names=label_names)
         self.ignore_label = ignore_label
         self.axis = axis
 
@@ -1243,9 +1050,7 @@ def update(self, labels, preds):
             loss -= ndarray.sum(ndarray.log(ndarray.maximum(1e-10, pred))).asscalar()
             num += pred.size
         self.sum_metric += loss
-        self.global_sum_metric += loss
         self.num_inst += num
-        self.global_num_inst += num
 
     def get(self):
         """Returns the current evaluation result.
@@ -1260,19 +1065,6 @@ def get(self):
         else:
             return (self.name, math.exp(self.sum_metric/self.num_inst))
 
-    def get_global(self):
-        """Returns the current global evaluation result.
-
-        Returns
-        -------
-        Tuple of (str, float)
-            Representing name of the metric and evaluation result.
-        """
-        if self.global_num_inst == 0:
-            return (self.name, float('nan'))
-        else:
-            return (self.name, math.exp(self.global_sum_metric/self.global_num_inst))
-
 ####################
 # REGRESSION METRICS
 ####################
@@ -1314,8 +1106,7 @@ class MAE(EvalMetric):
     def __init__(self, name='mae',
                  output_names=None, label_names=None, average='macro'):
         super(MAE, self).__init__(
-            name, output_names=output_names, label_names=label_names,
-            has_global_stats=True)
+            name, output_names=output_names, label_names=label_names)
         self.average = average
         
     def update(self, labels, preds):
@@ -1348,9 +1139,7 @@ def update(self, labels, preds):
                 mae = numpy.abs(label - pred).reshape(num_inst, -1).mean(axis=-1).sum()
                 
             self.sum_metric += mae
-            self.global_sum_metric += mae
             self.num_inst += num_inst
-            self.global_num_inst += num_inst
 
 
 @register
@@ -1388,8 +1177,7 @@ class MSE(EvalMetric):
     def __init__(self, name='mse',
                  output_names=None, label_names=None, average="macro"):
         super(MSE, self).__init__(
-            name, output_names=output_names, label_names=label_names,
-            has_global_stats=True)
+            name, output_names=output_names, label_names=label_names)
         self.average = average
         
     def update(self, labels, preds):
@@ -1421,9 +1209,7 @@ def update(self, labels, preds):
                 num_inst = label.shape[0]
                 mse = ((label - pred)**2.0).reshape(num_inst, -1).mean(axis=-1).sum()
             self.sum_metric += mse
-            self.global_sum_metric += mse
             self.num_inst += num_inst
-            self.global_num_inst += num_inst
 
 
 @register
@@ -1461,8 +1247,7 @@ class RMSE(EvalMetric):
     def __init__(self, name='rmse',
                  output_names=None, label_names=None, average="macro"):
         super(RMSE, self).__init__(
-            name, output_names=output_names, label_names=label_names,
-            has_global_stats=True)
+            name, output_names=output_names, label_names=label_names)
         self.average = average
         
     def update(self, labels, preds):
@@ -1494,9 +1279,7 @@ def update(self, labels, preds):
                 num_inst = label.shape[0]
                 rmse = numpy.sqrt(((label - pred)**2.0).reshape(num_inst, -1).mean(axis=1)).sum()
             self.sum_metric += rmse
-            self.global_sum_metric += rmse
             self.num_inst += num_inst
-            self.global_num_inst += num_inst
 
 
 @register
@@ -1536,8 +1319,7 @@ class MeanPairwiseDistance(EvalMetric):
     def __init__(self, name='mpd',
                  output_names=None, label_names=None, p=2, average="micro"):
         super(MeanPairwiseDistance, self).__init__(
-            name, output_names=output_names, label_names=label_names,
-            has_global_stats=True)
+            name, output_names=output_names, label_names=label_names)
         self.average = average
         self.p = p
        
@@ -1561,18 +1343,16 @@ def update(self, labels, preds):
             label = label.reshape(label.shape[0], -1)
             pred = pred.reshape(pred.shape[0], -1)
 
-            pd = (((label - pred) ** self.p).sum(axis=-1)) ** (1./self.p)
+            dis = (((label - pred) ** self.p).sum(axis=-1)) ** (1./self.p)
             if self.average == "macro":
-                pd = pd.mean()
+                dis = dis.mean()
                 num_inst = 1
             else:
-                pd = pd.sum()
+                dis = dis.sum()
                 num_inst = label.shape[0]
 
-            self.sum_metric += pd
-            self.global_sum_metric += pd
+            self.sum_metric += dis
             self.num_inst += num_inst
-            self.global_num_inst += num_inst
             
 
 @register
@@ -1613,8 +1393,7 @@ class MeanCosineSimilarity(EvalMetric):
     def __init__(self, name='cos_sim',
                  output_names=None, label_names=None, eps=1e-8, average="micro"):
         super(MeanCosineSimilarity, self).__init__(
-            name, output_names=output_names, label_names=label_names,
-            has_global_stats=True)
+            name, output_names=output_names, label_names=label_names)
         self.average = average
         self.eps = eps
         
@@ -1651,9 +1430,7 @@ def update(self, labels, preds):
                 sim = sim.sum()
                 num_inst = numpy.prod(label.shape[:-1])
             self.sum_metric += sim
-            self.global_sum_metric += sim
             self.num_inst += num_inst
-            self.global_num_inst += num_inst
 
             
 @register
@@ -1697,8 +1474,7 @@ def __init__(self, eps=1e-12, name='cross-entropy',
                  output_names=None, label_names=None):
         super(CrossEntropy, self).__init__(
             name, eps=eps,
-            output_names=output_names, label_names=label_names,
-            has_global_stats=True)
+            output_names=output_names, label_names=label_names)
         self.eps = eps
 
     def update(self, labels, preds):
@@ -1724,9 +1500,7 @@ def update(self, labels, preds):
             prob = pred[numpy.arange(label.shape[0]), numpy.int64(label)]
             cross_entropy = (-numpy.log(prob + self.eps)).sum()
             self.sum_metric += cross_entropy
-            self.global_sum_metric += cross_entropy
             self.num_inst += label.shape[0]
-            self.global_num_inst += label.shape[0]
 
 @register
 @alias('nll_loss')
@@ -1769,8 +1543,7 @@ def __init__(self, eps=1e-12, name='nll-loss',
                  output_names=None, label_names=None):
         super(NegativeLogLikelihood, self).__init__(
             name, eps=eps,
-            output_names=output_names, label_names=label_names,
-            has_global_stats=True)
+            output_names=output_names, label_names=label_names)
         self.eps = eps
 
     def update(self, labels, preds):
@@ -1796,9 +1569,8 @@ def update(self, labels, preds):
             prob = pred[numpy.arange(num_examples, dtype=numpy.int64), numpy.int64(label)]
             nll = (-numpy.log(prob + self.eps)).sum()
             self.sum_metric += nll
-            self.global_sum_metric += nll
             self.num_inst += num_examples
-            self.global_num_inst += num_examples
+
 
 @register
 @alias('pearsonr')
@@ -1838,8 +1610,7 @@ def __init__(self, name='pearsonr',
                  output_names=None, label_names=None, average='macro'):
         self.average = average
         super(PearsonCorrelation, self).__init__(
-            name, output_names=output_names, label_names=label_names,
-            has_global_stats=True)
+            name, output_names=output_names, label_names=label_names)
         if self.average == 'micro':
             self.reset_micro()
 
@@ -1855,8 +1626,6 @@ def reset_micro(self):
     def reset(self):
         self.num_inst = 0
         self.sum_metric = 0.0
-        self.global_num_inst = 0
-        self.global_sum_metric = 0.0
         if self.average == 'micro':
             self.reset_micro()
 
@@ -1891,11 +1660,8 @@ def update(self, labels, preds):
             if self.average == 'macro':
                 pearson_corr = numpy.corrcoef(pred, label)[0, 1]
                 self.sum_metric += pearson_corr
-                self.global_sum_metric += pearson_corr
                 self.num_inst += 1
-                self.global_num_inst += 1
             else:
-                self.global_num_inst += 1
                 self.num_inst += 1
                 self._label_nums, self._mean_l, self._sse_l = \
                     self.update_variance(label, self._label_nums, self._mean_l, self._sse_l)
@@ -1967,18 +1733,14 @@ class PCC(EvalMetric):
     ('pcc', 0.01917751877733392)
     """
     def __init__(self, name='pcc',
-                 output_names=None, label_names=None,
-                 has_global_stats=True):
+                 output_names=None, label_names=None):
         self.k = 2
         super(PCC, self).__init__(
-            name=name, output_names=output_names, label_names=label_names,
-            has_global_stats=has_global_stats)
+            name=name, output_names=output_names, label_names=label_names)
 
     def _grow(self, inc):
         self.lcm = numpy.pad(
             self.lcm, ((0, inc), (0, inc)), 'constant', constant_values=(0))
-        self.gcm = numpy.pad(
-            self.gcm, ((0, inc), (0, inc)), 'constant', constant_values=(0))
         self.k += inc
 
     def _calc_mcc(self, cmat):
@@ -2021,27 +1783,14 @@ def update(self, labels, preds):
             for i, j in zip(pred, label):
                 bcm[i, j] += 1
             self.lcm += bcm
-            self.gcm += bcm
-
         self.num_inst += 1
-        self.global_num_inst += 1
 
     @property
     def sum_metric(self):
         return self._calc_mcc(self.lcm) * self.num_inst
 
-    @property
-    def global_sum_metric(self):
-        return self._calc_mcc(self.gcm) * self.global_num_inst
-
     def reset(self):
         """Resets the internal evaluation result to initial state."""
-        self.global_num_inst = 0.
-        self.gcm = numpy.zeros((self.k, self.k))
-        self.reset_local()
-
-    def reset_local(self):
-        """Resets the local portion of the internal evaluation results to initial state."""
         self.num_inst = 0.
         self.lcm = numpy.zeros((self.k, self.k))
 
@@ -2064,8 +1813,7 @@ class Loss(EvalMetric):
     def __init__(self, name='loss',
                  output_names=None, label_names=None):
         super(Loss, self).__init__(
-            name, output_names=output_names, label_names=label_names,
-            has_global_stats=True)
+            name, output_names=output_names, label_names=label_names)
 
     def update(self, _, preds):
 
@@ -2075,9 +1823,7 @@ def update(self, _, preds):
         for pred in preds:
             loss = ndarray.sum(pred).asscalar()
             self.sum_metric += loss
-            self.global_sum_metric += loss
             self.num_inst += pred.size
-            self.global_num_inst += pred.size
 
 
 @register
@@ -2143,8 +1889,7 @@ def __init__(self, feval, name=None, allow_extra_outputs=False,
         super(CustomMetric, self).__init__(
             name, feval=feval,
             allow_extra_outputs=allow_extra_outputs,
-            output_names=output_names, label_names=label_names,
-            has_global_stats=True)
+            output_names=output_names, label_names=label_names)
         self._feval = feval
         self._allow_extra_outputs = allow_extra_outputs
 
@@ -2170,14 +1915,10 @@ def update(self, labels, preds):
             if isinstance(reval, tuple):
                 (sum_metric, num_inst) = reval
                 self.sum_metric += sum_metric
-                self.global_sum_metric += sum_metric
                 self.num_inst += num_inst
-                self.global_num_inst += num_inst
             else:
                 self.sum_metric += reval
-                self.global_sum_metric += reval
                 self.num_inst += 1
-                self.global_num_inst += 1
 
     def get_config(self):
         raise NotImplementedError("CustomMetric cannot be serialized")

From c06f3635e63a235f9c0e3e2cae55ebe5595fa792 Mon Sep 17 00:00:00 2001
From: acphile <phile_999@126.com>
Date: Mon, 20 Apr 2020 04:12:21 +0000
Subject: [PATCH 06/24] remove macro support

---
 python/mxnet/gluon/metric.py | 232 ++++++++---------------------------
 1 file changed, 53 insertions(+), 179 deletions(-)

diff --git a/python/mxnet/gluon/metric.py b/python/mxnet/gluon/metric.py
index 953a57894827..07e759e4d1f0 100644
--- a/python/mxnet/gluon/metric.py
+++ b/python/mxnet/gluon/metric.py
@@ -718,7 +718,7 @@ def reset(self):
 
 
 @register
-class Fbeta(EvalMetric):
+class Fbeta(F1):
     """Computes the Fbeta score of a binary classification problem.
 
     The Fbeta score is equivalent to harmonic mean of the precision and recall,
@@ -766,41 +766,11 @@ class Fbeta(EvalMetric):
 
     def __init__(self, name='fbeta',
                  output_names=None, label_names=None, beta=1, threshold=0.5, average="macro"):
-        self.average = average
+        super(Fbeta, self).__init__(name=name,
+                            output_names=output_names, label_names=label_names, 
+                            threshold=threshold, average=average)
         self.metrics = _BinaryClassificationMetrics(threshold=threshold, beta=beta)
-        EvalMetric.__init__(self, name=name,
-                            output_names=output_names, label_names=label_names)
-
-    def update(self, labels, preds):
-        """Updates the internal evaluation result.
-
-        Parameters
-        ----------
-        labels : list of `NDArray`
-            The labels of the data.
-
-        preds : list of `NDArray`
-            Predicted values.
-        """
-        labels, preds = check_label_shapes(labels, preds, True)
-
-        for label, pred in zip(labels, preds):
-            self.metrics.update_binary_stats(label, pred)
-
-        if self.average == "macro":
-            self.sum_metric += self.metrics.fscore
-            self.num_inst += 1
-            self.metrics.reset_stats()
-        else:
-            self.sum_metric = self.metrics.fscore * self.metrics.total_examples
-            self.num_inst = self.metrics.total_examples
-
-    def reset(self):
-        """Resets the internal evaluation result to initial state."""
-        self.sum_metric = 0.
-        self.num_inst = 0
-        self.metrics.reset_stats()
-
+        
 
 @register
 class BinaryAccuracy(EvalMetric):
@@ -892,10 +862,6 @@ class MCC(EvalMetric):
     label_names : list of str, or None
         Name of labels that should be used when updating with update_dict.
         By default include all labels.
-    average : str, default 'macro'
-        Strategy to be used for aggregating across mini-batches.
-            "macro": average the MCC for each batch.
-            "micro": compute a single MCC across all batches.
 
     Examples
     --------
@@ -925,8 +891,7 @@ class MCC(EvalMetric):
     """
 
     def __init__(self, name='mcc',
-                 output_names=None, label_names=None, average="macro"):
-        self._average = average
+                 output_names=None, label_names=None):
         self._metrics = _BinaryClassificationMetrics()
         EvalMetric.__init__(self, name=name,
                             output_names=output_names, label_names=label_names)
@@ -947,13 +912,8 @@ def update(self, labels, preds):
         for label, pred in zip(labels, preds):
             self._metrics.update_binary_stats(label, pred)
 
-        if self._average == "macro":
-            self.sum_metric += self._metrics.matthewscc()
-            self.num_inst += 1
-            self._metrics.reset_stats()
-        else:
-            self.sum_metric = self._metrics.matthewscc() * self._metrics.total_examples
-            self.num_inst = self._metrics.total_examples
+        self.sum_metric = self._metrics.matthewscc() * self._metrics.total_examples
+        self.num_inst = self._metrics.total_examples
 
     def reset(self):
         """Resets the internal evaluation result to initial state."""
@@ -1089,14 +1049,11 @@ class MAE(EvalMetric):
     label_names : list of str, or None
         Name of labels that should be used when updating with update_dict.
         By default include all labels.
-    average : str, default 'macro'
-        Strategy to be used for aggregating across mini-batches.
-            "macro": average MAE results for each batch.
-            "micro": compute a single MAE result across all batches.
+
     Examples
     --------
-    >>> predicts = [mx.nd.array(np.array([3, -0.5, 2, 7]).reshape(4,1))]
-    >>> labels = [mx.nd.array(np.array([2.5, 0.0, 2, 8]).reshape(4,1))]
+    >>> predicts = [mx.nd.array([3, -0.5, 2, 7])]
+    >>> labels = [mx.nd.array([2.5, 0.0, 2, 8])]
     >>> mean_absolute_error = mx.gluon.metric.MAE()
     >>> mean_absolute_error.update(labels = labels, preds = predicts)
     >>> print mean_absolute_error.get()
@@ -1104,10 +1061,9 @@ class MAE(EvalMetric):
     """
 
     def __init__(self, name='mae',
-                 output_names=None, label_names=None, average='macro'):
+                 output_names=None, label_names=None):
         super(MAE, self).__init__(
             name, output_names=output_names, label_names=label_names)
-        self.average = average
         
     def update(self, labels, preds):
         """Updates the internal evaluation result.
@@ -1126,17 +1082,8 @@ def update(self, labels, preds):
             label = label.asnumpy()
             pred = pred.asnumpy()
 
-            if len(label.shape) == 1:
-                label = label.reshape(label.shape[0], 1)
-            if len(pred.shape) == 1:
-                pred = pred.reshape(pred.shape[0], 1)
-            
-            if self.average == "macro":
-                mae = numpy.abs(label - pred).mean()
-                num_inst = 1
-            else:
-                num_inst = label.shape[0]
-                mae = numpy.abs(label - pred).reshape(num_inst, -1).mean(axis=-1).sum()
+            num_inst = label.shape[0]
+            mae = numpy.abs(label - pred).reshape(num_inst, -1).mean(axis=-1).sum()
                 
             self.sum_metric += mae
             self.num_inst += num_inst
@@ -1161,24 +1108,20 @@ class MSE(EvalMetric):
     label_names : list of str, or None
         Name of labels that should be used when updating with update_dict.
         By default include all labels.
-    average : str, default 'macro'
-        Strategy to be used for aggregating across mini-batches.
-            "macro": average MSE results for each batch.
-            "micro": compute a single MSE result across all batches.
+
     Examples
     --------
-    >>> predicts = [mx.nd.array(np.array([3, -0.5, 2, 7]).reshape(4,1))]
-    >>> labels = [mx.nd.array(np.array([2.5, 0.0, 2, 8]).reshape(4,1))]
+    >>> predicts = [mx.nd.array([3, -0.5, 2, 7])]
+    >>> labels = [mx.nd.array([2.5, 0.0, 2, 8])]
     >>> mean_squared_error = mx.gluon.metric.MSE()
     >>> mean_squared_error.update(labels = labels, preds = predicts)
     >>> print mean_squared_error.get()
     ('mse', 0.375)
     """
     def __init__(self, name='mse',
-                 output_names=None, label_names=None, average="macro"):
+                 output_names=None, label_names=None):
         super(MSE, self).__init__(
             name, output_names=output_names, label_names=label_names)
-        self.average = average
         
     def update(self, labels, preds):
         """Updates the internal evaluation result.
@@ -1197,23 +1140,15 @@ def update(self, labels, preds):
             label = label.asnumpy()
             pred = pred.asnumpy()
 
-            if len(label.shape) == 1:
-                label = label.reshape(label.shape[0], 1)
-            if len(pred.shape) == 1:
-                pred = pred.reshape(pred.shape[0], 1)
-
-            if self.average == "macro":
-                mse = ((label - pred)**2.0).mean()
-                num_inst = 1
-            else:
-                num_inst = label.shape[0]
-                mse = ((label - pred)**2.0).reshape(num_inst, -1).mean(axis=-1).sum()
+            num_inst = label.shape[0]
+            mse = ((label - pred)**2.0).reshape(num_inst, -1).mean(axis=-1).sum()
+            
             self.sum_metric += mse
             self.num_inst += num_inst
 
 
 @register
-class RMSE(EvalMetric):
+class RMSE(MSE):
     """Computes Root Mean Squred Error (RMSE) loss.
 
     The root mean squared error is given by
@@ -1231,55 +1166,26 @@ class RMSE(EvalMetric):
     label_names : list of str, or None
         Name of labels that should be used when updating with update_dict.
         By default include all labels.
-    average : str, default 'macro'
-        Strategy to be used for aggregating across mini-batches.
-            "macro": average RMSE results for each batch.
-            "micro": compute a single RSME result across all batches.
+
     Examples
     --------
-    >>> predicts = [mx.nd.array(np.array([3, -0.5, 2, 7]).reshape(4,1))]
-    >>> labels = [mx.nd.array(np.array([2.5, 0.0, 2, 8]).reshape(4,1))]
+    >>> predicts = [mx.nd.array([3, -0.5, 2, 7])]
+    >>> labels = [mx.nd.array([2.5, 0.0, 2, 8])]
     >>> root_mean_squared_error = mx.gluon.metric.RMSE()
     >>> root_mean_squared_error.update(labels = labels, preds = predicts)
     >>> print root_mean_squared_error.get()
     ('rmse', 0.612372457981)
     """
     def __init__(self, name='rmse',
-                 output_names=None, label_names=None, average="macro"):
+                 output_names=None, label_names=None):
         super(RMSE, self).__init__(
             name, output_names=output_names, label_names=label_names)
-        self.average = average
         
-    def update(self, labels, preds):
-        """Updates the internal evaluation result.
-
-        Parameters
-        ----------
-        labels : list of `NDArray`
-            The labels of the data.
-
-        preds : list of `NDArray`
-            Predicted values.
-        """
-        labels, preds = check_label_shapes(labels, preds, True)
-
-        for label, pred in zip(labels, preds):
-            label = label.asnumpy()
-            pred = pred.asnumpy()
-
-            if len(label.shape) == 1:
-                label = label.reshape(label.shape[0], 1)
-            if len(pred.shape) == 1:
-                pred = pred.reshape(pred.shape[0], 1)
-
-            if self.average == "macro":
-                rmse = numpy.sqrt(((label - pred)**2.0).mean())
-                num_inst = 1
-            else:
-                num_inst = label.shape[0]
-                rmse = numpy.sqrt(((label - pred)**2.0).reshape(num_inst, -1).mean(axis=1)).sum()
-            self.sum_metric += rmse
-            self.num_inst += num_inst
+    def get(self):
+        if self.num_inst == 0:
+            return (self.name, float('nan'))
+        else:
+            return (self.name, math.sqrt(self.sum_metric / self.num_inst))           
 
 
 @register
@@ -1303,10 +1209,7 @@ class MeanPairwiseDistance(EvalMetric):
         By default include all labels.
     p : float, default 2
         calculating distance using the p-norm
-    average : str, default 'macro'
-        Strategy to be used for aggregating across mini-batches.
-            "macro": average MPD results for each batch.
-            "micro": compute a single MPD result across all batches.
+
     Examples
     --------
     >>> predicts = [mx.nd.array([[1., 2.], [3., 4.]])]
@@ -1317,10 +1220,9 @@ class MeanPairwiseDistance(EvalMetric):
     ('mpd', 2.1180338859558105)
     """
     def __init__(self, name='mpd',
-                 output_names=None, label_names=None, p=2, average="micro"):
+                 output_names=None, label_names=None, p=2):
         super(MeanPairwiseDistance, self).__init__(
             name, output_names=output_names, label_names=label_names)
-        self.average = average
         self.p = p
        
     def update(self, labels, preds):
@@ -1344,12 +1246,8 @@ def update(self, labels, preds):
             pred = pred.reshape(pred.shape[0], -1)
 
             dis = (((label - pred) ** self.p).sum(axis=-1)) ** (1./self.p)
-            if self.average == "macro":
-                dis = dis.mean()
-                num_inst = 1
-            else:
-                dis = dis.sum()
-                num_inst = label.shape[0]
+            dis = dis.sum()
+            num_inst = label.shape[0]
 
             self.sum_metric += dis
             self.num_inst += num_inst
@@ -1377,10 +1275,6 @@ class MeanCosineSimilarity(EvalMetric):
         By default include all labels.
     eps : float, default 1e-8
         small vale to avoid division by zero.
-    average : str, default 'micro'
-        Strategy to be used for aggregating across mini-batches.
-            "macro": average RMSE results for each batch.
-            "micro": compute a single RSME result across all batches.
     Examples
     --------
     >>> predicts = [mx.nd.array([[1., 0.], [1., 1.]])]
@@ -1391,10 +1285,9 @@ class MeanCosineSimilarity(EvalMetric):
     ('cos_sim', 0.8)
     """
     def __init__(self, name='cos_sim',
-                 output_names=None, label_names=None, eps=1e-8, average="micro"):
+                 output_names=None, label_names=None, eps=1e-8):
         super(MeanCosineSimilarity, self).__init__(
             name, output_names=output_names, label_names=label_names)
-        self.average = average
         self.eps = eps
         
     def update(self, labels, preds):
@@ -1423,12 +1316,8 @@ def update(self, labels, preds):
             n_p = numpy.linalg.norm(pred, axis=-1)
             n_l = numpy.linalg.norm(label, axis=-1)
             sim = sim / numpy.maximum(n_l * n_p, self.eps)
-            if self.average == "macro":
-                sim = sim.mean()
-                num_inst = 1
-            else:
-                sim = sim.sum()
-                num_inst = numpy.prod(label.shape[:-1])
+            sim = sim.sum()
+            num_inst = numpy.prod(label.shape[:-1])
             self.sum_metric += sim
             self.num_inst += num_inst
 
@@ -1592,10 +1481,6 @@ class PearsonCorrelation(EvalMetric):
     label_names : list of str, or None
         Name of labels that should be used when updating with update_dict.
         By default include all labels.
-    average : str, default 'macro'
-        Strategy to be used for aggregating across mini-batches.
-            "macro": average the pearsonr scores for each batch.
-            "micro": compute a single pearsonr score across all batches.
 
     Examples
     --------
@@ -1607,14 +1492,12 @@ class PearsonCorrelation(EvalMetric):
     ('pearsonr', 0.42163704544016178)
     """
     def __init__(self, name='pearsonr',
-                 output_names=None, label_names=None, average='macro'):
-        self.average = average
+                 output_names=None, label_names=None):
         super(PearsonCorrelation, self).__init__(
             name, output_names=output_names, label_names=label_names)
-        if self.average == 'micro':
-            self.reset_micro()
+        self.reset()
 
-    def reset_micro(self):
+    def reset(self):
         self._sse_p = 0
         self._mean_p = 0
         self._sse_l = 0
@@ -1622,12 +1505,9 @@ def reset_micro(self):
         self._pred_nums = 0
         self._label_nums = 0
         self._conv = 0
-
-    def reset(self):
+        
         self.num_inst = 0
         self.sum_metric = 0.0
-        if self.average == 'micro':
-            self.reset_micro()
 
     def update_variance(self, new_values, *aggregate):
         #Welford's online algorithm for variance update
@@ -1657,27 +1537,21 @@ def update(self, labels, preds):
             check_label_shapes(label, pred, False, True)
             label = label.asnumpy().ravel().astype(numpy.float64)
             pred = pred.asnumpy().ravel().astype(numpy.float64)
-            if self.average == 'macro':
-                pearson_corr = numpy.corrcoef(pred, label)[0, 1]
-                self.sum_metric += pearson_corr
-                self.num_inst += 1
-            else:
-                self.num_inst += 1
-                self._label_nums, self._mean_l, self._sse_l = \
-                    self.update_variance(label, self._label_nums, self._mean_l, self._sse_l)
-                self.update_cov(label, pred)
-                self._pred_nums, self._mean_p, self._sse_p = \
-                    self.update_variance(pred, self._pred_nums, self._mean_p, self._sse_p)
+
+            self.num_inst += 1
+            self._label_nums, self._mean_l, self._sse_l = \
+                self.update_variance(label, self._label_nums, self._mean_l, self._sse_l)
+            self.update_cov(label, pred)
+            self._pred_nums, self._mean_p, self._sse_p = \
+                self.update_variance(pred, self._pred_nums, self._mean_p, self._sse_p)
 
     def get(self):
         if self.num_inst == 0:
             return (self.name, float('nan'))
-        if self.average == 'macro':
-            return (self.name, self.sum_metric / self.num_inst)
-        else:
-            n = self._label_nums
-            pearsonr = self._conv / ((n-1) * numpy.sqrt(self._sse_p / (n - 1)) * numpy.sqrt(self._sse_l / (n - 1)))
-            return (self.name, pearsonr)
+
+        n = self._label_nums
+        pearsonr = self._conv / ((n-1) * numpy.sqrt(self._sse_p / (n - 1)) * numpy.sqrt(self._sse_l / (n - 1)))
+        return (self.name, pearsonr)
 
 @register
 class PCC(EvalMetric):

From 6beba21647e4a4a04a804e167504bbf66e40ff4d Mon Sep 17 00:00:00 2001
From: acphile <phile_999@126.com>
Date: Mon, 20 Apr 2020 08:53:10 +0000
Subject: [PATCH 07/24] rewrite BinaryAccuracy

---
 python/mxnet/gluon/metric.py | 64 +++++++++++++++++++++++-------------
 1 file changed, 41 insertions(+), 23 deletions(-)

diff --git a/python/mxnet/gluon/metric.py b/python/mxnet/gluon/metric.py
index 07e759e4d1f0..95d4340b69bf 100644
--- a/python/mxnet/gluon/metric.py
+++ b/python/mxnet/gluon/metric.py
@@ -508,6 +508,28 @@ def update(self, labels, preds):
             self.num_inst += num_samples
 
 
+def predict_with_threshold(pred, threshold=0.5):
+    """Do thresholding of predictions in binaray and multilabel cases.
+    
+    Parameters
+    ----------
+    preds : ndarray
+        predictions in shape of (batch_size, ...) or (batch_size, ..., num_categories)
+
+    preds : float or ndarray
+        threshold（s) in shape of float or (num_categories)
+    """
+    if isinstance(threshold, float):
+        return pred > threshold
+    elif isinstance(threshold, numpy.ndarray) or isinstance(threshold, ndarray.ndarray.NDArray):
+        num_classes = pred.shape[-1]
+        assert threshold.shape[-1] == num_classes, \
+                "shape mismatch: %s vs. %s"%(pred.shape[-1], threshold.shape[-1])
+        return pred > threshold        
+    else:
+        raise ValueError("{} is a wrong type for threshold!".format(type(threshold)))
+    
+    
 class _BinaryClassificationMetrics(object):
     """Private container class for classification metric statistics.
 
@@ -620,13 +642,6 @@ def total_examples(self):
         return self.false_negatives + self.false_positives + \
                self.true_negatives + self.true_positives
 
-    @property
-    def accuracy(self):
-        if self.total_examples > 0:
-            return float(self.true_positives + self.true_negatives) / self.total_examples
-        else:
-            return 0.
-
     def reset_stats(self):
         self.false_positives = 0
         self.false_negatives = 0
@@ -680,7 +695,7 @@ class F1(EvalMetric):
     """
 
     def __init__(self, name='f1',
-                 output_names=None, label_names=None, threshold=0.5, average="macro"):
+                 output_names=None, label_names=None, threshold=0.5, average="micro"):
         self.average = average
         self.metrics = _BinaryClassificationMetrics(threshold=threshold)
         EvalMetric.__init__(self, name=name,
@@ -765,7 +780,7 @@ class Fbeta(F1):
     """
 
     def __init__(self, name='fbeta',
-                 output_names=None, label_names=None, beta=1, threshold=0.5, average="macro"):
+                 output_names=None, label_names=None, beta=1, threshold=0.5, average="micro"):
         super(Fbeta, self).__init__(name=name,
                             output_names=output_names, label_names=label_names, 
                             threshold=threshold, average=average)
@@ -774,7 +789,7 @@ def __init__(self, name='fbeta',
 
 @register
 class BinaryAccuracy(EvalMetric):
-    """Computes the accuracy of a binary classification problem.
+    """Computes the accuracy of a binary or multilabel classification problem.
 
     Parameters
     ----------
@@ -786,7 +801,7 @@ class BinaryAccuracy(EvalMetric):
     label_names : list of str, or None
         Name of labels that should be used when updating with update_dict.
         By default include all labels.
-    threshold : float, default 0.5
+    threshold : float or ndarray, default 0.5
         threshold for deciding whether the predictions are positive or negative.
 
     Examples
@@ -801,7 +816,7 @@ class BinaryAccuracy(EvalMetric):
 
     def __init__(self, name='binary_accuracy',
                  output_names=None, label_names=None, threshold=0.5):
-        self.metrics = _BinaryClassificationMetrics(threshold=threshold)
+        self.threshold = threshold
         EvalMetric.__init__(self, name=name,
                             output_names=output_names, label_names=label_names)
 
@@ -811,24 +826,27 @@ def update(self, labels, preds):
         Parameters
         ----------
         labels : list of `NDArray`
-            The labels of the data.
+            Each label denotes positive/negative for each class.
 
         preds : list of `NDArray`
-            Predicted values.
+            Each prediction value is a confidence value of being positive for each class.
         """
         labels, preds = check_label_shapes(labels, preds, True)
 
-        for label, pred in zip(labels, preds):
-            self.metrics.update_binary_stats(label, pred)
+        for label, pred_label in zip(labels, preds):
+            pred_label = predict_with_threshold(pred_label, self.threshold)
+            
+            pred_label = pred_label.asnumpy().astype('int32')
+            label = label.asnumpy().astype('int32')
+            # flatten before checking shapes to avoid shape miss match
+            label = label.flat
+            pred_label = pred_label.flat
 
-        self.sum_metric = self.metrics.accuracy * self.metrics.total_examples
-        self.num_inst = self.metrics.total_examples
+            check_label_shapes(label, pred_label)
 
-    def reset(self):
-        """Resets the internal evaluation result to initial state."""
-        self.sum_metric = 0.
-        self.num_inst = 0
-        self.metrics.reset_stats()
+            num_correct = (pred_label == label).sum()
+            self.sum_metric += num_correct
+            self.num_inst += len(pred_label)
        
         
 @register

From b1fc42b9227826934545da0335a14422f3c5a230 Mon Sep 17 00:00:00 2001
From: acphile <phile_999@126.com>
Date: Tue, 21 Apr 2020 05:40:38 +0000
Subject: [PATCH 08/24] extend F1 to multiclass/multilabel

---
 python/mxnet/gluon/metric.py | 189 +++++++++++++++++++++++------------
 1 file changed, 126 insertions(+), 63 deletions(-)

diff --git a/python/mxnet/gluon/metric.py b/python/mxnet/gluon/metric.py
index 95d4340b69bf..8214ad86ae67 100644
--- a/python/mxnet/gluon/metric.py
+++ b/python/mxnet/gluon/metric.py
@@ -509,7 +509,7 @@ def update(self, labels, preds):
 
 
 def predict_with_threshold(pred, threshold=0.5):
-    """Do thresholding of predictions in binaray and multilabel cases.
+    """Do thresholding of predictions in binary and multilabel cases.
     
     Parameters
     ----------
@@ -529,8 +529,12 @@ def predict_with_threshold(pred, threshold=0.5):
     else:
         raise ValueError("{} is a wrong type for threshold!".format(type(threshold)))
     
+
+def one_hot(x, m):
+    return (numpy.arange(m)==x[:,None]).astype('int32')
     
-class _BinaryClassificationMetrics(object):
+        
+class _ClassificationMetrics(object):
     """Private container class for classification metric statistics.
 
     True/false positive and true/false negative counts are sufficient statistics for various classification metrics.
@@ -539,6 +543,10 @@ class _BinaryClassificationMetrics(object):
     
     Parameters
     ----------
+    class_type : str, default "binary"
+        "binary": f1 for binary classification.
+        "multiclass": f1 for multiclassification problem.
+        "multilabel": f1 for multilabel classification.      
     beta : float, default 1
         weight of precision in harmonic mean. 
     threshold : float, default 0.5
@@ -546,15 +554,23 @@ class _BinaryClassificationMetrics(object):
         
     """
 
-    def __init__(self, threshold=0.5, beta=1):
+    def __init__(self, class_type="binary", threshold=0.5, beta=1):
+        self.class_type = class_type
         self.threshold = threshold
         self.beta = beta
-        self.true_positives = 0
-        self.false_negatives = 0
-        self.false_positives = 0
-        self.true_negatives = 0
-
-    def update_binary_stats(self, label, pred):
+        self.reset_stats()
+
+    def _set(self, num):
+        if self.num_classes is None:
+            self.num_classes = num
+            self.true_positives = numpy.zeros(num)
+            self.false_negatives = numpy.zeros(num)
+            self.false_positives = numpy.zeros(num)
+            self.true_negatives = numpy.zeros(num)
+        else:
+            assert self.num_classes == num, "Input number of classes has changed from {} to {}".format(self.num_classes, num)
+            
+    def update_stats(self, label, pred):
         """Update various binary classification counts for a single (label, pred) pair.
 
         Parameters
@@ -567,31 +583,46 @@ def update_binary_stats(self, label, pred):
         """
         pred = pred.asnumpy()
         label = label.asnumpy().astype('int32')
-        if len(pred.shape) == 1: # assume each value refers to confidence(positive)
-            pass
-        elif pred.shape[-1] > 2:
-            raise ValueError("%s currently only supports binary classification."
-                             % self.__class__.__name__)
-        elif pred.shape[-1] == 1: # classify positive when confidence(positive) > threshold
-            pred = pred.flat
+        if self.class_type == "binary":
+            self._set(1)
+            if len(numpy.unique(label)) > 2:
+                raise ValueError("Wrong label for binary classification.")
+            if pred.shape == label.shape:
+                pass
+            elif pred.shape[-1] > 2:
+                raise ValueError("The shape of prediction {} is wrong for binary classification.".format(pred.shape))
+            elif pred.shape[-1] == 2:
+                pred = pred.reshape(-1, 2)[:, 1]     
+            pred_label = predict_with_threshold(pred, self.threshold).flat
+            label = label.flat
+            
+        elif self.class_type == "multiclass":
+            num = pred.shape[-1]
+            self._set(num)
+            assert label.max() < num, "pred contains fewer classes than label!"
+            pred_label = one_hot(pred.argmax(axis=-1).reshape(-1), num)         
+            label = one_hot(label.reshape(-1), num)
+            
+        elif self.class_type == "multilabel":
+            num = pred.shape[-1]
+            self._set(num)
+            assert pred.shape == label.shape, "The shape of label should be same as that of prediction for multilabel classification."
+            pred_label = predict_with_threshold(pred, self.threshold).reshape(-1, num)
+            label = label.reshape(-1, num)
         else:
-            pred = pred.reshape(-1, 2)[:, 1] 
-        pred_label = pred > self.threshold 
-        label = label.flat
-        
+            raise ValueError("Wrong class_type {}! Only supports ['binary', 'multiclass', 'multilabel']".format(self.class_type))
+            
         check_label_shapes(label, pred_label)
-        if len(numpy.unique(label)) > 2:
-            raise ValueError("%s currently only supports binary classification."
-                             % self.__class__.__name__)
+        
         pred_true = (pred_label == 1)
         pred_false = 1 - pred_true
         label_true = (label == 1)
         label_false = 1 - label_true
 
-        true_pos = (pred_true * label_true).sum()
-        false_pos = (pred_true * label_false).sum()
-        false_neg = (pred_false * label_true).sum()
-        true_neg = (pred_false * label_false).sum()
+        true_pos = (pred_true * label_true).sum(0)
+        false_pos = (pred_true * label_false).sum(0)
+        false_neg = (pred_false * label_true).sum(0)
+        true_neg = (pred_false * label_false).sum(0)
         self.true_positives += true_pos
         self.false_positives += false_pos
         self.false_negatives += false_neg
@@ -599,25 +630,44 @@ def update_binary_stats(self, label, pred):
 
     @property
     def precision(self):
-        if self.true_positives + self.false_positives > 0:
-            return float(self.true_positives) / (self.true_positives + self.false_positives)
+        if self.num_classes is not None:
+            return self.true_positives / numpy.maximum(self.true_positives + self.false_positives, 1e-12)
         else:
             return 0.
 
+    @property
+    def global_precision(self):
+        if self.num_classes is not None:
+            return self.true_positives.sum() / numpy.maximum(self.true_positives.sum() + self.false_positives.sum(), 1e-12)
+        else:
+            return 0.
+            
     @property
     def recall(self):
-        if self.true_positives + self.false_negatives > 0:
-            return float(self.true_positives) / (self.true_positives + self.false_negatives)
+        if self.num_classes is not None:
+            return self.true_positives / numpy.maximum(self.true_positives + self.false_negatives, 1e-12)
         else:
             return 0.
 
     @property
-    def fscore(self):
-        if self.precision + self.recall > 0:
-            return (1 + self.beta ** 2) * self.precision * self.recall / (self.beta ** 2 * self.precision + self.recall)
+    def global_recall(self):
+        if self.num_classes is not None:
+            return self.true_positives.sum() / numpy.maximum(self.true_positives.sum() + self.false_negatives.sum(), 1e-12)
         else:
             return 0.
+            
+    @property
+    def fscore(self):
+        return (1 + self.beta ** 2) * self.precision * self.recall / numpy.maximum(self.beta ** 2 * self.precision + self.recall, 1e-12)
 
+    @property
+    def global_fscore(self):
+        if self.global_precision + self.global_recall > 0:
+            return (1 + self.beta ** 2) * self.global_precision * self.global_recall / \
+                (self.beta ** 2 * self.global_precision + self.global_recall)
+        else:
+            return 0.
+            
     def matthewscc(self):
         """Calculate the Matthew's Correlation Coefficent"""
         if not self.total_examples:
@@ -639,14 +689,17 @@ def matthewscc(self):
 
     @property
     def total_examples(self):
-        return self.false_negatives + self.false_positives + \
-               self.true_negatives + self.true_positives
+        if self.num_classes is None:
+            return 0
+        return self.false_negatives[0] + self.false_positives[0] + \
+               self.true_negatives[0] + self.true_positives[0]
 
     def reset_stats(self):
-        self.false_positives = 0
-        self.false_negatives = 0
-        self.true_positives = 0
-        self.true_negatives = 0
+        self.num_classes = None
+        self.true_positives = None
+        self.false_negatives = None
+        self.false_positives = None
+        self.true_negatives = None
 
 
 @register
@@ -677,12 +730,17 @@ class F1(EvalMetric):
     label_names : list of str, or None
         Name of labels that should be used when updating with update_dict.
         By default include all labels.
+    class_type : str, default "binary"
+        "binary": f1 for binary classification.
+        "multiclass": f1 for multiclassification problem.
+        "multilabel": f1 for multilabel classification.        
     threshold : float, default 0.5
         threshold for postive confidence value.
-    average : str, default 'macro'
+    average : str, default 'micro'
         Strategy to be used for aggregating across mini-batches.
-            "macro": average the F1 scores for each batch.
-            "micro": compute a single F1 score across all batches.
+            "macro": Calculate metrics for each label and return unweighted mean of f1.
+            "micro": Calculate metrics globally by counting the total true positives, false negatives and false positives.
+            None: Return f1 scores for each class (numpy.ndarray) . 
 
     Examples
     --------
@@ -695,9 +753,9 @@ class F1(EvalMetric):
     """
 
     def __init__(self, name='f1',
-                 output_names=None, label_names=None, threshold=0.5, average="micro"):
+                 output_names=None, label_names=None, class_type="binary", threshold=0.5, average="micro"):
         self.average = average
-        self.metrics = _BinaryClassificationMetrics(threshold=threshold)
+        self.metrics = _ClassificationMetrics(class_type=class_type, threshold=threshold)
         EvalMetric.__init__(self, name=name,
                             output_names=output_names, label_names=label_names)
 
@@ -715,16 +773,16 @@ def update(self, labels, preds):
         labels, preds = check_label_shapes(labels, preds, True)
 
         for label, pred in zip(labels, preds):
-            self.metrics.update_binary_stats(label, pred)
+            self.metrics.update_stats(label, pred)
 
-        if self.average == "macro":
-            self.sum_metric += self.metrics.fscore
-            self.num_inst += 1
-            self.metrics.reset_stats()
+        if self.average == "micro":
+            self.sum_metric = self.metrics.global_fscore * self.metrics.total_examples   
+        elif self.average == "macro":
+            self.sum_metric = self.metrics.fscore.mean() * self.metrics.total_examples  
         else:
-            self.sum_metric = self.metrics.fscore * self.metrics.total_examples
-            self.num_inst = self.metrics.total_examples
-
+            self.sum_metric = self.metrics.fscore * self.metrics.total_examples  
+        self.num_inst = self.metrics.total_examples   
+          
     def reset(self):
         """Resets the internal evaluation result to initial state."""
         self.sum_metric = 0.
@@ -760,14 +818,19 @@ class Fbeta(F1):
     label_names : list of str, or None
         Name of labels that should be used when updating with update_dict.
         By default include all labels.
+    class_type : str, default "binary"
+        "binary": f1 for binary classification.
+        "multiclass": f1 for multiclassification problem.
+        "multilabel": f1 for multilabel classification.      
     beta : float, default 1
-        weight of precision in harmonic mean. 
+        weight of precision in harmonic mean.   
     threshold : float, default 0.5
-        threshold for deciding whether the predictions are positive or negative.
-    average : str, default 'macro'
+        threshold for postive confidence value.
+    average : str, default 'micro'
         Strategy to be used for aggregating across mini-batches.
-            "macro": average the F1 scores for each batch.
-            "micro": compute a single F1 score across all batches.
+            "macro": Calculate metrics for each label and return unweighted mean of f1.
+            "micro": Calculate metrics globally by counting the total true positives, false negatives and false positives.
+            None: Return f1 scores for each class. 
 
     Examples
     --------
@@ -780,11 +843,11 @@ class Fbeta(F1):
     """
 
     def __init__(self, name='fbeta',
-                 output_names=None, label_names=None, beta=1, threshold=0.5, average="micro"):
+                 output_names=None, label_names=None, class_type="binary", beta=1, threshold=0.5, average="micro"):
         super(Fbeta, self).__init__(name=name,
                             output_names=output_names, label_names=label_names, 
-                            threshold=threshold, average=average)
-        self.metrics = _BinaryClassificationMetrics(threshold=threshold, beta=beta)
+                            class_type=class_type, threshold=threshold, average=average)
+        self.metrics = _ClassificationMetrics(class_type=class_type, threshold=threshold, beta=beta)
         
 
 @register
@@ -910,7 +973,7 @@ class MCC(EvalMetric):
 
     def __init__(self, name='mcc',
                  output_names=None, label_names=None):
-        self._metrics = _BinaryClassificationMetrics()
+        self._metrics = _ClassificationMetrics()
         EvalMetric.__init__(self, name=name,
                             output_names=output_names, label_names=label_names)
 
@@ -928,7 +991,7 @@ def update(self, labels, preds):
         labels, preds = check_label_shapes(labels, preds, True)
 
         for label, pred in zip(labels, preds):
-            self._metrics.update_binary_stats(label, pred)
+            self._metrics.update_stats(label, pred)
 
         self.sum_metric = self._metrics.matthewscc() * self._metrics.total_examples
         self.num_inst = self._metrics.total_examples

From 4b091b088916f9bf74411d26384b36b18f790fd1 Mon Sep 17 00:00:00 2001
From: acphile <phile_999@126.com>
Date: Tue, 21 Apr 2020 07:04:49 +0000
Subject: [PATCH 09/24] add tests for new F1, remove global tests

---
 tests/python/unittest/test_metric.py | 183 ++++++++++++---------------
 1 file changed, 79 insertions(+), 104 deletions(-)

diff --git a/tests/python/unittest/test_metric.py b/tests/python/unittest/test_metric.py
index 3408dd503d59..81f57f0eed6d 100644
--- a/tests/python/unittest/test_metric.py
+++ b/tests/python/unittest/test_metric.py
@@ -43,83 +43,6 @@ def test_metrics():
     composite = mx.gluon.metric.create(['acc', 'f1'])
     check_metric(composite)
 
-def _check_global_metric(metric, *args, **kwargs):
-    def _create_pred_label():
-        if use_same_shape:
-            pred = mx.nd.random.uniform(0, 1, shape=shape)
-            label = mx.nd.random.uniform(0, 1, shape=shape)
-        else:
-            # Make a random prediction
-            idx = np.random.rand(*shape).argsort(1)
-            pred = mx.nd.array(1 - 0.1 * idx)
-            # Label is half 1 and half 0
-            # Setting all 0s or all 1s would make either
-            # MCC or F1 metrics always produce 0
-            label = mx.nd.ones(shape[0])
-            label[:shape[0] // 2] = 0
-        return pred, label
-
-    def _compare_metric_result(m1, m2):
-        # Compare names
-        assert m1[0] == m2[0]
-        # Compare values
-        if isinstance(m1[1], (list, tuple)):
-            assert len(m1[1]) == len(m2[1])
-            for r1, r2 in zip(m1[1], m2[1]):
-                assert r1 == r2 or \
-                       (math.isnan(r1) and
-                        math.isnan(r2))
-        else:
-            assert m1[1] == m2[1] or \
-                   (math.isnan(m1[1]) and
-                    math.isnan(m2[1]))
-
-    shape = kwargs.pop('shape', (10,10))
-    use_same_shape = kwargs.pop('use_same_shape', False)
-    m1 = mx.gluon.metric.create(metric, *args, **kwargs)
-    m2 = deepcopy(m1)
-    # check that global stats are not reset when calling
-    # reset_local()
-    for i in range(10):
-        pred, label = _create_pred_label()
-        m1.update([label], [pred])
-        m1.reset_local()
-        m2.update([label], [pred])
-    assert m1.get_global() == m2.get()
-
-    # check that reset_local() properly resets the local state
-    m1.reset_local()
-    m2.reset()
-    pred, label = _create_pred_label()
-    m1.update([label], [pred])
-    m1.reset_local()
-    pred, label = _create_pred_label()
-    m1.update([label], [pred])
-    m2.update([label], [pred])
-    _compare_metric_result(m1.get(), m2.get())
-
-@with_seed()
-def test_global_metric():
-    _check_global_metric('acc')
-    _check_global_metric('TopKAccuracy', top_k=3)
-    _check_global_metric('f1', shape=(10,2))
-    _check_global_metric('f1', shape=(10,2), average='micro')
-    _check_global_metric('mcc', shape=(10,2))
-    _check_global_metric('mcc', shape=(10,2), average='micro')
-    _check_global_metric('perplexity', -1)
-    _check_global_metric('pearsonr', use_same_shape=True)
-    _check_global_metric('pcc', shape=(10,2))
-    _check_global_metric('nll_loss')
-    _check_global_metric('loss')
-    _check_global_metric('ce')
-    _check_global_metric('mae', use_same_shape=True)
-    _check_global_metric('mse', use_same_shape=True)
-    _check_global_metric('rmse', use_same_shape=True)
-    def custom_metric(label, pred):
-        return np.mean(np.abs(label-pred))
-    _check_global_metric(custom_metric, use_same_shape=True)
-    _check_global_metric(['acc', 'f1'], shape=(10,2))
-
 def test_nll_loss():
     metric = mx.gluon.metric.create('nll_loss')
     pred = mx.nd.array([[0.2, 0.3, 0.5], [0.6, 0.1, 0.3]])
@@ -159,7 +82,7 @@ def test_loss_update():
     _, acc2 = metric2.get()
     assert acc1 == acc2
 
-def test_f1():
+def test_binary_f1():
     microF1 = mx.gluon.metric.create("f1", average="micro")
     macroF1 = mx.gluon.metric.F1(average="macro")
 
@@ -191,7 +114,7 @@ def test_f1():
     microF1.update([label11, label12], [pred11, pred12])
     macroF1.update([label11, label12], [pred11, pred12])
     assert microF1.num_inst == 4
-    assert macroF1.num_inst == 1
+    assert macroF1.num_inst == 4
     # f1 = 2 * tp / (2 * tp + fp + fn)
     fscore1 = 2. * (1) / (2 * 1 + 1 + 0)
     np.testing.assert_almost_equal(microF1.get()[1], fscore1)
@@ -200,29 +123,96 @@ def test_f1():
     microF1.update([label21, label22], [pred21, pred22])
     macroF1.update([label21, label22], [pred21, pred22])
     assert microF1.num_inst == 6
-    assert macroF1.num_inst == 2
+    assert macroF1.num_inst == 6
     fscore2 = 2. * (1) / (2 * 1 + 0 + 0)
     fscore_total = 2. * (1 + 1) / (2 * (1 + 1) + (1 + 0) + (0 + 0))
     np.testing.assert_almost_equal(microF1.get()[1], fscore_total)
-    np.testing.assert_almost_equal(macroF1.get()[1], (fscore1 + fscore2) / 2.)
+    np.testing.assert_almost_equal(macroF1.get()[1], fscore_total)
+
+def test_multiclass_f1():
+    microF1 = mx.gluon.metric.create("f1", class_type="multiclass", average="micro")
+    macroF1 = mx.gluon.metric.F1(class_type="multiclass", average="macro")
+
+    assert np.isnan(macroF1.get()[1])
+    assert np.isnan(microF1.get()[1])
+
+    # check one class is zero
+    pred = mx.nd.array([[0.9, 0.1],
+                        [0.8, 0.2]])
+    label = mx.nd.array([0, 0])
+    macroF1.update([label], [pred])
+    microF1.update([label], [pred])
+    assert macroF1.get()[1] == 0.5 # one class is 1.0, the other is 0. (divided by 0)
+    assert microF1.get()[1] == 1.0 # globally f1 is 1.0
+    macroF1.reset()
+    microF1.reset()
+
+    # test case from sklearn, here pred is probabilistic distributions instead of predicted labels
+    pred11 = mx.nd.array([[1, 0, 0], [0, 1, 0]])
+    label11 = mx.nd.array([0, 2])
+    pred12 = mx.nd.array([[0, 0, 1], [1, 0, 0], [0, 1, 0], [0, 0, 1]])
+    label12 = mx.nd.array([1, 0, 0, 1])
+
+    microF1.update([label11, label12], [pred11, pred12])
+    macroF1.update([label11, label12], [pred11, pred12])
+    assert microF1.num_inst == 6
+    assert macroF1.num_inst == 6
+    
+    from sklearn.metrics import f1_score
+    overall_pred = [0, 1, 2, 0, 1, 2]
+    overall_label = [0, 2, 1, 0, 0, 1]
+    fmacro = f1_score(overall_label, overall_pred, average="macro")
+    fmicro = f1_score(overall_label, overall_pred, average="micro")
+    np.testing.assert_almost_equal(microF1.get()[1], fmicro)
+    np.testing.assert_almost_equal(macroF1.get()[1], fmacro)
+    
+def test_multilabel_f1():
+    microF1 = mx.gluon.metric.create("f1", class_type="multilabel", average="micro")
+    macroF1 = mx.gluon.metric.F1(class_type="multilabel", average="macro")
+
+    assert np.isnan(macroF1.get()[1])
+    assert np.isnan(microF1.get()[1])
+
+    # check one class is zero
+    pred = mx.nd.array([[0.9, 0.1],
+                        [0.8, 0.2]])
+    label = mx.nd.array([[1, 1], [1, 1]])
+    macroF1.update([label], [pred])
+    microF1.update([label], [pred])
+    assert macroF1.get()[1] == 0.5 # one class is 1.0, the other is 0. (divided by 0)
+    assert microF1.get()[1] == 2.0 / 3 
+    macroF1.reset()
+    microF1.reset()
 
+    pred11 = mx.nd.array([[0.9, 0.4, 0.3], [0.2, 0.7, 0.8]])
+    label11 = mx.nd.array([[1, 0, 1], [0, 0, 1]])
+    pred12 = mx.nd.array([[0.6, 0.6, 0.7]])
+    label12 = mx.nd.array([[0, 1, 1]])
+
+    microF1.update([label11, label12], [pred11, pred12])
+    macroF1.update([label11, label12], [pred11, pred12])
+    assert microF1.num_inst == 3
+    assert macroF1.num_inst == 3
+    from sklearn.metrics import f1_score
+    overall_pred = [[1, 0, 0], [0, 1, 1], [1, 1, 1]]
+    overall_label = [[1, 0, 1], [0, 0, 1], [0, 1, 1]]
+    fmacro = f1_score(overall_label, overall_pred, average="macro")
+    fmicro = f1_score(overall_label, overall_pred, average="micro")
+    np.testing.assert_almost_equal(microF1.get()[1], fmicro)
+    np.testing.assert_almost_equal(macroF1.get()[1], fmacro)
+    
 def test_mcc():
-    microMCC = mx.gluon.metric.create("mcc", average="micro")
-    macroMCC = mx.gluon.metric.MCC(average="macro")
+    microMCC = mx.gluon.metric.create("mcc")
 
     assert np.isnan(microMCC.get()[1])
-    assert np.isnan(macroMCC.get()[1])
-
+    
     # check divide by zero
     pred = mx.nd.array([[0.9, 0.1],
                         [0.8, 0.2]])
     label = mx.nd.array([0, 0])
     microMCC.update([label], [pred])
-    macroMCC.update([label], [pred])
     assert microMCC.get()[1] == 0.0
-    assert macroMCC.get()[1] == 0.0
     microMCC.reset()
-    macroMCC.reset()
 
     pred11 = mx.nd.array([[0.1, 0.9],
                         [0.5, 0.5]])
@@ -235,24 +225,18 @@ def test_mcc():
     pred22 = mx.nd.array([[0.2, 0.8]])
     label22 = mx.nd.array([1])
     microMCC.update([label11, label12], [pred11, pred12])
-    macroMCC.update([label11, label12], [pred11, pred12])
     assert microMCC.num_inst == 4
-    assert macroMCC.num_inst == 1
     tp1 = 1; fp1 = 0; fn1 = 1; tn1=2
     mcc1 = (tp1*tn1 - fp1*fn1) / np.sqrt((tp1+fp1)*(tp1+fn1)*(tn1+fp1)*(tn1+fn1))
     np.testing.assert_almost_equal(microMCC.get()[1], mcc1)
-    np.testing.assert_almost_equal(macroMCC.get()[1], mcc1)
 
     microMCC.update([label21, label22], [pred21, pred22])
-    macroMCC.update([label21, label22], [pred21, pred22])
     assert microMCC.num_inst == 6
-    assert macroMCC.num_inst == 2
     tp2 = 1; fp2 = 0; fn2 = 0; tn2=1
     mcc2 = (tp2*tn2 - fp2*fn2) / np.sqrt((tp2+fp2)*(tp2+fn2)*(tn2+fp2)*(tn2+fn2))
     tpT = tp1+tp2; fpT = fp1+fp2; fnT = fn1+fn2; tnT = tn1+tn2;
     mccT = (tpT*tnT - fpT*fnT) / np.sqrt((tpT+fpT)*(tpT+fnT)*(tnT+fpT)*(tnT+fnT))
     np.testing.assert_almost_equal(microMCC.get()[1], mccT)
-    np.testing.assert_almost_equal(macroMCC.get()[1], .5*(mcc1+mcc2))
 
 def test_perplexity():
     pred = mx.nd.array([[0.8, 0.2], [0.2, 0.8], [0, 1.]])
@@ -269,17 +253,12 @@ def test_pearsonr():
     label1 = mx.nd.array([[1, 0], [0, 1], [0, 1]])
     pearsonr_expected_np = np.corrcoef(pred1.asnumpy().ravel(), label1.asnumpy().ravel())[0, 1]
     pearsonr_expected_scipy, _ = pearsonr(pred1.asnumpy().ravel(), label1.asnumpy().ravel())
-    macro_pr = mx.gluon.metric.create('pearsonr', average='macro')
-    micro_pr = mx.gluon.metric.create('pearsonr', average='micro')
+    micro_pr = mx.gluon.metric.create('pearsonr')
 
-    assert np.isnan(macro_pr.get()[1])
     assert np.isnan(micro_pr.get()[1])
 
-    macro_pr.update([label1], [pred1])
     micro_pr.update([label1], [pred1])
 
-    np.testing.assert_almost_equal(macro_pr.get()[1], pearsonr_expected_np)
-    np.testing.assert_almost_equal(macro_pr.get()[1], pearsonr_expected_scipy)
     np.testing.assert_almost_equal(micro_pr.get()[1], pearsonr_expected_np)
     np.testing.assert_almost_equal(micro_pr.get()[1], pearsonr_expected_scipy)
 
@@ -292,11 +271,7 @@ def test_pearsonr():
     pearsonr_expected_np = np.corrcoef(pred12.asnumpy().ravel(), label12.asnumpy().ravel())[0, 1]
     pearsonr_expected_scipy, _ = pearsonr(pred12.asnumpy().ravel(), label12.asnumpy().ravel())
 
-    macro_pr.reset()
     micro_pr.update([label2], [pred2])
-    macro_pr.update([label12], [pred12])
-    np.testing.assert_almost_equal(macro_pr.get()[1], pearsonr_expected_np)
-    np.testing.assert_almost_equal(macro_pr.get()[1], pearsonr_expected_scipy)
     np.testing.assert_almost_equal(micro_pr.get()[1], pearsonr_expected_np)
     np.testing.assert_almost_equal(micro_pr.get()[1], pearsonr_expected_scipy)
 

From 1dfe0e0376431bbced789b77652e37cf3bb2092e Mon Sep 17 00:00:00 2001
From: acphile <phile_999@126.com>
Date: Wed, 22 Apr 2020 09:56:40 +0000
Subject: [PATCH 10/24] use mxnet.numpy instead of numpy

---
 python/mxnet/gluon/metric.py         | 115 ++++++++++++++++-----------
 tests/python/unittest/test_metric.py |   3 +-
 2 files changed, 69 insertions(+), 49 deletions(-)

diff --git a/python/mxnet/gluon/metric.py b/python/mxnet/gluon/metric.py
index 8214ad86ae67..83da4eb9e28f 100644
--- a/python/mxnet/gluon/metric.py
+++ b/python/mxnet/gluon/metric.py
@@ -22,7 +22,8 @@
 import math
 from collections import OrderedDict
 
-import numpy
+from .. import numpy
+from ..util import use_np
 
 from ..base import numeric_types, string_types
 from .. import ndarray
@@ -359,6 +360,7 @@ def get_config(self):
 
 @register
 @alias('acc')
+@use_np
 class Accuracy(EvalMetric):
     """Computes accuracy classification score.
 
@@ -414,11 +416,11 @@ def update(self, labels, preds):
         for label, pred_label in zip(labels, preds):
             if pred_label.shape != label.shape:
                 pred_label = ndarray.argmax(pred_label, axis=self.axis)
-            pred_label = pred_label.asnumpy().astype('int32')
-            label = label.asnumpy().astype('int32')
+            pred_label = pred_label.as_np_ndarray().astype('int32')
+            label = label.as_np_ndarray().astype('int32')
             # flatten before checking shapes to avoid shape miss match
-            label = label.flat
-            pred_label = pred_label.flat
+            label = label.reshape(-1)
+            pred_label = pred_label.reshape(-1)
 
             check_label_shapes(label, pred_label)
 
@@ -429,6 +431,7 @@ def update(self, labels, preds):
 
 @register
 @alias('top_k_accuracy', 'top_k_acc')
+@use_np
 class TopKAccuracy(EvalMetric):
     """Computes top k predictions accuracy.
 
@@ -491,19 +494,19 @@ def update(self, labels, preds):
             # we do not care about the order of top k elements. It is
             # much faster, which is important since that computation is
             # single-threaded due to Python GIL.
-            pred_label = numpy.argpartition(pred_label.asnumpy().astype('float32'), -self.top_k)
-            label = label.asnumpy().astype('int32')
+            pred_label = numpy.argpartition(pred_label.as_np_ndarray().astype('float32'), -self.top_k)
+            label = label.as_np_ndarray().astype('int32')
             check_label_shapes(label, pred_label)
             num_samples = pred_label.shape[0]
             num_dims = len(pred_label.shape)
             if num_dims == 1:
-                num_correct = (pred_label.flat == label.flat).sum()
+                num_correct = (pred_label.reshape(-1) == label.reshape(-1)).sum()
                 self.sum_metric += num_correct
             elif num_dims == 2:
                 num_classes = pred_label.shape[1]
                 top_k = min(num_classes, self.top_k)
                 for j in range(top_k):
-                    num_correct = (pred_label[:, num_classes - 1 - j].flat == label.flat).sum()
+                    num_correct = (pred_label[:, num_classes - 1 - j].reshape(-1) == label.reshape(-1)).sum()
                     self.sum_metric += num_correct
             self.num_inst += num_samples
 
@@ -530,10 +533,11 @@ def predict_with_threshold(pred, threshold=0.5):
         raise ValueError("{} is a wrong type for threshold!".format(type(threshold)))
     
 
-def one_hot(x, m):
-    return (numpy.arange(m)==x[:,None]).astype('int32')
+def one_hot(idx, num):
+    return (numpy.arange(num).astype(idx) == idx[:,None]).astype('int32')
     
-        
+
+@use_np        
 class _ClassificationMetrics(object):
     """Private container class for classification metric statistics.
 
@@ -581,8 +585,8 @@ def update_stats(self, label, pred):
         pred : `NDArray`
             Predicted values.
         """
-        pred = pred.asnumpy()
-        label = label.asnumpy().astype('int32')
+        pred = pred.as_np_ndarray()
+        label = label.as_np_ndarray().astype('int32')
         if self.class_type == "binary":
             self._set(1)
             if len(numpy.unique(label)) > 2:
@@ -593,8 +597,8 @@ def update_stats(self, label, pred):
                 raise ValueError("The shape of prediction {} is wrong for binary classification.".format(pred.shape))
             elif pred.shape[-1] == 2:
                 pred = pred.reshape(-1, 2)[:, 1]     
-            pred_label = predict_with_threshold(pred, self.threshold).flat
-            label = label.flat
+            pred_label = predict_with_threshold(pred, self.threshold).reshape(-1)
+            label = label.reshape(-1)
             
         elif self.class_type == "multiclass":
             num = pred.shape[-1]
@@ -615,9 +619,9 @@ def update_stats(self, label, pred):
         check_label_shapes(label, pred_label)
         
         pred_true = (pred_label == 1)
-        pred_false = 1 - pred_true
+        pred_false = (pred_label == 0)
         label_true = (label == 1)
-        label_false = 1 - label_true
+        label_false = (label == 0)
 
         true_pos = (pred_true * label_true).sum(0)
         false_pos = (pred_true * label_false).sum(0)
@@ -668,7 +672,7 @@ def global_fscore(self):
         else:
             return 0.
             
-    def matthewscc(self):
+    def binary_matthewscc(self):
         """Calculate the Matthew's Correlation Coefficent"""
         if not self.total_examples:
             return 0.
@@ -703,6 +707,7 @@ def reset_stats(self):
 
 
 @register
+@use_np
 class F1(EvalMetric):
     """Computes the F1 score of a binary classification problem.
 
@@ -791,6 +796,7 @@ def reset(self):
 
 
 @register
+@use_np
 class Fbeta(F1):
     """Computes the Fbeta score of a binary classification problem.
 
@@ -851,6 +857,7 @@ def __init__(self, name='fbeta',
         
 
 @register
+@use_np
 class BinaryAccuracy(EvalMetric):
     """Computes the accuracy of a binary or multilabel classification problem.
 
@@ -899,11 +906,11 @@ def update(self, labels, preds):
         for label, pred_label in zip(labels, preds):
             pred_label = predict_with_threshold(pred_label, self.threshold)
             
-            pred_label = pred_label.asnumpy().astype('int32')
-            label = label.asnumpy().astype('int32')
+            pred_label = pred_label.as_np_ndarray().astype('int32')
+            label = label.as_np_ndarray().astype('int32')
             # flatten before checking shapes to avoid shape miss match
-            label = label.flat
-            pred_label = pred_label.flat
+            label = label.reshape(-1)
+            pred_label = pred_label.reshape(-1)
 
             check_label_shapes(label, pred_label)
 
@@ -913,6 +920,7 @@ def update(self, labels, preds):
        
         
 @register
+@use_np
 class MCC(EvalMetric):
     """Computes the Matthews Correlation Coefficient of a binary classification problem.
 
@@ -993,7 +1001,7 @@ def update(self, labels, preds):
         for label, pred in zip(labels, preds):
             self._metrics.update_stats(label, pred)
 
-        self.sum_metric = self._metrics.matthewscc() * self._metrics.total_examples
+        self.sum_metric = self._metrics.binary_matthewscc() * self._metrics.total_examples
         self.num_inst = self._metrics.total_examples
 
     def reset(self):
@@ -1112,6 +1120,7 @@ def get(self):
 
 
 @register
+@use_np
 class MAE(EvalMetric):
     """Computes Mean Absolute Error (MAE) loss.
 
@@ -1160,8 +1169,8 @@ def update(self, labels, preds):
         labels, preds = check_label_shapes(labels, preds, True)
 
         for label, pred in zip(labels, preds):
-            label = label.asnumpy()
-            pred = pred.asnumpy()
+            label = label.as_np_ndarray()
+            pred = pred.as_np_ndarray()
 
             num_inst = label.shape[0]
             mae = numpy.abs(label - pred).reshape(num_inst, -1).mean(axis=-1).sum()
@@ -1171,6 +1180,7 @@ def update(self, labels, preds):
 
 
 @register
+@use_np
 class MSE(EvalMetric):
     """Computes Mean Squared Error (MSE) loss.
 
@@ -1218,8 +1228,8 @@ def update(self, labels, preds):
         labels, preds = check_label_shapes(labels, preds, True)
 
         for label, pred in zip(labels, preds):
-            label = label.asnumpy()
-            pred = pred.asnumpy()
+            label = label.as_np_ndarray()
+            pred = pred.as_np_ndarray()
 
             num_inst = label.shape[0]
             mse = ((label - pred)**2.0).reshape(num_inst, -1).mean(axis=-1).sum()
@@ -1229,6 +1239,7 @@ def update(self, labels, preds):
 
 
 @register
+@use_np
 class RMSE(MSE):
     """Computes Root Mean Squred Error (RMSE) loss.
 
@@ -1270,6 +1281,7 @@ def get(self):
 
 
 @register
+@use_np
 class MeanPairwiseDistance(EvalMetric):
     """Computes Mean Pairwise Distance.
 
@@ -1320,8 +1332,8 @@ def update(self, labels, preds):
         labels, preds = check_label_shapes(labels, preds, True)
 
         for label, pred in zip(labels, preds):
-            label = label.asnumpy()
-            pred = pred.asnumpy()
+            label = label.as_np_ndarray()
+            pred = pred.as_np_ndarray()
 
             label = label.reshape(label.shape[0], -1)
             pred = pred.reshape(pred.shape[0], -1)
@@ -1335,6 +1347,7 @@ def update(self, labels, preds):
             
 
 @register
+@use_np
 class MeanCosineSimilarity(EvalMetric):
     """Computes Mean Cosine Similarity.
 
@@ -1385,8 +1398,8 @@ def update(self, labels, preds):
         labels, preds = check_label_shapes(labels, preds, True)
 
         for label, pred in zip(labels, preds):
-            label = label.asnumpy()
-            pred = pred.asnumpy()
+            label = label.as_np_ndarray()
+            pred = pred.as_np_ndarray()
 
             if len(label.shape) == 1:
                 label = label.reshape(1, label.shape[0])
@@ -1398,13 +1411,14 @@ def update(self, labels, preds):
             n_l = numpy.linalg.norm(label, axis=-1)
             sim = sim / numpy.maximum(n_l * n_p, self.eps)
             sim = sim.sum()
-            num_inst = numpy.prod(label.shape[:-1])
+            num_inst = len(label.reshape(-1, label.shape[-1])) # numpy.prod(label.shape[:-1]) is not supported
             self.sum_metric += sim
             self.num_inst += num_inst
 
             
 @register
 @alias('ce')
+@use_np
 class CrossEntropy(EvalMetric):
     """Computes Cross Entropy loss.
 
@@ -1461,10 +1475,10 @@ def update(self, labels, preds):
         labels, preds = check_label_shapes(labels, preds, True)
 
         for label, pred in zip(labels, preds):
-            label = label.asnumpy()
-            pred = pred.asnumpy()
+            label = label.as_np_ndarray()
+            pred = pred.as_np_ndarray()
 
-            label = label.ravel()
+            label = label.reshape(-1)
             assert label.shape[0] == pred.shape[0]
 
             prob = pred[numpy.arange(label.shape[0]), numpy.int64(label)]
@@ -1474,6 +1488,7 @@ def update(self, labels, preds):
 
 @register
 @alias('nll_loss')
+@use_np
 class NegativeLogLikelihood(EvalMetric):
     """Computes the negative log-likelihood loss.
 
@@ -1530,10 +1545,10 @@ def update(self, labels, preds):
         labels, preds = check_label_shapes(labels, preds, True)
 
         for label, pred in zip(labels, preds):
-            label = label.asnumpy()
-            pred = pred.asnumpy()
+            label = label.as_np_ndarray()
+            pred = pred.as_np_ndarray()
 
-            label = label.ravel()
+            label = label.reshape(-1)
             num_examples = pred.shape[0]
             assert label.shape[0] == num_examples, (label.shape[0], num_examples)
             prob = pred[numpy.arange(num_examples, dtype=numpy.int64), numpy.int64(label)]
@@ -1544,6 +1559,7 @@ def update(self, labels, preds):
 
 @register
 @alias('pearsonr')
+@use_np
 class PearsonCorrelation(EvalMetric):
     """Computes Pearson correlation.
 
@@ -1616,8 +1632,8 @@ def update(self, labels, preds):
         labels, preds = check_label_shapes(labels, preds, True)
         for label, pred in zip(labels, preds):
             check_label_shapes(label, pred, False, True)
-            label = label.asnumpy().ravel().astype(numpy.float64)
-            pred = pred.asnumpy().ravel().astype(numpy.float64)
+            label = label.as_np_ndarray().reshape(-1).astype(numpy.float64)
+            pred = pred.as_np_ndarray().reshape(-1).astype(numpy.float64)
 
             self.num_inst += 1
             self._label_nums, self._mean_l, self._sse_l = \
@@ -1635,6 +1651,7 @@ def get(self):
         return (self.name, pearsonr)
 
 @register
+@use_np
 class PCC(EvalMetric):
     """PCC is a multiclass equivalent for the Matthews correlation coefficient derived
     from a discrete solution to the Pearson correlation coefficient.
@@ -1706,7 +1723,8 @@ def _calc_mcc(self, cmat):
         cov_yy = numpy.sum(y * (n - y))
         if cov_xx == 0 or cov_yy == 0:
             return float('nan')
-        i = cmat.diagonal()
+        # i = cmat.diagonal() # mxnet.numpy.ndarray.diagonal() is currently not available.
+        i = cmat[numpy.arange(self.k), numpy.arange(self.k)]
         cov_xy = numpy.sum(i * n - x * y)
         return cov_xy / (cov_xx * cov_yy) ** 0.5
 
@@ -1725,13 +1743,13 @@ def update(self, labels, preds):
 
         # update the confusion matrix
         for label, pred in zip(labels, preds):
-            label = label.astype('int32', copy=False).asnumpy()
-            pred = pred.asnumpy()
+            label = label.astype('int32', copy=False).as_np_ndarray()
+            pred = pred.as_np_ndarray()
             if pred.shape != label.shape:
-                pred = pred.argmax(axis=1)
+                pred = pred.argmax(axis=1).astype(label, copy=False)
             else:
                 pred = pred.astype('int32', copy=False)
-            n = max(pred.max(), label.max())
+            n = int(max(pred.max(), label.max()))
             if n >= self.k:
                 self._grow(n + 1 - self.k)
             bcm = numpy.zeros((self.k, self.k))
@@ -1800,6 +1818,7 @@ def __init__(self, name='caffe',
 
 
 @register
+@use_np
 class CustomMetric(EvalMetric):
     """Computes a customized evaluation metric.
 
@@ -1863,8 +1882,8 @@ def update(self, labels, preds):
             labels, preds = check_label_shapes(labels, preds, True)
 
         for pred, label in zip(preds, labels):
-            label = label.asnumpy()
-            pred = pred.asnumpy()
+            label = label.as_np_ndarray()
+            pred = pred.as_np_ndarray()
 
             reval = self._feval(label, pred)
             if isinstance(reval, tuple):
diff --git a/tests/python/unittest/test_metric.py b/tests/python/unittest/test_metric.py
index 81f57f0eed6d..0a6d48f3f2b4 100644
--- a/tests/python/unittest/test_metric.py
+++ b/tests/python/unittest/test_metric.py
@@ -16,6 +16,7 @@
 # under the License.
 
 import mxnet as mx
+mx.npx.set_np()
 import numpy as np
 import scipy
 from scipy.stats import pearsonr
@@ -352,7 +353,7 @@ def test_pcc():
     # * order
     # * batch size
     # * update frequency
-    labels = [ [ i ] for i in labels[0] ]
+    labels = [ [ i.reshape(-1) ] for i in labels[0] ]
     labels.reverse()
     preds = [ [ i.reshape((1, -1)) ] for i in preds[0] ]
     preds.reverse()

From 59d98b36307bab31a9ffbf616b2f88235d52ee94 Mon Sep 17 00:00:00 2001
From: acphile <phile_999@126.com>
Date: Sat, 25 Apr 2020 03:24:33 +0000
Subject: [PATCH 11/24] fix sanity

---
 python/mxnet/gluon/metric.py | 30 ++++++++++++++++++------------
 1 file changed, 18 insertions(+), 12 deletions(-)

diff --git a/python/mxnet/gluon/metric.py b/python/mxnet/gluon/metric.py
index 5b83feb1534a..1099cc901ef7 100644
--- a/python/mxnet/gluon/metric.py
+++ b/python/mxnet/gluon/metric.py
@@ -534,7 +534,7 @@ def predict_with_threshold(pred, threshold=0.5):
     
 
 def one_hot(idx, num):
-    return (numpy.arange(num).astype(idx) == idx[:,None]).astype('int32')
+    return (numpy.arange(num).astype(idx) == idx[:, None]).astype('int32')
     
 
 @use_np        
@@ -572,7 +572,8 @@ def _set(self, num):
             self.false_positives = numpy.zeros(num)
             self.true_negatives = numpy.zeros(num)
         else:
-            assert self.num_classes == num, "Input number of classes has changed from {} to {}".format(self.num_classes, num)
+            assert self.num_classes == num, \
+                "Input number of classes has changed from {} to {}".format(self.num_classes, num)
             
     def update_stats(self, label, pred):
         """Update various binary classification counts for a single (label, pred) pair.
@@ -610,11 +611,13 @@ def update_stats(self, label, pred):
         elif self.class_type == "multilabel":
             num = pred.shape[-1]
             self._set(num)
-            assert pred.shape == label.shape, "The shape of label should be same as that of prediction for multilabel classification."
+            assert pred.shape == label.shape, \
+                "The shape of label should be same as that of prediction for multilabel classification."
             pred_label = predict_with_threshold(pred, self.threshold).reshape(-1, num)
             label = label.reshape(-1, num)
         else:
-            raise ValueError("Wrong class_type {}! Only supports ['binary', 'multiclass', 'multilabel']".format(self.class_type))
+            raise ValueError(
+                "Wrong class_type {}! Only supports ['binary', 'multiclass', 'multilabel']".format(self.class_type))
             
         check_label_shapes(label, pred_label)
         
@@ -642,7 +645,8 @@ def precision(self):
     @property
     def micro_precision(self):
         if self.num_classes is not None:
-            return self.true_positives.sum() / numpy.maximum(self.true_positives.sum() + self.false_positives.sum(), 1e-12)
+            return self.true_positives.sum() / \
+                numpy.maximum(self.true_positives.sum() + self.false_positives.sum(), 1e-12)
         else:
             return 0.
             
@@ -656,13 +660,15 @@ def recall(self):
     @property
     def micro_recall(self):
         if self.num_classes is not None:
-            return self.true_positives.sum() / numpy.maximum(self.true_positives.sum() + self.false_negatives.sum(), 1e-12)
+            return self.true_positives.sum() / \
+                numpy.maximum(self.true_positives.sum() + self.false_negatives.sum(), 1e-12)
         else:
             return 0.
             
     @property
     def fscore(self):
-        return (1 + self.beta ** 2) * self.precision * self.recall / numpy.maximum(self.beta ** 2 * self.precision + self.recall, 1e-12)
+        return (1 + self.beta ** 2) * self.precision * self.recall / \
+            numpy.maximum(self.beta ** 2 * self.precision + self.recall, 1e-12)
 
     @property
     def micro_fscore(self):
@@ -744,7 +750,7 @@ class F1(EvalMetric):
     average : str, default 'micro'
         Strategy to be used for aggregating across mini-batches.
             "macro": Calculate metrics for each label and return unweighted mean of f1.
-            "micro": Calculate metrics globally by counting the total true positives, false negatives and false positives.
+            "micro": Calculate metrics globally by counting the total TP, FN and FP.
             None: Return f1 scores for each class (numpy.ndarray) . 
 
     Examples
@@ -835,7 +841,7 @@ class Fbeta(F1):
     average : str, default 'micro'
         Strategy to be used for aggregating across mini-batches.
             "macro": Calculate metrics for each label and return unweighted mean of f1.
-            "micro": Calculate metrics globally by counting the total true positives, false negatives and false positives.
+            "micro": Calculate metrics globally by counting the total TP, FN and FP.
             None: Return f1 scores for each class. 
 
     Examples
@@ -850,9 +856,9 @@ class Fbeta(F1):
 
     def __init__(self, name='fbeta',
                  output_names=None, label_names=None, class_type="binary", beta=1, threshold=0.5, average="micro"):
-        super(Fbeta, self).__init__(name=name,
-                            output_names=output_names, label_names=label_names, 
-                            class_type=class_type, threshold=threshold, average=average)
+        super(Fbeta, self).__init__(
+            name=name, output_names=output_names, label_names=label_names, 
+            class_type=class_type, threshold=threshold, average=average)
         self.metrics = _ClassificationMetrics(class_type=class_type, threshold=threshold, beta=beta)
         
 

From 40e87e3139314ef9a5d9c26cc6b7e249f79e1551 Mon Sep 17 00:00:00 2001
From: acphile <phile_999@126.com>
Date: Mon, 27 Apr 2020 05:53:49 +0000
Subject: [PATCH 12/24] rewrite ce and ppl, improve some details

---
 python/mxnet/gluon/metric.py         | 221 +++++++++++++--------------
 tests/python/unittest/test_metric.py |  16 +-
 2 files changed, 112 insertions(+), 125 deletions(-)

diff --git a/python/mxnet/gluon/metric.py b/python/mxnet/gluon/metric.py
index 1099cc901ef7..f750d2087fa1 100644
--- a/python/mxnet/gluon/metric.py
+++ b/python/mxnet/gluon/metric.py
@@ -162,7 +162,15 @@ def get(self):
         if self.num_inst == 0:
             return (self.name, float('nan'))
         else:
-            return (self.name, self.sum_metric / self.num_inst)
+            res = self.sum_metric / self.num_inst
+            if isinstance(res, numpy.ndarray) and len(res.shape) == 0:
+                """
+                currently calling ' c = mxnet.numpy.array([1,2,3]).sum() ' would get
+                ' array(6.) ', a ndarray with shape ()
+                In this case, returning a 'float' in .get() is more explicit.
+                """
+                res = res.item()
+            return (self.name, res)
 
     def get_name_value(self):
         """Returns zipped name and value pairs.
@@ -590,7 +598,7 @@ def update_stats(self, label, pred):
         label = label.as_np_ndarray().astype('int32')
         if self.class_type == "binary":
             self._set(1)
-            if len(numpy.unique(label)) > 2:
+            if label.max() > 1:
                 raise ValueError("Wrong label for binary classification.")
             if pred.shape == label.shape:
                 pass
@@ -1017,109 +1025,6 @@ def reset(self):
         self._metrics.reset_stats()
 
 
-@register
-class Perplexity(EvalMetric):
-    """Computes perplexity.
-
-    Perplexity is a measurement of how well a probability distribution
-    or model predicts a sample. A low perplexity indicates the model
-    is good at predicting the sample.
-
-    The perplexity of a model q is defined as
-
-    .. math::
-        b^{\\big(-\\frac{1}{N} \\sum_{i=1}^N \\log_b q(x_i) \\big)}
-        = \\exp \\big(-\\frac{1}{N} \\sum_{i=1}^N \\log q(x_i)\\big)
-
-    where we let `b = e`.
-
-    :math:`q(x_i)` is the predicted value of its ground truth
-    label on sample :math:`x_i`.
-
-    For example, we have three samples :math:`x_1, x_2, x_3` and their labels
-    are :math:`[0, 1, 1]`.
-    Suppose our model predicts :math:`q(x_1) = p(y_1 = 0 | x_1) = 0.3`
-    and :math:`q(x_2) = 1.0`,
-    :math:`q(x_3) = 0.6`. The perplexity of model q is
-    :math:`exp\\big(-(\\log 0.3 + \\log 1.0 + \\log 0.6) / 3\\big) = 1.77109762852`.
-
-    Parameters
-    ----------
-    ignore_label : int or None
-        Index of invalid label to ignore when
-        counting. By default, sets to -1.
-        If set to `None`, it will include all entries.
-    axis : int (default -1)
-        The axis from prediction that was used to
-        compute softmax. By default use the last
-        axis.
-    name : str
-        Name of this metric instance for display.
-    output_names : list of str, or None
-        Name of predictions that should be used when updating with update_dict.
-        By default include all predictions.
-    label_names : list of str, or None
-        Name of labels that should be used when updating with update_dict.
-        By default include all labels.
-
-    Examples
-    --------
-    >>> predicts = [mx.nd.array([[0.3, 0.7], [0, 1.], [0.4, 0.6]])]
-    >>> labels   = [mx.nd.array([0, 1, 1])]
-    >>> perp = mx.gluon.metric.Perplexity(ignore_label=None)
-    >>> perp.update(labels, predicts)
-    >>> print perp.get()
-    ('Perplexity', 1.7710976285155853)
-    """
-    def __init__(self, ignore_label, axis=-1, name='perplexity',
-                 output_names=None, label_names=None):
-        super(Perplexity, self).__init__(
-            name, ignore_label=ignore_label,
-            output_names=output_names, label_names=label_names)
-        self.ignore_label = ignore_label
-        self.axis = axis
-
-    def update(self, labels, preds):
-        """Updates the internal evaluation result.
-
-        Parameters
-        ----------
-        labels : list of `NDArray`
-            The labels of the data.
-
-        preds : list of `NDArray`
-            Predicted values.
-        """
-        assert len(labels) == len(preds)
-        loss = 0.
-        num = 0
-        for label, pred in zip(labels, preds):
-            assert label.size == pred.size/pred.shape[-1], \
-                "shape mismatch: %s vs. %s"%(label.shape, pred.shape)
-            label = label.as_in_context(pred.context).reshape((label.size,))
-            pred = ndarray.pick(pred, label.astype(dtype='int32'), axis=self.axis)
-            if self.ignore_label is not None:
-                ignore = (label == self.ignore_label).astype(pred.dtype)
-                num -= ndarray.sum(ignore).asscalar()
-                pred = pred*(1-ignore) + ignore
-            loss -= ndarray.sum(ndarray.log(ndarray.maximum(1e-10, pred))).asscalar()
-            num += pred.size
-        self.sum_metric += loss
-        self.num_inst += num
-
-    def get(self):
-        """Returns the current evaluation result.
-
-        Returns
-        -------
-        Tuple of (str, float)
-            Representing name of the metric and evaluation result.
-        """
-        if self.num_inst == 0:
-            return (self.name, float('nan'))
-        else:
-            return (self.name, math.exp(self.sum_metric/self.num_inst))
-
 ####################
 # REGRESSION METRICS
 ####################
@@ -1439,9 +1344,13 @@ class :math:`k`.
 
     Parameters
     ----------
-    eps : float
-        Cross Entropy loss is undefined for predicted value is 0 or 1,
-        so predicted values are added with the small constant.
+    ignore_label : int or None, default None
+        Index of invalid label to ignore when
+        counting. By default, sets to -1.
+        If set to `None`, it will include all entries.
+    axis : int (default -1)
+        The axis from prediction that was used to
+        compute softmax. By default use the last axis.
     name : str
         Name of this metric instance for display.
     output_names : list of str, or None
@@ -1460,12 +1369,12 @@ class :math:`k`.
     >>> print ce.get()
     ('cross-entropy', 0.57159948348999023)
     """
-    def __init__(self, eps=1e-12, name='cross-entropy',
+    def __init__(self, ignore_label=None, axis=-1, name='cross-entropy',
                  output_names=None, label_names=None):
         super(CrossEntropy, self).__init__(
-            name, eps=eps,
-            output_names=output_names, label_names=label_names)
-        self.eps = eps
+            name, output_names=output_names, label_names=label_names)
+        self.ignore_label = ignore_label
+        self.axis = axis
 
     def update(self, labels, preds):
         """Updates the internal evaluation result.
@@ -1480,17 +1389,91 @@ def update(self, labels, preds):
         """
         labels, preds = check_label_shapes(labels, preds, True)
 
+        loss = 0.
+        num = 0
         for label, pred in zip(labels, preds):
+            assert label.size == pred.size/pred.shape[-1], \
+                "shape mismatch: %s vs. %s"%(label.shape, pred.shape)
+            label = label.as_in_context(pred.context).reshape((label.size,))
+            pred = ndarray.pick(pred, label.astype(dtype='int32'), axis=self.axis)
             label = label.as_np_ndarray()
             pred = pred.as_np_ndarray()
+            if self.ignore_label is not None:
+                ignore = (label == self.ignore_label).astype(pred.dtype)
+                num -= ignore.sum()
+                pred = pred * (1 - ignore) + ignore
+            loss -= numpy.log(numpy.maximum(1e-12, pred)).sum()
+            num += pred.size
+        self.sum_metric += loss
+        self.num_inst += num
 
-            label = label.reshape(-1)
-            assert label.shape[0] == pred.shape[0]
 
-            prob = pred[numpy.arange(label.shape[0]), numpy.int64(label)]
-            cross_entropy = (-numpy.log(prob + self.eps)).sum()
-            self.sum_metric += cross_entropy
-            self.num_inst += label.shape[0]
+@register
+@use_np
+class Perplexity(CrossEntropy):
+    """Computes perplexity.
+
+    Perplexity is a measurement of how well a probability distribution
+    or model predicts a sample. A low perplexity indicates the model
+    is good at predicting the sample.
+
+    The perplexity of a model q is defined as
+
+    .. math::
+        b^{\\big(-\\frac{1}{N} \\sum_{i=1}^N \\log_b q(x_i) \\big)}
+        = \\exp \\big(-\\frac{1}{N} \\sum_{i=1}^N \\log q(x_i)\\big)
+
+    where we let `b = e`.
+
+    :math:`q(x_i)` is the predicted value of its ground truth
+    label on sample :math:`x_i`.
+
+    For example, we have three samples :math:`x_1, x_2, x_3` and their labels
+    are :math:`[0, 1, 1]`.
+    Suppose our model predicts :math:`q(x_1) = p(y_1 = 0 | x_1) = 0.3`
+    and :math:`q(x_2) = 1.0`,
+    :math:`q(x_3) = 0.6`. The perplexity of model q is
+    :math:`exp\\big(-(\\log 0.3 + \\log 1.0 + \\log 0.6) / 3\\big) = 1.77109762852`.
+
+    Parameters
+    ----------
+    ignore_label : int or None, default None
+        Index of invalid label to ignore when
+        counting. By default, sets to -1.
+        If set to `None`, it will include all entries.
+    axis : int (default -1)
+        The axis from prediction that was used to
+        compute softmax. By default use the last axis.
+    name : str
+        Name of this metric instance for display.
+    output_names : list of str, or None
+        Name of predictions that should be used when updating with update_dict.
+        By default include all predictions.
+    label_names : list of str, or None
+        Name of labels that should be used when updating with update_dict.
+        By default include all labels.
+
+    Examples
+    --------
+    >>> predicts = [mx.nd.array([[0.3, 0.7], [0, 1.], [0.4, 0.6]])]
+    >>> labels   = [mx.nd.array([0, 1, 1])]
+    >>> perp = mx.gluon.metric.Perplexity(ignore_label=None)
+    >>> perp.update(labels, predicts)
+    >>> print perp.get()
+    ('Perplexity', 1.7710976285155853)
+    """
+    def __init__(self, ignore_label=None, axis=-1, name='perplexity',
+                 output_names=None, label_names=None):
+        super(Perplexity, self).__init__(
+            name=name, ignore_label=ignore_label, axis=axis, 
+            output_names=output_names, label_names=label_names)
+
+    def get(self):
+        if self.num_inst == 0:
+            return (self.name, float('nan'))
+        else:
+            return (self.name, math.exp(self.sum_metric/self.num_inst))
+
 
 @register
 @alias('nll_loss')
@@ -1654,7 +1637,7 @@ def get(self):
 
         n = self._label_nums
         pearsonr = self._conv / ((n-1) * numpy.sqrt(self._sse_p / (n - 1)) * numpy.sqrt(self._sse_l / (n - 1)))
-        return (self.name, pearsonr)
+        return (self.name, float(pearsonr))
 
 @register
 @use_np
diff --git a/tests/python/unittest/test_metric.py b/tests/python/unittest/test_metric.py
index 665feb39ceee..a9f7e4c1b568 100644
--- a/tests/python/unittest/test_metric.py
+++ b/tests/python/unittest/test_metric.py
@@ -36,7 +36,7 @@ def test_metrics():
     check_metric('acc', axis=0)
     check_metric('f1')
     check_metric('mcc')
-    check_metric('perplexity', -1)
+    check_metric('perplexity', axis=-1)
     check_metric('pearsonr')
     check_metric('pcc')
     check_metric('nll_loss')
@@ -60,7 +60,7 @@ def test_acc():
     metric.update([label], [pred])
     _, acc = metric.get()
     expected_acc = (np.argmax(pred, axis=1) == label).sum().asscalar() / label.size
-    assert acc == expected_acc
+    np.testing.assert_almost_equal(acc, expected_acc)
 
 def test_acc_2d_label():
     # label maybe provided in 2d arrays in custom data iterator
@@ -71,7 +71,7 @@ def test_acc_2d_label():
     _, acc = metric.get()
     expected_acc = (np.argmax(pred, axis=1).asnumpy() == label.asnumpy().ravel()).sum() / \
                    float(label.asnumpy().ravel().size)
-    assert acc == expected_acc
+    np.testing.assert_almost_equal(acc, expected_acc)
 
 def test_loss_update():
     pred = mx.nd.array([[0.3, 0.7], [0, 1.], [0.4, 0.6]])
@@ -181,7 +181,7 @@ def test_multilabel_f1():
     macroF1.update([label], [pred])
     microF1.update([label], [pred])
     assert macroF1.get()[1] == 0.5 # one class is 1.0, the other is 0. (divided by 0)
-    assert microF1.get()[1] == 2.0 / 3 
+    np.testing.assert_almost_equal(microF1.get()[1], 2.0 / 3)  
     macroF1.reset()
     microF1.reset()
 
@@ -244,10 +244,10 @@ def test_perplexity():
     label = mx.nd.array([0, 1, 1])
     p = pred.asnumpy()[np.arange(label.size), label.asnumpy().astype('int32')]
     perplexity_expected = np.exp(-np.log(p).sum()/label.size)
-    metric = mx.gluon.metric.create('perplexity', -1)
+    metric = mx.gluon.metric.create('perplexity', axis=-1)
     metric.update([label], [pred])
     _, perplexity = metric.get()
-    assert perplexity == perplexity_expected
+    np.testing.assert_almost_equal(perplexity, perplexity_expected)
 
 def test_pearsonr():
     pred1 = mx.nd.array([[0.3, 0.7], [0, 1.], [0.4, 0.6]])
@@ -383,3 +383,7 @@ def test_single_array_input():
     rmse.get()
     _, rmse_res = rmse.get()
     np.testing.assert_almost_equal(rmse_res, 0.1)
+
+if __name__ == '__main__':
+    import nose
+    nose.runmodule()

From 5e153e12278c2d07096556f6e3131a8132f36968 Mon Sep 17 00:00:00 2001
From: acphile <phile_999@126.com>
Date: Mon, 27 Apr 2020 11:09:20 +0000
Subject: [PATCH 13/24] use mxnet.numpy.float64

---
 python/mxnet/gluon/metric.py | 127 +++++++++++++++++------------------
 1 file changed, 63 insertions(+), 64 deletions(-)

diff --git a/python/mxnet/gluon/metric.py b/python/mxnet/gluon/metric.py
index f750d2087fa1..2679cda2cda2 100644
--- a/python/mxnet/gluon/metric.py
+++ b/python/mxnet/gluon/metric.py
@@ -164,11 +164,9 @@ def get(self):
         else:
             res = self.sum_metric / self.num_inst
             if isinstance(res, numpy.ndarray) and len(res.shape) == 0:
-                """
-                currently calling ' c = mxnet.numpy.array([1,2,3]).sum() ' would get
-                ' array(6.) ', a ndarray with shape ()
-                In this case, returning a 'float' in .get() is more explicit.
-                """
+                # currently calling ' c = mxnet.numpy.array([1,2,3]).sum() ' would get
+                # ' array(6.) ', a ndarray with shape ()
+                # In this case, returning a 'float' in .get() is more explicit.
                 res = res.item()
             return (self.name, res)
 
@@ -432,7 +430,7 @@ def update(self, labels, preds):
 
             check_label_shapes(label, pred_label)
 
-            num_correct = (pred_label == label).sum()
+            num_correct = (pred_label == label).sum().astype('float64')
             self.sum_metric += num_correct
             self.num_inst += len(pred_label)
 
@@ -509,19 +507,19 @@ def update(self, labels, preds):
             num_dims = len(pred_label.shape)
             if num_dims == 1:
                 num_correct = (pred_label.reshape(-1) == label.reshape(-1)).sum()
-                self.sum_metric += num_correct
+                self.sum_metric += num_correct.astype('float64')
             elif num_dims == 2:
                 num_classes = pred_label.shape[1]
                 top_k = min(num_classes, self.top_k)
                 for j in range(top_k):
                     num_correct = (pred_label[:, num_classes - 1 - j].reshape(-1) == label.reshape(-1)).sum()
-                    self.sum_metric += num_correct
+                    self.sum_metric += num_correct.astype('float64')
             self.num_inst += num_samples
 
 
 def predict_with_threshold(pred, threshold=0.5):
     """Do thresholding of predictions in binary and multilabel cases.
-    
+
     Parameters
     ----------
     preds : ndarray
@@ -532,38 +530,38 @@ def predict_with_threshold(pred, threshold=0.5):
     """
     if isinstance(threshold, float):
         return pred > threshold
-    elif isinstance(threshold, numpy.ndarray) or isinstance(threshold, ndarray.ndarray.NDArray):
+    elif isinstance(threshold, (numpy.ndarray, ndarray.ndarray.NDArray)):
         num_classes = pred.shape[-1]
         assert threshold.shape[-1] == num_classes, \
                 "shape mismatch: %s vs. %s"%(pred.shape[-1], threshold.shape[-1])
-        return pred > threshold        
+        return pred > threshold
     else:
         raise ValueError("{} is a wrong type for threshold!".format(type(threshold)))
-    
+
 
 def one_hot(idx, num):
     return (numpy.arange(num).astype(idx) == idx[:, None]).astype('int32')
-    
 
-@use_np        
+
+@use_np
 class _ClassificationMetrics(object):
     """Private container class for classification metric statistics.
 
     True/false positive and true/false negative counts are sufficient statistics for various classification metrics.
     This class provides the machinery to track those statistics across mini-batches of
     (label, prediction) pairs.
-    
+
     Parameters
     ----------
     class_type : str, default "binary"
         "binary": f1 for binary classification.
         "multiclass": f1 for multiclassification problem.
-        "multilabel": f1 for multilabel classification.      
+        "multilabel": f1 for multilabel classification.
     beta : float, default 1
-        weight of precision in harmonic mean. 
+        weight of precision in harmonic mean.
     threshold : float, default 0.5
         threshold for deciding whether the predictions are positive or negative.
-        
+
     """
 
     def __init__(self, class_type="binary", threshold=0.5, beta=1):
@@ -575,14 +573,14 @@ def __init__(self, class_type="binary", threshold=0.5, beta=1):
     def _set(self, num):
         if self.num_classes is None:
             self.num_classes = num
-            self.true_positives = numpy.zeros(num)
-            self.false_negatives = numpy.zeros(num)
-            self.false_positives = numpy.zeros(num)
-            self.true_negatives = numpy.zeros(num)
+            self.true_positives = numpy.zeros(num, dtype='float64')
+            self.false_negatives = numpy.zeros(num, dtype='float64')
+            self.false_positives = numpy.zeros(num, dtype='float64')
+            self.true_negatives = numpy.zeros(num, dtype='float64')
         else:
             assert self.num_classes == num, \
                 "Input number of classes has changed from {} to {}".format(self.num_classes, num)
-            
+
     def update_stats(self, label, pred):
         """Update various binary classification counts for a single (label, pred) pair.
 
@@ -605,17 +603,17 @@ def update_stats(self, label, pred):
             elif pred.shape[-1] > 2:
                 raise ValueError("The shape of prediction {} is wrong for binary classification.".format(pred.shape))
             elif pred.shape[-1] == 2:
-                pred = pred.reshape(-1, 2)[:, 1]     
+                pred = pred.reshape(-1, 2)[:, 1]
             pred_label = predict_with_threshold(pred, self.threshold).reshape(-1)
             label = label.reshape(-1)
-            
+
         elif self.class_type == "multiclass":
             num = pred.shape[-1]
             self._set(num)
             assert label.max() < num, "pred contains fewer classes than label!"
-            pred_label = one_hot(pred.argmax(axis=-1).reshape(-1), num)         
+            pred_label = one_hot(pred.argmax(axis=-1).reshape(-1), num)
             label = one_hot(label.reshape(-1), num)
-            
+
         elif self.class_type == "multilabel":
             num = pred.shape[-1]
             self._set(num)
@@ -626,9 +624,9 @@ def update_stats(self, label, pred):
         else:
             raise ValueError(
                 "Wrong class_type {}! Only supports ['binary', 'multiclass', 'multilabel']".format(self.class_type))
-            
+
         check_label_shapes(label, pred_label)
-        
+
         pred_true = (pred_label == 1)
         pred_false = (pred_label == 0)
         label_true = (label == 1)
@@ -657,7 +655,7 @@ def micro_precision(self):
                 numpy.maximum(self.true_positives.sum() + self.false_positives.sum(), 1e-12)
         else:
             return 0.
-            
+
     @property
     def recall(self):
         if self.num_classes is not None:
@@ -672,7 +670,7 @@ def micro_recall(self):
                 numpy.maximum(self.true_positives.sum() + self.false_negatives.sum(), 1e-12)
         else:
             return 0.
-            
+
     @property
     def fscore(self):
         return (1 + self.beta ** 2) * self.precision * self.recall / \
@@ -685,7 +683,7 @@ def micro_fscore(self):
                 (self.beta ** 2 * self.micro_precision + self.micro_recall)
         else:
             return 0.
-            
+
     def binary_matthewscc(self):
         """Calculate the Matthew's Correlation Coefficent"""
         if not self.total_examples:
@@ -752,14 +750,14 @@ class F1(EvalMetric):
     class_type : str, default "binary"
         "binary": f1 for binary classification.
         "multiclass": f1 for multiclassification problem.
-        "multilabel": f1 for multilabel classification.        
+        "multilabel": f1 for multilabel classification.
     threshold : float, default 0.5
         threshold for postive confidence value.
     average : str, default 'micro'
         Strategy to be used for aggregating across mini-batches.
             "macro": Calculate metrics for each label and return unweighted mean of f1.
             "micro": Calculate metrics globally by counting the total TP, FN and FP.
-            None: Return f1 scores for each class (numpy.ndarray) . 
+            None: Return f1 scores for each class (numpy.ndarray) .
 
     Examples
     --------
@@ -795,13 +793,13 @@ def update(self, labels, preds):
             self.metrics.update_stats(label, pred)
 
         if self.average == "micro":
-            self.sum_metric = self.metrics.micro_fscore * self.metrics.total_examples   
+            self.sum_metric = self.metrics.micro_fscore * self.metrics.total_examples
         elif self.average == "macro":
-            self.sum_metric = self.metrics.fscore.mean() * self.metrics.total_examples  
+            self.sum_metric = self.metrics.fscore.mean() * self.metrics.total_examples
         else:
-            self.sum_metric = self.metrics.fscore * self.metrics.total_examples  
-        self.num_inst = self.metrics.total_examples   
-          
+            self.sum_metric = self.metrics.fscore * self.metrics.total_examples
+        self.num_inst = self.metrics.total_examples
+
     def reset(self):
         """Resets the internal evaluation result to initial state."""
         self.sum_metric = 0.
@@ -841,16 +839,16 @@ class Fbeta(F1):
     class_type : str, default "binary"
         "binary": f1 for binary classification.
         "multiclass": f1 for multiclassification problem.
-        "multilabel": f1 for multilabel classification.      
+        "multilabel": f1 for multilabel classification.
     beta : float, default 1
-        weight of precision in harmonic mean.   
+        weight of precision in harmonic mean.
     threshold : float, default 0.5
         threshold for postive confidence value.
     average : str, default 'micro'
         Strategy to be used for aggregating across mini-batches.
             "macro": Calculate metrics for each label and return unweighted mean of f1.
             "micro": Calculate metrics globally by counting the total TP, FN and FP.
-            None: Return f1 scores for each class. 
+            None: Return f1 scores for each class.
 
     Examples
     --------
@@ -865,10 +863,10 @@ class Fbeta(F1):
     def __init__(self, name='fbeta',
                  output_names=None, label_names=None, class_type="binary", beta=1, threshold=0.5, average="micro"):
         super(Fbeta, self).__init__(
-            name=name, output_names=output_names, label_names=label_names, 
+            name=name, output_names=output_names, label_names=label_names,
             class_type=class_type, threshold=threshold, average=average)
         self.metrics = _ClassificationMetrics(class_type=class_type, threshold=threshold, beta=beta)
-        
+
 
 @register
 @use_np
@@ -919,7 +917,7 @@ def update(self, labels, preds):
 
         for label, pred_label in zip(labels, preds):
             pred_label = predict_with_threshold(pred_label, self.threshold)
-            
+
             pred_label = pred_label.as_np_ndarray().astype('int32')
             label = label.as_np_ndarray().astype('int32')
             # flatten before checking shapes to avoid shape miss match
@@ -928,11 +926,11 @@ def update(self, labels, preds):
 
             check_label_shapes(label, pred_label)
 
-            num_correct = (pred_label == label).sum()
+            num_correct = (pred_label == label).sum().astype('float64')
             self.sum_metric += num_correct
             self.num_inst += len(pred_label)
-       
-        
+
+
 @register
 @use_np
 class MCC(EvalMetric):
@@ -1065,7 +1063,7 @@ def __init__(self, name='mae',
                  output_names=None, label_names=None):
         super(MAE, self).__init__(
             name, output_names=output_names, label_names=label_names)
-        
+
     def update(self, labels, preds):
         """Updates the internal evaluation result.
 
@@ -1085,7 +1083,7 @@ def update(self, labels, preds):
 
             num_inst = label.shape[0]
             mae = numpy.abs(label - pred).reshape(num_inst, -1).mean(axis=-1).sum()
-                
+
             self.sum_metric += mae
             self.num_inst += num_inst
 
@@ -1124,7 +1122,7 @@ def __init__(self, name='mse',
                  output_names=None, label_names=None):
         super(MSE, self).__init__(
             name, output_names=output_names, label_names=label_names)
-        
+
     def update(self, labels, preds):
         """Updates the internal evaluation result.
 
@@ -1144,7 +1142,7 @@ def update(self, labels, preds):
 
             num_inst = label.shape[0]
             mse = ((label - pred)**2.0).reshape(num_inst, -1).mean(axis=-1).sum()
-            
+
             self.sum_metric += mse
             self.num_inst += num_inst
 
@@ -1183,12 +1181,12 @@ def __init__(self, name='rmse',
                  output_names=None, label_names=None):
         super(RMSE, self).__init__(
             name, output_names=output_names, label_names=label_names)
-        
+
     def get(self):
         if self.num_inst == 0:
             return (self.name, float('nan'))
         else:
-            return (self.name, math.sqrt(self.sum_metric / self.num_inst))           
+            return (self.name, math.sqrt(self.sum_metric / self.num_inst))
 
 
 @register
@@ -1228,7 +1226,7 @@ def __init__(self, name='mpd',
         super(MeanPairwiseDistance, self).__init__(
             name, output_names=output_names, label_names=label_names)
         self.p = p
-       
+
     def update(self, labels, preds):
         """Updates the internal evaluation result.
 
@@ -1255,7 +1253,7 @@ def update(self, labels, preds):
 
             self.sum_metric += dis
             self.num_inst += num_inst
-            
+
 
 @register
 @use_np
@@ -1265,9 +1263,9 @@ class MeanCosineSimilarity(EvalMetric):
     The mean cosine similarity is given by
 
     .. math::
-        cos\_sim(label, pred) = \frac{{label}.{pred}}{max(||label||.||pred||, eps)}
+        cos_sim(label, pred) = \frac{{label}.{pred}}{max(||label||.||pred||, eps)}
     (calculating on the last dimension of label and pred.)
-    
+
     Parameters
     ----------
     name : str
@@ -1280,6 +1278,7 @@ class MeanCosineSimilarity(EvalMetric):
         By default include all labels.
     eps : float, default 1e-8
         small vale to avoid division by zero.
+
     Examples
     --------
     >>> predicts = [mx.nd.array([[1., 0.], [1., 1.]])]
@@ -1294,7 +1293,7 @@ def __init__(self, name='cos_sim',
         super(MeanCosineSimilarity, self).__init__(
             name, output_names=output_names, label_names=label_names)
         self.eps = eps
-        
+
     def update(self, labels, preds):
         """Updates the internal evaluation result.
 
@@ -1326,7 +1325,7 @@ def update(self, labels, preds):
             self.sum_metric += sim
             self.num_inst += num_inst
 
-            
+
 @register
 @alias('ce')
 @use_np
@@ -1465,7 +1464,7 @@ class Perplexity(CrossEntropy):
     def __init__(self, ignore_label=None, axis=-1, name='perplexity',
                  output_names=None, label_names=None):
         super(Perplexity, self).__init__(
-            name=name, ignore_label=ignore_label, axis=axis, 
+            name=name, ignore_label=ignore_label, axis=axis,
             output_names=output_names, label_names=label_names)
 
     def get(self):
@@ -1591,7 +1590,7 @@ def reset(self):
         self._pred_nums = 0
         self._label_nums = 0
         self._conv = 0
-        
+
         self.num_inst = 0
         self.sum_metric = 0.0
 
@@ -1741,7 +1740,7 @@ def update(self, labels, preds):
             n = int(max(pred.max(), label.max()))
             if n >= self.k:
                 self._grow(n + 1 - self.k)
-            bcm = numpy.zeros((self.k, self.k))
+            bcm = numpy.zeros((self.k, self.k), dtype='float64')
             for i, j in zip(pred, label):
                 bcm[i, j] += 1
             self.lcm += bcm
@@ -1754,7 +1753,7 @@ def sum_metric(self):
     def reset(self):
         """Resets the internal evaluation result to initial state."""
         self.num_inst = 0.
-        self.lcm = numpy.zeros((self.k, self.k))
+        self.lcm = numpy.zeros((self.k, self.k), dtype='float64')
 
 
 @register

From bf68c6db891ad8cddb549a4eac8508342137afc0 Mon Sep 17 00:00:00 2001
From: acphile <phile_999@126.com>
Date: Tue, 28 Apr 2020 14:32:30 +0000
Subject: [PATCH 14/24] remove sklearn

---
 python/mxnet/gluon/metric.py         |  4 ++--
 tests/python/unittest/test_metric.py | 22 +++++++++++-----------
 2 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/python/mxnet/gluon/metric.py b/python/mxnet/gluon/metric.py
index 2679cda2cda2..9840f5569b93 100644
--- a/python/mxnet/gluon/metric.py
+++ b/python/mxnet/gluon/metric.py
@@ -707,8 +707,8 @@ def binary_matthewscc(self):
     def total_examples(self):
         if self.num_classes is None:
             return 0
-        return self.false_negatives[0] + self.false_positives[0] + \
-               self.true_negatives[0] + self.true_positives[0]
+        return int(self.false_negatives[0] + self.false_positives[0] + \
+               self.true_negatives[0] + self.true_positives[0])
 
     def reset_stats(self):
         self.num_classes = None
diff --git a/tests/python/unittest/test_metric.py b/tests/python/unittest/test_metric.py
index a9f7e4c1b568..af81251fa11b 100644
--- a/tests/python/unittest/test_metric.py
+++ b/tests/python/unittest/test_metric.py
@@ -16,7 +16,7 @@
 # under the License.
 
 import mxnet as mx
-mx.npx.set_np()
+from mxnet.test_utils import use_np
 import numpy as np
 import scipy
 from scipy.stats import pearsonr
@@ -159,11 +159,11 @@ def test_multiclass_f1():
     assert microF1.num_inst == 6
     assert macroF1.num_inst == 6
     
-    from sklearn.metrics import f1_score
-    overall_pred = [0, 1, 2, 0, 1, 2]
-    overall_label = [0, 2, 1, 0, 0, 1]
-    fmacro = f1_score(overall_label, overall_pred, average="macro")
-    fmicro = f1_score(overall_label, overall_pred, average="micro")
+    # from sklearn.metrics import f1_score
+    # overall_pred = [0, 1, 2, 0, 1, 2]
+    # overall_label = [0, 2, 1, 0, 0, 1]
+    fmacro = 0.26666666666666666 #f1_score(overall_label, overall_pred, average="macro")
+    fmicro = 0.3333333333333333 #f1_score(overall_label, overall_pred, average="micro")
     np.testing.assert_almost_equal(microF1.get()[1], fmicro)
     np.testing.assert_almost_equal(macroF1.get()[1], fmacro)
     
@@ -194,11 +194,11 @@ def test_multilabel_f1():
     macroF1.update([label11, label12], [pred11, pred12])
     assert microF1.num_inst == 3
     assert macroF1.num_inst == 3
-    from sklearn.metrics import f1_score
-    overall_pred = [[1, 0, 0], [0, 1, 1], [1, 1, 1]]
-    overall_label = [[1, 0, 1], [0, 0, 1], [0, 1, 1]]
-    fmacro = f1_score(overall_label, overall_pred, average="macro")
-    fmicro = f1_score(overall_label, overall_pred, average="micro")
+    #from sklearn.metrics import f1_score
+    #overall_pred = [[1, 0, 0], [0, 1, 1], [1, 1, 1]]
+    #overall_label = [[1, 0, 1], [0, 0, 1], [0, 1, 1]]
+    fmacro = 0.7111111111111111  #f1_score(overall_label, overall_pred, average="macro")
+    fmicro = 0.7272727272727272  #f1_score(overall_label, overall_pred, average="micro")
     np.testing.assert_almost_equal(microF1.get()[1], fmicro)
     np.testing.assert_almost_equal(macroF1.get()[1], fmacro)
     

From 56b846e4d5f95ee77d0e6bf83bd55f629ee48593 Mon Sep 17 00:00:00 2001
From: acphile <phile_999@126.com>
Date: Wed, 29 Apr 2020 03:57:31 +0000
Subject: [PATCH 15/24] remove reset_local() and get_global in other files

---
 example/ssd/train/metric.py        | 11 -----------
 python/mxnet/callback.py           |  4 ++--
 python/mxnet/module/base_module.py |  2 +-
 3 files changed, 3 insertions(+), 14 deletions(-)

diff --git a/example/ssd/train/metric.py b/example/ssd/train/metric.py
index eeb9796bf4a8..731f8fcc19f4 100644
--- a/example/ssd/train/metric.py
+++ b/example/ssd/train/metric.py
@@ -39,17 +39,6 @@ def reset(self):
             self.num_inst = [0] * self.num
             self.sum_metric = [0.0] * self.num
 
-    def reset_local(self):
-        """
-        override reset behavior
-        """
-        if getattr(self, 'num', None) is None:
-            self.num_inst = 0
-            self.sum_metric = 0.0
-        else:
-            self.num_inst = [0] * self.num
-            self.sum_metric = [0.0] * self.num
-
     def update(self, labels, preds):
         """
         Implementation of updating metrics
diff --git a/python/mxnet/callback.py b/python/mxnet/callback.py
index 4be509270fd3..bd515707eace 100644
--- a/python/mxnet/callback.py
+++ b/python/mxnet/callback.py
@@ -112,7 +112,7 @@ def _callback(param):
                 logging.info('Iter[%d] Batch[%d] Train-%s=%f',
                              param.epoch, param.nbatch, name, value)
             if auto_reset:
-                param.eval_metric.reset_local()
+                param.eval_metric.reset()
     return _callback
 
 
@@ -163,7 +163,7 @@ def __call__(self, param):
                 if param.eval_metric is not None:
                     name_value = param.eval_metric.get_name_value()
                     if self.auto_reset:
-                        param.eval_metric.reset_local()
+                        param.eval_metric.reset()
                         msg = 'Epoch[%d] Batch [%d-%d]\tSpeed: %.2f samples/sec'
                         msg += '\t%s=%f'*len(name_value)
                         logging.info(msg, param.epoch, count-self.frequent, count, speed, *sum(name_value, ()))
diff --git a/python/mxnet/module/base_module.py b/python/mxnet/module/base_module.py
index 9154aebb4b25..92fb7f188bfb 100644
--- a/python/mxnet/module/base_module.py
+++ b/python/mxnet/module/base_module.py
@@ -543,7 +543,7 @@ def fit(self, train_data, eval_data=None, eval_metric='acc',
                     monitor.toc_print()
 
                 if end_of_batch:
-                    eval_name_vals = eval_metric.get_global_name_value()
+                    eval_name_vals = eval_metric.get_name_value()
 
                 if batch_end_callback is not None:
                     batch_end_params = BatchEndParam(epoch=epoch, nbatch=nbatch,

From 8a437e967ee2203e7854d3b043d451ed09e6da79 Mon Sep 17 00:00:00 2001
From: acphile <phile_999@126.com>
Date: Wed, 29 Apr 2020 06:10:45 +0000
Subject: [PATCH 16/24] fix test_mlp

---
 tests/python/train/test_mlp.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tests/python/train/test_mlp.py b/tests/python/train/test_mlp.py
index 166fd8de28d8..5fc1277e95fe 100644
--- a/tests/python/train/test_mlp.py
+++ b/tests/python/train/test_mlp.py
@@ -35,7 +35,8 @@
 
 def accuracy(label, pred):
     py = np.argmax(pred, axis=1)
-    return np.sum(py == label) / float(label.size)
+    return np.sum(py == label.astype(py)) / float(label.size)
+    # currently mxnet.numpy (which used in gluon.metric) did not support "==" between different types
 
 num_epoch = 4
 prefix = './mlp'

From b7c2b3bb67ba7ebeecaa58d36c14c87153725234 Mon Sep 17 00:00:00 2001
From: acphile <phile_999@126.com>
Date: Wed, 29 Apr 2020 18:23:04 +0800
Subject: [PATCH 17/24] replace mx.metric with mx.gluon.metric in example

---
 benchmark/python/sparse/sparse_end2end.py      |  2 +-
 example/adversary/adversary_generation.ipynb   |  6 +++---
 .../variational_autoencoder/VAE_example.ipynb  |  2 +-
 example/caffe/caffe_net.py                     |  2 +-
 example/caffe/train_model.py                   |  2 +-
 example/capsnet/capsulenet.py                  |  2 +-
 example/ctc/lstm_ocr_train.py                  |  2 +-
 .../deep-embedded-clustering/autoencoder.py    |  4 ++--
 example/deep-embedded-clustering/dec.py        |  2 +-
 .../gluon_mnist.py                             |  4 ++--
 .../module_mnist.py                            |  2 +-
 .../resnet50_imagenet.py                       | 10 +++++-----
 example/distributed_training/cifar10_dist.py   |  2 +-
 .../cifar10_kvstore_hvd.py                     |  4 ++--
 example/fcn-xs/solver.py                       |  2 +-
 example/gluon/audio/urban_sounds/train.py      |  2 +-
 example/gluon/dc_gan/dcgan.py                  |  2 +-
 example/gluon/image_classification.py          |  2 +-
 example/gluon/mnist/mnist.py                   |  4 ++--
 example/gluon/sn_gan/train.py                  |  2 +-
 .../gluon/super_resolution/super_resolution.py |  2 +-
 example/gluon/tree_lstm/main.py                |  2 +-
 example/image-classification/common/fit.py     |  4 ++--
 example/image-classification/score.py          |  4 ++--
 example/image-classification/test_score.py     |  4 ++--
 example/kaggle-ndsb2/Train.py                  |  4 ++--
 .../matrix_factorization/train.py              |  2 +-
 example/module/mnist_mlp.py                    |  2 +-
 example/multi-task/multi-task-learning.ipynb   |  8 ++++----
 .../multivariate_time_series/src/metrics.py    |  8 ++++----
 .../named_entity_recognition/src/metrics.py    | 10 +++++-----
 example/nce-loss/nce.py                        |  6 +++---
 .../neural_collaborative_filtering/train.py    |  2 +-
 example/quantization/imagenet_inference.py     |  4 ++--
 example/rcnn/symnet/metric.py                  | 12 ++++++------
 example/rcnn/train.py                          |  2 +-
 example/rnn/bucketing/cudnn_rnn_bucketing.py   |  6 +++---
 example/rnn/bucketing/lstm_bucketing.py        |  2 +-
 example/rnn/old/char-rnn.ipynb                 |  2 +-
 example/rnn/old/gru_bucketing.py               |  2 +-
 example/rnn/old/lstm_bucketing.py              |  2 +-
 example/rnn/old/rnn_cell_demo.py               |  2 +-
 example/sparse/factorization_machine/metric.py | 18 +++++++++---------
 example/sparse/factorization_machine/train.py  |  2 +-
 example/sparse/linear_classification/train.py  |  2 +-
 example/sparse/matrix_factorization/train.py   |  2 +-
 example/sparse/wide_deep/inference.py          |  2 +-
 example/sparse/wide_deep/train.py              |  2 +-
 example/speech_recognition/stt_metric.py       |  2 +-
 example/ssd/evaluate/eval_metric.py            |  2 +-
 example/ssd/train/metric.py                    |  2 +-
 example/svm_mnist/svm_mnist.py                 |  4 ++--
 .../api_usage_example/example_api_train.py     |  2 +-
 .../api_usage_example/example_inference.py     |  2 +-
 .../benchmarks/svrg_benchmark.ipynb            |  4 ++--
 .../svrg_module/linear_regression/common.py    |  2 +-
 example/vae-gan/vaegan_mxnet.py                |  8 ++++----
 tests/nightly/estimator/test_estimator_cnn.py  |  4 ++--
 tests/nightly/estimator/test_sentiment_rnn.py  | 10 +++++-----
 tests/nightly/test_optimizer.py                |  2 +-
 tests/nightly/test_tlocal_racecondition.py     |  2 +-
 tools/caffe_converter/test_converter.py        |  2 +-
 62 files changed, 114 insertions(+), 114 deletions(-)

diff --git a/benchmark/python/sparse/sparse_end2end.py b/benchmark/python/sparse/sparse_end2end.py
index d032f9d6c38e..fc949b649767 100644
--- a/benchmark/python/sparse/sparse_end2end.py
+++ b/benchmark/python/sparse/sparse_end2end.py
@@ -225,7 +225,7 @@ def row_sparse_pull(kv, key, data, slices, weight_array, priority):
                            learning_rate=0.1, rescale_grad=1.0/batch_size/num_worker)
     mod.init_optimizer(optimizer=sgd, kvstore=kv)
     # use accuracy as the metric
-    metric = mx.metric.create('acc')
+    metric = mx.gluon.metric.create('acc')
 
     index = mod._exec_group.param_names.index('w')
     # weight_array bound to executors of the contexts
diff --git a/example/adversary/adversary_generation.ipynb b/example/adversary/adversary_generation.ipynb
index 76c5f4cff569..0dda371a8f41 100644
--- a/example/adversary/adversary_generation.ipynb
+++ b/example/adversary/adversary_generation.ipynb
@@ -168,7 +168,7 @@
     "epoch = 3\n",
     "for e in range(epoch):\n",
     "    train_loss = 0.\n",
-    "    acc = mx.metric.Accuracy()\n",
+    "    acc = mx.gluon.metric.Accuracy()\n",
     "    for i, (data, label) in enumerate(train_data):\n",
     "        data = data.as_in_context(ctx)\n",
     "        label = label.as_in_context(ctx)\n",
@@ -223,7 +223,7 @@
     "    l = loss(output, label)\n",
     "l.backward()\n",
     "\n",
-    "acc = mx.metric.Accuracy()\n",
+    "acc = mx.gluon.metric.Accuracy()\n",
     "acc.update(label, output)\n",
     "\n",
     "print(\"Validation batch accuracy {}\".format(acc.get()[1]))"
@@ -256,7 +256,7 @@
     "\n",
     "output = net(data_perturbated)    \n",
     "\n",
-    "acc = mx.metric.Accuracy()\n",
+    "acc = mx.gluon.metric.Accuracy()\n",
     "acc.update(label, output)\n",
     "\n",
     "print(\"Validation batch accuracy after perturbation {}\".format(acc.get()[1]))"
diff --git a/example/autoencoder/variational_autoencoder/VAE_example.ipynb b/example/autoencoder/variational_autoencoder/VAE_example.ipynb
index 964e13725c69..7de336611b38 100755
--- a/example/autoencoder/variational_autoencoder/VAE_example.ipynb
+++ b/example/autoencoder/variational_autoencoder/VAE_example.ipynb
@@ -610,7 +610,7 @@
    ],
    "source": [
     "# calculate the ELBO which is minus the loss for test set\n",
-    "metric = mx.metric.Loss()\n",
+    "metric = mx.gluon.metric.Loss()\n",
     "model.score(nd_iter_test, metric)"
    ]
   },
diff --git a/example/caffe/caffe_net.py b/example/caffe/caffe_net.py
index 803efda9b68e..d748298a2965 100644
--- a/example/caffe/caffe_net.py
+++ b/example/caffe/caffe_net.py
@@ -140,6 +140,6 @@ def parse_args():
 
     # train
     if use_caffe_loss:
-        train_model.fit(args, net, get_iterator(data_shape, use_caffe_data), mx.metric.Caffe())
+        train_model.fit(args, net, get_iterator(data_shape, use_caffe_data), mx.gluon.metric.Caffe())
     else:
         train_model.fit(args, net, get_iterator(data_shape, use_caffe_data))
diff --git a/example/caffe/train_model.py b/example/caffe/train_model.py
index d7dfd5d7a31e..96e81e06add4 100644
--- a/example/caffe/train_model.py
+++ b/example/caffe/train_model.py
@@ -93,7 +93,7 @@ def fit(args, network, data_loader, eval_metrics=None, batch_end_callback=None):
         eval_metrics = ['accuracy']
         # TopKAccuracy only allows top_k > 1
         for top_k in [5, 10, 20]:
-            eval_metrics.append(mx.metric.create('top_k_accuracy', top_k=top_k))
+            eval_metrics.append(mx.gluon.metric.create('top_k_accuracy', top_k=top_k))
 
     if batch_end_callback is not None:
         if not isinstance(batch_end_callback, list):
diff --git a/example/capsnet/capsulenet.py b/example/capsnet/capsulenet.py
index 4d455dbc504c..2e38d85fbdea 100644
--- a/example/capsnet/capsulenet.py
+++ b/example/capsnet/capsulenet.py
@@ -122,7 +122,7 @@ def to4d(img):
     return img.reshape(img.shape[0], 1, 28, 28).astype(np.float32)/255
 
 
-class LossMetric(mx.metric.EvalMetric):
+class LossMetric(mx.gluon.metric.EvalMetric):
     """Evaluate the loss function"""
     def __init__(self, batch_size, num_gpus):
         super(LossMetric, self).__init__('LossMetric')
diff --git a/example/ctc/lstm_ocr_train.py b/example/ctc/lstm_ocr_train.py
index 49d9531920ae..e774ff73ab08 100644
--- a/example/ctc/lstm_ocr_train.py
+++ b/example/ctc/lstm_ocr_train.py
@@ -103,7 +103,7 @@ def main():
         module.fit(train_data=data_train,
                    eval_data=data_val,
                    # use metrics.accuracy or metrics.accuracy_lcs
-                   eval_metric=mx.metric.np(metrics.accuracy, allow_extra_outputs=True),
+                   eval_metric=mx.gluon.metric.np(metrics.accuracy, allow_extra_outputs=True),
                    optimizer='sgd',
                    optimizer_params={'learning_rate': hp.learning_rate,
                                      'momentum': hp.momentum,
diff --git a/example/deep-embedded-clustering/autoencoder.py b/example/deep-embedded-clustering/autoencoder.py
index c75634475e3a..d6c15ae19df1 100644
--- a/example/deep-embedded-clustering/autoencoder.py
+++ b/example/deep-embedded-clustering/autoencoder.py
@@ -165,7 +165,7 @@ def l2_norm(label, pred):
             return np.mean(np.square(label-pred))/2.0
         solver = Solver(optimizer, momentum=0.9, wd=decay, learning_rate=l_rate,
                         lr_scheduler=lr_scheduler)
-        solver.set_metric(mx.metric.CustomMetric(l2_norm))
+        solver.set_metric(mx.gluon.metric.CustomMetric(l2_norm))
         solver.set_monitor(Monitor(print_every))
         data_iter = mx.io.NDArrayIter({'data': X}, batch_size=batch_size, shuffle=True,
                                       last_batch_handle='roll_over')
@@ -188,7 +188,7 @@ def l2_norm(label, pred):
             return np.mean(np.square(label-pred))/2.0
         solver = Solver(optimizer, momentum=0.9, wd=decay, learning_rate=l_rate,
                         lr_scheduler=lr_scheduler)
-        solver.set_metric(mx.metric.CustomMetric(l2_norm))
+        solver.set_metric(mx.gluon.metric.CustomMetric(l2_norm))
         solver.set_monitor(Monitor(print_every))
         data_iter = mx.io.NDArrayIter({'data': X}, batch_size=batch_size, shuffle=True,
                                       last_batch_handle='roll_over')
diff --git a/example/deep-embedded-clustering/dec.py b/example/deep-embedded-clustering/dec.py
index 8fb3891e3e99..f67792f0fe37 100644
--- a/example/deep-embedded-clustering/dec.py
+++ b/example/deep-embedded-clustering/dec.py
@@ -122,7 +122,7 @@ def cluster(self, X, y=None, update_interval=None):
 
         def ce(label, pred):
             return np.sum(label*np.log(label/(pred+0.000001)))/label.shape[0]
-        solver.set_metric(mx.metric.CustomMetric(ce))
+        solver.set_metric(mx.gluon.metric.CustomMetric(ce))
 
         label_buff = np.zeros((X.shape[0], self.num_centers))
         train_iter = mx.io.NDArrayIter({'data': X}, {'label': label_buff}, batch_size=batch_size,
diff --git a/example/distributed_training-horovod/gluon_mnist.py b/example/distributed_training-horovod/gluon_mnist.py
index 7b39f5776a42..c2e6f0bdc533 100644
--- a/example/distributed_training-horovod/gluon_mnist.py
+++ b/example/distributed_training-horovod/gluon_mnist.py
@@ -104,7 +104,7 @@ def conv_nets():
 # Function to evaluate accuracy for a model
 def evaluate(model, data_iter, context):
     data_iter.reset()
-    metric = mx.metric.Accuracy()
+    metric = mx.gluon.metric.Accuracy()
     for _, batch in enumerate(data_iter):
         data = batch.data[0].as_in_context(context)
         label = batch.label[0].as_in_context(context)
@@ -149,7 +149,7 @@ def evaluate(model, data_iter, context):
 
 # Create loss function and train metric
 loss_fn = gluon.loss.SoftmaxCrossEntropyLoss()
-metric = mx.metric.Accuracy()
+metric = mx.gluon.metric.Accuracy()
 
 # Train model
 for epoch in range(args.epochs):
diff --git a/example/distributed_training-horovod/module_mnist.py b/example/distributed_training-horovod/module_mnist.py
index 4fcb02a46996..74f6bc9daf21 100644
--- a/example/distributed_training-horovod/module_mnist.py
+++ b/example/distributed_training-horovod/module_mnist.py
@@ -157,7 +157,7 @@ def conv_net():
           num_epoch=args.epochs)  # train for at most 10 dataset passes
 
 # Step 7: evaluate model accuracy
-acc = mx.metric.Accuracy()
+acc = mx.gluon.metric.Accuracy()
 model.score(val_iter, acc)
 
 if hvd.rank() == 0:
diff --git a/example/distributed_training-horovod/resnet50_imagenet.py b/example/distributed_training-horovod/resnet50_imagenet.py
index 5e5169e98ece..ae8a56100929 100644
--- a/example/distributed_training-horovod/resnet50_imagenet.py
+++ b/example/distributed_training-horovod/resnet50_imagenet.py
@@ -286,8 +286,8 @@ def evaluate(epoch):
             return
 
         val_data.reset()
-        acc_top1 = mx.metric.Accuracy()
-        acc_top5 = mx.metric.TopKAccuracy(5)
+        acc_top1 = mx.gluon.metric.Accuracy()
+        acc_top5 = mx.gluon.metric.TopKAccuracy(5)
         for _, batch in enumerate(val_data):
             data, label = batch_fn(batch, context)
             output = net(data.astype(args.dtype, copy=False))
@@ -321,7 +321,7 @@ def evaluate(epoch):
 
     # Create loss function and train metric
     loss_fn = gluon.loss.SoftmaxCrossEntropyLoss()
-    metric = mx.metric.Accuracy()
+    metric = mx.gluon.metric.Accuracy()
 
     # Train model
     for epoch in range(args.num_epochs):
@@ -450,8 +450,8 @@ def train_module():
 
     # Evaluate performance if not using synthetic data
     if args.use_rec:
-        acc_top1 = mx.metric.Accuracy()
-        acc_top5 = mx.metric.TopKAccuracy(5)
+        acc_top1 = mx.gluon.metric.Accuracy()
+        acc_top5 = mx.gluon.metric.TopKAccuracy(5)
         res = mod.score(val_data, [acc_top1, acc_top5])
         for name, val in res:
             logging.info('Epoch[%d] Rank[%d] Validation-%s=%f',
diff --git a/example/distributed_training/cifar10_dist.py b/example/distributed_training/cifar10_dist.py
index d3ba515776f6..8c5fb3639ef9 100644
--- a/example/distributed_training/cifar10_dist.py
+++ b/example/distributed_training/cifar10_dist.py
@@ -121,7 +121,7 @@ def evaluate_accuracy(data_iterator, network):
     ----------
     tuple of array element
     """
-    acc = mx.metric.Accuracy()
+    acc = mx.gluon.metric.Accuracy()
 
     # Iterate through data and label
     for i, (data, label) in enumerate(data_iterator):
diff --git a/example/distributed_training/cifar10_kvstore_hvd.py b/example/distributed_training/cifar10_kvstore_hvd.py
index e6780e5db85e..ff679864f7c3 100644
--- a/example/distributed_training/cifar10_kvstore_hvd.py
+++ b/example/distributed_training/cifar10_kvstore_hvd.py
@@ -123,7 +123,7 @@ def evaluate(data_iterator, network, context):
     ----------
     tuple of array element
     """
-    acc = mx.metric.Accuracy()
+    acc = mx.gluon.metric.Accuracy()
 
     # Iterate through data and label
     for i, (data, label) in enumerate(data_iterator):
@@ -208,7 +208,7 @@ def __len__(self):
                         optimizer_params={'learning_rate': args.lr},
                         kvstore=store)
 
-train_metric = mx.metric.Accuracy()
+train_metric = mx.gluon.metric.Accuracy()
 
 # Run as many epochs as required
 for epoch in range(args.epochs):
diff --git a/example/fcn-xs/solver.py b/example/fcn-xs/solver.py
index e99b31a13055..ab8964f80898 100644
--- a/example/fcn-xs/solver.py
+++ b/example/fcn-xs/solver.py
@@ -23,7 +23,7 @@
 from collections import namedtuple
 from mxnet import optimizer as opt
 from mxnet.optimizer import get_updater
-from mxnet import metric
+from mxnet.gluon import metric
 
 # Parameter to pass to batch_end_callback
 BatchEndParam = namedtuple('BatchEndParams', ['epoch', 'nbatch', 'eval_metric'])
diff --git a/example/gluon/audio/urban_sounds/train.py b/example/gluon/audio/urban_sounds/train.py
index c88f9fb55187..8a55c5b5bc67 100644
--- a/example/gluon/audio/urban_sounds/train.py
+++ b/example/gluon/audio/urban_sounds/train.py
@@ -28,7 +28,7 @@
 
 def evaluate_accuracy(data_iterator, net):
     """Function to evaluate accuracy of any data iterator passed to it as an argument"""
-    acc = mx.metric.Accuracy()
+    acc = mx.gluon.metric.Accuracy()
     for data, label in data_iterator:
         output = net(data)
         predictions = nd.argmax(output, axis=1)
diff --git a/example/gluon/dc_gan/dcgan.py b/example/gluon/dc_gan/dcgan.py
index 93af13ababf3..1b1fa75c1c2a 100644
--- a/example/gluon/dc_gan/dcgan.py
+++ b/example/gluon/dc_gan/dcgan.py
@@ -259,7 +259,7 @@ def main():
     real_label = mx.nd.ones((opt.batch_size,), ctx=ctx)
     fake_label = mx.nd.zeros((opt.batch_size,), ctx=ctx)
 
-    metric = mx.metric.Accuracy()
+    metric = mx.gluon.metric.Accuracy()
     print('Training... ')
     stamp = datetime.now().strftime('%Y_%m_%d-%H_%M')
 
diff --git a/example/gluon/image_classification.py b/example/gluon/image_classification.py
index 44a2afea3681..de31b06655eb 100644
--- a/example/gluon/image_classification.py
+++ b/example/gluon/image_classification.py
@@ -27,7 +27,7 @@
 from mxnet.gluon.model_zoo import vision as models
 from mxnet import autograd as ag
 from mxnet.test_utils import get_mnist_iterator
-from mxnet.metric import Accuracy, TopKAccuracy, CompositeEvalMetric
+from mxnet.gluon.metric import Accuracy, TopKAccuracy, CompositeEvalMetric
 import numpy as np
 
 from data import (get_cifar10_iterator, get_imagenet_iterator,
diff --git a/example/gluon/mnist/mnist.py b/example/gluon/mnist/mnist.py
index 6aea3abc5041..4c1cc16bb7df 100644
--- a/example/gluon/mnist/mnist.py
+++ b/example/gluon/mnist/mnist.py
@@ -70,7 +70,7 @@ def transformer(data, label):
 # train
 
 def test(ctx):
-    metric = mx.metric.Accuracy()
+    metric = mx.gluon.metric.Accuracy()
     for data, label in val_data:
         data = data.as_in_context(ctx)
         label = label.as_in_context(ctx)
@@ -86,7 +86,7 @@ def train(epochs, ctx):
     # Trainer is for updating parameters with gradient.
     trainer = gluon.Trainer(net.collect_params(), 'sgd',
                             {'learning_rate': opt.lr, 'momentum': opt.momentum})
-    metric = mx.metric.Accuracy()
+    metric = mx.gluon.metric.Accuracy()
     loss = gluon.loss.SoftmaxCrossEntropyLoss()
 
     for epoch in range(epochs):
diff --git a/example/gluon/sn_gan/train.py b/example/gluon/sn_gan/train.py
index 46e44791cebd..fc4e87d632fe 100644
--- a/example/gluon/sn_gan/train.py
+++ b/example/gluon/sn_gan/train.py
@@ -102,7 +102,7 @@ def facc(label, pred):
 g_net.collect_params().zero_grad()
 d_net.collect_params().zero_grad()
 # define evaluation metric
-metric = mx.metric.CustomMetric(facc)
+metric = mx.gluon.metric.CustomMetric(facc)
 # initialize labels
 real_label = nd.ones(BATCH_SIZE, CTX)
 fake_label = nd.zeros(BATCH_SIZE, CTX)
diff --git a/example/gluon/super_resolution/super_resolution.py b/example/gluon/super_resolution/super_resolution.py
index 4a3e8d92aa39..52bfc2241f82 100644
--- a/example/gluon/super_resolution/super_resolution.py
+++ b/example/gluon/super_resolution/super_resolution.py
@@ -156,7 +156,7 @@ def hybrid_forward(self, F, x):
         return x
 
 net = SuperResolutionNet(upscale_factor)
-metric = mx.metric.MSE()
+metric = mx.gluon.metric.MSE()
 
 def test(ctx):
     val_data.reset()
diff --git a/example/gluon/tree_lstm/main.py b/example/gluon/tree_lstm/main.py
index 53af3fa019e9..41e4f4f13ed8 100644
--- a/example/gluon/tree_lstm/main.py
+++ b/example/gluon/tree_lstm/main.py
@@ -96,7 +96,7 @@
 net = SimilarityTreeLSTM(sim_hidden_size, rnn_hidden_size, vocab.size, vocab.embed.shape[1], num_classes)
 
 # use pearson correlation and mean-square error for evaluation
-metric = mx.metric.create(['pearsonr', 'mse'])
+metric = mx.gluon.metric.create(['pearsonr', 'mse'])
 
 def to_target(x):
     target = np.zeros((1, num_classes))
diff --git a/example/image-classification/common/fit.py b/example/image-classification/common/fit.py
index 38ca296cf986..8662db3baba4 100644
--- a/example/image-classification/common/fit.py
+++ b/example/image-classification/common/fit.py
@@ -290,7 +290,7 @@ def fit(args, network, data_loader, **kwargs):
     # evaluation metrices
     eval_metrics = ['accuracy']
     if args.top_k > 0:
-        eval_metrics.append(mx.metric.create(
+        eval_metrics.append(mx.gluon.metric.create(
             'top_k_accuracy', top_k=args.top_k))
 
     supported_loss = ['ce', 'nll_loss']
@@ -306,7 +306,7 @@ def fit(args, network, data_loader, **kwargs):
                     logging.warning(loss_type + ' is not an valid loss type, only cross-entropy or ' \
                                     'negative likelihood loss is supported!')
                 else:
-                    eval_metrics.append(mx.metric.create(loss_type))
+                    eval_metrics.append(mx.gluon.metric.create(loss_type))
         else:
             logging.warning("The output is not softmax_output, loss argument will be skipped!")
 
diff --git a/example/image-classification/score.py b/example/image-classification/score.py
index f40e649f1f42..dbad44ef6981 100644
--- a/example/image-classification/score.py
+++ b/example/image-classification/score.py
@@ -97,8 +97,8 @@ def score(model, data_val, metrics, gpus, batch_size, rgb_mean=None, mean_img=No
     logger = logging.getLogger()
     logger.setLevel(logging.DEBUG)
 
-    metrics = [mx.metric.create('acc'),
-               mx.metric.create('top_k_accuracy', top_k = 5)]
+    metrics = [mx.gluon.metric.create('acc'),
+               mx.gluon.metric.create('top_k_accuracy', top_k = 5)]
 
     (speed,) = score(metrics = metrics, **vars(args))
     logging.info('Finished with %f images per second', speed)
diff --git a/example/image-classification/test_score.py b/example/image-classification/test_score.py
index 58c5c66a7f1f..1a82bcff5ba3 100644
--- a/example/image-classification/test_score.py
+++ b/example/image-classification/test_score.py
@@ -43,7 +43,7 @@ def test_imagenet1k_resnet(imagenet_val_5k_settings):
     models = ['imagenet1k-resnet-50', 'imagenet1k-resnet-152']
     accs = [.77, .78]
     for (m, g) in zip(models, accs):
-        acc = mx.metric.create('acc')
+        acc = mx.gluon.metric.create('acc')
         (speed,) = score(model=m, data_val=imagenet_val_5k,
                          rgb_mean='0,0,0', metrics=acc, **kwargs)
         r = acc.get()[1]
@@ -52,7 +52,7 @@ def test_imagenet1k_resnet(imagenet_val_5k_settings):
 
 def test_imagenet1k_inception_bn(imagenet_val_5k_settings):
     imagenet_val_5k, kwargs = imagenet_val_5k_settings
-    acc = mx.metric.create('acc')
+    acc = mx.gluon.metric.create('acc')
     m = 'imagenet1k-inception-bn'
     g = 0.75
     (speed,) = score(model=m,
diff --git a/example/kaggle-ndsb2/Train.py b/example/kaggle-ndsb2/Train.py
index 51e308a2e21c..c3ab165d11da 100644
--- a/example/kaggle-ndsb2/Train.py
+++ b/example/kaggle-ndsb2/Train.py
@@ -111,7 +111,7 @@ def encode_csv(label_csv, systole_csv, diastole_csv):
         wd                 = 0.00001,
         momentum           = 0.9)
 
-systole_model.fit(X=data_train, eval_metric = mx.metric.np(CRPS))
+systole_model.fit(X=data_train, eval_metric = mx.gluon.metric.np(CRPS))
 
 
 # # Predict systole
@@ -139,7 +139,7 @@ def encode_csv(label_csv, systole_csv, diastole_csv):
         wd                 = 0.00001,
         momentum           = 0.9)
 
-diastole_model.fit(X=data_train, eval_metric = mx.metric.np(CRPS))
+diastole_model.fit(X=data_train, eval_metric = mx.gluon.metric.np(CRPS))
 
 
 # # Predict diastole
diff --git a/example/model-parallel/matrix_factorization/train.py b/example/model-parallel/matrix_factorization/train.py
index 591dab3a6534..fea2c153f853 100644
--- a/example/model-parallel/matrix_factorization/train.py
+++ b/example/model-parallel/matrix_factorization/train.py
@@ -94,7 +94,7 @@
         'rescale_grad': 1.0/batch_size}
 
     # use MSE as the metric
-    metric = mx.metric.create(['MSE'])
+    metric = mx.gluon.metric.create(['MSE'])
     
     speedometer = mx.callback.Speedometer(batch_size, print_every)
     
diff --git a/example/module/mnist_mlp.py b/example/module/mnist_mlp.py
index 7d63a584aec9..f6d5bf306bd8 100644
--- a/example/module/mnist_mlp.py
+++ b/example/module/mnist_mlp.py
@@ -55,7 +55,7 @@
 mod.init_params()
 
 mod.init_optimizer(optimizer_params={'learning_rate':0.01, 'momentum': 0.9})
-metric = mx.metric.create('acc')
+metric = mx.gluon.metric.create('acc')
 
 for i_epoch in range(n_epoch):
     for i_iter, batch in enumerate(train_dataiter):
diff --git a/example/multi-task/multi-task-learning.ipynb b/example/multi-task/multi-task-learning.ipynb
index 048d6d9862b8..e615559441f6 100644
--- a/example/multi-task/multi-task-learning.ipynb
+++ b/example/multi-task/multi-task-learning.ipynb
@@ -267,8 +267,8 @@
    "outputs": [],
    "source": [
     "def evaluate_accuracy(net, data_iterator):\n",
-    "    acc_digits = mx.metric.Accuracy(name='digits')\n",
-    "    acc_odd_even = mx.metric.Accuracy(name='odd_even')\n",
+    "    acc_digits = mx.gluon.metric.Accuracy(name='digits')\n",
+    "    acc_odd_even = mx.gluon.metric.Accuracy(name='odd_even')\n",
     "    \n",
     "    for i, (data, label_digit, label_odd_even) in enumerate(data_iterator):\n",
     "        data = data.as_in_context(ctx)\n",
@@ -335,8 +335,8 @@
    "source": [
     "for e in range(epochs):\n",
     "    # Accuracies for each task\n",
-    "    acc_digits = mx.metric.Accuracy(name='digits')\n",
-    "    acc_odd_even = mx.metric.Accuracy(name='odd_even')\n",
+    "    acc_digits = mx.gluon.metric.Accuracy(name='digits')\n",
+    "    acc_odd_even = mx.gluon.metric.Accuracy(name='odd_even')\n",
     "    # Accumulative losses\n",
     "    l_digits_ = 0.\n",
     "    l_odd_even_ = 0. \n",
diff --git a/example/multivariate_time_series/src/metrics.py b/example/multivariate_time_series/src/metrics.py
index 4818591068f8..6dd8e765f0ed 100644
--- a/example/multivariate_time_series/src/metrics.py
+++ b/example/multivariate_time_series/src/metrics.py
@@ -46,10 +46,10 @@ def get_custom_metrics():
     """
     :return: mxnet metric object
     """
-    _rse = mx.metric.create(rse)
-    _rae = mx.metric.create(rae)
-    _corr = mx.metric.create(corr)
-    return mx.metric.create([_rae, _rse, _corr])
+    _rse = mx.gluon.metric.create(rse)
+    _rae = mx.gluon.metric.create(rae)
+    _corr = mx.gluon.metric.create(corr)
+    return mx.gluon.metric.create([_rae, _rse, _corr])
 
 def evaluate(pred, label):
     return {"RAE":rae(label, pred), "RSE":rse(label,pred),"CORR": corr(label,pred)}
\ No newline at end of file
diff --git a/example/named_entity_recognition/src/metrics.py b/example/named_entity_recognition/src/metrics.py
index a1d270af6863..d04904c7763e 100644
--- a/example/named_entity_recognition/src/metrics.py
+++ b/example/named_entity_recognition/src/metrics.py
@@ -79,9 +79,9 @@ def entity_f1(label, pred):
     return classifer_metrics(label, pred)[2]
 
 def composite_classifier_metrics():
-    metric1 = mx.metric.CustomMetric(feval=entity_precision, name='entity precision')
-    metric2 = mx.metric.CustomMetric(feval=entity_recall, name='entity recall')
-    metric3 = mx.metric.CustomMetric(feval=entity_f1, name='entity f1 score')
-    metric4 = mx.metric.Accuracy()
+    metric1 = mx.gluon.metric.CustomMetric(feval=entity_precision, name='entity precision')
+    metric2 = mx.gluon.metric.CustomMetric(feval=entity_recall, name='entity recall')
+    metric3 = mx.gluon.metric.CustomMetric(feval=entity_f1, name='entity f1 score')
+    metric4 = mx.gluon.metric.Accuracy()
 
-    return mx.metric.CompositeEvalMetric([metric4, metric1, metric2, metric3])
+    return mx.gluon.metric.CompositeEvalMetric([metric4, metric1, metric2, metric3])
diff --git a/example/nce-loss/nce.py b/example/nce-loss/nce.py
index e59220a026a8..6764e9c20852 100644
--- a/example/nce-loss/nce.py
+++ b/example/nce-loss/nce.py
@@ -62,7 +62,7 @@ def nce_loss_subwords(
                                            label=label_weight)
 
 
-class NceAccuracy(mx.metric.EvalMetric):
+class NceAccuracy(mx.gluon.metric.EvalMetric):
     def __init__(self):
         super(NceAccuracy, self).__init__('nce-accuracy')
 
@@ -75,7 +75,7 @@ def update(self, labels, preds):
             self.num_inst += 1
 
 
-class NceAuc(mx.metric.EvalMetric):
+class NceAuc(mx.gluon.metric.EvalMetric):
     def __init__(self):
         super(NceAuc, self).__init__('nce-auc')
 
@@ -105,7 +105,7 @@ def update(self, labels, preds):
         self.num_inst += 1
 
 
-class NceLSTMAuc(mx.metric.EvalMetric):
+class NceLSTMAuc(mx.gluon.metric.EvalMetric):
     def __init__(self):
         super(NceLSTMAuc, self).__init__('nce-lstm-auc')
 
diff --git a/example/neural_collaborative_filtering/train.py b/example/neural_collaborative_filtering/train.py
index c68f271a6f0d..f99b16fd5b0e 100644
--- a/example/neural_collaborative_filtering/train.py
+++ b/example/neural_collaborative_filtering/train.py
@@ -124,7 +124,7 @@ def cross_entropy(label, pred, eps=1e-12):
     mod.init_params()
     mod.init_optimizer(optimizer='adam', optimizer_params=[('learning_rate', learning_rate), ('beta1',beta1), ('beta2',beta2), ('epsilon',eps)])
     
-    metric = mx.metric.create(cross_entropy)
+    metric = mx.gluon.metric.create(cross_entropy)
     speedometer = mx.callback.Speedometer(batch_size, log_interval)
     best_hr, best_ndcg, best_iter = -1, -1, -1 
     logging.info('Training started ...')
diff --git a/example/quantization/imagenet_inference.py b/example/quantization/imagenet_inference.py
index 4d690d37d00c..2f41fec2a9a3 100644
--- a/example/quantization/imagenet_inference.py
+++ b/example/quantization/imagenet_inference.py
@@ -70,8 +70,8 @@ def advance_data_iter(data_iter, n):
 
 
 def score(sym, arg_params, aux_params, data, devs, label_name, max_num_examples, logger=None):
-    metrics = [mx.metric.create('acc'),
-               mx.metric.create('top_k_accuracy', top_k=5)]
+    metrics = [mx.gluon.metric.create('acc'),
+               mx.gluon.metric.create('top_k_accuracy', top_k=5)]
     if not isinstance(metrics, list):
         metrics = [metrics, ]
     mod = mx.mod.Module(symbol=sym, context=devs, label_names=[label_name, ])
diff --git a/example/rcnn/symnet/metric.py b/example/rcnn/symnet/metric.py
index fa8d7919e919..6509ba436d75 100644
--- a/example/rcnn/symnet/metric.py
+++ b/example/rcnn/symnet/metric.py
@@ -25,7 +25,7 @@ def get_names():
     return pred, label
 
 
-class RPNAccMetric(mx.metric.EvalMetric):
+class RPNAccMetric(mx.gluon.metric.EvalMetric):
     def __init__(self):
         super(RPNAccMetric, self).__init__('RPNAcc')
         self.pred, self.label = get_names()
@@ -49,7 +49,7 @@ def update(self, labels, preds):
         self.num_inst += len(pred_label.flat)
 
 
-class RCNNAccMetric(mx.metric.EvalMetric):
+class RCNNAccMetric(mx.gluon.metric.EvalMetric):
     def __init__(self):
         super(RCNNAccMetric, self).__init__('RCNNAcc')
         self.pred, self.label = get_names()
@@ -66,7 +66,7 @@ def update(self, labels, preds):
         self.num_inst += len(pred_label.flat)
 
 
-class RPNLogLossMetric(mx.metric.EvalMetric):
+class RPNLogLossMetric(mx.gluon.metric.EvalMetric):
     def __init__(self):
         super(RPNLogLossMetric, self).__init__('RPNLogLoss')
         self.pred, self.label = get_names()
@@ -93,7 +93,7 @@ def update(self, labels, preds):
         self.num_inst += label.shape[0]
 
 
-class RCNNLogLossMetric(mx.metric.EvalMetric):
+class RCNNLogLossMetric(mx.gluon.metric.EvalMetric):
     def __init__(self):
         super(RCNNLogLossMetric, self).__init__('RCNNLogLoss')
         self.pred, self.label = get_names()
@@ -114,7 +114,7 @@ def update(self, labels, preds):
         self.num_inst += label.shape[0]
 
 
-class RPNL1LossMetric(mx.metric.EvalMetric):
+class RPNL1LossMetric(mx.gluon.metric.EvalMetric):
     def __init__(self):
         super(RPNL1LossMetric, self).__init__('RPNL1Loss')
         self.pred, self.label = get_names()
@@ -130,7 +130,7 @@ def update(self, labels, preds):
         self.num_inst += num_inst
 
 
-class RCNNL1LossMetric(mx.metric.EvalMetric):
+class RCNNL1LossMetric(mx.gluon.metric.EvalMetric):
     def __init__(self):
         super(RCNNL1LossMetric, self).__init__('RCNNL1Loss')
         self.pred, self.label = get_names()
diff --git a/example/rcnn/train.py b/example/rcnn/train.py
index 7b1f2f7f31a5..4d89ac6e2cdd 100644
--- a/example/rcnn/train.py
+++ b/example/rcnn/train.py
@@ -85,7 +85,7 @@ def train_net(sym, roidb, args):
     eval_metric = RCNNAccMetric()
     cls_metric = RCNNLogLossMetric()
     bbox_metric = RCNNL1LossMetric()
-    eval_metrics = mx.metric.CompositeEvalMetric()
+    eval_metrics = mx.gluon.metric.CompositeEvalMetric()
     for child_metric in [rpn_eval_metric, rpn_cls_metric, rpn_bbox_metric, eval_metric, cls_metric, bbox_metric]:
         eval_metrics.add(child_metric)
 
diff --git a/example/rnn/bucketing/cudnn_rnn_bucketing.py b/example/rnn/bucketing/cudnn_rnn_bucketing.py
index 38275ae3dfb8..8f77172087ef 100644
--- a/example/rnn/bucketing/cudnn_rnn_bucketing.py
+++ b/example/rnn/bucketing/cudnn_rnn_bucketing.py
@@ -156,7 +156,7 @@ def sym_gen(seq_len):
     model.fit(
         train_data          = data_train,
         eval_data           = data_val,
-        eval_metric         = mx.metric.Perplexity(invalid_label),
+        eval_metric         = mx.gluon.metric.Perplexity(invalid_label),
         kvstore             = args.kv_store,
         optimizer           = args.optimizer,
         optimizer_params    = opt_params,
@@ -244,14 +244,14 @@ def sym_gen(seq_len):
 
     if args.dtype == "float32":
         model.set_params(arg_params, aux_params)
-        model.score(data_val, mx.metric.Perplexity(invalid_label),
+        model.score(data_val, mx.gluon.metric.Perplexity(invalid_label),
                     batch_end_callback=mx.callback.Speedometer(args.batch_size, 5))
     else:
         assert args.dtype == "float16", "Only float32 and float16 are supported currently"
         model = amp.convert_bucketing_module(model, target_dtype="float16")
         model.bind(data_val.provide_data, data_val.provide_label,
                    for_training=False)
-        model.score(data_val, mx.metric.Perplexity(invalid_label),
+        model.score(data_val, mx.gluon.metric.Perplexity(invalid_label),
                     batch_end_callback=mx.callback.Speedometer(args.batch_size, 5))
 
 if __name__ == '__main__':
diff --git a/example/rnn/bucketing/lstm_bucketing.py b/example/rnn/bucketing/lstm_bucketing.py
index 7f150104f458..281aa8988ab0 100644
--- a/example/rnn/bucketing/lstm_bucketing.py
+++ b/example/rnn/bucketing/lstm_bucketing.py
@@ -115,7 +115,7 @@ def sym_gen(seq_len):
     model.fit(
         train_data          = data_train,
         eval_data           = data_val,
-        eval_metric         = mx.metric.Perplexity(invalid_label),
+        eval_metric         = mx.gluon.metric.Perplexity(invalid_label),
         kvstore             = args.kv_store,
         optimizer           = args.optimizer,
         optimizer_params    = { 'learning_rate': args.lr,
diff --git a/example/rnn/old/char-rnn.ipynb b/example/rnn/old/char-rnn.ipynb
index 1ec56cd9aa8c..4fd32d932512 100644
--- a/example/rnn/old/char-rnn.ipynb
+++ b/example/rnn/old/char-rnn.ipynb
@@ -347,7 +347,7 @@
    "source": [
     "# Fit it\n",
     "model.fit(X=data_train,\n",
-    "          eval_metric = mx.metric.np(Perplexity),\n",
+    "          eval_metric = mx.gluon.metric.np(Perplexity),\n",
     "          batch_end_callback=mx.callback.Speedometer(batch_size, 50),\n",
     "          epoch_end_callback=mx.callback.do_checkpoint(\"obama\"))"
    ]
diff --git a/example/rnn/old/gru_bucketing.py b/example/rnn/old/gru_bucketing.py
index b9f651a90dc0..47c13ec0db43 100644
--- a/example/rnn/old/gru_bucketing.py
+++ b/example/rnn/old/gru_bucketing.py
@@ -88,6 +88,6 @@ def sym_gen(seq_len):
     logging.basicConfig(level=logging.DEBUG, format=head)
 
     model.fit(X=data_train, eval_data=data_val,
-              eval_metric = mx.metric.np(Perplexity),
+              eval_metric = mx.gluon.metric.np(Perplexity),
               batch_end_callback=mx.callback.Speedometer(batch_size, 50),)
 
diff --git a/example/rnn/old/lstm_bucketing.py b/example/rnn/old/lstm_bucketing.py
index 0fe4116250a2..2bea6cc3898f 100644
--- a/example/rnn/old/lstm_bucketing.py
+++ b/example/rnn/old/lstm_bucketing.py
@@ -90,6 +90,6 @@ def sym_gen(seq_len):
     logging.basicConfig(level=logging.DEBUG, format=head)
 
     model.fit(X=data_train, eval_data=data_val, kvstore='device',
-              eval_metric = mx.metric.np(Perplexity),
+              eval_metric = mx.gluon.metric.np(Perplexity),
               batch_end_callback=mx.callback.Speedometer(batch_size, 50),)
 
diff --git a/example/rnn/old/rnn_cell_demo.py b/example/rnn/old/rnn_cell_demo.py
index c5772fa3a5b7..64a8ee0fe72b 100644
--- a/example/rnn/old/rnn_cell_demo.py
+++ b/example/rnn/old/rnn_cell_demo.py
@@ -144,7 +144,7 @@ def sym_gen(seq_len):
     logging.basicConfig(level=logging.DEBUG, format=head)
 
     mod.fit(data_train, eval_data=data_val, num_epoch=num_epoch,
-            eval_metric=mx.metric.np(Perplexity),
+            eval_metric=mx.gluon.metric.np(Perplexity),
             batch_end_callback=mx.callback.Speedometer(batch_size, 50),
             initializer=mx.init.Xavier(factor_type="in", magnitude=2.34),
             optimizer='sgd',
diff --git a/example/sparse/factorization_machine/metric.py b/example/sparse/factorization_machine/metric.py
index a8c52c781c0f..8c80f0092203 100644
--- a/example/sparse/factorization_machine/metric.py
+++ b/example/sparse/factorization_machine/metric.py
@@ -19,9 +19,9 @@
 import numpy as np
 from operator import itemgetter
 
-@mx.metric.register
-@mx.metric.alias('log_loss')
-class LogLossMetric(mx.metric.EvalMetric):
+@mx.gluon.metric.register
+@mx.gluon.metric.alias('log_loss')
+class LogLossMetric(mx.gluon.metric.EvalMetric):
     """Computes the negative log-likelihood loss.
 
     The negative log-likelihoodd loss over a batch of sample size :math:`N` is given by
@@ -51,7 +51,7 @@ class LogLossMetric(mx.metric.EvalMetric):
     --------
     >>> predicts = [mx.nd.array([[0.3], [0], [0.4]])]
     >>> labels   = [mx.nd.array([0, 1, 1])]
-    >>> log_loss= mx.metric.NegativeLogLikelihood()
+    >>> log_loss= mx.gluon.metric.NegativeLogLikelihood()
     >>> log_loss.update(labels, predicts)
     >>> print(log_loss.get())
     ('log-loss', 0.57159948348999023)
@@ -74,7 +74,7 @@ def update(self, labels, preds):
         preds : list of `NDArray`
             Predicted values.
         """
-        mx.metric.check_label_shapes(labels, preds)
+        mx.gluon.metric.check_label_shapes(labels, preds)
 
         for label, pred in zip(labels, preds):
             label = label.asnumpy()
@@ -88,16 +88,16 @@ def update(self, labels, preds):
             self.sum_metric += (-np.log(prob + self.eps)).sum()
             self.num_inst += num_examples
 
-@mx.metric.register
-@mx.metric.alias('auc')
-class AUCMetric(mx.metric.EvalMetric):
+@mx.gluon.metric.register
+@mx.gluon.metric.alias('auc')
+class AUCMetric(mx.gluon.metric.EvalMetric):
     def __init__(self, eps=1e-12):
         super(AUCMetric, self).__init__(
             'auc')
         self.eps = eps
 
     def update(self, labels, preds):
-        mx.metric.check_label_shapes(labels, preds)
+        mx.gluon.metric.check_label_shapes(labels, preds)
         label_weight = labels[0].asnumpy()
         preds = preds[0].asnumpy()
         tmp = []
diff --git a/example/sparse/factorization_machine/train.py b/example/sparse/factorization_machine/train.py
index b30f9cc81acf..1e2ab0e2f0ff 100644
--- a/example/sparse/factorization_machine/train.py
+++ b/example/sparse/factorization_machine/train.py
@@ -110,7 +110,7 @@ def all_row_ids(data_batch):
     mod.init_optimizer(optimizer='adam', kvstore=kv, optimizer_params=optimizer_params)
 
     # metrics
-    metric = mx.metric.create(['log_loss', 'auc'])
+    metric = mx.gluon.metric.create(['log_loss', 'auc'])
     speedometer = mx.callback.Speedometer(batch_size, log_interval)
 
     logging.info('Training started ...')
diff --git a/example/sparse/linear_classification/train.py b/example/sparse/linear_classification/train.py
index 0a8acfd87bef..77eb2c09de28 100644
--- a/example/sparse/linear_classification/train.py
+++ b/example/sparse/linear_classification/train.py
@@ -100,7 +100,7 @@ def all_row_ids(data_batch):
     optim = mx.optimizer.create(optimizer, learning_rate=0.01, rescale_grad=1.0/batch_size/num_worker)
     mod.init_optimizer(optimizer=optim, kvstore=kv)
     # use accuracy as the metric
-    metric = mx.metric.create(['nll_loss'])
+    metric = mx.gluon.metric.create(['nll_loss'])
 
     # get the sparse weight parameter
     speedometer = mx.callback.Speedometer(batch_size, 100)
diff --git a/example/sparse/matrix_factorization/train.py b/example/sparse/matrix_factorization/train.py
index 44bab2c416ba..d9dccce89459 100644
--- a/example/sparse/matrix_factorization/train.py
+++ b/example/sparse/matrix_factorization/train.py
@@ -101,7 +101,7 @@ def all_row_ids(data_batch):
                                 rescale_grad=1.0/batch_size)
     mod.init_optimizer(optimizer=optim, kvstore='device')
     # use MSE as the metric
-    metric = mx.metric.create(['MSE'])
+    metric = mx.gluon.metric.create(['MSE'])
     speedometer = mx.callback.Speedometer(batch_size, log_interval)
     logging.info('Training started ...')
     for epoch in range(num_epoch):
diff --git a/example/sparse/wide_deep/inference.py b/example/sparse/wide_deep/inference.py
index e14396e50c15..c615020200e2 100644
--- a/example/sparse/wide_deep/inference.py
+++ b/example/sparse/wide_deep/inference.py
@@ -93,7 +93,7 @@
     else:
         logging.info('Inference started ...')
         # use accuracy as the metric
-        metric = mx.metric.create(['acc'])
+        metric = mx.gluon.metric.create(['acc'])
         accuracy_avg = 0.0
         for batch in data_iter:
             nbatch += 1
diff --git a/example/sparse/wide_deep/train.py b/example/sparse/wide_deep/train.py
index eea70301660d..c8c2b157865a 100644
--- a/example/sparse/wide_deep/train.py
+++ b/example/sparse/wide_deep/train.py
@@ -83,7 +83,7 @@
     optim = mx.optimizer.create(optimizer, learning_rate=lr, rescale_grad=1.0/batch_size)
     mod.init_optimizer(optimizer=optim)
     # use accuracy as the metric
-    metric = mx.metric.create(['acc'])
+    metric = mx.gluon.metric.create(['acc'])
     # get the sparse weight parameter
     speedometer = mx.callback.Speedometer(batch_size, log_interval)
 
diff --git a/example/speech_recognition/stt_metric.py b/example/speech_recognition/stt_metric.py
index 26609627ea58..1eb77aa301cb 100644
--- a/example/speech_recognition/stt_metric.py
+++ b/example/speech_recognition/stt_metric.py
@@ -35,7 +35,7 @@ def check_label_shapes(labels, preds, shape=0):
                          "predictions {}".format(label_shape, pred_shape))
 
 
-class STTMetric(mx.metric.EvalMetric):
+class STTMetric(mx.gluon.metric.EvalMetric):
     def __init__(self, batch_size, num_gpu, is_epoch_end=False, is_logging=True):
         super(STTMetric, self).__init__('STTMetric')
 
diff --git a/example/ssd/evaluate/eval_metric.py b/example/ssd/evaluate/eval_metric.py
index 1deb381fb859..b038d3afb376 100644
--- a/example/ssd/evaluate/eval_metric.py
+++ b/example/ssd/evaluate/eval_metric.py
@@ -18,7 +18,7 @@
 import mxnet as mx
 import numpy as np
 
-class MApMetric(mx.metric.EvalMetric):
+class MApMetric(mx.gluon.metric.EvalMetric):
     """
     Calculate mean AP for object detection task
 
diff --git a/example/ssd/train/metric.py b/example/ssd/train/metric.py
index 731f8fcc19f4..a99c8762de16 100644
--- a/example/ssd/train/metric.py
+++ b/example/ssd/train/metric.py
@@ -19,7 +19,7 @@
 import numpy as np
 
 
-class MultiBoxMetric(mx.metric.EvalMetric):
+class MultiBoxMetric(mx.gluon.metric.EvalMetric):
     """Calculate metrics for Multibox training """
     def __init__(self, eps=1e-8):
         super(MultiBoxMetric, self).__init__('MultiBox')
diff --git a/example/svm_mnist/svm_mnist.py b/example/svm_mnist/svm_mnist.py
index e166cb6ac707..9ceae6d4588b 100644
--- a/example/svm_mnist/svm_mnist.py
+++ b/example/svm_mnist/svm_mnist.py
@@ -113,8 +113,8 @@
             'momentum': 0.9,       # Momentum for SGD with momentum
             'wd': 0.00001,         # Weight decay for regularization
         })
-    results[output.name] = mod.score(test_iter, mx.metric.Accuracy())[0][1]*100
-    print('Accuracy for %s:'%output.name, mod.score(test_iter, mx.metric.Accuracy())[0][1]*100, '%\n')
+    results[output.name] = mod.score(test_iter, mx.gluon.metric.Accuracy())[0][1]*100
+    print('Accuracy for %s:'%output.name, mod.score(test_iter, mx.gluon.metric.Accuracy())[0][1]*100, '%\n')
     
 for key, value in results.items():
     print(key, value, "%s")
diff --git a/example/svrg_module/api_usage_example/example_api_train.py b/example/svrg_module/api_usage_example/example_api_train.py
index f6cd1b2e592c..cc9987fe3edb 100644
--- a/example/svrg_module/api_usage_example/example_api_train.py
+++ b/example/svrg_module/api_usage_example/example_api_train.py
@@ -40,7 +40,7 @@ def test_svrg_intermediate_level_api(args):
     mod.init_params(initializer=mx.init.Uniform(0.01), allow_missing=False, force_init=False, allow_extra=False)
     kv = mx.kv.create("local")
     mod.init_optimizer(kvstore=kv, optimizer='sgd', optimizer_params=(('learning_rate', 0.025),))
-    metrics = mx.metric.create("mse")
+    metrics = mx.gluon.metric.create("mse")
     for e in range(num_epoch):
         metrics.reset()
         if e % mod.update_freq == 0:
diff --git a/example/svrg_module/api_usage_example/example_inference.py b/example/svrg_module/api_usage_example/example_inference.py
index 312f9796074d..7e5b7a40abe2 100644
--- a/example/svrg_module/api_usage_example/example_inference.py
+++ b/example/svrg_module/api_usage_example/example_inference.py
@@ -42,7 +42,7 @@ def get_validation_score(args):
     mod.bind(data_shapes=train_iter.provide_data, label_shapes=train_iter.provide_label)
     mod.init_params(initializer=mx.init.Uniform(0.01), allow_missing=False, force_init=False, allow_extra=False)
     mod.init_optimizer(kvstore='local', optimizer='sgd', optimizer_params=(('learning_rate', 0.025),))
-    metrics = mx.metric.create("mse")
+    metrics = mx.gluon.metric.create("mse")
     for e in range(epoch):
         metrics.reset()
         if e % mod.update_freq == 0:
diff --git a/example/svrg_module/benchmarks/svrg_benchmark.ipynb b/example/svrg_module/benchmarks/svrg_benchmark.ipynb
index 54ae81281db3..66f52d70be5f 100644
--- a/example/svrg_module/benchmarks/svrg_benchmark.ipynb
+++ b/example/svrg_module/benchmarks/svrg_benchmark.ipynb
@@ -127,7 +127,7 @@
     "    mod.bind(data_shapes=di.provide_data, label_shapes=di.provide_label)\n",
     "    mod.init_params(initializer=mx.init.Zero(), allow_missing=False, force_init=False, allow_extra=False)\n",
     "    mod.init_optimizer(kvstore='local', optimizer='sgd', optimizer_params=optimizer_params)\n",
-    "    metrics = mx.metric.create(\"mse\")\n",
+    "    metrics = mx.gluon.metric.create(\"mse\")\n",
     "    \n",
     "    results = {}\n",
     "    for e in range(num_epoch):\n",
@@ -170,7 +170,7 @@
     "    mod.bind(data_shapes=di.provide_data, label_shapes=di.provide_label)\n",
     "    mod.init_params(initializer=mx.init.Zero(), allow_missing=False, force_init=False, allow_extra=False)\n",
     "    mod.init_optimizer(kvstore='local', optimizer='sgd', optimizer_params=optimizer_params)\n",
-    "    metrics = mx.metric.create(\"mse\")\n",
+    "    metrics = mx.gluon.metric.create(\"mse\")\n",
     "    \n",
     "    results = {}\n",
     "    for e in range(num_epoch):\n",
diff --git a/example/svrg_module/linear_regression/common.py b/example/svrg_module/linear_regression/common.py
index 14a144f40ce2..edf4f729f3e6 100644
--- a/example/svrg_module/linear_regression/common.py
+++ b/example/svrg_module/linear_regression/common.py
@@ -39,7 +39,7 @@ def create_lin_reg_network(train_features, train_labels, feature_dim, batch_size
 
 
 def create_metrics(metrics):
-    metric = mx.metric.create(metrics)
+    metric = mx.gluon.metric.create(metrics)
     return metric
 
 
diff --git a/example/vae-gan/vaegan_mxnet.py b/example/vae-gan/vaegan_mxnet.py
index 38e7e2ecc92f..1881f383c18b 100644
--- a/example/vae-gan/vaegan_mxnet.py
+++ b/example/vae-gan/vaegan_mxnet.py
@@ -424,10 +424,10 @@ def kldivergence(label, pred):
         KLLoss = KLLoss / nElements
         return KLLoss
 
-    mG = mx.metric.CustomMetric(fentropy)
-    mD = mx.metric.CustomMetric(fentropy)
-    mE = mx.metric.CustomMetric(kldivergence)
-    mACC = mx.metric.CustomMetric(facc)
+    mG = mx.gluon.metric.CustomMetric(fentropy)
+    mD = mx.gluon.metric.CustomMetric(fentropy)
+    mE = mx.gluon.metric.CustomMetric(kldivergence)
+    mACC = mx.gluon.metric.CustomMetric(facc)
 
     print('Training...')
     stamp =  datetime.now().strftime('%Y_%m_%d-%H_%M')
diff --git a/tests/nightly/estimator/test_estimator_cnn.py b/tests/nightly/estimator/test_estimator_cnn.py
index 0d113cdf4984..466c01019575 100644
--- a/tests/nightly/estimator/test_estimator_cnn.py
+++ b/tests/nightly/estimator/test_estimator_cnn.py
@@ -116,7 +116,7 @@ def test_estimator_cpu():
         # Define estimator
         est = estimator.Estimator(net=net,
                                   loss=loss,
-                                  train_metrics=mx.metric.Accuracy(),
+                                  train_metrics=mx.gluon.metric.Accuracy(),
                                   trainer=trainer,
                                   context=context)
         # Call fit()
@@ -140,7 +140,7 @@ def test_estimator_gpu():
     train_data, test_data = load_data_mnist(batch_size, resize=224)
     loss = gluon.loss.SoftmaxCrossEntropyLoss()
     net.hybridize()
-    acc = mx.metric.Accuracy()
+    acc = mx.gluon.metric.Accuracy()
     trainer = gluon.Trainer(net.collect_params(), 'sgd', {'learning_rate': 0.001})
     # Define estimator
     est = estimator.Estimator(net=net,
diff --git a/tests/nightly/estimator/test_sentiment_rnn.py b/tests/nightly/estimator/test_sentiment_rnn.py
index 367c69b88a0b..7d3561db3789 100644
--- a/tests/nightly/estimator/test_sentiment_rnn.py
+++ b/tests/nightly/estimator/test_sentiment_rnn.py
@@ -190,11 +190,11 @@ def run(net, train_dataloader, test_dataloader, num_epochs, ctx, lr):
     trainer = mx.gluon.Trainer(net.collect_params(), 'adam', {'learning_rate': lr})
     # Define loss and evaluation metrics
     loss = gluon.loss.SoftmaxCrossEntropyLoss()
-    metrics = mx.metric.CompositeEvalMetric()
-    acc = mx.metric.Accuracy()
-    nested_metrics = mx.metric.CompositeEvalMetric()
-    metrics.add([acc, mx.metric.Loss()])
-    nested_metrics.add([metrics, mx.metric.Accuracy()])
+    metrics = mx.gluon.metric.CompositeEvalMetric()
+    acc = mx.gluon.metric.Accuracy()
+    nested_metrics = mx.gluon.metric.CompositeEvalMetric()
+    metrics.add([acc, mx.gluon.metric.Loss()])
+    nested_metrics.add([metrics, mx.gluon.metric.Accuracy()])
 
     # Define estimator
     est = estimator.Estimator(net=net, loss=loss, train_metrics=nested_metrics,
diff --git a/tests/nightly/test_optimizer.py b/tests/nightly/test_optimizer.py
index 0a87368d991e..9c2fcb8a62cf 100644
--- a/tests/nightly/test_optimizer.py
+++ b/tests/nightly/test_optimizer.py
@@ -83,7 +83,7 @@ def test_lars():
                     num_epoch=num_epochs)
 
     # predict accuracy for lenet
-    acc = mx.metric.Accuracy()
+    acc = mx.gluon.metric.Accuracy()
     lenet_model.score(test_iter, acc)
     accuracy = acc.get()[1]
     assert accuracy > 0.98, "LeNet-5 training accuracy on MNIST was too low"
diff --git a/tests/nightly/test_tlocal_racecondition.py b/tests/nightly/test_tlocal_racecondition.py
index d43c45937c05..986e1f464bfb 100644
--- a/tests/nightly/test_tlocal_racecondition.py
+++ b/tests/nightly/test_tlocal_racecondition.py
@@ -91,7 +91,7 @@ def infer_type(self, in_type):
     def create_operator(self, ctx, shapes, dtypes):
         return MyCustom()
 
-class MyMetric(mx.metric.EvalMetric):
+class MyMetric(mx.gluon.metric.EvalMetric):
     def __init__(self):
         super(MyMetric, self).__init__("MyMetric")
         self.name = ['empty']
diff --git a/tools/caffe_converter/test_converter.py b/tools/caffe_converter/test_converter.py
index 49f8bdb167c2..880de1be449f 100644
--- a/tools/caffe_converter/test_converter.py
+++ b/tools/caffe_converter/test_converter.py
@@ -40,7 +40,7 @@ def test_imagenet_model_performance(model_name, val_data, gpus, batch_size):
     meta_info = get_model_meta_info(model_name)
     [model_name, mean] = convert_caffe_model(model_name, meta_info)
     sym, arg_params, aux_params = mx.model.load_checkpoint(model_name, 0)
-    acc = [mx.metric.create('acc'), mx.metric.create('top_k_accuracy', top_k=5)]
+    acc = [mx.gluon.metric.create('acc'), mx.gluon.metric.create('top_k_accuracy', top_k=5)]
     if isinstance(mean, str):
         mean_args = {'mean_img':mean}
     else:

From ec615a5463d4d3320a099b35e2e4ee8472bc5eea Mon Sep 17 00:00:00 2001
From: acphile <phile_999@126.com>
Date: Wed, 29 Apr 2020 13:46:05 +0000
Subject: [PATCH 18/24] fix context difference

---
 python/mxnet/gluon/metric.py | 48 +++++++++++++++++++-----------------
 1 file changed, 25 insertions(+), 23 deletions(-)

diff --git a/python/mxnet/gluon/metric.py b/python/mxnet/gluon/metric.py
index 9840f5569b93..dc9d5c957444 100644
--- a/python/mxnet/gluon/metric.py
+++ b/python/mxnet/gluon/metric.py
@@ -420,10 +420,12 @@ def update(self, labels, preds):
         labels, preds = check_label_shapes(labels, preds, True)
 
         for label, pred_label in zip(labels, preds):
+            pred_label = pred_label.as_np_ndarray()
+            label = label.as_np_ndarray().as_in_ctx(pred_label.ctx)
             if pred_label.shape != label.shape:
-                pred_label = ndarray.argmax(pred_label, axis=self.axis)
-            pred_label = pred_label.as_np_ndarray().astype('int32')
-            label = label.as_np_ndarray().astype('int32')
+                pred_label = pred_label.argmax(axis=self.axis)
+            pred_label = pred_label.astype('int32')
+            label = label.astype('int32')
             # flatten before checking shapes to avoid shape miss match
             label = label.reshape(-1)
             pred_label = pred_label.reshape(-1)
@@ -501,7 +503,7 @@ def update(self, labels, preds):
             # much faster, which is important since that computation is
             # single-threaded due to Python GIL.
             pred_label = numpy.argpartition(pred_label.as_np_ndarray().astype('float32'), -self.top_k)
-            label = label.as_np_ndarray().astype('int32')
+            label = label.as_np_ndarray().astype('int32').as_in_ctx(pred_label.ctx)
             check_label_shapes(label, pred_label)
             num_samples = pred_label.shape[0]
             num_dims = len(pred_label.shape)
@@ -570,13 +572,13 @@ def __init__(self, class_type="binary", threshold=0.5, beta=1):
         self.beta = beta
         self.reset_stats()
 
-    def _set(self, num):
+    def _set(self, num, ctx):
         if self.num_classes is None:
             self.num_classes = num
-            self.true_positives = numpy.zeros(num, dtype='float64')
-            self.false_negatives = numpy.zeros(num, dtype='float64')
-            self.false_positives = numpy.zeros(num, dtype='float64')
-            self.true_negatives = numpy.zeros(num, dtype='float64')
+            self.true_positives = numpy.zeros(num, dtype='float64').as_in_ctx(ctx)
+            self.false_negatives = numpy.zeros(num, dtype='float64').as_in_ctx(ctx)
+            self.false_positives = numpy.zeros(num, dtype='float64').as_in_ctx(ctx)
+            self.true_negatives = numpy.zeros(num, dtype='float64').as_in_ctx(ctx)
         else:
             assert self.num_classes == num, \
                 "Input number of classes has changed from {} to {}".format(self.num_classes, num)
@@ -593,9 +595,9 @@ def update_stats(self, label, pred):
             Predicted values.
         """
         pred = pred.as_np_ndarray()
-        label = label.as_np_ndarray().astype('int32')
+        label = label.as_np_ndarray().astype('int32').as_in_ctx(pred.ctx)
         if self.class_type == "binary":
-            self._set(1)
+            self._set(1, pred.ctx)
             if label.max() > 1:
                 raise ValueError("Wrong label for binary classification.")
             if pred.shape == label.shape:
@@ -609,14 +611,14 @@ def update_stats(self, label, pred):
 
         elif self.class_type == "multiclass":
             num = pred.shape[-1]
-            self._set(num)
+            self._set(num, pred.ctx)
             assert label.max() < num, "pred contains fewer classes than label!"
             pred_label = one_hot(pred.argmax(axis=-1).reshape(-1), num)
             label = one_hot(label.reshape(-1), num)
 
         elif self.class_type == "multilabel":
             num = pred.shape[-1]
-            self._set(num)
+            self._set(num, pred.ctx)
             assert pred.shape == label.shape, \
                 "The shape of label should be same as that of prediction for multilabel classification."
             pred_label = predict_with_threshold(pred, self.threshold).reshape(-1, num)
@@ -919,7 +921,7 @@ def update(self, labels, preds):
             pred_label = predict_with_threshold(pred_label, self.threshold)
 
             pred_label = pred_label.as_np_ndarray().astype('int32')
-            label = label.as_np_ndarray().astype('int32')
+            label = label.as_np_ndarray().astype('int32').as_in_ctx(pred_label.ctx)
             # flatten before checking shapes to avoid shape miss match
             label = label.reshape(-1)
             pred_label = pred_label.reshape(-1)
@@ -1078,7 +1080,7 @@ def update(self, labels, preds):
         labels, preds = check_label_shapes(labels, preds, True)
 
         for label, pred in zip(labels, preds):
-            label = label.as_np_ndarray()
+            label = label.as_np_ndarray().as_in_ctx(pred.ctx)
             pred = pred.as_np_ndarray()
 
             num_inst = label.shape[0]
@@ -1137,7 +1139,7 @@ def update(self, labels, preds):
         labels, preds = check_label_shapes(labels, preds, True)
 
         for label, pred in zip(labels, preds):
-            label = label.as_np_ndarray()
+            label = label.as_np_ndarray().as_in_ctx(pred.ctx)
             pred = pred.as_np_ndarray()
 
             num_inst = label.shape[0]
@@ -1241,7 +1243,7 @@ def update(self, labels, preds):
         labels, preds = check_label_shapes(labels, preds, True)
 
         for label, pred in zip(labels, preds):
-            label = label.as_np_ndarray()
+            label = label.as_np_ndarray().as_in_ctx(pred.ctx)
             pred = pred.as_np_ndarray()
 
             label = label.reshape(label.shape[0], -1)
@@ -1308,7 +1310,7 @@ def update(self, labels, preds):
         labels, preds = check_label_shapes(labels, preds, True)
 
         for label, pred in zip(labels, preds):
-            label = label.as_np_ndarray()
+            label = label.as_np_ndarray().as_in_ctx(pred.ctx)
             pred = pred.as_np_ndarray()
 
             if len(label.shape) == 1:
@@ -1393,7 +1395,7 @@ def update(self, labels, preds):
         for label, pred in zip(labels, preds):
             assert label.size == pred.size/pred.shape[-1], \
                 "shape mismatch: %s vs. %s"%(label.shape, pred.shape)
-            label = label.as_in_context(pred.context).reshape((label.size,))
+            label = label.as_in_context(pred.ctx).reshape((label.size,))
             pred = ndarray.pick(pred, label.astype(dtype='int32'), axis=self.axis)
             label = label.as_np_ndarray()
             pred = pred.as_np_ndarray()
@@ -1533,7 +1535,7 @@ def update(self, labels, preds):
         labels, preds = check_label_shapes(labels, preds, True)
 
         for label, pred in zip(labels, preds):
-            label = label.as_np_ndarray()
+            label = label.as_np_ndarray().as_in_ctx(pred.ctx)
             pred = pred.as_np_ndarray()
 
             label = label.reshape(-1)
@@ -1620,7 +1622,7 @@ def update(self, labels, preds):
         labels, preds = check_label_shapes(labels, preds, True)
         for label, pred in zip(labels, preds):
             check_label_shapes(label, pred, False, True)
-            label = label.as_np_ndarray().reshape(-1).astype(numpy.float64)
+            label = label.as_np_ndarray().as_in_ctx(pred.ctx).reshape(-1).astype(numpy.float64)
             pred = pred.as_np_ndarray().reshape(-1).astype(numpy.float64)
 
             self.num_inst += 1
@@ -1731,7 +1733,7 @@ def update(self, labels, preds):
 
         # update the confusion matrix
         for label, pred in zip(labels, preds):
-            label = label.astype('int32', copy=False).as_np_ndarray()
+            label = label.astype('int32', copy=False).as_np_ndarray().as_in_ctx(pred.ctx)
             pred = pred.as_np_ndarray()
             if pred.shape != label.shape:
                 pred = pred.argmax(axis=1).astype(label, copy=False)
@@ -1870,7 +1872,7 @@ def update(self, labels, preds):
             labels, preds = check_label_shapes(labels, preds, True)
 
         for pred, label in zip(preds, labels):
-            label = label.as_np_ndarray()
+            label = label.as_np_ndarray().as_in_ctx(pred.ctx)
             pred = pred.as_np_ndarray()
 
             reval = self._feval(label, pred)

From c4a3b67635a1fe3ea90569a7c70a95958b82f22e Mon Sep 17 00:00:00 2001
From: Leonard Lausen <lausen@amazon.com>
Date: Thu, 30 Apr 2020 05:48:17 +0000
Subject: [PATCH 19/24] Disable -DUSE_TVM_OP on GPU builds

---
 ci/docker/runtime_functions.sh  | 49 ++-------------------------------
 ci/jenkins/Jenkins_steps.groovy | 27 ------------------
 ci/jenkins/Jenkinsfile_unix_gpu |  2 --
 3 files changed, 2 insertions(+), 76 deletions(-)

diff --git a/ci/docker/runtime_functions.sh b/ci/docker/runtime_functions.sh
index 12b50133f22b..3808ba0da76f 100755
--- a/ci/docker/runtime_functions.sh
+++ b/ci/docker/runtime_functions.sh
@@ -723,7 +723,6 @@ build_ubuntu_gpu_mkldnn() {
     CC=gcc-7 CXX=g++-7 cmake \
         -DCMAKE_BUILD_TYPE="RelWithDebInfo" \
         -DUSE_MKL_IF_AVAILABLE=OFF \
-        -DUSE_TVM_OP=ON \
         -DUSE_CUDA=ON \
         -DMXNET_CUDA_ARCH="$CI_CMAKE_CUDA_ARCH" \
         -DUSE_CPP_PACKAGE=ON \
@@ -737,7 +736,6 @@ build_ubuntu_gpu_mkldnn_nocudnn() {
     CC=gcc-7 CXX=g++-7 cmake \
         -DCMAKE_BUILD_TYPE="RelWithDebInfo" \
         -DUSE_MKL_IF_AVAILABLE=OFF \
-        -DUSE_TVM_OP=ON \
         -DUSE_CUDA=ON \
         -DMXNET_CUDA_ARCH="$CI_CMAKE_CUDA_ARCH" \
         -DUSE_CUDNN=OFF \
@@ -752,7 +750,6 @@ build_ubuntu_gpu_cuda101_cudnn7() {
     CC=gcc-7 CXX=g++-7 cmake \
         -DCMAKE_BUILD_TYPE="RelWithDebInfo" \
         -DUSE_MKL_IF_AVAILABLE=OFF \
-        -DUSE_TVM_OP=ON \
         -DUSE_CUDA=ON \
         -DMXNET_CUDA_ARCH="$CI_CMAKE_CUDA_ARCH" \
         -DUSE_CUDNN=ON \
@@ -775,7 +772,6 @@ build_ubuntu_gpu_cuda101_cudnn7_make() {
         USE_CUDA=1                                \
         USE_CUDA_PATH=/usr/local/cuda             \
         USE_CUDNN=1                               \
-        USE_TVM_OP=1                              \
         USE_CPP_PACKAGE=1                         \
         USE_DIST_KVSTORE=1                        \
         CUDA_ARCH="$CI_CUDA_COMPUTE_CAPABILITIES" \
@@ -795,7 +791,6 @@ build_ubuntu_gpu_cuda101_cudnn7_mkldnn_cpp_test() {
         USE_CUDA=1                                \
         USE_CUDA_PATH=/usr/local/cuda             \
         USE_CUDNN=1                               \
-        USE_TVM_OP=0                              \
         USE_CPP_PACKAGE=1                         \
         USE_DIST_KVSTORE=1                        \
         CUDA_ARCH="$CI_CUDA_COMPUTE_CAPABILITIES" \
@@ -805,23 +800,6 @@ build_ubuntu_gpu_cuda101_cudnn7_mkldnn_cpp_test() {
     make cython PYTHON=python3
 }
 
-build_ubuntu_gpu_cuda101_cudnn7_no_tvm_op() {
-    set -ex
-    cd /work/build
-    CC=gcc-7 CXX=g++-7 cmake \
-        -DCMAKE_BUILD_TYPE="RelWithDebInfo" \
-        -DUSE_MKL_IF_AVAILABLE=OFF \
-        -DUSE_TVM_OP=OFF \
-        -DUSE_CUDA=ON \
-        -DMXNET_CUDA_ARCH="$CI_CMAKE_CUDA_ARCH" \
-        -DUSE_CUDNN=ON \
-        -DUSE_MKLDNN=OFF \
-        -DBUILD_CYTHON_MODULES=ON \
-        -DUSE_DIST_KVSTORE=ON \
-        -G Ninja /work/mxnet
-    ninja
-}
-
 build_ubuntu_amalgamation() {
     set -ex
     # Amalgamation can not be run with -j nproc
@@ -852,7 +830,6 @@ build_ubuntu_gpu_cmake() {
         -DUSE_SIGNAL_HANDLER=ON                 \
         -DUSE_CUDA=ON                           \
         -DUSE_CUDNN=ON                          \
-        -DUSE_TVM_OP=ON                         \
         -DUSE_MKL_IF_AVAILABLE=OFF              \
         -DUSE_MKLML_MKL=OFF                     \
         -DUSE_MKLDNN=OFF                        \
@@ -873,7 +850,6 @@ build_ubuntu_gpu_cmake_no_rtc() {
         -DUSE_SIGNAL_HANDLER=ON                 \
         -DUSE_CUDA=ON                           \
         -DUSE_CUDNN=ON                          \
-        -DUSE_TVM_OP=ON                         \
         -DUSE_MKL_IF_AVAILABLE=OFF              \
         -DUSE_MKLML_MKL=OFF                     \
         -DUSE_MKLDNN=ON                         \
@@ -888,27 +864,6 @@ build_ubuntu_gpu_cmake_no_rtc() {
     ninja
 }
 
-build_ubuntu_gpu_cmake_no_tvm_op() {
-    set -ex
-    cd /work/build
-    CC=gcc-7 CXX=g++-7 cmake \
-        -DUSE_SIGNAL_HANDLER=ON                 \
-        -DUSE_CUDA=ON                           \
-        -DUSE_CUDNN=ON                          \
-        -DUSE_TVM_OP=OFF                        \
-        -DUSE_MKL_IF_AVAILABLE=OFF              \
-        -DUSE_MKLML_MKL=OFF                     \
-        -DUSE_MKLDNN=OFF                        \
-        -DUSE_DIST_KVSTORE=ON                   \
-        -DCMAKE_BUILD_TYPE=Release              \
-        -DMXNET_CUDA_ARCH="$CI_CMAKE_CUDA_ARCH" \
-        -DBUILD_CYTHON_MODULES=1                \
-        -G Ninja                                \
-        /work/mxnet
-
-    ninja
-}
-
 build_ubuntu_cpu_large_tensor() {
     set -ex
     cd /work/build
@@ -931,7 +886,6 @@ build_ubuntu_gpu_large_tensor() {
         -DUSE_SIGNAL_HANDLER=ON                 \
         -DUSE_CUDA=ON                           \
         -DUSE_CUDNN=ON                          \
-        -DUSE_TVM_OP=ON                         \
         -DUSE_MKL_IF_AVAILABLE=OFF              \
         -DUSE_MKLML_MKL=OFF                     \
         -DUSE_MKLDNN=OFF                        \
@@ -989,7 +943,8 @@ cd_unittest_ubuntu() {
 
         # Adding these here as CI doesn't test all CUDA environments
         pytest example/image-classification/test_score.py
-        integrationtest_ubuntu_gpu_dist_kvstore
+        # TODO(szha): fix and reenable the hanging issue. tracked in #18098
+        # integrationtest_ubuntu_gpu_dist_kvstore
     fi
 
     if [[ ${mxnet_variant} = *mkl ]]; then
diff --git a/ci/jenkins/Jenkins_steps.groovy b/ci/jenkins/Jenkins_steps.groovy
index f129fe1299ab..747ddcf27ce0 100644
--- a/ci/jenkins/Jenkins_steps.groovy
+++ b/ci/jenkins/Jenkins_steps.groovy
@@ -289,20 +289,6 @@ def compile_unix_full_gpu_mkldnn_cpp_test() {
     }]
 }
 
-def compile_unix_full_gpu_no_tvm_op() {
-    return ['GPU: CUDA10.1+cuDNN7 TVM_OP OFF': {
-      node(NODE_LINUX_CPU) {
-        ws('workspace/build-gpu-no-tvm-op') {
-          timeout(time: max_time, unit: 'MINUTES') {
-            utils.init_git()
-            utils.docker_run('ubuntu_build_cuda', 'build_ubuntu_gpu_cuda101_cudnn7_no_tvm_op', false)
-            utils.pack_lib('gpu_no_tvm_op', mx_lib_cpp_examples_no_tvm_op)
-          }
-        }
-      }
-    }]
-}
-
 def compile_unix_cmake_gpu() {
     return ['GPU: CMake': {
       node(NODE_LINUX_CPU) {
@@ -317,19 +303,6 @@ def compile_unix_cmake_gpu() {
     }]
 }
 
-def compile_unix_cmake_gpu_no_tvm_op() {
-    return ['GPU: CMake TVM_OP OFF': {
-      node(NODE_LINUX_CPU) {
-        ws('workspace/build-cmake-gpu-no-tvm-op') {
-          timeout(time: max_time, unit: 'MINUTES') {
-            utils.init_git()
-            utils.docker_run('ubuntu_gpu_cu101', 'build_ubuntu_gpu_cmake_no_tvm_op', false)
-          }
-        }
-      }
-    }]
-}
-
 def compile_unix_cmake_gpu_no_rtc() {
     return ['GPU: CMake CUDA RTC OFF': {
         node(NODE_LINUX_CPU) {
diff --git a/ci/jenkins/Jenkinsfile_unix_gpu b/ci/jenkins/Jenkinsfile_unix_gpu
index 7742a654faa1..384f9f2908cd 100644
--- a/ci/jenkins/Jenkinsfile_unix_gpu
+++ b/ci/jenkins/Jenkinsfile_unix_gpu
@@ -41,8 +41,6 @@ core_logic: {
     custom_steps.compile_unix_cmake_gpu(),
     custom_steps.compile_unix_tensorrt_gpu(),
     custom_steps.compile_unix_int64_gpu(),
-    custom_steps.compile_unix_full_gpu_no_tvm_op(),
-    custom_steps.compile_unix_cmake_gpu_no_tvm_op(),
     custom_steps.compile_unix_cmake_gpu_no_rtc(),
     custom_steps.compile_unix_full_gpu_mkldnn_cpp_test()
   ])

From 0456416a05a0f9bf8e65cf64c6315f75ee503522 Mon Sep 17 00:00:00 2001
From: Leonard Lausen <lausen@amazon.com>
Date: Thu, 30 Apr 2020 05:52:10 +0000
Subject: [PATCH 20/24] Fix disable tvm op for gpu runs

---
 ci/docker/runtime_functions.sh  |  3 +--
 ci/jenkins/Jenkins_steps.groovy | 16 ----------------
 ci/jenkins/Jenkinsfile_unix_gpu |  1 -
 3 files changed, 1 insertion(+), 19 deletions(-)

diff --git a/ci/docker/runtime_functions.sh b/ci/docker/runtime_functions.sh
index 3808ba0da76f..9856346dc460 100755
--- a/ci/docker/runtime_functions.sh
+++ b/ci/docker/runtime_functions.sh
@@ -943,8 +943,7 @@ cd_unittest_ubuntu() {
 
         # Adding these here as CI doesn't test all CUDA environments
         pytest example/image-classification/test_score.py
-        # TODO(szha): fix and reenable the hanging issue. tracked in #18098
-        # integrationtest_ubuntu_gpu_dist_kvstore
+        integrationtest_ubuntu_gpu_dist_kvstore
     fi
 
     if [[ ${mxnet_variant} = *mkl ]]; then
diff --git a/ci/jenkins/Jenkins_steps.groovy b/ci/jenkins/Jenkins_steps.groovy
index 747ddcf27ce0..59ad73d58f0f 100644
--- a/ci/jenkins/Jenkins_steps.groovy
+++ b/ci/jenkins/Jenkins_steps.groovy
@@ -817,22 +817,6 @@ def test_unix_python3_gpu() {
     }]
 }
 
-def test_unix_python3_gpu_no_tvm_op() {
-    return ['Python3: GPU TVM_OP OFF': {
-      node(NODE_LINUX_GPU) {
-        ws('workspace/ut-python3-gpu-no-tvm-op') {
-          try {
-            utils.unpack_and_init('gpu_no_tvm_op', mx_lib_cpp_examples_no_tvm_op)
-            python3_gpu_ut_cython('ubuntu_gpu_cu101')
-            utils.publish_test_coverage()
-          } finally {
-            utils.collect_test_results_unix('tests_gpu.xml', 'tests_python3_gpu.xml')
-          }
-        }
-      }
-    }]
-}
-
 def test_unix_python3_quantize_gpu() {
     return ['Python3: Quantize GPU': {
       node(NODE_LINUX_GPU_P3) {
diff --git a/ci/jenkins/Jenkinsfile_unix_gpu b/ci/jenkins/Jenkinsfile_unix_gpu
index 384f9f2908cd..0e2310fc9220 100644
--- a/ci/jenkins/Jenkinsfile_unix_gpu
+++ b/ci/jenkins/Jenkinsfile_unix_gpu
@@ -59,7 +59,6 @@ core_logic: {
     custom_steps.test_unix_scala_gpu(),
     // TODO(szha): fix and reenable the hanging issue. tracked in #18098
     // custom_steps.test_unix_distributed_kvstore_gpu(),
-    custom_steps.test_unix_python3_gpu_no_tvm_op(),
     custom_steps.test_unix_capi_cpp_package(),
   ]) 
 }

From 8163fbbe8881c0584c8f4f3aeba8b87e3e3006be Mon Sep 17 00:00:00 2001
From: acphile <phile_999@126.com>
Date: Thu, 7 May 2020 10:37:03 +0000
Subject: [PATCH 21/24] use label.ctx in metric.py; remove gluoncv dependency
 in test_cvnets

---
 python/mxnet/gluon/metric.py         | 71 +++++++++++++++-------------
 tests/python/tensorrt/test_cvnets.py | 13 ++---
 2 files changed, 45 insertions(+), 39 deletions(-)

diff --git a/python/mxnet/gluon/metric.py b/python/mxnet/gluon/metric.py
index dc9d5c957444..8503d80e92ad 100644
--- a/python/mxnet/gluon/metric.py
+++ b/python/mxnet/gluon/metric.py
@@ -420,8 +420,8 @@ def update(self, labels, preds):
         labels, preds = check_label_shapes(labels, preds, True)
 
         for label, pred_label in zip(labels, preds):
-            pred_label = pred_label.as_np_ndarray()
-            label = label.as_np_ndarray().as_in_ctx(pred_label.ctx)
+            pred_label = pred_label.as_np_ndarray().as_in_ctx(label.ctx)
+            label = label.as_np_ndarray()
             if pred_label.shape != label.shape:
                 pred_label = pred_label.argmax(axis=self.axis)
             pred_label = pred_label.astype('int32')
@@ -502,8 +502,8 @@ def update(self, labels, preds):
             # we do not care about the order of top k elements. It is
             # much faster, which is important since that computation is
             # single-threaded due to Python GIL.
-            pred_label = numpy.argpartition(pred_label.as_np_ndarray().astype('float32'), -self.top_k)
-            label = label.as_np_ndarray().astype('int32').as_in_ctx(pred_label.ctx)
+            pred_label = numpy.argpartition(pred_label.as_np_ndarray().astype('float32'), -self.top_k).as_in_ctx(label.ctx)
+            label = label.as_np_ndarray().astype('int32')
             check_label_shapes(label, pred_label)
             num_samples = pred_label.shape[0]
             num_dims = len(pred_label.shape)
@@ -594,10 +594,10 @@ def update_stats(self, label, pred):
         pred : `NDArray`
             Predicted values.
         """
-        pred = pred.as_np_ndarray()
-        label = label.as_np_ndarray().astype('int32').as_in_ctx(pred.ctx)
+        pred = pred.as_np_ndarray().as_in_ctx(label.ctx)
+        label = label.as_np_ndarray().astype('int32')
         if self.class_type == "binary":
-            self._set(1, pred.ctx)
+            self._set(1, label.ctx)
             if label.max() > 1:
                 raise ValueError("Wrong label for binary classification.")
             if pred.shape == label.shape:
@@ -611,14 +611,14 @@ def update_stats(self, label, pred):
 
         elif self.class_type == "multiclass":
             num = pred.shape[-1]
-            self._set(num, pred.ctx)
+            self._set(num, label.ctx)
             assert label.max() < num, "pred contains fewer classes than label!"
             pred_label = one_hot(pred.argmax(axis=-1).reshape(-1), num)
             label = one_hot(label.reshape(-1), num)
 
         elif self.class_type == "multilabel":
             num = pred.shape[-1]
-            self._set(num, pred.ctx)
+            self._set(num, label.ctx)
             assert pred.shape == label.shape, \
                 "The shape of label should be same as that of prediction for multilabel classification."
             pred_label = predict_with_threshold(pred, self.threshold).reshape(-1, num)
@@ -920,8 +920,8 @@ def update(self, labels, preds):
         for label, pred_label in zip(labels, preds):
             pred_label = predict_with_threshold(pred_label, self.threshold)
 
-            pred_label = pred_label.as_np_ndarray().astype('int32')
-            label = label.as_np_ndarray().astype('int32').as_in_ctx(pred_label.ctx)
+            pred_label = pred_label.as_np_ndarray().astype('int32').as_in_ctx(label.ctx)
+            label = label.as_np_ndarray().astype('int32')
             # flatten before checking shapes to avoid shape miss match
             label = label.reshape(-1)
             pred_label = pred_label.reshape(-1)
@@ -1080,8 +1080,8 @@ def update(self, labels, preds):
         labels, preds = check_label_shapes(labels, preds, True)
 
         for label, pred in zip(labels, preds):
-            label = label.as_np_ndarray().as_in_ctx(pred.ctx)
-            pred = pred.as_np_ndarray()
+            label = label.as_np_ndarray()
+            pred = pred.as_np_ndarray().as_in_ctx(label.ctx)
 
             num_inst = label.shape[0]
             mae = numpy.abs(label - pred).reshape(num_inst, -1).mean(axis=-1).sum()
@@ -1139,8 +1139,8 @@ def update(self, labels, preds):
         labels, preds = check_label_shapes(labels, preds, True)
 
         for label, pred in zip(labels, preds):
-            label = label.as_np_ndarray().as_in_ctx(pred.ctx)
-            pred = pred.as_np_ndarray()
+            label = label.as_np_ndarray()
+            pred = pred.as_np_ndarray().as_in_ctx(label.ctx)
 
             num_inst = label.shape[0]
             mse = ((label - pred)**2.0).reshape(num_inst, -1).mean(axis=-1).sum()
@@ -1243,8 +1243,8 @@ def update(self, labels, preds):
         labels, preds = check_label_shapes(labels, preds, True)
 
         for label, pred in zip(labels, preds):
-            label = label.as_np_ndarray().as_in_ctx(pred.ctx)
-            pred = pred.as_np_ndarray()
+            label = label.as_np_ndarray()
+            pred = pred.as_np_ndarray().as_in_ctx(label.ctx)
 
             label = label.reshape(label.shape[0], -1)
             pred = pred.reshape(pred.shape[0], -1)
@@ -1310,8 +1310,8 @@ def update(self, labels, preds):
         labels, preds = check_label_shapes(labels, preds, True)
 
         for label, pred in zip(labels, preds):
-            label = label.as_np_ndarray().as_in_ctx(pred.ctx)
-            pred = pred.as_np_ndarray()
+            label = label.as_np_ndarray()
+            pred = pred.as_np_ndarray().as_in_ctx(label.ctx)
 
             if len(label.shape) == 1:
                 label = label.reshape(1, label.shape[0])
@@ -1345,6 +1345,8 @@ class :math:`k`.
 
     Parameters
     ----------
+    eps : float, default 1e-12
+        Use small constant for the case that predicted value is 0.
     ignore_label : int or None, default None
         Index of invalid label to ignore when
         counting. By default, sets to -1.
@@ -1370,12 +1372,13 @@ class :math:`k`.
     >>> print ce.get()
     ('cross-entropy', 0.57159948348999023)
     """
-    def __init__(self, ignore_label=None, axis=-1, name='cross-entropy',
+    def __init__(self, eps=1e-12, ignore_label=None, axis=-1, name='cross-entropy',
                  output_names=None, label_names=None):
         super(CrossEntropy, self).__init__(
             name, output_names=output_names, label_names=label_names)
         self.ignore_label = ignore_label
         self.axis = axis
+        self.eps = eps
 
     def update(self, labels, preds):
         """Updates the internal evaluation result.
@@ -1395,15 +1398,15 @@ def update(self, labels, preds):
         for label, pred in zip(labels, preds):
             assert label.size == pred.size/pred.shape[-1], \
                 "shape mismatch: %s vs. %s"%(label.shape, pred.shape)
-            label = label.as_in_context(pred.ctx).reshape((label.size,))
-            pred = ndarray.pick(pred, label.astype(dtype='int32'), axis=self.axis)
+            label = label.reshape((label.size,))
+            pred = ndarray.pick(pred.as_in_context(label.ctx), label.astype(dtype='int32'), axis=self.axis)
             label = label.as_np_ndarray()
             pred = pred.as_np_ndarray()
             if self.ignore_label is not None:
                 ignore = (label == self.ignore_label).astype(pred.dtype)
                 num -= ignore.sum()
                 pred = pred * (1 - ignore) + ignore
-            loss -= numpy.log(numpy.maximum(1e-12, pred)).sum()
+            loss -= numpy.log(numpy.maximum(self.eps, pred)).sum()
             num += pred.size
         self.sum_metric += loss
         self.num_inst += num
@@ -1438,6 +1441,8 @@ class Perplexity(CrossEntropy):
 
     Parameters
     ----------
+    eps : float, default 1e-12
+        Use small constant for the case that predicted value is 0.
     ignore_label : int or None, default None
         Index of invalid label to ignore when
         counting. By default, sets to -1.
@@ -1463,10 +1468,10 @@ class Perplexity(CrossEntropy):
     >>> print perp.get()
     ('Perplexity', 1.7710976285155853)
     """
-    def __init__(self, ignore_label=None, axis=-1, name='perplexity',
+    def __init__(self, eps=1e-12, ignore_label=None, axis=-1, name='perplexity',
                  output_names=None, label_names=None):
         super(Perplexity, self).__init__(
-            name=name, ignore_label=ignore_label, axis=axis,
+            name=name, eps=eps, ignore_label=ignore_label, axis=axis,
             output_names=output_names, label_names=label_names)
 
     def get(self):
@@ -1535,8 +1540,8 @@ def update(self, labels, preds):
         labels, preds = check_label_shapes(labels, preds, True)
 
         for label, pred in zip(labels, preds):
-            label = label.as_np_ndarray().as_in_ctx(pred.ctx)
-            pred = pred.as_np_ndarray()
+            label = label.as_np_ndarray()
+            pred = pred.as_np_ndarray().as_in_ctx(label.ctx)
 
             label = label.reshape(-1)
             num_examples = pred.shape[0]
@@ -1622,8 +1627,8 @@ def update(self, labels, preds):
         labels, preds = check_label_shapes(labels, preds, True)
         for label, pred in zip(labels, preds):
             check_label_shapes(label, pred, False, True)
-            label = label.as_np_ndarray().as_in_ctx(pred.ctx).reshape(-1).astype(numpy.float64)
-            pred = pred.as_np_ndarray().reshape(-1).astype(numpy.float64)
+            label = label.as_np_ndarray().reshape(-1).astype(numpy.float64)
+            pred = pred.as_np_ndarray().as_in_ctx(label.ctx).reshape(-1).astype(numpy.float64)
 
             self.num_inst += 1
             self._label_nums, self._mean_l, self._sse_l = \
@@ -1733,8 +1738,8 @@ def update(self, labels, preds):
 
         # update the confusion matrix
         for label, pred in zip(labels, preds):
-            label = label.astype('int32', copy=False).as_np_ndarray().as_in_ctx(pred.ctx)
-            pred = pred.as_np_ndarray()
+            label = label.astype('int32', copy=False).as_np_ndarray()
+            pred = pred.as_np_ndarray().as_in_ctx(label.ctx)
             if pred.shape != label.shape:
                 pred = pred.argmax(axis=1).astype(label, copy=False)
             else:
@@ -1872,8 +1877,8 @@ def update(self, labels, preds):
             labels, preds = check_label_shapes(labels, preds, True)
 
         for pred, label in zip(preds, labels):
-            label = label.as_np_ndarray().as_in_ctx(pred.ctx)
-            pred = pred.as_np_ndarray()
+            label = label.as_np_ndarray()
+            pred = pred.as_np_ndarray().as_in_ctx(label.ctx)
 
             reval = self._feval(label, pred)
             if isinstance(reval, tuple):
diff --git a/tests/python/tensorrt/test_cvnets.py b/tests/python/tensorrt/test_cvnets.py
index 99312d76dc7a..56cda90a80b1 100644
--- a/tests/python/tensorrt/test_cvnets.py
+++ b/tests/python/tensorrt/test_cvnets.py
@@ -16,7 +16,6 @@
 # under the License.
 
 import gc
-import gluoncv
 import mxnet as mx
 import numpy as np
 
@@ -29,7 +28,12 @@
 def get_classif_model(model_name, use_tensorrt, ctx=mx.gpu(0), batch_size=128):
     mx.contrib.tensorrt.set_use_fp16(False)
     h, w = 32, 32
-    net = gluoncv.model_zoo.get_model(model_name, pretrained=True)
+    model_url = "https://raw.githubusercontent.com/dmlc/web-data/master/gluoncv/models/"
+    param_file = "{}-0000.params".format(model_name)
+    symbol_file = "{}-symbol.json".format(model_name)
+    mx.test_utils.download("{}/{}".format(model_url, param_file), fname=param_file, overwrite=True)
+    mx.test_utils.download("{}/{}".format(model_url, symbol_file), fname=symbol_file, overwrite=True)
+    net = gluon.SymbolBlock.imports(symbol_file, ['data'], param_file)
     net.hybridize()
     net.forward(mx.nd.zeros((batch_size, 3, h, w)))
     net.export(model_name)
@@ -130,10 +134,7 @@ def test_tensorrt_on_cifar_resnets(batch_size=32, tolerance=0.1, num_workers=1):
             'cifar_resnet20_v2',
             'cifar_resnet56_v2',
             'cifar_resnet110_v2',
-            'cifar_wideresnet16_10',
-            'cifar_wideresnet28_10',
-            'cifar_wideresnet40_8',
-            'cifar_resnext29_16x64d'
+            'cifar_wideresnet16_10'
         ]
 
         num_models = len(models)

From d53e6ef32217c1bb798882db9134e39d7318ba5e Mon Sep 17 00:00:00 2001
From: acphile <phile_999@126.com>
Date: Thu, 7 May 2020 12:25:04 +0000
Subject: [PATCH 22/24] fix sanity

---
 python/mxnet/gluon/metric.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/python/mxnet/gluon/metric.py b/python/mxnet/gluon/metric.py
index 8503d80e92ad..5b081ceac4d8 100644
--- a/python/mxnet/gluon/metric.py
+++ b/python/mxnet/gluon/metric.py
@@ -502,7 +502,8 @@ def update(self, labels, preds):
             # we do not care about the order of top k elements. It is
             # much faster, which is important since that computation is
             # single-threaded due to Python GIL.
-            pred_label = numpy.argpartition(pred_label.as_np_ndarray().astype('float32'), -self.top_k).as_in_ctx(label.ctx)
+            pred_label = pred_label.as_np_ndarray().as_in_ctx(label.ctx).astype('float32')
+            pred_label = numpy.argpartition(pred_label, -self.top_k)
             label = label.as_np_ndarray().astype('int32')
             check_label_shapes(label, pred_label)
             num_samples = pred_label.shape[0]

From a2b0ffe13f8021b5c958206018db270aaf3e528e Mon Sep 17 00:00:00 2001
From: acphile <phile_999@126.com>
Date: Fri, 8 May 2020 03:18:47 +0000
Subject: [PATCH 23/24] fix importError

---
 python/mxnet/gluon/contrib/data/vision/dataloader.py          | 4 ++--
 .../mxnet/gluon/contrib/data/vision/transforms/bbox/bbox.py   | 2 +-
 python/mxnet/gluon/data/dataloader.py                         | 2 +-
 tests/python/tensorrt/test_cvnets.py                          | 2 +-
 4 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/python/mxnet/gluon/contrib/data/vision/dataloader.py b/python/mxnet/gluon/contrib/data/vision/dataloader.py
index 0c71d90453d8..3213398b2214 100644
--- a/python/mxnet/gluon/contrib/data/vision/dataloader.py
+++ b/python/mxnet/gluon/contrib/data/vision/dataloader.py
@@ -21,9 +21,9 @@
 import logging
 import numpy as np
 
-from ..... import nd
+from ..... import ndarray as nd
 from .....util import is_np_array
-from ..... import np as _mx_np   # pylint: disable=reimported
+from ..... import numpy as _mx_np   # pylint: disable=reimported
 from ....nn import HybridSequential, Sequential, HybridBlock, Block
 from ....data.vision import transforms
 from ....data import DataLoader
diff --git a/python/mxnet/gluon/contrib/data/vision/transforms/bbox/bbox.py b/python/mxnet/gluon/contrib/data/vision/transforms/bbox/bbox.py
index 1629c212957f..65a18aaf80cd 100644
--- a/python/mxnet/gluon/contrib/data/vision/transforms/bbox/bbox.py
+++ b/python/mxnet/gluon/contrib/data/vision/transforms/bbox/bbox.py
@@ -23,7 +23,7 @@
 from .......base import numeric_types
 from ......block import Block
 from .......util import is_np_array
-from ....... import nd, npx, np
+from ....... import ndarray as nd, numpy_extension as npx, numpy as np
 from .utils import _check_bbox_shape, bbox_crop, bbox_translate
 from .utils import bbox_resize, bbox_random_crop_with_constraints
 
diff --git a/python/mxnet/gluon/data/dataloader.py b/python/mxnet/gluon/data/dataloader.py
index d991bc769ac9..c51981678367 100644
--- a/python/mxnet/gluon/data/dataloader.py
+++ b/python/mxnet/gluon/data/dataloader.py
@@ -39,7 +39,7 @@
 
 from . import sampler as _sampler
 from . import batchify as _batchify
-from ... import nd, context
+from ... import ndarray as nd, context
 from ...util import is_np_shape, is_np_array, set_np
 from ... import numpy as _mx_np  # pylint: disable=reimported
 
diff --git a/tests/python/tensorrt/test_cvnets.py b/tests/python/tensorrt/test_cvnets.py
index 56cda90a80b1..cd090c5e2f5c 100644
--- a/tests/python/tensorrt/test_cvnets.py
+++ b/tests/python/tensorrt/test_cvnets.py
@@ -28,7 +28,7 @@
 def get_classif_model(model_name, use_tensorrt, ctx=mx.gpu(0), batch_size=128):
     mx.contrib.tensorrt.set_use_fp16(False)
     h, w = 32, 32
-    model_url = "https://raw.githubusercontent.com/dmlc/web-data/master/gluoncv/models/"
+    model_url = "https://raw.githubusercontent.com/dmlc/web-data/221ce5b7c6d5b0777a1e3471f7f03ff98da90a0a/gluoncv/models"
     param_file = "{}-0000.params".format(model_name)
     symbol_file = "{}-symbol.json".format(model_name)
     mx.test_utils.download("{}/{}".format(model_url, param_file), fname=param_file, overwrite=True)

From ef3058adba8353fd8fa81b39264b543d64e0dac7 Mon Sep 17 00:00:00 2001
From: acphile <phile_999@126.com>
Date: Sat, 9 May 2020 14:59:04 +0800
Subject: [PATCH 24/24] remove nose

---
 tests/python/unittest/test_metric.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/tests/python/unittest/test_metric.py b/tests/python/unittest/test_metric.py
index af81251fa11b..c2e4783de411 100644
--- a/tests/python/unittest/test_metric.py
+++ b/tests/python/unittest/test_metric.py
@@ -384,6 +384,3 @@ def test_single_array_input():
     _, rmse_res = rmse.get()
     np.testing.assert_almost_equal(rmse_res, 0.1)
 
-if __name__ == '__main__':
-    import nose
-    nose.runmodule()