From 4e4dfd2bfb5a3e89529a54a8b020000a2caee3af Mon Sep 17 00:00:00 2001
From: Dick Carter <dcarter@nvidia.com>
Date: Mon, 12 Oct 2020 12:19:11 -0700
Subject: [PATCH] [v1.8.x] Backport TRT test update #19296 (#19298)

* Bypass test_tensorrt.py:test_tensorrt_symbol_int8 on arch < 70

* Adapt test_tensorrt.py:test_tensorrt_symbol for A100

* Fix test_numpy_op.py:test_np_mixed_precision_binary_funcs with portion of (#18660)
---
 tests/python/tensorrt/test_tensorrt.py | 27 ++++++++++++++++++++------
 tests/python/unittest/test_numpy_op.py | 21 ++++++++++++++++++++
 2 files changed, 42 insertions(+), 6 deletions(-)

diff --git a/tests/python/tensorrt/test_tensorrt.py b/tests/python/tensorrt/test_tensorrt.py
index c7e5f01018db..20b84d0ef7c6 100644
--- a/tests/python/tensorrt/test_tensorrt.py
+++ b/tests/python/tensorrt/test_tensorrt.py
@@ -16,17 +16,23 @@
 # under the License.
 
 import os
+import sys
 import ctypes
 import mxnet as mx
 from mxnet.base import SymbolHandle, check_call, _LIB, mx_uint, c_str_array, c_str, mx_real_t
 from mxnet.symbol import Symbol
 import numpy as np
 from mxnet.test_utils import assert_almost_equal
+from mxnet.numpy_extension import get_cuda_compute_capability
 from mxnet import gluon
 from mxnet.gluon import nn
 from mxnet import nd
 from mxnet.gluon.model_zoo import vision
 
+curr_path = os.path.dirname(os.path.abspath(os.path.expanduser(__file__)))
+sys.path.insert(0, os.path.join(curr_path, '../unittest'))
+from common import setup_module, with_seed, teardown
+
 ####################################
 ######### FP32/FP16 tests ##########
 ####################################
@@ -60,7 +66,7 @@ def get_baseline(input_data):
     return output
 
 
-def check_tensorrt_symbol(baseline, input_data, fp16_mode, tol):
+def check_tensorrt_symbol(baseline, input_data, fp16_mode, rtol=None, atol=None):
     sym, arg_params, aux_params = get_model(batch_shape=input_data.shape)
     trt_sym = sym.optimize_for('TensorRT', args=arg_params, aux=aux_params, ctx=mx.gpu(0),
                                precision='fp16' if fp16_mode else 'fp32')
@@ -69,17 +75,18 @@ def check_tensorrt_symbol(baseline, input_data, fp16_mode, tol):
                                    grad_req='null', force_rebind=True)
 
     output = executor.forward(is_train=False, data=input_data)
-    assert_almost_equal(output[0].asnumpy(), baseline[0].asnumpy(), atol=tol[0], rtol=tol[1])
+    assert_almost_equal(output[0], baseline[0], rtol=rtol, atol=atol)
 
+@with_seed()
 def test_tensorrt_symbol():
     batch_shape = (32, 3, 224, 224)
     input_data = mx.nd.random.uniform(shape=(batch_shape), ctx=mx.gpu(0))
     baseline = get_baseline(input_data)
     print("Testing resnet50 with TensorRT backend numerical accuracy...")
     print("FP32")
-    check_tensorrt_symbol(baseline, input_data, fp16_mode=False, tol=(1e-4, 1e-4))
+    check_tensorrt_symbol(baseline, input_data, fp16_mode=False)
     print("FP16")
-    check_tensorrt_symbol(baseline, input_data, fp16_mode=True, tol=(1e-1, 1e-2))
+    check_tensorrt_symbol(baseline, input_data, fp16_mode=True, rtol=1e-2, atol=1e-1)
 
 ##############################
 ######### INT8 tests ##########
@@ -135,17 +142,25 @@ def get_top1(logits):
 
 
 def test_tensorrt_symbol_int8():
+    ctx = mx.gpu(0)
+    cuda_arch = get_cuda_compute_capability(ctx)
+    cuda_arch_min = 70
+    if cuda_arch < cuda_arch_min:
+        print('Bypassing test_tensorrt_symbol_int8 on cuda arch {}, need arch >= {}).'.format(
+              cuda_arch, cuda_arch_min))
+        return
+
     # INT8 engine output are not lossless, so we don't expect numerical uniformity,
     # but we have to compare the TOP1 metric
 
     batch_shape=(1,3,224,224)
     sym, arg_params, aux_params = get_model(batch_shape=batch_shape)
     calibration_iters = 700
-    trt_sym = sym.optimize_for('TensorRT', args=arg_params, aux=aux_params, ctx=mx.gpu(0),
+    trt_sym = sym.optimize_for('TensorRT', args=arg_params, aux=aux_params, ctx=ctx,
                                precision='int8',
                                calibration_iters=calibration_iters)
     
-    executor = trt_sym.simple_bind(ctx=mx.gpu(), data=batch_shape,
+    executor = trt_sym.simple_bind(ctx=ctx, data=batch_shape,
                                grad_req='null', force_rebind=True)
     
     dali_val_iter = get_dali_iter()
diff --git a/tests/python/unittest/test_numpy_op.py b/tests/python/unittest/test_numpy_op.py
index 20da12b12f48..4bdaf5203ef1 100644
--- a/tests/python/unittest/test_numpy_op.py
+++ b/tests/python/unittest/test_numpy_op.py
@@ -2528,6 +2528,27 @@ def __init__(self, func):
             def hybrid_forward(self, F, a, b, *args, **kwargs):
                 return getattr(F.np, self._func)(a, b)
 
+        if (func in ['multiply', 'mod', 'equal', 'not_equal', 'greater',
+                    'greater_equal', 'less', 'less_equal']) and \
+            (lshape == () or rshape == ()) :
+        # the behaviors of infer type in dealing with the input shape of '()' are different between np and onp
+        # for example,
+        # mx_test_x1 = np.random.uniform(-2, 2, (2,3)).astype(np.float32)
+        # mx_test_x2 = np.random.uniform(-2, 2, ()).astype(np.float16)
+        # np_out = _np.mod(mx_test_x1.asnumpy(), mx_test_x2.asnumpy()) # float16
+        # mx_out = np.mod(mx_test_x1, mx_test_x2) # float32
+
+        # logcial ops: when two numbers are only different in precision, NumPy also has a weird behavior
+        # for example,
+        # a = np.array([[1.441]], dtype = np.float16)
+        # b = np.array(1.4413278, dtype = np.float32)
+        # c = np.array([1.4413278], dtype = np.float32)
+        # np.greater(a,b), np.greater(a,c) # True True
+        # _np.greater(a.asnumpy(),b.asnumpy()), _np.greater(a.asnumpy(),c.asnumpy()) # False True
+
+        # thus, skip the tests
+            return
+
         np_func = getattr(_np, func)
         mx_func = TestMixedBinary(func)
         np_test_x1 = _np.random.uniform(low, high, lshape).astype(ltype)