diff --git a/src/target/llvm/intrin_rule_llvm.cc b/src/target/llvm/intrin_rule_llvm.cc
index fbf67d080634..6c5a9cd079af 100644
--- a/src/target/llvm/intrin_rule_llvm.cc
+++ b/src/target/llvm/intrin_rule_llvm.cc
@@ -93,31 +93,15 @@ TVM_REGISTER_GLOBAL("tvm.intrin.rule.llvm.popcount")
 
 TVM_REGISTER_GLOBAL("tvm.intrin.rule.llvm.tan")
 .set_body([](const TVMArgs& targs, TVMRetValue* rv) {
-  using tir::make_const;
   PrimExpr e = targs[0];
   const tir::CallNode* call = e.as<tir::CallNode>();
   CHECK(call != nullptr);
-  PrimExpr x = call->args[0];
-  PrimExpr y = x;
-  DataType dtype = x.dtype();
-  const char* opName = nullptr;
-
-  if (!dtype.is_float()) {
-    LOG(FATAL) << "tan expects floating input";
-  }
-
-  if (dtype.bits() == 64) {
-    opName = "tan";
-  } else if (dtype.bits() == 32) {
-    opName = "tanf";
-  } else if (dtype.bits() == 16) {
-    opName = "tanf";
-    y = cast(DataType::Float(32, dtype.lanes()), x);
-  } else {
-    LOG(FATAL) << "tan cannot handle float" << dtype.bits();
-  }
-
-  PrimExpr tan_x = tir::CallNode::make(x.dtype(), opName, {y}, tir::CallNode::Extern);
+  const PrimExpr& x = call->args[0];
+  PrimExpr sin_x = tir::CallNode::make(
+      x.dtype(), "sin", {x}, tir::CallNode::PureIntrinsic);
+  PrimExpr cos_x = tir::CallNode::make(
+      x.dtype(), "cos", {x}, tir::CallNode::PureIntrinsic);
+  PrimExpr tan_x = sin_x / cos_x;
   *rv = tan_x;
 });
 
diff --git a/src/target/source/intrin_rule_cuda.cc b/src/target/source/intrin_rule_cuda.cc
index fc62dae7c9f9..849e098d4ad2 100644
--- a/src/target/source/intrin_rule_cuda.cc
+++ b/src/target/source/intrin_rule_cuda.cc
@@ -54,6 +54,22 @@ struct CUDAFastMath : public CUDAMath {
   }
 };
 
+struct CUDAFastMathTan : public CUDAMath {
+  std::string operator()(DataType t, std::string name) const {
+    if (t.lanes() == 1 && t.is_float()) {
+        switch (t.bits()) {
+          case 64: return name;
+          // `__tanf` seems to produce some values too deviant from numpy tan version.
+          // So, let's use just `tanf` instead.
+          case 32: return name + 'f';
+          case 16: LOG(FATAL) << "cuda tan unsupported for float16";
+          default: return "";
+        }
+    }
+    return "";
+  }
+};
+
 struct CUDAPopcount {
   std::string operator()(DataType t, std::string name) const {
     if (t.lanes() == 1 && t.is_uint()) {
@@ -98,7 +114,7 @@ TVM_REGISTER_GLOBAL("tvm.intrin.rule.cuda.log")
 .set_body(DispatchExtern<CUDAFastMath>);
 
 TVM_REGISTER_GLOBAL("tvm.intrin.rule.cuda.tan")
-.set_body(DispatchExtern<CUDAFastMath>);
+.set_body(DispatchExtern<CUDAFastMathTan>);
 
 TVM_REGISTER_GLOBAL("tvm.intrin.rule.cuda.cos")
 .set_body(DispatchExtern<CUDAFastMath>);
diff --git a/topi/tests/python/test_topi_math.py b/topi/tests/python/test_topi_math.py
index 51ce3a93fa18..3e58518ed4fe 100644
--- a/topi/tests/python/test_topi_math.py
+++ b/topi/tests/python/test_topi_math.py
@@ -127,58 +127,13 @@ def check_device(device):
     test_apply(topi.sqrt, "sqrt", np.sqrt, 0, 100)
     test_apply(topi.rsqrt, "rsqrt", lambda x: np.ones_like(x) / np.sqrt(x), 0, 100, skip_name_check=True)
     test_apply(topi.cos, "cos", np.cos, -2.0*np.pi, 2.0*np.pi)
-    test_apply(topi.tan, "tan", np.tan, -2.0*np.pi, 2.0*np.pi)
+    test_apply(topi.tan, "tan", np.tan, -2.0*np.pi, 2.0*np.pi, dtype='float32')
+    test_apply(topi.tan, "tan", np.tan, -2.0*np.pi, 2.0*np.pi, dtype='float64')
     test_apply(topi.sin, "sin", np.sin, -2.0*np.pi, 2.0*np.pi)
     test_apply(topi.erf, "erf", scipy.special.erf, -.1, .1, dtype="float32")
     test_isnan(-100, 100)
 
 
-def test_ewise_tan():
-    def test_apply(
-        func,
-        name,
-        f_numpy,
-        low,
-        high,
-        shape=(20, 3),
-        dtype='float32',
-        check_round=False,
-        skip_name_check=False,
-    ):
-        A = te.placeholder(dtype=dtype, name="A", shape=shape)
-
-        B = func(A)
-        assert tuple(B.shape) == tuple(A.shape)
-        if not skip_name_check:
-            assert B.op.body[0].name == name
-        a_np = np.random.uniform(low=low, high=high, size=shape).astype(A.dtype) * 10
-        # avoid round check too close to boundary
-        if check_round:
-            a_np += ((np.abs(np.fmod(a_np, 1)) - 0.5) < 1e-6) * 1e-5
-        b_np = f_numpy(a_np)
-
-        def check_device(device):
-            ctx = tvm.context(device, 0)
-            if not ctx.exist:
-                print("Skip because %s is not enabled" % device)
-                return
-            print("Running on target: %s" % device)
-            with tvm.target.create(device):
-                s = topi.testing.get_injective_schedule(device)(B)
-            foo = tvm.build(s, [A, B], device, name=name)
-            a = tvm.nd.array(a_np, ctx)
-            b = tvm.nd.array(np.zeros_like(b_np), ctx)
-            foo(a, b)
-            tvm.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5, atol=1e-5)
-
-        for target in get_all_backend():
-            check_device(target)
-
-    test_apply(topi.tan, "tan", np.tan, -2.0*np.pi, 2.0*np.pi, dtype='float64')
-    test_apply(topi.tan, "tan", np.tan, -2.0*np.pi, 2.0*np.pi, dtype='float32')
-    test_apply(topi.tan, "tan", np.tan, -2.0*np.pi, 2.0*np.pi, dtype='float16')
-
-
 def test_cast():
     def verify(from_dtype, to_dtype, low=-100, high=100):
         shape = (5, 4)