From d14242f7154159ee7d2d2a1d828931f3f24bec9f Mon Sep 17 00:00:00 2001
From: atamazov <artem.tamazov@gmail.com>
Date: Thu, 5 Oct 2023 01:17:02 +0300
Subject: [PATCH 1/8] rocblas-remove-int8x4-uses(01) Removed support of INT8x4
 type from the library (except miopen.h, some tests and driver)

---
 include/miopen/miopen.h                       | 11 ++--
 src/check_numerics.cpp                        |  2 +-
 src/convolution.cpp                           |  2 +-
 src/gemm_v2.cpp                               | 52 ++++++-------------
 src/hip/batched_transpose_sol.cpp             |  6 ++-
 src/include/miopen/datatype.hpp               | 12 ++---
 src/include/miopen/tensor.hpp                 |  4 +-
 src/include/miopen/visit_float.hpp            |  2 +-
 src/kernels/MIOpenIm2d2Col.cl                 |  6 ---
 src/kernels/MIOpenIm3d2Col.cl                 |  6 ---
 .../MIOpenSubTensorOpWithScalarKernel.cl      |  6 +--
 .../MIOpenSubTensorOpWithSubTensorKernel.cl   |  6 +--
 .../MIOpenSubTensorOpWithTransformKernel.cl   |  6 +--
 src/kernels/MIOpenUtilKernels4.cl             |  6 ---
 src/ocl/convolutionocl.cpp                    |  8 +--
 src/ocl/tensorocl.cpp                         | 42 ++++++---------
 src/ocl/utilocl.cpp                           | 33 +++---------
 src/pooling_api.cpp                           |  2 +-
 src/reducetensor.cpp                          |  7 ++-
 src/reducetensor_api.cpp                      |  2 +-
 src/solver/batchnorm/forward_inference_ck.cpp |  4 +-
 .../conv_ck_igemm_fwd_bias_activ_fused.cpp    |  8 +--
 src/solver/conv_direct_naive_conv.cpp         |  2 +-
 ...ip_implicit_gemm_3d_grouped_bwd_xdlops.cpp |  8 +--
 ...ip_implicit_gemm_3d_grouped_fwd_xdlops.cpp |  8 +--
 ...ip_implicit_gemm_3d_grouped_wrw_xdlops.cpp |  8 +--
 ...conv_hip_implicit_gemm_bwd_data_xdlops.cpp |  8 +--
 .../conv_hip_implicit_gemm_fwd_xdlops.cpp     |  8 +--
 ...v_hip_implicit_gemm_grouped_fwd_xdlops.cpp |  8 +--
 src/solver/gemm.cpp                           | 50 ++++++------------
 src/solver/mlir_common.cpp                    |  2 +-
 src/tensor.cpp                                |  2 +-
 test/conv_common.hpp                          | 33 +++++-------
 test/driver.hpp                               |  4 +-
 test/gpu_reference_kernel.cpp                 |  2 -
 test/gtest/conv_embed_db.cpp                  |  4 +-
 test/gtest/conv_hip_igemm_xdlops.cpp          |  4 +-
 test/gtest/conv_igemm_dynamic.cpp             |  4 +-
 test/gtest/conv_igemm_mlir.cpp                |  4 +-
 test/gtest/conv_igemm_mlir_xdlops.cpp         |  4 +-
 test/gtest/conv_trans.cpp                     |  4 +-
 test/gtest/db_sync.cpp                        |  2 +-
 test/tensor_holder.hpp                        |  7 +--
 test/tensor_transform.cpp                     |  2 +-
 44 files changed, 150 insertions(+), 261 deletions(-)

diff --git a/include/miopen/miopen.h b/include/miopen/miopen.h
index a59181acf3..7a85ec5875 100644
--- a/include/miopen/miopen.h
+++ b/include/miopen/miopen.h
@@ -346,12 +346,11 @@ MIOPEN_DECLARE_OBJECT(miopenReduceTensorDescriptor);
  */
 typedef enum
 {
-    miopenHalf  = 0, /*!< 16-bit floating point (Fully supported) */
-    miopenFloat = 1, /*!< 32-bit floating point (Fully supported) */
-    miopenInt32 = 2, /*!< 32-bit int point (Partially supported) */
-    miopenInt8  = 3, /*!< 8-bit int point (Partially supported) */
-    miopenInt8x4 =
-        4, /*!< Pack of four 8-bit int points in NCHW_VECT_C format (Partially supported) */
+    miopenHalf     = 0, /*!< 16-bit floating point (Fully supported) */
+    miopenFloat    = 1, /*!< 32-bit floating point (Fully supported) */
+    miopenInt32    = 2, /*!< 32-bit int point (Partially supported) */
+    miopenInt8     = 3, /*!< 8-bit int point (Partially supported) */
+    miopenInt8x4   = 4, /*!< Pack of four Int8 in NCHW_VECT_C format (Support discontinued) */
     miopenBFloat16 = 5, /*!< 16-bit binary floating point (8-bit exponent, 7-bit fraction)
                            (Partially supported) */
     miopenDouble  = 6,  /*!< 64-bit floating point (Partially supported) */
diff --git a/src/check_numerics.cpp b/src/check_numerics.cpp
index b0882995e9..50cb0af4af 100644
--- a/src/check_numerics.cpp
+++ b/src/check_numerics.cpp
@@ -64,7 +64,7 @@ std::string GetKernelName(miopenDataType_t data_type)
     case miopenBFloat8: return {"check_numerics_bf8"};
     case miopenInt32:
     case miopenInt8:
-    case miopenInt8x4:
+    case miopenInt8x4: // Support discontinued.
     case miopenDouble:
     default: return {""};
     }
diff --git a/src/convolution.cpp b/src/convolution.cpp
index 403ff777cd..ac7c28fdc4 100644
--- a/src/convolution.cpp
+++ b/src/convolution.cpp
@@ -358,7 +358,7 @@ ConvolutionDescriptor::GetForwardOutputTensorWithLayout(const TensorDescriptor&
     std::vector<std::size_t> out_strides;
     tensor_layout_to_strides(
         out_lens, default_layout, yLayout, xDesc.GetVectorLength(), out_strides);
-    return {(xDesc.GetType() == miopenInt8 || xDesc.GetType() == miopenInt8x4
+    return {(xDesc.GetType() == miopenInt8
                  ? (yType)
                  : xDesc.GetType()), // TODO: This function overrides the output type with
                                      // essentially the input which is incorrect.
diff --git a/src/gemm_v2.cpp b/src/gemm_v2.cpp
index 804587aac0..35add906a1 100644
--- a/src/gemm_v2.cpp
+++ b/src/gemm_v2.cpp
@@ -62,10 +62,6 @@
 /// "disabled expansion of recursive macro" injected by rocblas headers.
 #define AVOID_ROCBLAS_WRAPPERS_204 (MIOPEN_ROCBLAS_VERSION_FLAT >= 2004000)
 
-/// Maintain API compatibility with various rocBLAS version
-#define USE_GEMM_FLAGS_PACK_INT8X4 \
-    ((MIOPEN_ROCBLAS_VERSION_FLAT >= 2038000) && (MIOPEN_ROCBLAS_VERSION_FLAT < 4000000))
-
 /// Maintain API compatibility for versions not supporting FP16 alternate implementations
 #define USE_GEMM_FLAGS_FP16_ALT_IMPL (MIOPEN_ROCBLAS_VERSION_FLAT >= 2043000)
 /// Some 2.42 versions have rocblas_gemm_flags_fp16_alt_impl, but
@@ -109,7 +105,7 @@ static inline rocblas_datatype rocBlasComputeType(const miopen::GemmDescriptor&
 {
     // Complex compute types are only supported in newer version of the API
     assert(desc.dataType == desc.a_cast_type && desc.dataType == desc.b_cast_type);
-    if(desc.dataType == miopenInt8 || desc.dataType == miopenInt8x4)
+    if(desc.dataType == miopenInt8)
         return rocblas_datatype::rocblas_datatype_i32_r;
     else
         return rocblas_datatype::rocblas_datatype_f32_r;
@@ -439,7 +435,6 @@ miopenStatus_t CallGemm(const Handle& handle,
 
         switch(gemm_desc.dataType)
         {
-        case miopenInt8x4:
         case miopenInt8: {
             assert(gemm_desc.k % 4 == 0);
 
@@ -471,12 +466,7 @@ miopenStatus_t CallGemm(const Handle& handle,
                 rocBlasComputeType(gemm_desc), // rocblas_datatype::rocblas_datatype_i32_r,
                 rocblas_gemm_algo::rocblas_gemm_algo_standard,
                 0,
-#if USE_GEMM_FLAGS_PACK_INT8X4
-                rocblas_gemm_flags_pack_int8x4
-#else
-                0
-#endif
-            );
+                0);
         }
         break;
         case miopenInt32: break;
@@ -620,9 +610,9 @@ miopenStatus_t CallGemm(const Handle& handle,
         };
         break;
 
+        case miopenInt8x4:
         case miopenDouble: {
-            MIOPEN_THROW(miopenStatusBadParm,
-                         "miopenDouble data type not supported by MIOpenGEMM.");
+            MIOPEN_THROW(miopenStatusBadParm, "Unknown or unsupported data type.");
         };
         break;
         }
@@ -692,7 +682,6 @@ miopenStatus_t CallGemmStridedBatched(const Handle& handle,
 
         switch(gemm_desc.dataType)
         {
-        case miopenInt8x4:
         case miopenInt8: {
             assert(gemm_desc.k % 4 == 0);
 
@@ -728,12 +717,7 @@ miopenStatus_t CallGemmStridedBatched(const Handle& handle,
                 rocblas_datatype::rocblas_datatype_i32_r,
                 rocblas_gemm_algo::rocblas_gemm_algo_standard,
                 0,
-#if USE_GEMM_FLAGS_PACK_INT8X4
-                rocblas_gemm_flags_pack_int8x4
-#else
-                0
-#endif
-            );
+                0);
         }
         break;
         case miopenInt32: break;
@@ -892,10 +876,10 @@ miopenStatus_t CallGemmStridedBatched(const Handle& handle,
             break;
         }
 
+        case miopenInt8x4:
         case miopenDouble: {
-            MIOPEN_THROW(miopenStatusBadParm,
-                         "miopenDouble data type not supported by MIOpenGEMM.");
-        }
+            MIOPEN_THROW(miopenStatusBadParm, "Unknown or unsupported data type.");
+        };
         break;
         }
 
@@ -967,7 +951,6 @@ miopenStatus_t CallGemmStridedBatchedSequential(const Handle& handle,
 
         switch(gemm_desc.dataType)
         {
-        case miopenInt8x4:
         case miopenInt8: {
             assert(gemm_desc.k % 4 == 0);
 
@@ -1001,12 +984,7 @@ miopenStatus_t CallGemmStridedBatchedSequential(const Handle& handle,
                     rocBlasComputeType(gemm_desc), // rocblas_datatype::rocblas_datatype_i32_r,
                     rocblas_gemm_algo::rocblas_gemm_algo_standard,
                     0,
-#if USE_GEMM_FLAGS_PACK_INT8X4
-                    rocblas_gemm_flags_pack_int8x4
-#else
-                    0
-#endif
-                );
+                    0);
             }
         }
         break;
@@ -1162,10 +1140,10 @@ miopenStatus_t CallGemmStridedBatchedSequential(const Handle& handle,
             break;
         }
 
+        case miopenInt8x4:
         case miopenDouble: {
-            MIOPEN_THROW(miopenStatusBadParm,
-                         "miopenDouble data type not supported by MIOpenGEMM.");
-        }
+            MIOPEN_THROW(miopenStatusBadParm, "Unknown or unsupported data type.");
+        };
         break;
         }
 
@@ -1195,7 +1173,7 @@ GemmDescriptor CreateGemmDescriptorConvFwd(const TensorDescriptor& wDesc,
 {
 #ifndef NDEBUG
     assert(wDesc.GetType() == xDesc.GetType());
-    if(wDesc.GetType() != miopenInt8 && wDesc.GetType() != miopenInt8x4)
+    if(wDesc.GetType() != miopenInt8)
         assert(wDesc.GetType() == yDesc.GetType());
 #endif
 
@@ -1350,7 +1328,7 @@ GemmDescriptor CreateGemmDescriptorConvCNHWFwd(const TensorDescriptor& wDesc,
 {
 #ifndef NDEBUG
     assert(wDesc.GetType() == xDesc.GetType());
-    if(wDesc.GetType() != miopenInt8 && wDesc.GetType() != miopenInt8x4)
+    if(wDesc.GetType() != miopenInt8)
         assert(wDesc.GetType() == yDesc.GetType());
 #endif
 
@@ -1454,7 +1432,7 @@ GemmDescriptor CreateGemmStridedBatchedDescriptorConv1x1Fwd(const TensorDescript
 {
 #ifndef NDEBUG
     assert(wDesc.GetType() == xDesc.GetType());
-    if(wDesc.GetType() != miopenInt8 && wDesc.GetType() != miopenInt8x4)
+    if(wDesc.GetType() != miopenInt8)
         assert(wDesc.GetType() == yDesc.GetType());
 #else
     (void)yDesc;
diff --git a/src/hip/batched_transpose_sol.cpp b/src/hip/batched_transpose_sol.cpp
index 4ffcf34b77..01349775ca 100644
--- a/src/hip/batched_transpose_sol.cpp
+++ b/src/hip/batched_transpose_sol.cpp
@@ -322,7 +322,11 @@ BatchedTransposeSolution::BatchedTransposeSolution(const ExecutionContext& ctx,
                                                    uint32_t width_)
     : data_type(data_type_), batch(batch_), height(height_), width(width_)
 {
-    if(data_type == miopenInt8x4 || data_type == miopenDouble)
+    if(!(data_type == miopenHalf     //
+         || data_type == miopenFloat //
+         || data_type == miopenInt32 //
+         || data_type == miopenInt8  //
+         || data_type == miopenBFloat16))
         MIOPEN_THROW("These data type are not supported");
     num_cu                 = ctx.GetStream().GetMaxComputeUnits();
     std::size_t data_size  = miopen::GetTypeSize(data_type);
diff --git a/src/include/miopen/datatype.hpp b/src/include/miopen/datatype.hpp
index 485bdb3d67..29057e3de9 100644
--- a/src/include/miopen/datatype.hpp
+++ b/src/include/miopen/datatype.hpp
@@ -53,7 +53,10 @@ inline std::string GetDataType(miopenDataType_t type)
         type_str = "bfloat16";
     }
     break;
-    case miopenInt8x4:
+    case miopenInt8x4: {
+        type_str = "UNSUPPORTED_TYPE";
+    }
+    break;
     case miopenInt8: {
         type_str = "int8_t";
     }
@@ -137,7 +140,6 @@ inline KernelBuildParameters GetDataTypeKBP(miopenDataType_t type)
     int use_fp16x8             = 0;
     int use_fp32               = 0;
     int use_int8               = 0;
-    int use_int8x4             = 0;
     int use_int32              = 0;
     int use_bfp16              = 0;
     int use_fp64               = 0;
@@ -150,15 +152,14 @@ inline KernelBuildParameters GetDataTypeKBP(miopenDataType_t type)
     case miopenHalf: use_fp16 = 1; break;
     case miopenFloat: use_fp32 = 1; break;
     case miopenInt8: use_int8 = 1; break;
-    case miopenInt8x4: use_int8x4 = 1; break;
     case miopenBFloat16: use_bfp16 = 1; break;
     case miopenInt32: use_int32 = 1; break;
     case miopenDouble: use_fp64 = 1; break;
     case miopenFloat8: use_fp8 = 1; break;
     case miopenBFloat8: use_bfp8 = 1; break;
+    case miopenInt8x4: // fallthrough
     default:
-        MIOPEN_THROW(
-            "Only float, half, bfloat16, int8, int8x4, float8, bfloat8 data type is supported.");
+        MIOPEN_THROW("Only float, half, bfloat16, int8, float8, bfloat8 data types are supported.");
         break;
     }
 
@@ -168,7 +169,6 @@ inline KernelBuildParameters GetDataTypeKBP(miopenDataType_t type)
         {"MIOPEN_USE_FP16x8", use_fp16x8},
         {"MIOPEN_USE_FP32", use_fp32},
         {"MIOPEN_USE_INT8", use_int8},
-        {"MIOPEN_USE_INT8x4", use_int8x4},
         {"MIOPEN_USE_BFP16", use_bfp16},
         {"MIOPEN_USE_INT32", use_int32},
         {"MIOPEN_USE_RNE_BFLOAT16", use_rne_bfloat16},
diff --git a/src/include/miopen/tensor.hpp b/src/include/miopen/tensor.hpp
index b8d72df67c..a133ab4adc 100644
--- a/src/include/miopen/tensor.hpp
+++ b/src/include/miopen/tensor.hpp
@@ -101,13 +101,13 @@ inline std::size_t GetTypeSize(miopenDataType_t d)
     case miopenFloat: return 4;
     case miopenHalf:
     case miopenBFloat16: return 2;
-    case miopenInt8x4:
+    case miopenInt8x4: break;
     case miopenInt8:
     case miopenFloat8:
     case miopenBFloat8: return 1;
     case miopenDouble: return 8;
     }
-    MIOPEN_THROW("Unknown data type");
+    MIOPEN_THROW("Unknown or unsupported data type");
 }
 
 template <class X, class Y>
diff --git a/src/include/miopen/visit_float.hpp b/src/include/miopen/visit_float.hpp
index d26afba9dd..35e1ae6ae7 100644
--- a/src/include/miopen/visit_float.hpp
+++ b/src/include/miopen/visit_float.hpp
@@ -79,7 +79,6 @@ void visit_float(miopenDataType_t t, F f)
     }
     case miopenFloat8:
     case miopenBFloat8:
-    case miopenInt8x4:
     case miopenInt8: {
         f(as_float<int8_t>{});
         break;
@@ -92,6 +91,7 @@ void visit_float(miopenDataType_t t, F f)
         f(as_float<double>{});
         break;
     }
+    case miopenInt8x4: MIOPEN_THROW("miopenInt8x4: Support discontinued.");
     }
 }
 
diff --git a/src/kernels/MIOpenIm2d2Col.cl b/src/kernels/MIOpenIm2d2Col.cl
index 7b1522db6f..852ccff955 100644
--- a/src/kernels/MIOpenIm2d2Col.cl
+++ b/src/kernels/MIOpenIm2d2Col.cl
@@ -40,10 +40,6 @@
 #define MIOPEN_USE_INT8 0
 #endif
 
-#ifndef MIOPEN_USE_INT8x4
-#define MIOPEN_USE_INT8x4 0
-#endif
-
 #ifndef MIOPEN_USE_INT32
 #define MIOPEN_USE_INT32 0
 #endif
@@ -58,8 +54,6 @@
 
 #if MIOPEN_USE_INT8 || MIOPEN_USE_FP8 || MIOPEN_USE_BFP8
 typedef char data_t;
-#elif MIOPEN_USE_INT8x4
-typedef uint data_t;
 #elif MIOPEN_USE_INT32
 typedef int data_t;
 #elif(MIOPEN_USE_FP16 || MIOPEN_USE_BFP16)
diff --git a/src/kernels/MIOpenIm3d2Col.cl b/src/kernels/MIOpenIm3d2Col.cl
index 7ccb8363b7..5ee437a068 100644
--- a/src/kernels/MIOpenIm3d2Col.cl
+++ b/src/kernels/MIOpenIm3d2Col.cl
@@ -40,18 +40,12 @@
 #define MIOPEN_USE_INT8 0
 #endif
 
-#ifndef MIOPEN_USE_INT8x4
-#define MIOPEN_USE_INT8x4 0
-#endif
-
 #ifndef MIOPEN_USE_INT32
 #define MIOPEN_USE_INT32 0
 #endif
 
 #if MIOPEN_USE_INT8
 typedef char data_t;
-#elif MIOPEN_USE_INT8x4
-typedef uint data_t;
 #elif MIOPEN_USE_INT32
 typedef int data_t;
 #elif(MIOPEN_USE_FP16 || MIOPEN_USE_BFP16)
diff --git a/src/kernels/MIOpenSubTensorOpWithScalarKernel.cl b/src/kernels/MIOpenSubTensorOpWithScalarKernel.cl
index 87fd82a7fd..dc8c96fc60 100644
--- a/src/kernels/MIOpenSubTensorOpWithScalarKernel.cl
+++ b/src/kernels/MIOpenSubTensorOpWithScalarKernel.cl
@@ -46,17 +46,13 @@
 #define MIOPEN_USE_INT8 0
 #endif
 
-#ifndef MIOPEN_USE_INT8x4
-#define MIOPEN_USE_INT8x4 0
-#endif
-
 #ifndef MIOPEN_USE_INT32
 #define MIOPEN_USE_INT32 0
 #endif
 
 #include "float_types.h"
 
-#if MIOPEN_USE_INT8 == 1 || MIOPEN_USE_INT8x4 == 1
+#if MIOPEN_USE_INT8 == 1
 #define _FLOAT char
 #endif
 
diff --git a/src/kernels/MIOpenSubTensorOpWithSubTensorKernel.cl b/src/kernels/MIOpenSubTensorOpWithSubTensorKernel.cl
index d0d1762a10..1de4843712 100644
--- a/src/kernels/MIOpenSubTensorOpWithSubTensorKernel.cl
+++ b/src/kernels/MIOpenSubTensorOpWithSubTensorKernel.cl
@@ -46,11 +46,7 @@
 #define MIOPEN_USE_INT8 0
 #endif
 
-#ifndef MIOPEN_USE_INT8x4
-#define MIOPEN_USE_INT8x4 0
-#endif
-
-#if MIOPEN_USE_INT8 == 1 || MIOPEN_USE_INT8x4 == 1
+#if MIOPEN_USE_INT8 == 1
 #define _FLOAT char
 #ifndef FLT_MAX
 #define MAX_VAL 127 /* max value */
diff --git a/src/kernels/MIOpenSubTensorOpWithTransformKernel.cl b/src/kernels/MIOpenSubTensorOpWithTransformKernel.cl
index bc17f2ecf4..bd06f89626 100644
--- a/src/kernels/MIOpenSubTensorOpWithTransformKernel.cl
+++ b/src/kernels/MIOpenSubTensorOpWithTransformKernel.cl
@@ -43,11 +43,7 @@
 #define MIOPEN_USE_INT8 0
 #endif
 
-#ifndef MIOPEN_USE_INT8x4
-#define MIOPEN_USE_INT8x4 0
-#endif
-
-#if MIOPEN_USE_INT8 == 1 || MIOPEN_USE_INT8x4 == 1
+#if MIOPEN_USE_INT8 == 1
 #define _FLOAT char
 #ifndef FLT_MAX
 #define MAX_VAL 127 /* max value */
diff --git a/src/kernels/MIOpenUtilKernels4.cl b/src/kernels/MIOpenUtilKernels4.cl
index 861e563012..09e6b73cff 100644
--- a/src/kernels/MIOpenUtilKernels4.cl
+++ b/src/kernels/MIOpenUtilKernels4.cl
@@ -40,10 +40,6 @@
 #define MIOPEN_USE_INT8 0
 #endif
 
-#ifndef MIOPEN_USE_INT8x4
-#define MIOPEN_USE_INT8x4 0
-#endif
-
 #ifndef MIOPEN_USE_INT32
 #define MIOPEN_USE_INT32 0
 #endif
@@ -58,8 +54,6 @@
 
 #if MIOPEN_USE_INT8 || MIOPEN_USE_FP8 || MIOPEN_USE_BFP8
 typedef char data_t;
-#elif MIOPEN_USE_INT8x4
-typedef uint data_t;
 #elif MIOPEN_USE_INT32
 typedef int data_t;
 #elif(MIOPEN_USE_FP16 || MIOPEN_USE_BFP16)
diff --git a/src/ocl/convolutionocl.cpp b/src/ocl/convolutionocl.cpp
index 8c042e3e7b..94b083577d 100644
--- a/src/ocl/convolutionocl.cpp
+++ b/src/ocl/convolutionocl.cpp
@@ -296,8 +296,7 @@ void ValidateConvTensors(const ConvTensors& tensors)
                                           tensors.xDesc.GetSize() != tensors.wDesc.GetSize();
 
     const auto trivial_tensor_types_not_matched =
-        tensors.xDesc.GetType() != tensors.yDesc.GetType() &&
-        tensors.xDesc.GetType() != miopenInt8 && tensors.xDesc.GetType() != miopenInt8x4;
+        tensors.xDesc.GetType() != tensors.yDesc.GetType() && tensors.xDesc.GetType() != miopenInt8;
 
     // if(xDesc.GetLengths()[1] != wDesc.GetLengths()[1]) {
     //    MIOPEN_THROW(miopenStatusBadParm);
@@ -426,11 +425,6 @@ void ConvolutionDescriptor::ConvolutionForward(Handle& handle,
     ValidateConvTensors(tensors);
     ValidateAlphaBeta(alpha, beta);
 
-    if(algo != miopenConvolutionFwdAlgoGEMM && xDesc.GetType() == miopenInt8x4)
-    {
-        MIOPEN_THROW(miopenStatusBadParm);
-    }
-
     ConvForwardCheckNumerics(handle, tensors, [&]() {
         ValidateGroupCount(xDesc, wDesc, *this);
 
diff --git a/src/ocl/tensorocl.cpp b/src/ocl/tensorocl.cpp
index 9c7bff6992..1e1f8b1df9 100644
--- a/src/ocl/tensorocl.cpp
+++ b/src/ocl/tensorocl.cpp
@@ -1597,10 +1597,13 @@ void ScaleTensor(const Handle& handle,
     assert(yDim_flat > 0 && yDim_flat <= 5);
 
     const miopenDataType_t dataType = yDesc_flat.GetType();
-    if(dataType == miopenInt8 || dataType == miopenInt8x4 || dataType == miopenBFloat16)
+
+    if(!(dataType == miopenHalf     //
+         || dataType == miopenFloat //
+         || dataType == miopenInt32 //
+         || dataType == miopenDouble))
     {
-        MIOPEN_THROW(miopenStatusBadParm,
-                     "Tensor scale operation is not supported for int8, int8x4, and bfloat16.");
+        MIOPEN_THROW(miopenStatusBadParm, "ScaleTensor: unsupported data type.");
     }
 
     std::string kernel_name = "SubTensorOpWithScalar" + std::to_string(yDim_flat) + "d";
@@ -1941,8 +1944,7 @@ std::string GetCastTensorBuildOptionFromType(const std::string& buildOption, mio
     case miopenDouble:
         // TODO
         MIOPEN_THROW(miopenStatusBadParm, "miopenDouble data type not supported in cast tensor.");
-    case miopenInt8x4:
-        MIOPEN_THROW(miopenStatusBadParm, "miopenInt8x4 data type not supported in cast tensor.");
+    case miopenInt8x4: // fallthrough
     default: MIOPEN_THROW(miopenStatusBadParm, "Invalid data type in cast tensor desc.");
     }
 }
@@ -2237,24 +2239,6 @@ void TransformTensor(const Handle& handle,
             }
         }
     }
-    else if(xDesc.GetType() == miopenInt8 && yDesc.GetType() == miopenInt8x4 && x_len.size() >= 3)
-    {
-        if(x_len[1] <= (y_len[1] - 4) || y_len[1] % 4 != 0)
-        {
-            MIOPEN_THROW("Invalid y channel size");
-        }
-
-        transpose_NCHW2Vec(handle, x_len, x, y, 4, false, true, alpha, beta);
-    }
-    else if(xDesc.GetType() == miopenInt8x4 && yDesc.GetType() == miopenInt8 && x_len.size() >= 3)
-    {
-        if(y_len[1] <= (x_len[1] - 4) || x_len[1] % 4 != 0)
-        {
-            MIOPEN_THROW("Invalid x channel size");
-        }
-
-        transpose_NCHW2Vec(handle, y_len, x, y, 4, false, false, alpha, beta);
-    }
     else
     {
         auto x_y_len          = boost::combine(x_len, y_len);
@@ -2294,12 +2278,20 @@ void TransformTensor(const Handle& handle,
         const miopenDataType_t dataTypex = xDesc_flat.GetType();
         const miopenDataType_t dataTypey = yDesc_flat.GetType();
 
-        if(dataTypex == miopenInt8 || dataTypex == miopenInt8x4)
+        if(!(dataTypex == miopenHalf        //
+             || dataTypex == miopenFloat    //
+             || dataTypex == miopenInt32    //
+             || dataTypex == miopenBFloat16 //
+             || dataTypex == miopenDouble))
         {
             MIOPEN_THROW("Tensor x is a unsupported data type");
         }
 
-        if(dataTypey == miopenInt8 || dataTypey == miopenInt8x4)
+        if(!(dataTypey == miopenHalf        //
+             || dataTypey == miopenFloat    //
+             || dataTypey == miopenInt32    //
+             || dataTypey == miopenBFloat16 //
+             || dataTypey == miopenDouble))
         {
             MIOPEN_THROW("Tensor y is a unsupported data type");
         }
diff --git a/src/ocl/utilocl.cpp b/src/ocl/utilocl.cpp
index 0a88efb2f1..d536e819e6 100644
--- a/src/ocl/utilocl.cpp
+++ b/src/ocl/utilocl.cpp
@@ -80,8 +80,8 @@ float Im2d2ColGPU(const Handle& handle,
 
     int data_size_bound = c * in_h * in_w;
 
-    int data_size_bound_pack = type == miopenInt8x4 ? data_size_bound * 4 : data_size_bound;
-    int im_offset_pack       = type == miopenInt8x4 ? im_offset / 4 : im_offset;
+    int data_size_bound_pack = data_size_bound;
+    int im_offset_pack       = im_offset;
 
     if(!kernels.empty())
     {
@@ -105,7 +105,7 @@ float Im2d2ColGPU(const Handle& handle,
     }
     else
     {
-        const int c_pack = type == miopenInt8x4 ? c / 4 : c;
+        const int c_pack = c;
 
         std::string params;
         int num_ch_per_wg;
@@ -331,9 +331,8 @@ float Im3d2ColGPU(const Handle& handle,
 
     auto&& kernels = handle.GetKernels("miopenIm3d2Col", network_config);
 
-    // int8x4 vectorize-c format
-    int im_offset_pack = type == miopenInt8x4 ? im_offset / 4 : im_offset;
-    int im_c_pack      = type == miopenInt8x4 ? im_c / 4 : im_c;
+    int im_offset_pack = im_offset;
+    int im_c_pack      = im_c;
 
     if(!kernels.empty())
     {
@@ -772,13 +771,6 @@ float transpose_NCHW2CNHW(const Handle& handle,
 
     std::string params = GetDataTypeKernelParams(type);
 
-    if(type == miopenInt8x4)
-    {
-        c /= 4;
-        in_offset /= 4;
-        out_offset /= 4;
-    }
-
     if(h_stride == 1 && w_stride == 1 && type == miopenFloat)
     {
         kernel_name += "_V1";
@@ -910,13 +902,6 @@ float transpose_CNHW2NCHW(const Handle& handle,
 
     std::string params = GetDataTypeKernelParams(type);
 
-    if(type == miopenInt8x4)
-    {
-        c /= 4;
-        in_offset /= 4;
-        out_offset /= 4;
-    }
-
     if(h_stride == 1 && w_stride == 1 && type == miopenFloat)
     {
         kernel_name += "_V1";
@@ -1170,14 +1155,8 @@ float transpose_packed_MN2NM(const Handle& handle,
     auto&& kernels = handle.GetKernels(kernel_name, network_config);
 
     std::string params = GetDataTypeKernelParams(type);
-    if(type == miopenInt8x4)
-    {
-        m /= 4;
-        in_offset /= 4;
-        out_offset /= 4;
-    }
 
-    if(!(type == miopenInt8x4 || type == miopenInt8))
+    if(type != miopenInt8)
     {
         MIOPEN_THROW("transpose_packed_MN2NM only meant for int8 variants.");
     }
diff --git a/src/pooling_api.cpp b/src/pooling_api.cpp
index ef526804cf..bf318f7b78 100644
--- a/src/pooling_api.cpp
+++ b/src/pooling_api.cpp
@@ -50,7 +50,7 @@ inline void Pooling_logging_cmd(const miopenPoolingDescriptor_t poolDesc,
         case miopenFloat: ss << "pool"; break;
         case miopenInt32:
         case miopenInt8:
-        case miopenInt8x4:
+        case miopenInt8x4: // Support discontinued.
         case miopenBFloat16:
         case miopenDouble:
         case miopenFloat8:
diff --git a/src/reducetensor.cpp b/src/reducetensor.cpp
index 6cf29c2f64..9cabbc03f6 100644
--- a/src/reducetensor.cpp
+++ b/src/reducetensor.cpp
@@ -211,11 +211,10 @@ inline int GetDataTypeSize(miopenDataType_t t)
     case miopenFloat8:
     case miopenBFloat8:
     case miopenInt8: return (1);
-    case miopenInt8x4: return (4);
+    case miopenInt8x4: return (4); // Support discontinued.
     case miopenBFloat16: return (2);
     case miopenInt32: return (4);
-    default:
-        MIOPEN_THROW("Only float, half, double, bfloat16, int8, int8x4 data type is supported.");
+    default: MIOPEN_THROW("Only float, half, double, bfloat16, int8 data types are supported.");
     };
 };
 
@@ -269,7 +268,7 @@ inline int GetDataTypeId(miopenDataType_t t)
     case miopenBFloat16: return (static_cast<int>('B'));
     case miopenDouble: return (static_cast<int>('D'));
     case miopenInt8:
-    case miopenInt8x4:
+    case miopenInt8x4: // Support discontinued.
     case miopenFloat8:
     case miopenBFloat8:
     case miopenInt32: return (static_cast<int>('O'));
diff --git a/src/reducetensor_api.cpp b/src/reducetensor_api.cpp
index 685b2fb430..10b1ff73bf 100644
--- a/src/reducetensor_api.cpp
+++ b/src/reducetensor_api.cpp
@@ -43,7 +43,7 @@ static void LogCmdRedux(const miopen::ReduceTensorDescriptor reduceTensorDesc,
             ss << "reducefp16";
         else if(aDesc.GetType() == miopenBFloat16)
             ss << "reducebfp16";
-        else if(aDesc.GetType() == miopenInt8 || aDesc.GetType() == miopenInt8x4)
+        else if(aDesc.GetType() == miopenInt8)
             ss << "reduceint8";
         else if(aDesc.GetType() == miopenDouble)
             ss << "reducefp64";
diff --git a/src/solver/batchnorm/forward_inference_ck.cpp b/src/solver/batchnorm/forward_inference_ck.cpp
index 5a7918cc64..ff17432a62 100644
--- a/src/solver/batchnorm/forward_inference_ck.cpp
+++ b/src/solver/batchnorm/forward_inference_ck.cpp
@@ -200,7 +200,7 @@ bool BnCKFwdInference::IsApplicable(const ExecutionContext& context,
         return (CheckCKApplicability<BF16, BF16, F32, BF16, BF16, F32>(bn_problem) != -1);
     case miopenInt32:
     case miopenInt8:
-    case miopenInt8x4:
+    case miopenInt8x4: // Support discontinued.
     case miopenFloat8:
     case miopenBFloat8:
     default: MIOPEN_THROW("Unsupported datatype");
@@ -244,7 +244,7 @@ BnCKFwdInference::GetSolution(const ExecutionContext& context,
                 break;
             case miopenInt8:
             case miopenInt32:
-            case miopenInt8x4:
+            case miopenInt8x4: // Support discontinued.
             case miopenFloat8:
             case miopenBFloat8:
             default: MIOPEN_THROW("Unsupported datatype");
diff --git a/src/solver/conv_ck_igemm_fwd_bias_activ_fused.cpp b/src/solver/conv_ck_igemm_fwd_bias_activ_fused.cpp
index 9520a79a90..e2df6f8097 100644
--- a/src/solver/conv_ck_igemm_fwd_bias_activ_fused.cpp
+++ b/src/solver/conv_ck_igemm_fwd_bias_activ_fused.cpp
@@ -287,7 +287,7 @@ void PerformanceConfigConvCKIgemmFwdBiasActivFused::HeuristicInit(
     case miopenInt8:
     case miopenFloat:
     case miopenInt32:
-    case miopenInt8x4:
+    case miopenInt8x4: // Support discontinued.
     case miopenBFloat16:
     case miopenDouble:
     default: MIOPEN_THROW("Unsupported datatype");
@@ -342,7 +342,7 @@ bool PerformanceConfigConvCKIgemmFwdBiasActivFused::IsValid(
     case miopenInt8:
     case miopenFloat:
     case miopenInt32:
-    case miopenInt8x4:
+    case miopenInt8x4: // Support discontinued.
     case miopenBFloat16:
     case miopenDouble:
     default: MIOPEN_THROW("Unsupported datatype");
@@ -435,7 +435,7 @@ bool ConvCKIgemmFwdBiasActivFused::IsApplicable(const FusionContext& ctx,
     case miopenInt8:
     case miopenFloat:
     case miopenInt32:
-    case miopenInt8x4:
+    case miopenInt8x4: // Support discontinued.
     case miopenBFloat16:
     case miopenDouble:
     default: MIOPEN_THROW("Unsupported datatype");
@@ -469,7 +469,7 @@ ConvSolution ConvCKIgemmFwdBiasActivFused::GetSolution(
             case miopenInt8:
             case miopenFloat:
             case miopenInt32:
-            case miopenInt8x4:
+            case miopenInt8x4: // Support discontinued.
             case miopenBFloat16:
             case miopenDouble:
             default: MIOPEN_THROW("Unsupported datatype");
diff --git a/src/solver/conv_direct_naive_conv.cpp b/src/solver/conv_direct_naive_conv.cpp
index 4df8df5874..6d89af7eaf 100644
--- a/src/solver/conv_direct_naive_conv.cpp
+++ b/src/solver/conv_direct_naive_conv.cpp
@@ -48,7 +48,7 @@ bool ConvDirectNaiveConvIsAssemblyKernel(const ExecutionContext& ctx,
 {
     const auto device_name = ctx.GetStream().GetDeviceName();
     return (device_name == "gfx906" || device_name == "gfx908") && ctx.rmv.IsV3() &&
-           problem.IsLayoutDefault() && (!problem.IsInt8());
+           problem.IsLayoutDefault() && (problem.IsFp16() || problem.IsFp32() || problem.IsBfp16());
 }
 
 // Check tensor data type respectively
diff --git a/src/solver/conv_hip_implicit_gemm_3d_grouped_bwd_xdlops.cpp b/src/solver/conv_hip_implicit_gemm_3d_grouped_bwd_xdlops.cpp
index 2602c54320..94983c7f0e 100644
--- a/src/solver/conv_hip_implicit_gemm_3d_grouped_bwd_xdlops.cpp
+++ b/src/solver/conv_hip_implicit_gemm_3d_grouped_bwd_xdlops.cpp
@@ -210,7 +210,7 @@ void PerformanceConfigHipImplicitGemm3DGroupBwdXdlops::HeuristicInit(
     case miopenFloat: Init<float>(problem); break;
     case miopenInt8: Init<int8_t>(problem); break;
     case miopenInt32:
-    case miopenInt8x4:
+    case miopenInt8x4: // Support discontinued.
     case miopenBFloat16:
     case miopenFloat8:
     case miopenBFloat8:
@@ -253,7 +253,7 @@ bool PerformanceConfigHipImplicitGemm3DGroupBwdXdlops::IsValid(
     case miopenFloat: return CheckIsSupportCKArgs<float>(problem);
     case miopenInt8: return CheckIsSupportCKArgs<int8_t>(problem);
     case miopenInt32:
-    case miopenInt8x4:
+    case miopenInt8x4: // Support discontinued.
     case miopenBFloat16:
     case miopenFloat8:
     case miopenBFloat8:
@@ -324,7 +324,7 @@ bool ConvHipImplicitGemm3DGroupBwdXdlops::IsApplicable(
     case miopenFloat: return CheckCKApplicability<float>(problem);
     case miopenInt8: return CheckCKApplicability<int8_t>(problem);
     case miopenInt32:
-    case miopenInt8x4:
+    case miopenInt8x4: // Support discontinued.
     case miopenBFloat16:
     case miopenFloat8:
     case miopenBFloat8:
@@ -352,7 +352,7 @@ ConvSolution ConvHipImplicitGemm3DGroupBwdXdlops::GetSolution(
         return InitInvokerFactory<DeviceOpGBwdPtrs<float>, CKArgs, conv::DataInvokeParams>(
             problem, config.kernel_id);
     case miopenInt32:
-    case miopenInt8x4:
+    case miopenInt8x4: // Support discontinued.
     case miopenBFloat16:
     case miopenDouble:
     case miopenFloat8:
diff --git a/src/solver/conv_hip_implicit_gemm_3d_grouped_fwd_xdlops.cpp b/src/solver/conv_hip_implicit_gemm_3d_grouped_fwd_xdlops.cpp
index f0623c642d..a21c9ba300 100644
--- a/src/solver/conv_hip_implicit_gemm_3d_grouped_fwd_xdlops.cpp
+++ b/src/solver/conv_hip_implicit_gemm_3d_grouped_fwd_xdlops.cpp
@@ -210,7 +210,7 @@ void PerformanceConfigHipImplicitGemm3DGroupFwdXdlops::HeuristicInit(
     case miopenFloat: Init<float>(problem); break;
     case miopenInt8: Init<int8_t>(problem); break;
     case miopenInt32:
-    case miopenInt8x4:
+    case miopenInt8x4: // Support discontinued.
     case miopenFloat8:
     case miopenBFloat8:
     case miopenBFloat16:
@@ -253,7 +253,7 @@ bool PerformanceConfigHipImplicitGemm3DGroupFwdXdlops::IsValid(
     case miopenFloat: return CheckIsSupportCKArgs<float>(problem);
     case miopenInt8: return CheckIsSupportCKArgs<int8_t>(problem);
     case miopenInt32:
-    case miopenInt8x4:
+    case miopenInt8x4: // Support discontinued.
     case miopenFloat8:
     case miopenBFloat8:
     case miopenBFloat16:
@@ -322,7 +322,7 @@ bool ConvHipImplicitGemm3DGroupFwdXdlops::IsApplicable(
     case miopenFloat: return CheckCKApplicability<float>(problem);
     case miopenInt8: return CheckCKApplicability<int8_t>(problem);
     case miopenInt32:
-    case miopenInt8x4:
+    case miopenInt8x4: // Support discontinued.
     case miopenFloat8:
     case miopenBFloat8:
     case miopenBFloat16:
@@ -350,7 +350,7 @@ ConvSolution ConvHipImplicitGemm3DGroupFwdXdlops::GetSolution(
         return InitInvokerFactory<DeviceOpGFwdPtrs<float>, CKArgs, conv::DataInvokeParams>(
             problem, config.kernel_id);
     case miopenInt32:
-    case miopenInt8x4:
+    case miopenInt8x4: // Support discontinued.
     case miopenBFloat16:
     case miopenDouble:
     case miopenFloat8:
diff --git a/src/solver/conv_hip_implicit_gemm_3d_grouped_wrw_xdlops.cpp b/src/solver/conv_hip_implicit_gemm_3d_grouped_wrw_xdlops.cpp
index 6fce8a80b8..5c92f57142 100644
--- a/src/solver/conv_hip_implicit_gemm_3d_grouped_wrw_xdlops.cpp
+++ b/src/solver/conv_hip_implicit_gemm_3d_grouped_wrw_xdlops.cpp
@@ -206,7 +206,7 @@ void PerformanceConfigHipImplicitGemm3DGroupWrwXdlops::HeuristicInit(
     case miopenFloat: Init<float>(problem); break;
     case miopenInt8: Init<int8_t>(problem); break;
     case miopenInt32:
-    case miopenInt8x4:
+    case miopenInt8x4: // Support discontinued.
     case miopenBFloat16:
     case miopenFloat8:
     case miopenBFloat8:
@@ -249,7 +249,7 @@ bool PerformanceConfigHipImplicitGemm3DGroupWrwXdlops::IsValid(
     case miopenFloat: return CheckIsSupportCKArgs<float>(problem);
     case miopenInt8: return CheckIsSupportCKArgs<int8_t>(problem);
     case miopenInt32:
-    case miopenInt8x4:
+    case miopenInt8x4: // Support discontinued.
     case miopenBFloat16:
     case miopenFloat8:
     case miopenBFloat8:
@@ -320,7 +320,7 @@ bool ConvHipImplicitGemm3DGroupWrwXdlops::IsApplicable(
     case miopenFloat: return CheckCKApplicability<float>(problem);
     case miopenInt8: return CheckCKApplicability<int8_t>(problem);
     case miopenInt32:
-    case miopenInt8x4:
+    case miopenInt8x4: // Support discontinued.
     case miopenBFloat16:
     case miopenFloat8:
     case miopenBFloat8:
@@ -348,7 +348,7 @@ ConvSolution ConvHipImplicitGemm3DGroupWrwXdlops::GetSolution(
         return InitInvokerFactory<DeviceOpGWrwPtrs<float>, CKArgs, conv::WrWInvokeParams>(
             problem, config.kernel_id);
     case miopenInt32:
-    case miopenInt8x4:
+    case miopenInt8x4: // Support discontinued.
     case miopenBFloat16:
     case miopenFloat8:
     case miopenBFloat8:
diff --git a/src/solver/conv_hip_implicit_gemm_bwd_data_xdlops.cpp b/src/solver/conv_hip_implicit_gemm_bwd_data_xdlops.cpp
index 57c63267d2..5aa17e75bf 100644
--- a/src/solver/conv_hip_implicit_gemm_bwd_data_xdlops.cpp
+++ b/src/solver/conv_hip_implicit_gemm_bwd_data_xdlops.cpp
@@ -172,7 +172,7 @@ void PerformanceConfigHipImplicitGemmBwdXdlops::HeuristicInit(
     case miopenBFloat8:
     case miopenInt8:
     case miopenInt32:
-    case miopenInt8x4:
+    case miopenInt8x4: // Support discontinued.
     case miopenBFloat16:
     case miopenDouble: break;
     }
@@ -214,7 +214,7 @@ bool PerformanceConfigHipImplicitGemmBwdXdlops::IsValid(
     case miopenBFloat8:
     case miopenInt8:
     case miopenInt32:
-    case miopenInt8x4:
+    case miopenInt8x4: // Support discontinued.
     case miopenBFloat16:
     case miopenDouble: break;
     }
@@ -294,7 +294,7 @@ bool ConvHipImplicitGemmBwdXdlops::IsApplicable(
     case miopenBFloat8:
     case miopenInt8:
     case miopenInt32:
-    case miopenInt8x4:
+    case miopenInt8x4: // Support discontinued.
     case miopenBFloat16:
     case miopenDouble: break;
     }
@@ -318,7 +318,7 @@ ConvSolution ConvHipImplicitGemmBwdXdlops::GetSolution(
             problem, config.kernel_id);
     case miopenInt8:
     case miopenInt32:
-    case miopenInt8x4:
+    case miopenInt8x4: // Support discontinued.
     case miopenBFloat16:
     case miopenDouble:
     case miopenFloat8:
diff --git a/src/solver/conv_hip_implicit_gemm_fwd_xdlops.cpp b/src/solver/conv_hip_implicit_gemm_fwd_xdlops.cpp
index 73907eb788..7a9c322bc7 100644
--- a/src/solver/conv_hip_implicit_gemm_fwd_xdlops.cpp
+++ b/src/solver/conv_hip_implicit_gemm_fwd_xdlops.cpp
@@ -172,7 +172,7 @@ void PerformanceConfigHipImplicitGemmFwdXdlops::HeuristicInit(
     case miopenFloat8:
     case miopenBFloat8:
     case miopenInt32:
-    case miopenInt8x4:
+    case miopenInt8x4: // Support discontinued.
     case miopenBFloat16:
     case miopenDouble: break;
     }
@@ -215,7 +215,7 @@ bool PerformanceConfigHipImplicitGemmFwdXdlops::IsValid(
     case miopenFloat8:
     case miopenBFloat8:
     case miopenInt32:
-    case miopenInt8x4:
+    case miopenInt8x4: // Support discontinued.
     case miopenBFloat16:
     case miopenDouble: break;
     }
@@ -294,7 +294,7 @@ bool ConvHipImplicitGemmFwdXdlops::IsApplicable(
     case miopenFloat8:
     case miopenBFloat8:
     case miopenInt32:
-    case miopenInt8x4:
+    case miopenInt8x4: // Support discontinued.
     case miopenBFloat16:
     case miopenDouble: break;
     }
@@ -320,7 +320,7 @@ ConvSolution ConvHipImplicitGemmFwdXdlops::GetSolution(
         return InitInvokerFactory<DeviceOpPtrs<float>, CKArgs, conv::DataInvokeParams>(
             problem, config.kernel_id);
     case miopenInt32:
-    case miopenInt8x4:
+    case miopenInt8x4: // Support discontinued.
     case miopenBFloat16:
     case miopenDouble:
     case miopenFloat8:
diff --git a/src/solver/conv_hip_implicit_gemm_grouped_fwd_xdlops.cpp b/src/solver/conv_hip_implicit_gemm_grouped_fwd_xdlops.cpp
index c2d0b83141..0a8b6eeb03 100644
--- a/src/solver/conv_hip_implicit_gemm_grouped_fwd_xdlops.cpp
+++ b/src/solver/conv_hip_implicit_gemm_grouped_fwd_xdlops.cpp
@@ -198,7 +198,7 @@ void PerformanceConfigHipImplicitGemmGroupFwdXdlops::HeuristicInit(
     case miopenFloat: Init<float>(problem); break;
     case miopenInt8: Init<int8_t>(problem); break;
     case miopenInt32:
-    case miopenInt8x4:
+    case miopenInt8x4: // Support discontinued.
     case miopenBFloat16:
     case miopenFloat8:
     case miopenBFloat8:
@@ -240,7 +240,7 @@ bool PerformanceConfigHipImplicitGemmGroupFwdXdlops::IsValid(
     case miopenFloat: return CheckIsSupportCKArgs<float>(problem);
     case miopenInt8: return CheckIsSupportCKArgs<int8_t>(problem);
     case miopenInt32:
-    case miopenInt8x4:
+    case miopenInt8x4: // Support discontinued.
     case miopenBFloat16:
     case miopenFloat8:
     case miopenBFloat8:
@@ -311,7 +311,7 @@ bool ConvHipImplicitGemmGroupFwdXdlops::IsApplicable(
     case miopenFloat: return CheckCKApplicability<float>(problem);
     case miopenInt8: return CheckCKApplicability<int8_t>(problem);
     case miopenInt32:
-    case miopenInt8x4:
+    case miopenInt8x4: // Support discontinued.
     case miopenBFloat16:
     case miopenFloat8:
     case miopenBFloat8:
@@ -339,7 +339,7 @@ ConvSolution ConvHipImplicitGemmGroupFwdXdlops::GetSolution(
         return InitInvokerFactory<DeviceOpGFwdPtrs<int8_t>, CKArgs, conv::DataInvokeParams>(
             problem, config.kernel_id);
     case miopenInt32:
-    case miopenInt8x4:
+    case miopenInt8x4: // Support discontinued.
     case miopenBFloat16:
     case miopenDouble:
     case miopenFloat8:
diff --git a/src/solver/gemm.cpp b/src/solver/gemm.cpp
index 7ad238e7fa..6ae4f5dde7 100644
--- a/src/solver/gemm.cpp
+++ b/src/solver/gemm.cpp
@@ -87,13 +87,15 @@ bool GemmFwdBase::IsApplicable(const ExecutionContext& ctx,
     const auto& xDesc = problem.GetIn();
     const auto& wDesc = problem.GetWeights();
     const auto& yDesc = problem.GetOut();
-    if(xDesc.GetType() == miopenInt8x4 || xDesc.GetType() == miopenInt8)
-    {
-        // rocBlas needs the output to be int32 always
-        if(yDesc.GetType() != miopenFloat && yDesc.GetType() != miopenInt32 &&
-           yDesc.GetType() != miopenInt8x4)
-            return false;
-    }
+
+    // rocBlas needs the output to be 32-bit always
+    if(xDesc.GetType() == miopenInt8x4)
+        return false;
+    if(xDesc.GetType() == miopenInt8      //
+       && (yDesc.GetType() != miopenFloat //
+           && yDesc.GetType() != miopenInt32))
+        return false;
+
     const auto rblas_fp8_supported = miopen::StartsWith(ctx.GetStream().GetDeviceName(), "gfx94");
     if(problem.IsTensorsCasted())
     {
@@ -185,8 +187,7 @@ float GemmFwdBase::GetWti(const ExecutionContext&, const conv::ProblemDescriptio
             n_transpose_packed_MN2NM = 1;
         n_gemm_strided_batched = conv.group_count;
         n_transpose_CNHW2NCHW  = 1;
-        if((wDesc.GetType() == miopenInt8 || wDesc.GetType() == miopenInt8x4) &&
-           yDesc.GetType() != miopenInt32)
+        if(wDesc.GetType() == miopenInt8 && yDesc.GetType() != miopenInt32)
             n_CastTensor = 1;
     }
     // 1x1_stride=1 with GEMM and zero workspace
@@ -205,8 +206,7 @@ float GemmFwdBase::GetWti(const ExecutionContext&, const conv::ProblemDescriptio
             n_gemm_strided_batched = conv.group_count;
             n_gemm_runs            = in_n;
         }
-        if((wDesc.GetType() == miopenInt8 || wDesc.GetType() == miopenInt8x4) &&
-           yDesc.GetType() != miopenInt32)
+        if(wDesc.GetType() == miopenInt8 && yDesc.GetType() != miopenInt32)
             n_CastTensor = 1;
     }
     else // not 1x1
@@ -216,8 +216,7 @@ float GemmFwdBase::GetWti(const ExecutionContext&, const conv::ProblemDescriptio
             n_transpose_packed_MN2NM = in_n;
         n_gemm_strided_batched = conv.group_count;
         n_gemm_runs            = in_n;
-        if((wDesc.GetType() == miopenInt8 || wDesc.GetType() == miopenInt8x4) &&
-           yDesc.GetType() != miopenInt32)
+        if(wDesc.GetType() == miopenInt8 && yDesc.GetType() != miopenInt32)
             n_CastTensor = 1;
     }
 
@@ -410,7 +409,7 @@ ConvSolution GemmFwd1x1_0_2::GetSolution(const ExecutionContext& context,
                 x_t_size *= 2;
             }
 
-            if(wDesc.GetType() == miopenInt8 || wDesc.GetType() == miopenInt8x4)
+            if(wDesc.GetType() == miopenInt8)
             {
                 const auto xts = GetTypeSize(xDesc.GetType());
                 if(xts > 0)
@@ -494,8 +493,7 @@ ConvSolution GemmFwd1x1_0_2::GetSolution(const ExecutionContext& context,
             if(handle.IsProfilingEnabled())
                 time_gemm += handle.GetKernelTime();
 
-            if((wDesc.GetType() == miopenInt8 || wDesc.GetType() == miopenInt8x4) &&
-               yDesc.GetType() != miopenInt32)
+            if(wDesc.GetType() == miopenInt8 && yDesc.GetType() != miopenInt32)
             {
                 TensorDescriptor ygemmDesc(miopenInt32, yDesc.GetLengths(), yDesc.GetStrides());
 
@@ -782,7 +780,6 @@ ConvSolution GemmFwd1x1_0_1::GetSolution(const ExecutionContext& context,
     auto solution = ConvSolution{miopenStatusSuccess};
 
     const auto group_count = conv.group_count;
-    const auto lowp_quant  = conv.lowp_quant;
 
     if(group_count > 1)
     {
@@ -881,14 +878,6 @@ ConvSolution GemmFwd1x1_0_1::GetSolution(const ExecutionContext& context,
                     }
                 }
 
-                if(wDesc.GetType() == miopenInt8x4 && yDesc.GetType() != miopenInt32)
-                {
-                    TensorDescriptor ygemmDesc(miopenInt32, yDesc.GetLengths(), yDesc.GetStrides());
-                    CastTensor(handle, &lowp_quant, ygemmDesc, y, yDesc, y, 0, 0);
-                    if(handle.IsProfilingEnabled())
-                        time_gemm += handle.GetKernelTime();
-                }
-
                 if(handle.IsProfilingEnabled())
                 {
                     handle.ResetKernelTime();
@@ -967,14 +956,6 @@ ConvSolution GemmFwd1x1_0_1::GetSolution(const ExecutionContext& context,
                 if(handle.IsProfilingEnabled())
                     time += handle.GetKernelTime();
 
-                if(wDesc.GetType() == miopenInt8x4 && yDesc.GetType() != miopenInt32)
-                {
-                    TensorDescriptor ygemmDesc(miopenInt32, yDesc.GetLengths(), yDesc.GetStrides());
-                    CastTensor(handle, &lowp_quant, ygemmDesc, y, yDesc, y, 0, 0);
-                    if(handle.IsProfilingEnabled())
-                        time += handle.GetKernelTime();
-                }
-
                 if(handle.IsProfilingEnabled())
                 {
                     handle.ResetKernelTime();
@@ -1268,8 +1249,7 @@ ConvSolution GemmFwdRest::GetSolution(const ExecutionContext& context,
                 }
             }
 
-            if((wDesc.GetType() == miopenInt8 || wDesc.GetType() == miopenInt8x4) &&
-               yDesc.GetType() != miopenInt32)
+            if(wDesc.GetType() == miopenInt8 && yDesc.GetType() != miopenInt32)
             {
                 TensorDescriptor ygemmDesc(miopenInt32, yDesc.GetLengths(), yDesc.GetStrides());
 
diff --git a/src/solver/mlir_common.cpp b/src/solver/mlir_common.cpp
index 4101db86f9..4e41b8944a 100644
--- a/src/solver/mlir_common.cpp
+++ b/src/solver/mlir_common.cpp
@@ -57,7 +57,7 @@ static const char* DTypeName(miopenDataType_t ty)
     case miopenBFloat16: return "bf16";
     case miopenInt32: return "i32";
     case miopenInt8: return "i8";
-    case miopenInt8x4: return "i8x4";
+    case miopenInt8x4: return "i8x4"; // Support discontinued.
     case miopenFloat8: return "fp8";
     case miopenBFloat8: return "bfp8";
     }
diff --git a/src/tensor.cpp b/src/tensor.cpp
index ca4f1afc7a..df0d7c2819 100644
--- a/src/tensor.cpp
+++ b/src/tensor.cpp
@@ -50,7 +50,7 @@ bool IsDataTypeSupported(miopenDataType_t t)
     case miopenFloat8:
     case miopenBFloat8:
     case miopenInt8:
-    case miopenInt8x4:
+    case miopenInt8x4: // Support discontinued.
     case miopenBFloat16:
     case miopenDouble: return true;
     }
diff --git a/test/conv_common.hpp b/test/conv_common.hpp
index 99f964a0f6..e3fa9766e6 100644
--- a/test/conv_common.hpp
+++ b/test/conv_common.hpp
@@ -217,12 +217,7 @@ tensor<Tout> get_output_tensor(const miopen::ConvolutionDescriptor& filter,
             ? input.desc.GetLayout(miopen::tensor_layout_get_default(input.desc.GetSize()))
             : out_layout;
     return tensor<Tout>{filter.GetForwardOutputTensorWithLayout(
-        input.desc,
-        weights.desc,
-        yLayout,
-        weights.desc.GetType() == miopenInt8x4
-            ? (std::is_same<Tout, int>{} ? miopenInt32 : miopenFloat)
-            : miopen_type<Tout>{})};
+        input.desc, weights.desc, yLayout, miopen_type<Tout>{})};
 }
 
 enum class ConvApi
@@ -572,8 +567,7 @@ struct verify_forward_conv : conv_base<T, Tout>
         auto rout = ref_conv_fwd(input, weights, out, filter);
         if(filter.mode != miopenTranspose)
         {
-            bool is_int8 =
-                weights.desc.GetType() == miopenInt8 || weights.desc.GetType() == miopenInt8x4;
+            bool is_int8   = weights.desc.GetType() == miopenInt8;
             bool is_vect_c = weights.desc.GetVectorLength() > 1;
             rout.par_for_each([&](auto... is) {
                 if(is_int8 && !is_vect_c)
@@ -774,7 +768,7 @@ struct verify_forward_conv : conv_base<T, Tout>
             break;
         case ConvApi::Find_1_0:
         case ConvApi::Find_2_0:
-            if(weights.desc.GetType() == miopenInt8 || weights.desc.GetType() == miopenInt8x4)
+            if(weights.desc.GetType() == miopenInt8)
             {
 
                 bool is_transform = (input.desc.GetLengths()[1] % 4 != 0 || is_vect);
@@ -786,10 +780,8 @@ struct verify_forward_conv : conv_base<T, Tout>
                 in_len[1]  = ((in_len[1] + 3) / 4) * 4;
                 wei_len[1] = ((wei_len[1] + 3) / 4) * 4;
 
-                miopen::TensorDescriptor input_vpad_desc(is_vect ? miopenInt8x4 : miopenInt8,
-                                                         in_len);
-                miopen::TensorDescriptor weight_vpad_desc(is_vect ? miopenInt8x4 : miopenInt8,
-                                                          wei_len);
+                miopen::TensorDescriptor input_vpad_desc(miopenInt8, in_len);
+                miopen::TensorDescriptor weight_vpad_desc(miopenInt8, wei_len);
 
                 auto input_vpad   = tensor<T>{in_len};
                 auto weights_vpad = tensor<T>{wei_len};
@@ -1738,8 +1730,8 @@ struct verify_forward_conv_int8 : conv_base<T>
         in_len[1]  = ((in_len[1] + 3) / 4) * 4;
         wei_len[1] = ((wei_len[1] + 3) / 4) * 4;
 
-        miopen::TensorDescriptor input_vpad_desc(is_vect ? miopenInt8x4 : miopenInt8, in_len);
-        miopen::TensorDescriptor weight_vpad_desc(is_vect ? miopenInt8x4 : miopenInt8, wei_len);
+        miopen::TensorDescriptor input_vpad_desc(miopenInt8, in_len);
+        miopen::TensorDescriptor weight_vpad_desc(miopenInt8, wei_len);
 
         auto input_vpad   = tensor<T>{in_len};
         auto weights_vpad = tensor<T>{wei_len};
@@ -2046,7 +2038,7 @@ struct conv_driver : test_driver
             filter.spatialDim = get_spatial_dim();
         else
             filter.spatialDim = filter_dims.size();
-        bool is_int8 = (input.desc.GetType() == miopenInt8 || input.desc.GetType() == miopenInt8x4);
+        bool is_int8 = (input.desc.GetType() == miopenInt8);
 
         filter.mode             = cmode_lookup[miopen::ToUpper(conv_mode)];
         filter.paddingMode      = pmode_lookup[miopen::ToUpper(pad_mode)];
@@ -2360,8 +2352,7 @@ struct conv_driver : test_driver
                 bool skip_backward_weights = is_int8;
 
 #if TEST_DIRECT_SUPPORTED_CONFIG_ONLY
-                if(input.desc.GetType() == miopenInt8 || input.desc.GetType() == miopenInt8x4 ||
-                   input.desc.GetType() == miopenBFloat16)
+                if(input.desc.GetType() == miopenInt8 || input.desc.GetType() == miopenBFloat16)
                 {
                     show_command();
                     std::cout << "Direct path doesn't support Int8 or BFloat16 type." << std::endl;
@@ -2405,7 +2396,8 @@ struct conv_driver : test_driver
                 size_t total_mem;
                 if(is_int8)
                 {
-                    // TODO: Tout here was float which should have been int32
+                    /// \todo Properly construct the `output` tensor descriptor
+                    /// and get rid of this special "int8" stuff.
                     auto output_int8 =
                         get_output_tensor<T, Tout>(filter, input, weights, out_layout);
                     const auto problem        = ConvProblemDescription{input.desc,
@@ -2422,6 +2414,9 @@ struct conv_driver : test_driver
                 }
                 else
                 {
+                    /// \todo Take into account `skip_forward`, `skip_backward_data`,
+                    /// `skip_backward_weights` and use this path to compute `total_mem` for int8
+                    /// variations.
                     const auto fwd_problem = miopen::conv::ProblemDescription{
                         input.desc,
                         weights.desc,
diff --git a/test/driver.hpp b/test/driver.hpp
index 0a8e2d3080..fd83dd1fc5 100644
--- a/test/driver.hpp
+++ b/test/driver.hpp
@@ -274,7 +274,7 @@ struct test_driver
         {
         case miopenHalf: ss << "--half "; break;
         case miopenBFloat16: ss << "--bfloat16 "; break;
-        case miopenInt8x4:
+        case miopenInt8x4: ss << "--UNSUPPORED_TYPE "; break;
         case miopenInt8: ss << "--int8 "; break;
         case miopenInt32: ss << "--int32 "; break;
         case miopenFloat: ss << "--float "; break;
@@ -303,7 +303,7 @@ struct test_driver
         {
         case miopenHalf: ret.emplace_back("--half"); break;
         case miopenBFloat16: ret.emplace_back("--bf16"); break;
-        case miopenInt8x4:
+        case miopenInt8x4: ret.emplace_back("--UNSUPPORTED_TYPE"); break;
         case miopenInt8: ret.emplace_back("--int8"); break;
         case miopenInt32: ret.emplace_back("--int32"); break;
         case miopenFloat: ret.emplace_back("--float"); break;
diff --git a/test/gpu_reference_kernel.cpp b/test/gpu_reference_kernel.cpp
index c3b26a80a9..db62684bec 100644
--- a/test/gpu_reference_kernel.cpp
+++ b/test/gpu_reference_kernel.cpp
@@ -302,8 +302,6 @@ static std::string miopen_type_to_string(miopenDataType_t type)
         return "int32";
     if(type == miopenInt8)
         return "int8";
-    if(type == miopenInt8x4)
-        return "int8x4";
     if(type == miopenBFloat16)
         return "bf16";
     return "n/a";
diff --git a/test/gtest/conv_embed_db.cpp b/test/gtest/conv_embed_db.cpp
index b69fde1b5e..4672bb0404 100644
--- a/test/gtest/conv_embed_db.cpp
+++ b/test/gtest/conv_embed_db.cpp
@@ -73,12 +73,12 @@ void Run2dDriver(miopenDataType_t prec)
     case miopenHalf: params = ConfigWithHalf::GetParam(); break;
     case miopenInt8: params = ConfigWithInt8::GetParam(); break;
     case miopenBFloat16: params = ConfigWithBFloat16::GetParam(); break;
-    case miopenInt8x4:
+    case miopenInt8x4: // Support discontinued.
     case miopenInt32:
     case miopenFloat8:
     case miopenBFloat8:
     case miopenDouble:
-        FAIL() << "miopenInt8x4, miopenInt32, miopenFloat8, miopenBFloat8, miopenDouble data type "
+        FAIL() << "miopenInt32, miopenFloat8, miopenBFloat8, miopenDouble data type "
                   "not supported by conv_embed_db test";
 
     default: params = ConfigWithFloat::GetParam();
diff --git a/test/gtest/conv_hip_igemm_xdlops.cpp b/test/gtest/conv_hip_igemm_xdlops.cpp
index 508624c847..3407446557 100644
--- a/test/gtest/conv_hip_igemm_xdlops.cpp
+++ b/test/gtest/conv_hip_igemm_xdlops.cpp
@@ -65,10 +65,10 @@ void Run2dDriver(miopenDataType_t prec)
     case miopenHalf:
     case miopenBFloat16:
     case miopenFloat:
-    case miopenInt8x4:
+    case miopenInt8x4: // Support discontinued.
     case miopenInt32:
     case miopenDouble:
-        FAIL() << "miopenHalf, miopenBFloat16, miopenFloat, miopenInt8x4, miopenInt32, "
+        FAIL() << "miopenHalf, miopenBFloat16, miopenFloat, miopenInt32, "
                   "miopenDouble data "
                   "type not supported by "
                   "test_conv_hip_igemm_xdlops test";
diff --git a/test/gtest/conv_igemm_dynamic.cpp b/test/gtest/conv_igemm_dynamic.cpp
index 25a4e179c5..59341ac8a0 100644
--- a/test/gtest/conv_igemm_dynamic.cpp
+++ b/test/gtest/conv_igemm_dynamic.cpp
@@ -68,12 +68,12 @@ void Run2dDriver(miopenDataType_t prec)
     case miopenHalf:
     case miopenInt8:
     case miopenBFloat16:
-    case miopenInt8x4:
+    case miopenInt8x4: // Support discontinued.
     case miopenInt32:
     case miopenDouble:
     case miopenFloat8:
     case miopenBFloat8:
-        FAIL() << "miopenHalf, miopenInt8, miopenBFloat16, miopenInt8x4, miopenInt32, "
+        FAIL() << "miopenHalf, miopenInt8, miopenBFloat16, miopenInt32, "
                   "miopenDouble, miopenFloat8, miopenBFloat8 "
                   "data type not supported by conv_igemm_dynamic test";
 
diff --git a/test/gtest/conv_igemm_mlir.cpp b/test/gtest/conv_igemm_mlir.cpp
index d5fc0d426c..29d240645c 100644
--- a/test/gtest/conv_igemm_mlir.cpp
+++ b/test/gtest/conv_igemm_mlir.cpp
@@ -81,13 +81,13 @@ void Run2dDriver(miopenDataType_t prec)
     case miopenInt8: params = ConfigWithInt8::GetParam(); break;
     case miopenFloat: params = ConfigWithFloat::GetParam(); break;
     case miopenBFloat16:
-    case miopenInt8x4:
+    case miopenInt8x4: // Support discontinued.
     case miopenInt32:
     case miopenFloat8:
     case miopenBFloat8:
     case miopenDouble:
         MIOPEN_THROW(miopenStatusBadParm,
-                     "miopenBFloat16, miopenInt8x4, miopenInt32, miopenFloat8, miopenBFloat8, "
+                     "miopenBFloat16, miopenInt32, miopenFloat8, miopenBFloat8, "
                      "miopenDouble data type not supported by conv_igemm_mlir test");
 
     default: params = ConfigWithFloat::GetParam();
diff --git a/test/gtest/conv_igemm_mlir_xdlops.cpp b/test/gtest/conv_igemm_mlir_xdlops.cpp
index 19913093c0..e2c5a3ed8f 100644
--- a/test/gtest/conv_igemm_mlir_xdlops.cpp
+++ b/test/gtest/conv_igemm_mlir_xdlops.cpp
@@ -53,13 +53,13 @@ void Run2dDriver(miopenDataType_t prec)
     case miopenInt8: params = ConfigWithInt8::GetParam(); break;
     case miopenBFloat16:
     case miopenFloat:
-    case miopenInt8x4:
+    case miopenInt8x4: // Support discontinued.
     case miopenInt32:
     case miopenDouble:
     case miopenFloat8:
     case miopenBFloat8:
         MIOPEN_THROW(miopenStatusBadParm,
-                     "miopenBFloat16, miopenFloat, miopenInt8x4, miopenInt32, miopenDouble data "
+                     "miopenBFloat16, miopenFloat, miopenInt32, miopenDouble data "
                      "type not supported by "
                      "conv_igemm_mlir_xdlops test");
 
diff --git a/test/gtest/conv_trans.cpp b/test/gtest/conv_trans.cpp
index 20015336a0..185a459947 100644
--- a/test/gtest/conv_trans.cpp
+++ b/test/gtest/conv_trans.cpp
@@ -55,10 +55,10 @@ void Run2dDriver(miopenDataType_t prec)
     case miopenBFloat8:
     case miopenInt8:
     case miopenBFloat16:
-    case miopenInt8x4:
+    case miopenInt8x4: // Support discontinued.
     case miopenInt32:
     case miopenDouble:
-        FAIL() << "miopenHalf, miopenInt8, miopenBFloat16, miopenInt8x4, miopenInt32, miopenDouble "
+        FAIL() << "miopenHalf, miopenInt8, miopenBFloat16, miopenInt32, miopenDouble "
                   "data type not supported by "
                   "conv_trans test";
 
diff --git a/test/gtest/db_sync.cpp b/test/gtest/db_sync.cpp
index d7e59468a2..13b647ceac 100644
--- a/test/gtest/db_sync.cpp
+++ b/test/gtest/db_sync.cpp
@@ -73,7 +73,7 @@ miopenDataType_t GetDataTypeFromString(const std::string& data_type)
     else if(data_type == "INT8")
         return miopenInt8;
     else if(data_type == "INT8x4")
-        return miopenInt8x4;
+        return miopenInt8x4; // Support discontinued. Maintain compatibility with old databases.
     else if(data_type == "INT32")
         return miopenInt32;
     else if(data_type == "BF16")
diff --git a/test/tensor_holder.hpp b/test/tensor_holder.hpp
index 0b05a1e5e8..3fda3b5cf7 100644
--- a/test/tensor_holder.hpp
+++ b/test/tensor_holder.hpp
@@ -195,9 +195,10 @@ struct tensor
 
     tensor(miopen::TensorDescriptor rhs) : desc(std::move(rhs))
     {
-        assert(desc.GetType() == miopen_type<T>{} ||
-               ((miopen_type<T>{} == miopenInt8 || miopen_type<T>{} == miopenInt8x4) &&
-                (desc.GetType() == miopenFloat || desc.GetType() == miopenInt32)));
+        assert(desc.GetType() == miopen_type<T>{}    //
+               || (miopen_type<T>{} == miopenInt8    //
+                   && (desc.GetType() == miopenFloat //
+                       || desc.GetType() == miopenInt32)));
         data.resize(desc.GetElementSpace());
     }
 
diff --git a/test/tensor_transform.cpp b/test/tensor_transform.cpp
index a0585853a0..e87420bb7a 100644
--- a/test/tensor_transform.cpp
+++ b/test/tensor_transform.cpp
@@ -422,7 +422,7 @@ struct tensor_transform_driver : test_driver
         }
 
         // Test tensor scale addition
-        if(miopen_type<T>{} == miopenInt8 || miopen_type<T>{} == miopenInt8x4)
+        if(miopen_type<T>{} == miopenInt8)
             return;
 
         super_src = tensor<T>{superLens_src}.generate(tensor_elem_gen_integer{max_value});

From 6737d66a66206f1ad90ab9da5871ada2eeb02875 Mon Sep 17 00:00:00 2001
From: atamazov <artem.tamazov@gmail.com>
Date: Fri, 6 Oct 2023 21:36:24 +0300
Subject: [PATCH 2/8] rocblas-remove-int8x4-uses(02) Removed the rest of INT8x4
 support (except the driver)

---
 driver/conv_driver.hpp                           |  4 ++++
 include/miopen/miopen.h                          |  1 -
 src/check_numerics.cpp                           |  1 -
 src/driver_arguments.cpp                         |  4 +---
 src/gemm_v2.cpp                                  | 16 ++++++++--------
 src/hip/general_tensor_reorder_sol.cpp           |  2 --
 src/include/miopen/conv/problem_description.hpp  |  1 -
 src/include/miopen/datatype.hpp                  |  5 -----
 src/include/miopen/tensor.hpp                    |  1 -
 src/include/miopen/visit_float.hpp               |  1 -
 src/ocl/tensorocl.cpp                            |  6 ------
 src/pooling_api.cpp                              |  1 -
 src/reducetensor.cpp                             |  3 ---
 src/solver/batchnorm/forward_inference_ck.cpp    |  2 --
 .../conv_ck_igemm_fwd_bias_activ_fused.cpp       |  4 ----
 ...v_hip_implicit_gemm_3d_grouped_bwd_xdlops.cpp |  4 ----
 ...v_hip_implicit_gemm_3d_grouped_fwd_xdlops.cpp |  4 ----
 ...v_hip_implicit_gemm_3d_grouped_wrw_xdlops.cpp |  4 ----
 .../conv_hip_implicit_gemm_bwd_data_xdlops.cpp   |  4 ----
 src/solver/conv_hip_implicit_gemm_fwd_xdlops.cpp |  4 ----
 ...conv_hip_implicit_gemm_grouped_fwd_xdlops.cpp |  4 ----
 src/solver/gemm.cpp                              |  2 --
 src/solver/mlir_common.cpp                       |  1 -
 src/tensor.cpp                                   |  1 -
 test/driver.hpp                                  |  2 --
 test/gtest/conv_embed_db.cpp                     |  1 -
 test/gtest/conv_hip_igemm_xdlops.cpp             |  1 -
 test/gtest/conv_igemm_dynamic.cpp                |  1 -
 test/gtest/conv_igemm_mlir.cpp                   |  1 -
 test/gtest/conv_igemm_mlir_xdlops.cpp            |  1 -
 test/gtest/conv_trans.cpp                        |  1 -
 test/gtest/db_sync.cpp                           |  2 --
 32 files changed, 13 insertions(+), 77 deletions(-)

diff --git a/driver/conv_driver.hpp b/driver/conv_driver.hpp
index 5f67b83588..3c28859776 100644
--- a/driver/conv_driver.hpp
+++ b/driver/conv_driver.hpp
@@ -83,6 +83,10 @@ MIOPEN_DECLARE_ENV_VAR(MIOPEN_DRIVER_PAD_BUFFERS_2M)
 MIOPEN_DECLARE_ENV_VAR(MIOPEN_DRIVER_USE_GPU_REFERENCE)
 MIOPEN_DECLARE_ENV_VAR(MIOPEN_DRIVER_SUBNORM_PERCENTAGE)
 
+// Support in the library discontinued, but left in the driver
+// for reference in the future.
+#define miopenInt8x4 (static_cast<miopenDataType_t>(4))
+
 #if MIOPEN_BACKEND_OPENCL
 #define STATUS_SUCCESS CL_SUCCESS
 typedef cl_int status_t;
diff --git a/include/miopen/miopen.h b/include/miopen/miopen.h
index 7a85ec5875..fd978c6200 100644
--- a/include/miopen/miopen.h
+++ b/include/miopen/miopen.h
@@ -350,7 +350,6 @@ typedef enum
     miopenFloat    = 1, /*!< 32-bit floating point (Fully supported) */
     miopenInt32    = 2, /*!< 32-bit int point (Partially supported) */
     miopenInt8     = 3, /*!< 8-bit int point (Partially supported) */
-    miopenInt8x4   = 4, /*!< Pack of four Int8 in NCHW_VECT_C format (Support discontinued) */
     miopenBFloat16 = 5, /*!< 16-bit binary floating point (8-bit exponent, 7-bit fraction)
                            (Partially supported) */
     miopenDouble  = 6,  /*!< 64-bit floating point (Partially supported) */
diff --git a/src/check_numerics.cpp b/src/check_numerics.cpp
index 50cb0af4af..12210069df 100644
--- a/src/check_numerics.cpp
+++ b/src/check_numerics.cpp
@@ -64,7 +64,6 @@ std::string GetKernelName(miopenDataType_t data_type)
     case miopenBFloat8: return {"check_numerics_bf8"};
     case miopenInt32:
     case miopenInt8:
-    case miopenInt8x4: // Support discontinued.
     case miopenDouble:
     default: return {""};
     }
diff --git a/src/driver_arguments.cpp b/src/driver_arguments.cpp
index 53aa6b41ab..d9e82d270a 100644
--- a/src/driver_arguments.cpp
+++ b/src/driver_arguments.cpp
@@ -50,7 +50,7 @@ void ConvDataType(std::stringstream& ss, const miopen::TensorDescriptor& desc)
     {
         ss << "convbfp16";
     }
-    else if(desc.GetType() == miopenInt8 || desc.GetType() == miopenInt8x4)
+    else if(desc.GetType() == miopenInt8)
     {
         ss << "convint8";
     }
@@ -190,8 +190,6 @@ std::string ConvArgsForMIOpenDriver(const miopen::TensorDescriptor& xDesc,
     ss << " -g " << convDesc.group_count;
     if(print_for_conv_driver)
         ss << " -F " << std::to_string(static_cast<int>(conv_dir)) << " -t 1"; // clang-format on
-    if(xDesc.GetType() == miopenInt8x4)
-        ss << " -Z 1";
     if(immediate_mode_solver_id.has_value())
     {
         ss << " -S " << *immediate_mode_solver_id;
diff --git a/src/gemm_v2.cpp b/src/gemm_v2.cpp
index 35add906a1..a44daf962d 100644
--- a/src/gemm_v2.cpp
+++ b/src/gemm_v2.cpp
@@ -610,9 +610,9 @@ miopenStatus_t CallGemm(const Handle& handle,
         };
         break;
 
-        case miopenInt8x4:
         case miopenDouble: {
-            MIOPEN_THROW(miopenStatusBadParm, "Unknown or unsupported data type.");
+            MIOPEN_THROW(miopenStatusBadParm,
+                         "miopenDouble data type not supported by MIOpenGEMM.");
         };
         break;
         }
@@ -876,10 +876,10 @@ miopenStatus_t CallGemmStridedBatched(const Handle& handle,
             break;
         }
 
-        case miopenInt8x4:
         case miopenDouble: {
-            MIOPEN_THROW(miopenStatusBadParm, "Unknown or unsupported data type.");
-        };
+            MIOPEN_THROW(miopenStatusBadParm,
+                         "miopenDouble data type not supported by MIOpenGEMM.");
+        }
         break;
         }
 
@@ -1140,10 +1140,10 @@ miopenStatus_t CallGemmStridedBatchedSequential(const Handle& handle,
             break;
         }
 
-        case miopenInt8x4:
         case miopenDouble: {
-            MIOPEN_THROW(miopenStatusBadParm, "Unknown or unsupported data type.");
-        };
+            MIOPEN_THROW(miopenStatusBadParm,
+                         "miopenDouble data type not supported by MIOpenGEMM.");
+        }
         break;
         }
 
diff --git a/src/hip/general_tensor_reorder_sol.cpp b/src/hip/general_tensor_reorder_sol.cpp
index f78aa358f8..ae85048f63 100644
--- a/src/hip/general_tensor_reorder_sol.cpp
+++ b/src/hip/general_tensor_reorder_sol.cpp
@@ -137,8 +137,6 @@ GenericReorderSolutionImpl::GenericReorderSolutionImpl(miopenDataType_t data_typ
       order_2(order_2_),
       order_3(order_3_)
 {
-    if(data_type == miopenInt8x4)
-        MIOPEN_THROW("These data type are not supported");
     std::size_t data_size  = miopen::GetTypeSize(data_type);
     kernel_param_heuristic = tensor_reorder::HeuristicGet(data_size, dim_0, dim_1, dim_2, dim_3);
 }
diff --git a/src/include/miopen/conv/problem_description.hpp b/src/include/miopen/conv/problem_description.hpp
index 438e1d306a..12d6f08811 100644
--- a/src/include/miopen/conv/problem_description.hpp
+++ b/src/include/miopen/conv/problem_description.hpp
@@ -51,7 +51,6 @@ inline std::string GetDataTypeName(miopenDataType_t data_type)
     case miopenFloat: return "FP32";
     case miopenHalf: return "FP16";
     case miopenInt8: return "INT8";
-    case miopenInt8x4: return "INT8x4";
     case miopenInt32: return "INT32";
     case miopenBFloat16: return "BF16";
     case miopenDouble: return "FP64";
diff --git a/src/include/miopen/datatype.hpp b/src/include/miopen/datatype.hpp
index 29057e3de9..2260cb5b4f 100644
--- a/src/include/miopen/datatype.hpp
+++ b/src/include/miopen/datatype.hpp
@@ -53,10 +53,6 @@ inline std::string GetDataType(miopenDataType_t type)
         type_str = "bfloat16";
     }
     break;
-    case miopenInt8x4: {
-        type_str = "UNSUPPORTED_TYPE";
-    }
-    break;
     case miopenInt8: {
         type_str = "int8_t";
     }
@@ -157,7 +153,6 @@ inline KernelBuildParameters GetDataTypeKBP(miopenDataType_t type)
     case miopenDouble: use_fp64 = 1; break;
     case miopenFloat8: use_fp8 = 1; break;
     case miopenBFloat8: use_bfp8 = 1; break;
-    case miopenInt8x4: // fallthrough
     default:
         MIOPEN_THROW("Only float, half, bfloat16, int8, float8, bfloat8 data types are supported.");
         break;
diff --git a/src/include/miopen/tensor.hpp b/src/include/miopen/tensor.hpp
index a133ab4adc..1bf2bcbaae 100644
--- a/src/include/miopen/tensor.hpp
+++ b/src/include/miopen/tensor.hpp
@@ -101,7 +101,6 @@ inline std::size_t GetTypeSize(miopenDataType_t d)
     case miopenFloat: return 4;
     case miopenHalf:
     case miopenBFloat16: return 2;
-    case miopenInt8x4: break;
     case miopenInt8:
     case miopenFloat8:
     case miopenBFloat8: return 1;
diff --git a/src/include/miopen/visit_float.hpp b/src/include/miopen/visit_float.hpp
index 35e1ae6ae7..ae84b7f0a9 100644
--- a/src/include/miopen/visit_float.hpp
+++ b/src/include/miopen/visit_float.hpp
@@ -91,7 +91,6 @@ void visit_float(miopenDataType_t t, F f)
         f(as_float<double>{});
         break;
     }
-    case miopenInt8x4: MIOPEN_THROW("miopenInt8x4: Support discontinued.");
     }
 }
 
diff --git a/src/ocl/tensorocl.cpp b/src/ocl/tensorocl.cpp
index 1e1f8b1df9..6329d33f69 100644
--- a/src/ocl/tensorocl.cpp
+++ b/src/ocl/tensorocl.cpp
@@ -1944,7 +1944,6 @@ std::string GetCastTensorBuildOptionFromType(const std::string& buildOption, mio
     case miopenDouble:
         // TODO
         MIOPEN_THROW(miopenStatusBadParm, "miopenDouble data type not supported in cast tensor.");
-    case miopenInt8x4: // fallthrough
     default: MIOPEN_THROW(miopenStatusBadParm, "Invalid data type in cast tensor desc.");
     }
 }
@@ -1968,11 +1967,6 @@ void CastTensor(const Handle& handle,
         MIOPEN_THROW(miopenStatusBadParm, "Tensor dimension lengths do not match.");
     }
 
-    if(srcDesc.GetType() == miopenInt8x4 || dstDesc.GetType() == miopenInt8x4)
-    {
-        MIOPEN_THROW(miopenStatusBadParm, "Tensor cast operation is not supported for int8x4.");
-    }
-
     auto flat_descriptors = GetConsistentFlattenedTensorDescriptors(srcDesc, dstDesc);
     const TensorDescriptor& srcDesc_flat = std::get<0>(flat_descriptors);
     const TensorDescriptor& dstDesc_flat = std::get<1>(flat_descriptors);
diff --git a/src/pooling_api.cpp b/src/pooling_api.cpp
index bf318f7b78..a2c792f8bb 100644
--- a/src/pooling_api.cpp
+++ b/src/pooling_api.cpp
@@ -50,7 +50,6 @@ inline void Pooling_logging_cmd(const miopenPoolingDescriptor_t poolDesc,
         case miopenFloat: ss << "pool"; break;
         case miopenInt32:
         case miopenInt8:
-        case miopenInt8x4: // Support discontinued.
         case miopenBFloat16:
         case miopenDouble:
         case miopenFloat8:
diff --git a/src/reducetensor.cpp b/src/reducetensor.cpp
index 9cabbc03f6..6ca1067fb8 100644
--- a/src/reducetensor.cpp
+++ b/src/reducetensor.cpp
@@ -211,7 +211,6 @@ inline int GetDataTypeSize(miopenDataType_t t)
     case miopenFloat8:
     case miopenBFloat8:
     case miopenInt8: return (1);
-    case miopenInt8x4: return (4); // Support discontinued.
     case miopenBFloat16: return (2);
     case miopenInt32: return (4);
     default: MIOPEN_THROW("Only float, half, double, bfloat16, int8 data types are supported.");
@@ -268,7 +267,6 @@ inline int GetDataTypeId(miopenDataType_t t)
     case miopenBFloat16: return (static_cast<int>('B'));
     case miopenDouble: return (static_cast<int>('D'));
     case miopenInt8:
-    case miopenInt8x4: // Support discontinued.
     case miopenFloat8:
     case miopenBFloat8:
     case miopenInt32: return (static_cast<int>('O'));
@@ -308,7 +306,6 @@ static ck::DataTypeEnum_t mapDataTypeId(miopenDataType_t t)
     case miopenBFloat16: return DataTypeEnum_t::BFloat16;
     case miopenDouble: return DataTypeEnum_t::Double;
     case miopenInt8: return DataTypeEnum_t::Int8;
-    case miopenInt8x4: return DataTypeEnum_t::Int8x4;
     case miopenInt32: return DataTypeEnum_t::Int32;
     case miopenFloat8:
     case miopenBFloat8:
diff --git a/src/solver/batchnorm/forward_inference_ck.cpp b/src/solver/batchnorm/forward_inference_ck.cpp
index ff17432a62..d1d67ddea6 100644
--- a/src/solver/batchnorm/forward_inference_ck.cpp
+++ b/src/solver/batchnorm/forward_inference_ck.cpp
@@ -200,7 +200,6 @@ bool BnCKFwdInference::IsApplicable(const ExecutionContext& context,
         return (CheckCKApplicability<BF16, BF16, F32, BF16, BF16, F32>(bn_problem) != -1);
     case miopenInt32:
     case miopenInt8:
-    case miopenInt8x4: // Support discontinued.
     case miopenFloat8:
     case miopenBFloat8:
     default: MIOPEN_THROW("Unsupported datatype");
@@ -244,7 +243,6 @@ BnCKFwdInference::GetSolution(const ExecutionContext& context,
                 break;
             case miopenInt8:
             case miopenInt32:
-            case miopenInt8x4: // Support discontinued.
             case miopenFloat8:
             case miopenBFloat8:
             default: MIOPEN_THROW("Unsupported datatype");
diff --git a/src/solver/conv_ck_igemm_fwd_bias_activ_fused.cpp b/src/solver/conv_ck_igemm_fwd_bias_activ_fused.cpp
index e2df6f8097..d9c0410d98 100644
--- a/src/solver/conv_ck_igemm_fwd_bias_activ_fused.cpp
+++ b/src/solver/conv_ck_igemm_fwd_bias_activ_fused.cpp
@@ -287,7 +287,6 @@ void PerformanceConfigConvCKIgemmFwdBiasActivFused::HeuristicInit(
     case miopenInt8:
     case miopenFloat:
     case miopenInt32:
-    case miopenInt8x4: // Support discontinued.
     case miopenBFloat16:
     case miopenDouble:
     default: MIOPEN_THROW("Unsupported datatype");
@@ -342,7 +341,6 @@ bool PerformanceConfigConvCKIgemmFwdBiasActivFused::IsValid(
     case miopenInt8:
     case miopenFloat:
     case miopenInt32:
-    case miopenInt8x4: // Support discontinued.
     case miopenBFloat16:
     case miopenDouble:
     default: MIOPEN_THROW("Unsupported datatype");
@@ -435,7 +433,6 @@ bool ConvCKIgemmFwdBiasActivFused::IsApplicable(const FusionContext& ctx,
     case miopenInt8:
     case miopenFloat:
     case miopenInt32:
-    case miopenInt8x4: // Support discontinued.
     case miopenBFloat16:
     case miopenDouble:
     default: MIOPEN_THROW("Unsupported datatype");
@@ -469,7 +466,6 @@ ConvSolution ConvCKIgemmFwdBiasActivFused::GetSolution(
             case miopenInt8:
             case miopenFloat:
             case miopenInt32:
-            case miopenInt8x4: // Support discontinued.
             case miopenBFloat16:
             case miopenDouble:
             default: MIOPEN_THROW("Unsupported datatype");
diff --git a/src/solver/conv_hip_implicit_gemm_3d_grouped_bwd_xdlops.cpp b/src/solver/conv_hip_implicit_gemm_3d_grouped_bwd_xdlops.cpp
index 94983c7f0e..cd95c85cf6 100644
--- a/src/solver/conv_hip_implicit_gemm_3d_grouped_bwd_xdlops.cpp
+++ b/src/solver/conv_hip_implicit_gemm_3d_grouped_bwd_xdlops.cpp
@@ -210,7 +210,6 @@ void PerformanceConfigHipImplicitGemm3DGroupBwdXdlops::HeuristicInit(
     case miopenFloat: Init<float>(problem); break;
     case miopenInt8: Init<int8_t>(problem); break;
     case miopenInt32:
-    case miopenInt8x4: // Support discontinued.
     case miopenBFloat16:
     case miopenFloat8:
     case miopenBFloat8:
@@ -253,7 +252,6 @@ bool PerformanceConfigHipImplicitGemm3DGroupBwdXdlops::IsValid(
     case miopenFloat: return CheckIsSupportCKArgs<float>(problem);
     case miopenInt8: return CheckIsSupportCKArgs<int8_t>(problem);
     case miopenInt32:
-    case miopenInt8x4: // Support discontinued.
     case miopenBFloat16:
     case miopenFloat8:
     case miopenBFloat8:
@@ -324,7 +322,6 @@ bool ConvHipImplicitGemm3DGroupBwdXdlops::IsApplicable(
     case miopenFloat: return CheckCKApplicability<float>(problem);
     case miopenInt8: return CheckCKApplicability<int8_t>(problem);
     case miopenInt32:
-    case miopenInt8x4: // Support discontinued.
     case miopenBFloat16:
     case miopenFloat8:
     case miopenBFloat8:
@@ -352,7 +349,6 @@ ConvSolution ConvHipImplicitGemm3DGroupBwdXdlops::GetSolution(
         return InitInvokerFactory<DeviceOpGBwdPtrs<float>, CKArgs, conv::DataInvokeParams>(
             problem, config.kernel_id);
     case miopenInt32:
-    case miopenInt8x4: // Support discontinued.
     case miopenBFloat16:
     case miopenDouble:
     case miopenFloat8:
diff --git a/src/solver/conv_hip_implicit_gemm_3d_grouped_fwd_xdlops.cpp b/src/solver/conv_hip_implicit_gemm_3d_grouped_fwd_xdlops.cpp
index a21c9ba300..4a5ecc8a3f 100644
--- a/src/solver/conv_hip_implicit_gemm_3d_grouped_fwd_xdlops.cpp
+++ b/src/solver/conv_hip_implicit_gemm_3d_grouped_fwd_xdlops.cpp
@@ -210,7 +210,6 @@ void PerformanceConfigHipImplicitGemm3DGroupFwdXdlops::HeuristicInit(
     case miopenFloat: Init<float>(problem); break;
     case miopenInt8: Init<int8_t>(problem); break;
     case miopenInt32:
-    case miopenInt8x4: // Support discontinued.
     case miopenFloat8:
     case miopenBFloat8:
     case miopenBFloat16:
@@ -253,7 +252,6 @@ bool PerformanceConfigHipImplicitGemm3DGroupFwdXdlops::IsValid(
     case miopenFloat: return CheckIsSupportCKArgs<float>(problem);
     case miopenInt8: return CheckIsSupportCKArgs<int8_t>(problem);
     case miopenInt32:
-    case miopenInt8x4: // Support discontinued.
     case miopenFloat8:
     case miopenBFloat8:
     case miopenBFloat16:
@@ -322,7 +320,6 @@ bool ConvHipImplicitGemm3DGroupFwdXdlops::IsApplicable(
     case miopenFloat: return CheckCKApplicability<float>(problem);
     case miopenInt8: return CheckCKApplicability<int8_t>(problem);
     case miopenInt32:
-    case miopenInt8x4: // Support discontinued.
     case miopenFloat8:
     case miopenBFloat8:
     case miopenBFloat16:
@@ -350,7 +347,6 @@ ConvSolution ConvHipImplicitGemm3DGroupFwdXdlops::GetSolution(
         return InitInvokerFactory<DeviceOpGFwdPtrs<float>, CKArgs, conv::DataInvokeParams>(
             problem, config.kernel_id);
     case miopenInt32:
-    case miopenInt8x4: // Support discontinued.
     case miopenBFloat16:
     case miopenDouble:
     case miopenFloat8:
diff --git a/src/solver/conv_hip_implicit_gemm_3d_grouped_wrw_xdlops.cpp b/src/solver/conv_hip_implicit_gemm_3d_grouped_wrw_xdlops.cpp
index 5c92f57142..f09fca29fd 100644
--- a/src/solver/conv_hip_implicit_gemm_3d_grouped_wrw_xdlops.cpp
+++ b/src/solver/conv_hip_implicit_gemm_3d_grouped_wrw_xdlops.cpp
@@ -206,7 +206,6 @@ void PerformanceConfigHipImplicitGemm3DGroupWrwXdlops::HeuristicInit(
     case miopenFloat: Init<float>(problem); break;
     case miopenInt8: Init<int8_t>(problem); break;
     case miopenInt32:
-    case miopenInt8x4: // Support discontinued.
     case miopenBFloat16:
     case miopenFloat8:
     case miopenBFloat8:
@@ -249,7 +248,6 @@ bool PerformanceConfigHipImplicitGemm3DGroupWrwXdlops::IsValid(
     case miopenFloat: return CheckIsSupportCKArgs<float>(problem);
     case miopenInt8: return CheckIsSupportCKArgs<int8_t>(problem);
     case miopenInt32:
-    case miopenInt8x4: // Support discontinued.
     case miopenBFloat16:
     case miopenFloat8:
     case miopenBFloat8:
@@ -320,7 +318,6 @@ bool ConvHipImplicitGemm3DGroupWrwXdlops::IsApplicable(
     case miopenFloat: return CheckCKApplicability<float>(problem);
     case miopenInt8: return CheckCKApplicability<int8_t>(problem);
     case miopenInt32:
-    case miopenInt8x4: // Support discontinued.
     case miopenBFloat16:
     case miopenFloat8:
     case miopenBFloat8:
@@ -348,7 +345,6 @@ ConvSolution ConvHipImplicitGemm3DGroupWrwXdlops::GetSolution(
         return InitInvokerFactory<DeviceOpGWrwPtrs<float>, CKArgs, conv::WrWInvokeParams>(
             problem, config.kernel_id);
     case miopenInt32:
-    case miopenInt8x4: // Support discontinued.
     case miopenBFloat16:
     case miopenFloat8:
     case miopenBFloat8:
diff --git a/src/solver/conv_hip_implicit_gemm_bwd_data_xdlops.cpp b/src/solver/conv_hip_implicit_gemm_bwd_data_xdlops.cpp
index 5aa17e75bf..d55ded78de 100644
--- a/src/solver/conv_hip_implicit_gemm_bwd_data_xdlops.cpp
+++ b/src/solver/conv_hip_implicit_gemm_bwd_data_xdlops.cpp
@@ -172,7 +172,6 @@ void PerformanceConfigHipImplicitGemmBwdXdlops::HeuristicInit(
     case miopenBFloat8:
     case miopenInt8:
     case miopenInt32:
-    case miopenInt8x4: // Support discontinued.
     case miopenBFloat16:
     case miopenDouble: break;
     }
@@ -214,7 +213,6 @@ bool PerformanceConfigHipImplicitGemmBwdXdlops::IsValid(
     case miopenBFloat8:
     case miopenInt8:
     case miopenInt32:
-    case miopenInt8x4: // Support discontinued.
     case miopenBFloat16:
     case miopenDouble: break;
     }
@@ -294,7 +292,6 @@ bool ConvHipImplicitGemmBwdXdlops::IsApplicable(
     case miopenBFloat8:
     case miopenInt8:
     case miopenInt32:
-    case miopenInt8x4: // Support discontinued.
     case miopenBFloat16:
     case miopenDouble: break;
     }
@@ -318,7 +315,6 @@ ConvSolution ConvHipImplicitGemmBwdXdlops::GetSolution(
             problem, config.kernel_id);
     case miopenInt8:
     case miopenInt32:
-    case miopenInt8x4: // Support discontinued.
     case miopenBFloat16:
     case miopenDouble:
     case miopenFloat8:
diff --git a/src/solver/conv_hip_implicit_gemm_fwd_xdlops.cpp b/src/solver/conv_hip_implicit_gemm_fwd_xdlops.cpp
index 7a9c322bc7..697ab33f70 100644
--- a/src/solver/conv_hip_implicit_gemm_fwd_xdlops.cpp
+++ b/src/solver/conv_hip_implicit_gemm_fwd_xdlops.cpp
@@ -172,7 +172,6 @@ void PerformanceConfigHipImplicitGemmFwdXdlops::HeuristicInit(
     case miopenFloat8:
     case miopenBFloat8:
     case miopenInt32:
-    case miopenInt8x4: // Support discontinued.
     case miopenBFloat16:
     case miopenDouble: break;
     }
@@ -215,7 +214,6 @@ bool PerformanceConfigHipImplicitGemmFwdXdlops::IsValid(
     case miopenFloat8:
     case miopenBFloat8:
     case miopenInt32:
-    case miopenInt8x4: // Support discontinued.
     case miopenBFloat16:
     case miopenDouble: break;
     }
@@ -294,7 +292,6 @@ bool ConvHipImplicitGemmFwdXdlops::IsApplicable(
     case miopenFloat8:
     case miopenBFloat8:
     case miopenInt32:
-    case miopenInt8x4: // Support discontinued.
     case miopenBFloat16:
     case miopenDouble: break;
     }
@@ -320,7 +317,6 @@ ConvSolution ConvHipImplicitGemmFwdXdlops::GetSolution(
         return InitInvokerFactory<DeviceOpPtrs<float>, CKArgs, conv::DataInvokeParams>(
             problem, config.kernel_id);
     case miopenInt32:
-    case miopenInt8x4: // Support discontinued.
     case miopenBFloat16:
     case miopenDouble:
     case miopenFloat8:
diff --git a/src/solver/conv_hip_implicit_gemm_grouped_fwd_xdlops.cpp b/src/solver/conv_hip_implicit_gemm_grouped_fwd_xdlops.cpp
index 0a8b6eeb03..0c5a91b627 100644
--- a/src/solver/conv_hip_implicit_gemm_grouped_fwd_xdlops.cpp
+++ b/src/solver/conv_hip_implicit_gemm_grouped_fwd_xdlops.cpp
@@ -198,7 +198,6 @@ void PerformanceConfigHipImplicitGemmGroupFwdXdlops::HeuristicInit(
     case miopenFloat: Init<float>(problem); break;
     case miopenInt8: Init<int8_t>(problem); break;
     case miopenInt32:
-    case miopenInt8x4: // Support discontinued.
     case miopenBFloat16:
     case miopenFloat8:
     case miopenBFloat8:
@@ -240,7 +239,6 @@ bool PerformanceConfigHipImplicitGemmGroupFwdXdlops::IsValid(
     case miopenFloat: return CheckIsSupportCKArgs<float>(problem);
     case miopenInt8: return CheckIsSupportCKArgs<int8_t>(problem);
     case miopenInt32:
-    case miopenInt8x4: // Support discontinued.
     case miopenBFloat16:
     case miopenFloat8:
     case miopenBFloat8:
@@ -311,7 +309,6 @@ bool ConvHipImplicitGemmGroupFwdXdlops::IsApplicable(
     case miopenFloat: return CheckCKApplicability<float>(problem);
     case miopenInt8: return CheckCKApplicability<int8_t>(problem);
     case miopenInt32:
-    case miopenInt8x4: // Support discontinued.
     case miopenBFloat16:
     case miopenFloat8:
     case miopenBFloat8:
@@ -339,7 +336,6 @@ ConvSolution ConvHipImplicitGemmGroupFwdXdlops::GetSolution(
         return InitInvokerFactory<DeviceOpGFwdPtrs<int8_t>, CKArgs, conv::DataInvokeParams>(
             problem, config.kernel_id);
     case miopenInt32:
-    case miopenInt8x4: // Support discontinued.
     case miopenBFloat16:
     case miopenDouble:
     case miopenFloat8:
diff --git a/src/solver/gemm.cpp b/src/solver/gemm.cpp
index 6ae4f5dde7..2cfbd257dc 100644
--- a/src/solver/gemm.cpp
+++ b/src/solver/gemm.cpp
@@ -89,8 +89,6 @@ bool GemmFwdBase::IsApplicable(const ExecutionContext& ctx,
     const auto& yDesc = problem.GetOut();
 
     // rocBlas needs the output to be 32-bit always
-    if(xDesc.GetType() == miopenInt8x4)
-        return false;
     if(xDesc.GetType() == miopenInt8      //
        && (yDesc.GetType() != miopenFloat //
            && yDesc.GetType() != miopenInt32))
diff --git a/src/solver/mlir_common.cpp b/src/solver/mlir_common.cpp
index 4e41b8944a..e54da1ac89 100644
--- a/src/solver/mlir_common.cpp
+++ b/src/solver/mlir_common.cpp
@@ -57,7 +57,6 @@ static const char* DTypeName(miopenDataType_t ty)
     case miopenBFloat16: return "bf16";
     case miopenInt32: return "i32";
     case miopenInt8: return "i8";
-    case miopenInt8x4: return "i8x4"; // Support discontinued.
     case miopenFloat8: return "fp8";
     case miopenBFloat8: return "bfp8";
     }
diff --git a/src/tensor.cpp b/src/tensor.cpp
index df0d7c2819..ba1b2ba106 100644
--- a/src/tensor.cpp
+++ b/src/tensor.cpp
@@ -50,7 +50,6 @@ bool IsDataTypeSupported(miopenDataType_t t)
     case miopenFloat8:
     case miopenBFloat8:
     case miopenInt8:
-    case miopenInt8x4: // Support discontinued.
     case miopenBFloat16:
     case miopenDouble: return true;
     }
diff --git a/test/driver.hpp b/test/driver.hpp
index fd83dd1fc5..bcbfe7d676 100644
--- a/test/driver.hpp
+++ b/test/driver.hpp
@@ -274,7 +274,6 @@ struct test_driver
         {
         case miopenHalf: ss << "--half "; break;
         case miopenBFloat16: ss << "--bfloat16 "; break;
-        case miopenInt8x4: ss << "--UNSUPPORED_TYPE "; break;
         case miopenInt8: ss << "--int8 "; break;
         case miopenInt32: ss << "--int32 "; break;
         case miopenFloat: ss << "--float "; break;
@@ -303,7 +302,6 @@ struct test_driver
         {
         case miopenHalf: ret.emplace_back("--half"); break;
         case miopenBFloat16: ret.emplace_back("--bf16"); break;
-        case miopenInt8x4: ret.emplace_back("--UNSUPPORTED_TYPE"); break;
         case miopenInt8: ret.emplace_back("--int8"); break;
         case miopenInt32: ret.emplace_back("--int32"); break;
         case miopenFloat: ret.emplace_back("--float"); break;
diff --git a/test/gtest/conv_embed_db.cpp b/test/gtest/conv_embed_db.cpp
index 4672bb0404..36d1e28123 100644
--- a/test/gtest/conv_embed_db.cpp
+++ b/test/gtest/conv_embed_db.cpp
@@ -73,7 +73,6 @@ void Run2dDriver(miopenDataType_t prec)
     case miopenHalf: params = ConfigWithHalf::GetParam(); break;
     case miopenInt8: params = ConfigWithInt8::GetParam(); break;
     case miopenBFloat16: params = ConfigWithBFloat16::GetParam(); break;
-    case miopenInt8x4: // Support discontinued.
     case miopenInt32:
     case miopenFloat8:
     case miopenBFloat8:
diff --git a/test/gtest/conv_hip_igemm_xdlops.cpp b/test/gtest/conv_hip_igemm_xdlops.cpp
index 3407446557..14e1738a35 100644
--- a/test/gtest/conv_hip_igemm_xdlops.cpp
+++ b/test/gtest/conv_hip_igemm_xdlops.cpp
@@ -65,7 +65,6 @@ void Run2dDriver(miopenDataType_t prec)
     case miopenHalf:
     case miopenBFloat16:
     case miopenFloat:
-    case miopenInt8x4: // Support discontinued.
     case miopenInt32:
     case miopenDouble:
         FAIL() << "miopenHalf, miopenBFloat16, miopenFloat, miopenInt32, "
diff --git a/test/gtest/conv_igemm_dynamic.cpp b/test/gtest/conv_igemm_dynamic.cpp
index 59341ac8a0..6e2863eb8f 100644
--- a/test/gtest/conv_igemm_dynamic.cpp
+++ b/test/gtest/conv_igemm_dynamic.cpp
@@ -68,7 +68,6 @@ void Run2dDriver(miopenDataType_t prec)
     case miopenHalf:
     case miopenInt8:
     case miopenBFloat16:
-    case miopenInt8x4: // Support discontinued.
     case miopenInt32:
     case miopenDouble:
     case miopenFloat8:
diff --git a/test/gtest/conv_igemm_mlir.cpp b/test/gtest/conv_igemm_mlir.cpp
index 29d240645c..c99d0a03c4 100644
--- a/test/gtest/conv_igemm_mlir.cpp
+++ b/test/gtest/conv_igemm_mlir.cpp
@@ -81,7 +81,6 @@ void Run2dDriver(miopenDataType_t prec)
     case miopenInt8: params = ConfigWithInt8::GetParam(); break;
     case miopenFloat: params = ConfigWithFloat::GetParam(); break;
     case miopenBFloat16:
-    case miopenInt8x4: // Support discontinued.
     case miopenInt32:
     case miopenFloat8:
     case miopenBFloat8:
diff --git a/test/gtest/conv_igemm_mlir_xdlops.cpp b/test/gtest/conv_igemm_mlir_xdlops.cpp
index e2c5a3ed8f..ea14e3b68e 100644
--- a/test/gtest/conv_igemm_mlir_xdlops.cpp
+++ b/test/gtest/conv_igemm_mlir_xdlops.cpp
@@ -53,7 +53,6 @@ void Run2dDriver(miopenDataType_t prec)
     case miopenInt8: params = ConfigWithInt8::GetParam(); break;
     case miopenBFloat16:
     case miopenFloat:
-    case miopenInt8x4: // Support discontinued.
     case miopenInt32:
     case miopenDouble:
     case miopenFloat8:
diff --git a/test/gtest/conv_trans.cpp b/test/gtest/conv_trans.cpp
index 185a459947..a5266d9f9c 100644
--- a/test/gtest/conv_trans.cpp
+++ b/test/gtest/conv_trans.cpp
@@ -55,7 +55,6 @@ void Run2dDriver(miopenDataType_t prec)
     case miopenBFloat8:
     case miopenInt8:
     case miopenBFloat16:
-    case miopenInt8x4: // Support discontinued.
     case miopenInt32:
     case miopenDouble:
         FAIL() << "miopenHalf, miopenInt8, miopenBFloat16, miopenInt32, miopenDouble "
diff --git a/test/gtest/db_sync.cpp b/test/gtest/db_sync.cpp
index 13b647ceac..13f0297724 100644
--- a/test/gtest/db_sync.cpp
+++ b/test/gtest/db_sync.cpp
@@ -72,8 +72,6 @@ miopenDataType_t GetDataTypeFromString(const std::string& data_type)
         return miopenHalf;
     else if(data_type == "INT8")
         return miopenInt8;
-    else if(data_type == "INT8x4")
-        return miopenInt8x4; // Support discontinued. Maintain compatibility with old databases.
     else if(data_type == "INT32")
         return miopenInt32;
     else if(data_type == "BF16")

From b6908bcf27490ea16637135bcbec092c75076aae Mon Sep 17 00:00:00 2001
From: atamazov <artem.tamazov@gmail.com>
Date: Fri, 6 Oct 2023 21:44:39 +0300
Subject: [PATCH 3/8] rocblas-remove-int8x4-uses(04) Fix merge error

---
 src/include/miopen/hip_f8_impl.hpp | 362 +---------------
 src/include/miopen/hip_float8.hpp  | 652 +----------------------------
 2 files changed, 2 insertions(+), 1012 deletions(-)

diff --git a/src/include/miopen/hip_f8_impl.hpp b/src/include/miopen/hip_f8_impl.hpp
index c7a62f9f72..22052778a0 120000
--- a/src/include/miopen/hip_f8_impl.hpp
+++ b/src/include/miopen/hip_f8_impl.hpp
@@ -1,361 +1 @@
-/*******************************************************************************
- *
- * MIT License
- *
- * Copyright (c) 2023 Advanced Micro Devices, Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- *all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- *******************************************************************************/
-// #include <miopen/bfloat16.hpp>
-// #include <half.hpp>
-namespace miopen_hip_f8_impl {
-
-#ifndef __HIP_PLATFORM_HCC__
-using hip_bfloat16 = bfloat16;
-using half         = half_float::half;
-#endif
-
-template <int wm, int we, typename T>
-MIOPEN_HIP_HOST_DEVICE uint8_t cast_to_f8_no_range_reduce(T _x,
-                                                          bool stoch   = false,
-                                                          uint32_t rng = 0)
-{
-    static_assert(we == 5, "we==5");
-    static_assert(sizeof(T) == 2, "no_range_reduce only works for float16");
-
-    uint32_t x = *(reinterpret_cast<uint16_t*>(&_x));
-
-    uint32_t head, mantissa, exponent;
-    uint32_t sign;
-
-    const int mfmt      = 10;
-    head                = x & 0xFC00;
-    mantissa            = x & 0x3FF;
-    exponent            = (head >> 10) & 0x1F;
-    sign                = head >> 15;
-    uint32_t signed_inf = (sign << 7) + (((1 << we) - 1) << wm);
-
-    if((x & 0x7FFF) == 0x7C00)
-        return signed_inf;
-    if((x & 0x7C00) == 0x7C00)
-        return signed_inf + 1;
-    if(x == 0)
-        return 0;
-    if(x == 0x8000)
-        return 0x80;
-
-    //  uint32_t nextbit = 1<<(mfmt-wm-1);
-    uint32_t drop_mask = (1 << (mfmt - wm)) - 1;
-
-    // const int max_exp = (1<<we)-(negative_zero_nan ? 1 : 2);
-    mantissa += (stoch ? rng : mantissa) & drop_mask;
-    if(exponent != 0)
-        mantissa += 1 << mfmt;
-    if(mantissa >= (2 << mfmt))
-    {
-        mantissa >>= 1;
-        exponent++;
-    }
-    else if(mantissa >= (1 << mfmt) && exponent == 0)
-    {
-        exponent++;
-    }
-    mantissa >>= (mfmt - wm);
-    mantissa &= (1 << wm) - 1;
-    if(exponent == 31)
-        return (sign << 7) | 0x7B;
-    return (sign << 7) | (exponent << wm) | mantissa;
-}
-
-template <int wm, int we, typename T, bool negative_zero_nan, bool clip>
-MIOPEN_HIP_HOST_DEVICE uint8_t cast_to_f8(T _x, bool stoch, uint32_t rng)
-{
-    constexpr bool is_half  = std::is_same<T, half>::value;
-    constexpr bool is_float = std::is_same<T, float>::value;
-    static_assert(wm + we == 7, "wm+we==7");
-    static_assert(is_half || is_float, "Only half and float can be cast to f8");
-
-    if(sizeof(T) == 2 && we == 5 && !negative_zero_nan)
-        return cast_to_f8_no_range_reduce<2, 5, half>(static_cast<half>(_x), stoch, rng);
-
-    const int mfmt = (sizeof(T) == 4) ? 23 : 10;
-    uint32_t x;
-    if(sizeof(T) == 4)
-        x = *(reinterpret_cast<uint32_t*>(&_x)); // cppcheck-suppress invalidPointerCast
-    else
-        x = *(reinterpret_cast<uint16_t*>(&_x)); // cppcheck-suppress invalidPointerCast
-
-    uint32_t head, mantissa;
-    int exponent, bias;
-    uint32_t sign;
-
-    if(sizeof(T) == 4)
-    {
-        head     = x & 0xFF800000;
-        mantissa = x & 0x7FFFFF;
-        exponent = (head >> 23) & 0xFF;
-        sign     = head >> 31;
-        bias     = 127;
-    }
-    else
-    {
-        head     = x & 0xFC00;
-        mantissa = x & 0x3FF;
-        exponent = (head >> 10) & 0x1F;
-        sign     = head >> 15;
-        bias     = 15;
-    }
-
-    uint32_t signed_inf = (sign << 7) + (((1 << we) - 1) << wm);
-
-    if(negative_zero_nan)
-    {
-        if(sizeof(T) == 4)
-        {
-            if((x & 0x7F800000) == 0x7F800000)
-                return 0x80;
-        }
-        else
-        {
-            // if(__hisinf(x) || __hisnan(x))
-            if((x & 0x7C00) == 0x7C00)
-                return 0x80;
-        }
-    }
-    else
-    {
-        if(sizeof(T) == 4)
-        {
-            if((x & 0x7F800000) == 0x7F800000)
-                return signed_inf + (mantissa != 0 ? 1 : 0);
-        }
-        else
-        {
-            if((x & 0x7C00) == 0x7C00)
-                return signed_inf + (mantissa != 0 ? 1 : 0);
-        }
-    }
-    if(x == 0)
-        return 0;
-    // First need to check if it is normal or denorm as there is a difference of implict 1
-    // Then need to adjust the exponent to align with the F8 exponent, in the meanwhile, shift
-    // The mantissa. Then for stochastic rounding, add rng to mantissa and truncate. And for
-    // RNE, no need to add rng. Then probably need to check whether there is carry and adjust
-    // exponent and mantissa again
-
-    // For IEEE bias mode, the bias is 2^(k-1) -1 where k is the width of exponent bits
-    const int f8_bias                  = (1 << (we - 1)) - 1 + (negative_zero_nan ? 1 : 0);
-    const int f8_denormal_act_exponent = 1 - f8_bias; // actual exponent of f8 denormal
-    // act_exponent is the actual exponent of fp32/fp16 (after subtracting bias)
-    // f8_exponent is the converted f8 exponent with bias encoding
-    // exponent_diff is the diff between fp32/fp16 exponent and f8 exponent,
-    // the difference needs to be adjusted and mantissa shifted
-    int act_exponent, f8_exponent, exponent_diff;
-
-    if(exponent == 0)
-    { // fp32/fp16 is in denormal.
-        /* fp32 denormal is below 2^-127 so it is usually not a concern here, we mostly concern fp16
-           here. In this case, f8 is usually in denormal. But there could be exceptions. fp16
-           denormal has exponent bias 15 while bf8 with NANOO has exponent bias 16. It means that
-           there are some numbers in fp16 denormal but they are bf8 (NANOO) normals - smallest bf8
-           (NANOO) normal is 2^-15. fp16 numbers where exponent==0 (actual exponent -14) and highest
-           bit of mantissa is 1 are bf8 (NANOO) normal. In this case, the fp16 mantissa should be
-           shift left by 1  */
-        act_exponent  = exponent - bias + 1;
-        exponent_diff = f8_denormal_act_exponent -
-                        act_exponent; // actual exponent is exponent-bias+1 as it is denormal
-    }
-    else
-    { // fp32/fp16 is normal with implicit 1
-        act_exponent = exponent - bias;
-        if(act_exponent <= f8_denormal_act_exponent)
-        {
-            /* This is the case where fp32/fp16 is normal but it is in f8 denormal range.
-               For example fp8 nanoo mode, denormal exponent is -7, but if the fp32/fp16
-               actual exponent is -7, it is actually larger due to the implict 1,
-               Therefore it needs to be adjust to -6 and mantissa shift right by 1.
-               So for fp32/fp16, exponent -8 is the cut point to convert to fp8 nanoo */
-            exponent_diff = f8_denormal_act_exponent - act_exponent;
-        }
-        else
-        { // both fp32/fp16 and f8 are in normal range
-            exponent_diff =
-                0; // exponent_diff=0 does not mean there is no difference for this case,
-                   // act_exponent could be larger. Just that it does not need shift mantissa
-        }
-        mantissa += (1 << mfmt); // Add the implicit 1 into mantissa
-    }
-    const long tmp = (mfmt - wm + exponent_diff);
-    if(tmp == 33)
-        printf("Gotcha");
-
-    bool midpoint = (mantissa & ((static_cast<uint32_t>(1) << (mfmt - wm + exponent_diff)) - 1)) ==
-                    (static_cast<uint32_t>(1) << (mfmt - wm + exponent_diff - 1));
-    /* This part is a bit tricky. The judgment of whether it is a tie needs to be done before we
-       shift right as shift right could rip off some residual part and make something not midpoint
-       look like midpoint. For example, the fp16 number 0x1002 (0 00100 0000000010), it is larger
-       than midpoint, but after shift right by 4 bits, it would look like midpoint.
-       */
-
-    if(exponent_diff > 0)
-        mantissa >>= exponent_diff;
-    else if(exponent_diff == -1)
-        mantissa <<= -exponent_diff;
-    bool implicit_one = mantissa & (1 << mfmt);
-    // if there is no implict 1, it  means the f8 is denormal and need to adjust to denorm exponent
-    f8_exponent =
-        (act_exponent + exponent_diff) /*actual f8 exponent*/ + f8_bias - (implicit_one ? 0 : 1);
-
-    // Now we have the exponent and mantissa adjusted
-    uint32_t drop_mask = (1 << (mfmt - wm)) - 1;
-    bool odd =
-        mantissa & (1 << (mfmt - wm)); // if the least significant bit that is not truncated is 1
-    mantissa += (stoch ? rng : (midpoint ? (odd ? mantissa : mantissa - 1) : mantissa)) & drop_mask;
-
-    // Now we deal with overflow
-    if(f8_exponent == 0)
-    {
-        if((1 << mfmt) & mantissa)
-        {
-            f8_exponent = 1; // denormal overflow to become normal, promote exponent
-        }
-    }
-    else
-    {
-        if((1 << (mfmt + 1)) & mantissa)
-        {
-            mantissa >>= 1;
-            f8_exponent++;
-        }
-    }
-
-    mantissa >>= (mfmt - wm);
-
-    // above range: quantize to maximum possible float of the same sign
-    const int max_exp = (1 << we) - (negative_zero_nan ? 1 : 2);
-    if(f8_exponent > max_exp)
-    {
-        if(clip)
-        {
-            mantissa    = (1 << wm) - 1;
-            f8_exponent = max_exp;
-        }
-        else
-        {
-            return signed_inf;
-        }
-    }
-
-    if(f8_exponent == 0 && mantissa == 0)
-        return negative_zero_nan ? 0 : (sign << 7);
-    mantissa &= (1 << wm) - 1;
-    return (sign << 7) | (f8_exponent << wm) | mantissa;
-}
-
-template <int wm, int we, typename T, bool negative_zero_nan>
-MIOPEN_HIP_HOST_DEVICE T cast_from_f8(uint8_t x)
-{
-    constexpr bool is_half  = std::is_same<T, half>::value;
-    constexpr bool is_float = std::is_same<T, float>::value;
-    static_assert(is_half || is_float, "only half and float are supported");
-
-    constexpr int weo = is_half ? 5 : 8;
-    constexpr int wmo = is_half ? 10 : (is_float ? 23 : 7);
-
-    T fInf, fNegInf, fNaN, fNeg0;
-    if(is_half)
-    {
-        const uint16_t ihInf    = 0x7C00;
-        const uint16_t ihNegInf = 0xFC00;
-        const uint16_t ihNaN    = 0x7C01;
-        const uint16_t ihNeg0   = 0x8000;
-        fInf                    = *(reinterpret_cast<const half*>(&ihInf));
-        fNegInf                 = *(reinterpret_cast<const half*>(&ihNegInf));
-        fNaN                    = *(reinterpret_cast<const half*>(&ihNaN));
-        fNeg0                   = *(reinterpret_cast<const half*>(&ihNeg0));
-    }
-    else if(is_float)
-    {
-        const uint32_t ifInf    = 0x7F800000;
-        const uint32_t ifNegInf = 0xFF800000;
-        const uint32_t ifNaN    = 0x7F800001;
-        const uint32_t ifNeg0   = 0x80000000;
-        fInf = *(reinterpret_cast<const float*>(&ifInf)); // cppcheck-suppress invalidPointerCast
-        fNegInf =
-            *(reinterpret_cast<const float*>(&ifNegInf));   // cppcheck-suppress invalidPointerCast
-        fNaN  = *(reinterpret_cast<const float*>(&ifNaN));  // cppcheck-suppress invalidPointerCast
-        fNeg0 = *(reinterpret_cast<const float*>(&ifNeg0)); // cppcheck-suppress invalidPointerCast
-    }
-
-    if(x == 0)
-        return static_cast<T>(0);
-
-    uint32_t sign     = x >> 7;
-    uint32_t mantissa = x & ((1 << wm) - 1);
-    int exponent      = (x & 0x7F) >> wm;
-    if(negative_zero_nan)
-    {
-        if(x == 0x80)
-            return fNaN;
-    }
-    else
-    {
-        if(x == 0x80)
-            return fNeg0;
-        if(exponent == ((1 << we) - 1))
-            return (mantissa == 0) ? (sign ? fNegInf : fInf) : fNaN;
-    }
-    typename std::conditional<sizeof(T) == 2, uint16_t, uint32_t>::type retval;
-    if(we == 5 && is_half && !negative_zero_nan)
-    {
-        retval = x << 8;
-        return *(reinterpret_cast<const T*>(&retval));
-    }
-
-    const int exp_low_cutoff = (1 << (weo - 1)) - (1 << (we - 1)) + 1 - (negative_zero_nan ? 1 : 0);
-
-    // subnormal input
-    if(exponent == 0)
-    {
-        // guaranteed mantissa!=0 since cases 0x0 and 0x80 are handled above
-        int sh = 1 + __builtin_clz(mantissa) - (32 - wm);
-        mantissa <<= sh;
-        exponent += 1 - sh;
-        mantissa &= ((1 << wm) - 1);
-    }
-    exponent += exp_low_cutoff - 1;
-    mantissa <<= wmo - wm;
-
-    // subnormal output (occurs when T=half, we=5, negative_zero_nan=true)
-    if(exponent <= 0)
-    {
-        mantissa |= 1 << wmo;
-        mantissa >>= 1 - exponent;
-        exponent = 0;
-    }
-
-    if(sizeof(T) == 2)
-        retval = (sign << 15) | (exponent << 10) | mantissa;
-    else
-        retval = (sign << 31) | (exponent << 23) | mantissa;
-    return *(reinterpret_cast<const T*>(&retval));
-}
-
-} // namespace miopen_hip_f8_impl
+../../kernels/hip_f8_impl.hpp
\ No newline at end of file
diff --git a/src/include/miopen/hip_float8.hpp b/src/include/miopen/hip_float8.hpp
index dd57c9ca5b..5e16a70c91 120000
--- a/src/include/miopen/hip_float8.hpp
+++ b/src/include/miopen/hip_float8.hpp
@@ -1,651 +1 @@
-/*******************************************************************************
- *
- * MIT License
- *
- * Copyright (c) 2023 Advanced Micro Devices, Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- *all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- *******************************************************************************/
-#pragma once
-#ifndef MIOPEN_ENABLE_F8_DEVICE_CODE
-#define MIOPEN_ENABLE_F8_DEVICE_CODE 0
-#endif
-
-// FP8 header version 0.4, 2021/05/11
-#if defined __HIP_PLATFORM_HCC__ && MIOPEN_ENABLE_F8_DEVICE_CODE
-// MIOpen by default does not have device code in the regular compilation paths,
-// therefore, when this file is used from the host side, compilation takes much
-// longer. By guarding the __device__ directive we can control that such compilation
-// only happens for kernels which include this file.
-#define MIOPEN_HIP_HOST_DEVICE __host__ __device__
-#else
-#define MIOPEN_HIP_HOST_DEVICE
-#endif
-
-#define USE_SIMPLER_HIP_F8x8 0
-
-#ifndef MIOPEN_FP8_CLIPPING
-#define MIOPEN_FP8_CLIPPING 1
-#endif
-
-#ifndef MIOPEN_FP8_IEEE_EXPONENT_BIAS
-#define MIOPEN_FP8_IEEE_EXPONENT_BIAS 1
-#endif
-
-namespace miopen_hip_f8_impl {
-
-template <int wm, int we, typename T, bool negative_zero_nan, bool clip>
-MIOPEN_HIP_HOST_DEVICE uint8_t cast_to_f8(T _x, bool stoch = false, uint32_t rng = 0);
-
-template <int wm, int we, typename T, bool negative_zero_nan>
-MIOPEN_HIP_HOST_DEVICE T cast_from_f8(uint8_t x);
-
-} // namespace miopen_hip_f8_impl
-
-#include "hip_f8_impl.hpp"
-
-namespace miopen_f8 {
-enum class hip_f8_type
-{
-    bf8 = 0, // 1:5:2
-    fp8 = 1  // 1:4:3
-};
-
-enum class hip_f8_rounding_mode
-{
-    standard,
-    stochastic
-};
-
-inline MIOPEN_HIP_HOST_DEVICE bool get_hip_f8_bias_mode()
-{
-#if MIOPEN_FP8_IEEE_EXPONENT_BIAS
-    return false;
-#else
-    return true;
-#endif
-}
-
-template <hip_f8_type T>
-struct hip_f8
-{
-    uint8_t data;
-
-    // default constructor
-    MIOPEN_HIP_HOST_DEVICE hip_f8() = default;
-
-    MIOPEN_HIP_HOST_DEVICE hip_f8(hip_f8<T> const&) = default;
-
-    // constructor from bits
-    explicit MIOPEN_HIP_HOST_DEVICE hip_f8(uint8_t v) { data = v; }
-
-    // constructor from in
-    explicit MIOPEN_HIP_HOST_DEVICE hip_f8(int v) : hip_f8(static_cast<float>(v)) {}
-
-    explicit MIOPEN_HIP_HOST_DEVICE hip_f8(double v) : hip_f8(static_cast<float>(v)) {}
-
-    // constructor from float
-    explicit MIOPEN_HIP_HOST_DEVICE
-    hip_f8(float v,
-           miopen_f8::hip_f8_rounding_mode rm = miopen_f8::hip_f8_rounding_mode::standard,
-           uint32_t rng                       = 0)
-    {
-        if(T == hip_f8_type::bf8)
-        {
-            if(get_hip_f8_bias_mode())
-            {
-                data = miopen_hip_f8_impl::cast_to_f8<2,
-                                                      5,
-                                                      float,
-                                                      true /*negative_zero_nan*/,
-                                                      MIOPEN_FP8_CLIPPING /*clip*/>(
-                    v, (rm == miopen_f8::hip_f8_rounding_mode::stochastic), rng);
-            }
-            else
-            {
-                data = miopen_hip_f8_impl::cast_to_f8<2,
-                                                      5,
-                                                      float,
-                                                      false /*negative_zero_nan*/,
-                                                      MIOPEN_FP8_CLIPPING /*clip*/>(
-                    v, (rm == miopen_f8::hip_f8_rounding_mode::stochastic), rng);
-            }
-        }
-        else /* fp8*/
-        {
-            if(get_hip_f8_bias_mode())
-            {
-                data = miopen_hip_f8_impl::cast_to_f8<3,
-                                                      4,
-                                                      float,
-                                                      true /*negative_zero_nan*/,
-                                                      MIOPEN_FP8_CLIPPING /*clip*/>(
-                    v, (rm == miopen_f8::hip_f8_rounding_mode::stochastic), rng);
-            }
-            else
-            {
-                data = miopen_hip_f8_impl::cast_to_f8<3,
-                                                      4,
-                                                      float,
-                                                      false /*negative_zero_nan*/,
-                                                      MIOPEN_FP8_CLIPPING /*clip*/>(
-                    v, (rm == miopen_f8::hip_f8_rounding_mode::stochastic), rng);
-            }
-        }
-    }
-
-    // constructor from half
-    explicit MIOPEN_HIP_HOST_DEVICE
-    hip_f8(half v,
-           miopen_f8::hip_f8_rounding_mode rm = miopen_f8::hip_f8_rounding_mode::standard,
-           uint32_t rng                       = 0)
-    {
-        if(T == hip_f8_type::bf8)
-        {
-            if(get_hip_f8_bias_mode())
-            {
-                data = miopen_hip_f8_impl::cast_to_f8<2,
-                                                      5,
-                                                      half,
-                                                      true /*negative_zero_nan*/,
-                                                      MIOPEN_FP8_CLIPPING /*clip*/>(
-                    v, (rm == miopen_f8::hip_f8_rounding_mode::stochastic), rng);
-            }
-            else
-            {
-                data = miopen_hip_f8_impl::cast_to_f8<2,
-                                                      5,
-                                                      half,
-                                                      false /*negative_zero_nan*/,
-                                                      MIOPEN_FP8_CLIPPING /*clip*/>(
-                    v, (rm == miopen_f8::hip_f8_rounding_mode::stochastic), rng);
-            }
-        }
-        else /* fp8*/
-        {
-            if(get_hip_f8_bias_mode())
-            {
-                data = miopen_hip_f8_impl::cast_to_f8<3,
-                                                      4,
-                                                      half,
-                                                      true /*negative_zero_nan*/,
-                                                      MIOPEN_FP8_CLIPPING /*clip*/>(
-                    v, (rm == miopen_f8::hip_f8_rounding_mode::stochastic), rng);
-            }
-            else
-            {
-                data = miopen_hip_f8_impl::cast_to_f8<3,
-                                                      4,
-                                                      half,
-                                                      false /*negative_zero_nan*/,
-                                                      MIOPEN_FP8_CLIPPING /*clip*/>(
-                    v, (rm == miopen_f8::hip_f8_rounding_mode::stochastic), rng);
-            }
-        }
-    }
-    template <hip_f8_type U>
-    explicit MIOPEN_HIP_HOST_DEVICE
-    hip_f8(hip_f8<U> v,
-           miopen_f8::hip_f8_rounding_mode rm = miopen_f8::hip_f8_rounding_mode::standard,
-           uint32_t rng                       = 0)
-    {
-        if(T == U)
-        {
-            data = v.data;
-        }
-        else
-        {
-            const auto tmp  = static_cast<float>(v);
-            const auto tmp2 = hip_f8<U>(tmp, rm, rng);
-            data            = tmp2.data;
-        }
-    }
-
-    explicit MIOPEN_HIP_HOST_DEVICE hip_f8(hip_f8<T> v, hip_f8_rounding_mode, uint32_t)
-    {
-        this->data = v.data;
-    }
-
-    // constructor from hip_bfloat16
-    explicit MIOPEN_HIP_HOST_DEVICE
-    hip_f8(hip_bfloat16 v,
-           hip_f8_rounding_mode r = miopen_f8::hip_f8_rounding_mode::standard,
-           uint32_t rng           = 0);
-
-    MIOPEN_HIP_HOST_DEVICE
-    hip_f8& operator*=(const hip_f8& rhs)
-    {
-        const auto tmp = static_cast<float>(*this) * static_cast<float>(rhs);
-        *this          = static_cast<hip_f8>(tmp);
-        return *this;
-    }
-
-    MIOPEN_HIP_HOST_DEVICE
-    hip_f8& operator+=(const hip_f8& rhs)
-    {
-        const auto tmp = static_cast<float>(*this) + static_cast<float>(rhs);
-        *this          = static_cast<hip_f8>(tmp);
-        return *this;
-    }
-
-    MIOPEN_HIP_HOST_DEVICE
-    hip_f8& operator-=(const hip_f8& rhs)
-    {
-        const auto tmp = static_cast<float>(*this) - static_cast<float>(rhs);
-        *this          = static_cast<hip_f8>(tmp);
-        return *this;
-    }
-
-    inline MIOPEN_HIP_HOST_DEVICE hip_f8& operator=(const hip_f8& rhs)
-    {
-        if(&rhs != this)
-            this->data = rhs.data;
-        return *this;
-    }
-
-    inline MIOPEN_HIP_HOST_DEVICE bool operator==(const hip_f8& rhs) const
-    {
-        if((rhs.is_zero() && this->is_zero()) ||
-           (fabs(rhs - *this) < std::numeric_limits<hip_f8<T>>::epsilon()))
-            return true;
-        else if(rhs.is_nan() || rhs.is_inf() || this->is_nan() || this->is_inf())
-            return false;
-
-        return false;
-    }
-
-    inline MIOPEN_HIP_HOST_DEVICE bool operator<(const hip_f8& rhs) const
-    {
-        const auto we   = static_cast<float>(*this);
-        const auto them = static_cast<float>(rhs);
-        return we < them;
-    }
-
-    inline MIOPEN_HIP_HOST_DEVICE bool operator>(const hip_f8& rhs) const
-    {
-        const auto we   = static_cast<float>(*this);
-        const auto them = static_cast<float>(rhs);
-        return we > them;
-    }
-#if 0
-    /*explicit*/ inline MIOPEN_HIP_HOST_DEVICE operator double()
-    {
-        // float tmp = static_cast<float>(*this);
-        // return tmp;
-    }
-
-    /*explicit*/ inline MIOPEN_HIP_HOST_DEVICE operator double() const
-    {
-        // float tmp = static_cast<float>(*this);
-        // return tmp;
-    }
-#endif
-    // convert to float
-    /*explicit*/ inline MIOPEN_HIP_HOST_DEVICE operator float() const
-    {
-        if(T == hip_f8_type::bf8)
-        {
-            if(get_hip_f8_bias_mode())
-            {
-                return miopen_hip_f8_impl::cast_from_f8<2, 5, float, true /*negative_zero_nan*/>(
-                    data);
-            }
-            else
-            {
-                return miopen_hip_f8_impl::cast_from_f8<2, 5, float, false /*negative_zero_nan*/>(
-                    data);
-            }
-        }
-        else /* fp8*/
-        {
-            if(get_hip_f8_bias_mode())
-            {
-                return miopen_hip_f8_impl::cast_from_f8<3, 4, float, true /*negative_zero_nan*/>(
-                    data);
-            }
-            else
-            {
-                return miopen_hip_f8_impl::cast_from_f8<3, 4, float, false /*negative_zero_nan*/>(
-                    data);
-            }
-        }
-    }
-
-    // convert to half
-    /*explicit*/ inline MIOPEN_HIP_HOST_DEVICE operator half() const
-    {
-        if(T == hip_f8_type::bf8)
-        {
-            if(get_hip_f8_bias_mode())
-            {
-                return miopen_hip_f8_impl::cast_from_f8<2, 5, half, true /*negative_zero_nan*/>(
-                    data);
-            }
-            else
-            {
-                return miopen_hip_f8_impl::cast_from_f8<2, 5, half, false /*negative_zero_nan*/>(
-                    data);
-            }
-        }
-        else /* fp8*/
-        {
-            if(get_hip_f8_bias_mode())
-            {
-                return miopen_hip_f8_impl::cast_from_f8<3, 4, half, true /*negative_zero_nan*/>(
-                    data);
-            }
-            else
-            {
-                return miopen_hip_f8_impl::cast_from_f8<3, 4, half, false /*negative_zero_nan*/>(
-                    data);
-            }
-        }
-    }
-
-    // convert to hip_bfloat16
-    /*explicit*/ inline MIOPEN_HIP_HOST_DEVICE operator hip_bfloat16() const;
-
-    // check for zero
-    inline MIOPEN_HIP_HOST_DEVICE bool is_zero() const
-    {
-        if(get_hip_f8_bias_mode())
-        {
-            return data == 0x00;
-        }
-        else
-        {
-            return (data == 0x00) || (data == 0x80);
-        }
-    }
-
-    // check for nan
-    inline MIOPEN_HIP_HOST_DEVICE bool is_nan() const
-    {
-        if(get_hip_f8_bias_mode())
-        {
-            return data == 0x80;
-        }
-        else
-        {
-            if(T == hip_f8_type::bf8)
-            {
-                return (data == 0x7d) || (data == 0x7e) || (data == 0x7f) || (data == 0xfd) ||
-                       (data == 0xfe) || (data == 0xff);
-            }
-            else
-            {
-                return (data == 0x79) || (data == 0x7a) || (data == 0x7b) || (data == 0x7c) ||
-                       (data == 0x7d) || (data == 0x7e) || (data == 0x7f) || (data == 0xf9) ||
-                       (data == 0xfa) || (data == 0xfb) || (data == 0xfc) || (data == 0xfd) ||
-                       (data == 0xfe) || (data == 0xff);
-            }
-        }
-    }
-
-    // check for inf
-    inline MIOPEN_HIP_HOST_DEVICE bool is_inf() const
-    {
-        if(get_hip_f8_bias_mode())
-        {
-            return data == 0x80;
-        }
-        else
-        {
-            if(T == hip_f8_type::bf8)
-            {
-                return (data == 0x7c) || (data == 0xfc);
-            }
-            else
-            {
-                return (data == 0x78) || (data == 0xf8);
-            }
-        }
-    }
-}; // end of class hip_f8
-
-template <miopen_f8::hip_f8_type T>
-inline MIOPEN_HIP_HOST_DEVICE miopen_f8::hip_f8<T> operator*(miopen_f8::hip_f8<T> lhs,
-                                                             const miopen_f8::hip_f8<T>& rhs)
-{
-    lhs *= rhs;
-    return lhs;
-}
-
-template <miopen_f8::hip_f8_type T>
-inline MIOPEN_HIP_HOST_DEVICE miopen_f8::hip_f8<T> operator+(miopen_f8::hip_f8<T> lhs,
-                                                             const miopen_f8::hip_f8<T>& rhs)
-{
-    lhs += rhs;
-    return lhs;
-}
-
-template <miopen_f8::hip_f8_type T>
-inline MIOPEN_HIP_HOST_DEVICE miopen_f8::hip_f8<T> operator-(miopen_f8::hip_f8<T> lhs,
-                                                             const miopen_f8::hip_f8<T>& rhs)
-{
-    lhs -= rhs;
-    return lhs;
-}
-
-template <miopen_f8::hip_f8_type T, typename U>
-inline MIOPEN_HIP_HOST_DEVICE miopen_f8::hip_f8<T> operator-(U lhs, const miopen_f8::hip_f8<T>& rhs)
-{
-    const auto tmp = static_cast<U>(rhs);
-    return static_cast<miopen_f8::hip_f8<T>>(lhs - tmp);
-}
-
-template <miopen_f8::hip_f8_type T>
-inline MIOPEN_HIP_HOST_DEVICE bool operator<(const miopen_f8::hip_f8<T>& lhs,
-                                             const miopen_f8::hip_f8<T>& rhs)
-{
-    return static_cast<float>(lhs) < static_cast<float>(rhs);
-}
-
-template <miopen_f8::hip_f8_type T>
-inline MIOPEN_HIP_HOST_DEVICE bool operator>(const miopen_f8::hip_f8<T>& lhs,
-                                             const miopen_f8::hip_f8<T>& rhs)
-{
-    return static_cast<float>(lhs) > static_cast<float>(rhs);
-}
-
-template <miopen_f8::hip_f8_type T>
-inline MIOPEN_HIP_HOST_DEVICE miopen_f8::hip_f8<T> fabs(miopen_f8::hip_f8<T> v)
-{
-    v.data = v.data & 0x7f;
-    return v;
-}
-template <class T>
-MIOPEN_HIP_HOST_DEVICE T F8_Max()
-{
-    union
-    {
-        uint8_t bits;
-        T value;
-    } x;
-
-    x.bits = 0x7F;
-    return x.value;
-}
-} // namespace miopen_f8
-
-// define numeric limits for the new data type
-namespace std {
-inline bool isfinite(miopen_f8::hip_f8<miopen_f8::hip_f8_type::fp8> x) // NOLINT
-{
-    return x.is_inf();
-}
-
-inline bool isfinite(miopen_f8::hip_f8<miopen_f8::hip_f8_type::bf8> x) // NOLINT
-{
-    return x.is_inf();
-}
-
-template <>
-class numeric_limits<miopen_f8::hip_f8<miopen_f8::hip_f8_type::fp8>>
-{
-public:
-    static MIOPEN_HIP_HOST_DEVICE miopen_f8::hip_f8<miopen_f8::hip_f8_type::fp8> epsilon()
-    {
-        return static_cast<miopen_f8::hip_f8<miopen_f8::hip_f8_type::fp8>>(float(0.0625));
-    }
-
-    static MIOPEN_HIP_HOST_DEVICE miopen_f8::hip_f8<miopen_f8::hip_f8_type::fp8> quiet_NaN()
-    {
-        return static_cast<miopen_f8::hip_f8<miopen_f8::hip_f8_type::fp8>>(
-            static_cast<uint8_t>(miopen_f8::get_hip_f8_bias_mode() ? 0X80 : 0x79));
-    }
-
-    static MIOPEN_HIP_HOST_DEVICE miopen_f8::hip_f8<miopen_f8::hip_f8_type::fp8> max()
-    {
-        return miopen_f8::F8_Max<miopen_f8::hip_f8<miopen_f8::hip_f8_type::fp8>>();
-    }
-
-    static MIOPEN_HIP_HOST_DEVICE miopen_f8::hip_f8<miopen_f8::hip_f8_type::fp8> min()
-    {
-        return static_cast<miopen_f8::hip_f8<miopen_f8::hip_f8_type::fp8>>(-1.0f) *
-               miopen_f8::F8_Max<miopen_f8::hip_f8<miopen_f8::hip_f8_type::fp8>>();
-    }
-};
-
-template <>
-class numeric_limits<miopen_f8::hip_f8<miopen_f8::hip_f8_type::bf8>>
-{
-public:
-    static MIOPEN_HIP_HOST_DEVICE miopen_f8::hip_f8<miopen_f8::hip_f8_type::bf8> epsilon()
-    {
-        return static_cast<miopen_f8::hip_f8<miopen_f8::hip_f8_type::bf8>>(float(0.125));
-    }
-
-    static MIOPEN_HIP_HOST_DEVICE miopen_f8::hip_f8<miopen_f8::hip_f8_type::bf8> quiet_NaN()
-    {
-        return static_cast<miopen_f8::hip_f8<miopen_f8::hip_f8_type::bf8>>(
-            static_cast<uint8_t>(miopen_f8::get_hip_f8_bias_mode() ? 0X80 : 0x7d));
-    }
-
-    static MIOPEN_HIP_HOST_DEVICE miopen_f8::hip_f8<miopen_f8::hip_f8_type::bf8> max()
-    {
-        return static_cast<miopen_f8::hip_f8<miopen_f8::hip_f8_type::bf8>>(
-            miopen_f8::F8_Max<miopen_f8::hip_f8<miopen_f8::hip_f8_type::bf8>>());
-    }
-    static MIOPEN_HIP_HOST_DEVICE miopen_f8::hip_f8<miopen_f8::hip_f8_type::bf8> min()
-    {
-        return static_cast<miopen_f8::hip_f8<miopen_f8::hip_f8_type::bf8>>(-1.0f) *
-               miopen_f8::F8_Max<miopen_f8::hip_f8<miopen_f8::hip_f8_type::bf8>>();
-    }
-};
-
-} // namespace std
-
-template <miopen_f8::hip_f8_type T>
-struct hip_f8x4
-{
-    // define some convenience types
-    using float32x2 = float __attribute__((ext_vector_type(2)));
-    using float32x4 = float __attribute__((ext_vector_type(4)));
-
-    using halfx2 = _Float16 __attribute__((ext_vector_type(2)));
-    using halfx4 = _Float16 __attribute__((ext_vector_type(4)));
-
-    using hip_bfloat16x2 = uint16_t __attribute__((ext_vector_type(2)));
-    using hip_bfloat16x4 = uint16_t __attribute__((ext_vector_type(4)));
-
-    uint32_t data;
-
-    // default constructor
-    MIOPEN_HIP_HOST_DEVICE hip_f8x4() = default;
-
-    // constructor from bits
-    MIOPEN_HIP_HOST_DEVICE hip_f8x4(uint32_t v);
-
-    // constructor from float
-    MIOPEN_HIP_HOST_DEVICE
-    hip_f8x4(float v0,
-             float v1                           = 0,
-             float v2                           = 0,
-             float v3                           = 0,
-             miopen_f8::hip_f8_rounding_mode rm = miopen_f8::hip_f8_rounding_mode::standard,
-             uint32_t rng                       = 0);
-    MIOPEN_HIP_HOST_DEVICE
-    hip_f8x4(float32x2 v,
-             miopen_f8::hip_f8_rounding_mode rm = miopen_f8::hip_f8_rounding_mode::standard,
-             uint32_t rng                       = 0);
-    MIOPEN_HIP_HOST_DEVICE
-    hip_f8x4(float32x4 v,
-             miopen_f8::hip_f8_rounding_mode rm = miopen_f8::hip_f8_rounding_mode::standard,
-             uint32_t rng                       = 0);
-
-    // constructor from half
-    MIOPEN_HIP_HOST_DEVICE
-    hip_f8x4(half v0,
-             half v1                            = {},
-             half v2                            = {},
-             half v3                            = {},
-             miopen_f8::hip_f8_rounding_mode rm = miopen_f8::hip_f8_rounding_mode::standard,
-             uint32_t rng                       = 0);
-    MIOPEN_HIP_HOST_DEVICE
-    hip_f8x4(halfx2 v,
-             miopen_f8::hip_f8_rounding_mode rm = miopen_f8::hip_f8_rounding_mode::standard,
-             uint32_t rng                       = 0);
-    MIOPEN_HIP_HOST_DEVICE
-    hip_f8x4(halfx4 v,
-             miopen_f8::hip_f8_rounding_mode rm = miopen_f8::hip_f8_rounding_mode::standard,
-             uint32_t rng                       = 0);
-
-    // convert to float32x4
-    inline MIOPEN_HIP_HOST_DEVICE operator float32x4() const;
-
-    // convert to halfx4
-    inline MIOPEN_HIP_HOST_DEVICE operator halfx4() const;
-};
-
-template <miopen_f8::hip_f8_type T>
-struct hip_f8x8
-{
-    // define some convenience types
-    using f8x8 = hip_f8x4<T> __attribute__((ext_vector_type(2)));
-
-    f8x8 data;
-
-    // default constructor
-    MIOPEN_HIP_HOST_DEVICE hip_f8x8() = default;
-
-    // do we need to define other constructors or any conversion routines here?
-};
-
-// If we do not end up needing either any constructors or conversion routines for the above type,
-// then we can simplify the above type to the following
-#if USE_SIMPLER_HIP_F8x8
-template <hip_f8_type T>
-using hip_f8x8 = hip_f8x4<T> __attribute__((ext_vector_type(2)));
-#endif
-
-using hip_float32x4  = float __attribute__((ext_vector_type(4)));
-using hip_float32x16 = float __attribute__((ext_vector_type(16)));
-
-// these are device-specific and we don't expect them to exist unless we're compiling with hip-clang
-// for MI300.
-template <miopen_f8::hip_f8_type T_A, miopen_f8::hip_f8_type T_B>
-__device__ hip_float32x4 mfma_f32_16x16x32(hip_f8x8<T_A> a, hip_f8x8<T_B> b, hip_float32x4 c);
-
-template <miopen_f8::hip_f8_type T_A, miopen_f8::hip_f8_type T_B>
-__device__ hip_float32x16 mfma_f32_32x32x16(hip_f8x8<T_A> a, hip_f8x8<T_B> b, hip_float32x16 c);
-
-using float8  = miopen_f8::hip_f8<miopen_f8::hip_f8_type::fp8>;
-using bfloat8 = miopen_f8::hip_f8<miopen_f8::hip_f8_type::bf8>;
+../../kernels/hip_float8.hpp
\ No newline at end of file

From 7efc89cae9250aeceba4fba534996d7d4fd3f709 Mon Sep 17 00:00:00 2001
From: atamazov <artem.tamazov@gmail.com>
Date: Sat, 7 Oct 2023 01:02:08 +0300
Subject: [PATCH 4/8] rocblas-remove-int8x4-uses(05) Doc update + review
 comments

---
 docs/datatypes.md       | 3 +--
 include/miopen/miopen.h | 9 +++++----
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/docs/datatypes.md b/docs/datatypes.md
index 1a46e5fd62..f5d8f28931 100644
--- a/docs/datatypes.md
+++ b/docs/datatypes.md
@@ -10,7 +10,7 @@ typedef enum {
     miopenFloat    = 1,
     miopenInt32    = 2,
     miopenInt8     = 3,
-    miopenInt8x4   = 4,
+    /* Value 4 is reserved. */
     miopenBFloat16 = 5,
 } miopenDataType_t;
 ```
@@ -22,7 +22,6 @@ Type descriptions:
  * `miopenFloat` - 32-bit floating point
  * `miopenInt32` - 32-bit integer, used primarily for `int8` convolution outputs
  * `miopenInt8` - 8-bit integer, currently only supported by `int8` convolution forward path, tensor set, tensor copy, tensor cast, tensor transform, tensor transpose, and im2col.
- * `miopenInt8x4` - 8-bit 4 element vector type used primarily with `int8` convolutions forward path.
  * `miopenBFloat16` - brain float fp-16 (8-bit exponent, 7-bit fraction), currently only supported by convolutions, tensor set, and tensor copy.
 
 
diff --git a/include/miopen/miopen.h b/include/miopen/miopen.h
index fd978c6200..7121ada6bf 100644
--- a/include/miopen/miopen.h
+++ b/include/miopen/miopen.h
@@ -346,10 +346,11 @@ MIOPEN_DECLARE_OBJECT(miopenReduceTensorDescriptor);
  */
 typedef enum
 {
-    miopenHalf     = 0, /*!< 16-bit floating point (Fully supported) */
-    miopenFloat    = 1, /*!< 32-bit floating point (Fully supported) */
-    miopenInt32    = 2, /*!< 32-bit int point (Partially supported) */
-    miopenInt8     = 3, /*!< 8-bit int point (Partially supported) */
+    miopenHalf  = 0, /*!< 16-bit floating point (Fully supported) */
+    miopenFloat = 1, /*!< 32-bit floating point (Fully supported) */
+    miopenInt32 = 2, /*!< 32-bit int point (Partially supported) */
+    miopenInt8  = 3, /*!< 8-bit int point (Partially supported) */
+    // miopenInt8x4   = 4, /*!< Pack of 4x Int8 in NCHW_VECT_C format (Support discontinued) */
     miopenBFloat16 = 5, /*!< 16-bit binary floating point (8-bit exponent, 7-bit fraction)
                            (Partially supported) */
     miopenDouble  = 6,  /*!< 64-bit floating point (Partially supported) */

From e984c54ba848cdf6db72e615f3287258211d4d8a Mon Sep 17 00:00:00 2001
From: atamazov <artem.tamazov@gmail.com>
Date: Sat, 7 Oct 2023 01:38:14 +0300
Subject: [PATCH 5/8] rocblas-remove-int8x4-uses(07) Fix build errors after
 merge of #2385

---
 src/solver/batchnorm/backward_ck.cpp         | 2 --
 src/solver/batchnorm/forward_training_ck.cpp | 2 --
 2 files changed, 4 deletions(-)

diff --git a/src/solver/batchnorm/backward_ck.cpp b/src/solver/batchnorm/backward_ck.cpp
index fba8724990..35fb8adbad 100644
--- a/src/solver/batchnorm/backward_ck.cpp
+++ b/src/solver/batchnorm/backward_ck.cpp
@@ -182,7 +182,6 @@ bool BnCKBwdBackward::IsApplicable(const ExecutionContext& ctx,
         return CheckCKApplicability<BF16, F32, F32, F32, BF16, F32, F32>(bn_problem);
     case miopenInt32:
     case miopenInt8:
-    case miopenInt8x4:
     case miopenBFloat8:
     case miopenFloat8:
     default: MIOPEN_THROW("Unsupported datatype");
@@ -236,7 +235,6 @@ ConvSolution BnCKBwdBackward::GetSolution(
         return MakeAnyInvokerFactory<BF16, F32, F32, F32, BF16, F32, F32>(bn_problem);
     case miopenInt8:
     case miopenInt32:
-    case miopenInt8x4:
     case miopenBFloat8:
     case miopenFloat8:
     default:
diff --git a/src/solver/batchnorm/forward_training_ck.cpp b/src/solver/batchnorm/forward_training_ck.cpp
index a65cec14a9..17191a4c9e 100644
--- a/src/solver/batchnorm/forward_training_ck.cpp
+++ b/src/solver/batchnorm/forward_training_ck.cpp
@@ -174,7 +174,6 @@ bool BnCKFwdTraining::IsApplicable(const ExecutionContext& context,
     case miopenBFloat16: return CheckCKApplicability<BF16, BF16, F32, BF16, BF16, F32>(bn_problem);
     case miopenInt32:
     case miopenInt8:
-    case miopenInt8x4:
     case miopenBFloat8:
     case miopenFloat8:
     default: MIOPEN_THROW("BnCKFwdTraining operation does not supprot this data type");
@@ -224,7 +223,6 @@ ConvSolution BnCKFwdTraining::GetSolution(
     case miopenBFloat16: return MakeAnyInvokerFactory<BF16, BF16, F32, BF16, BF16, F32>(bn_problem);
     case miopenInt8:
     case miopenInt32:
-    case miopenInt8x4:
     case miopenBFloat8:
     case miopenFloat8:
     default:

From 27f146a5c971ca13bd3d611f5f99f487ad8896e7 Mon Sep 17 00:00:00 2001
From: atamazov <artem.tamazov@gmail.com>
Date: Tue, 10 Oct 2023 00:23:56 +0300
Subject: [PATCH 6/8] rocblas-remove-int8x4-uses(08) Revert
 "rocblas-remove-int8x4-uses(02) Removed the rest of INT8x4 support (except
 the driver)"

This reverts commit 6737d66a66206f1ad90ab9da5871ada2eeb02875.

# RESOLVED Conflicts:
#	include/miopen/miopen.h
---
 driver/conv_driver.hpp                           |  4 ----
 include/miopen/miopen.h                          |  2 +-
 src/check_numerics.cpp                           |  1 +
 src/driver_arguments.cpp                         |  4 +++-
 src/gemm_v2.cpp                                  | 16 ++++++++--------
 src/hip/general_tensor_reorder_sol.cpp           |  2 ++
 src/include/miopen/conv/problem_description.hpp  |  1 +
 src/include/miopen/datatype.hpp                  |  5 +++++
 src/include/miopen/tensor.hpp                    |  1 +
 src/include/miopen/visit_float.hpp               |  1 +
 src/ocl/tensorocl.cpp                            |  6 ++++++
 src/pooling_api.cpp                              |  1 +
 src/reducetensor.cpp                             |  3 +++
 src/solver/batchnorm/forward_inference_ck.cpp    |  2 ++
 .../conv_ck_igemm_fwd_bias_activ_fused.cpp       |  4 ++++
 ...v_hip_implicit_gemm_3d_grouped_bwd_xdlops.cpp |  4 ++++
 ...v_hip_implicit_gemm_3d_grouped_fwd_xdlops.cpp |  4 ++++
 ...v_hip_implicit_gemm_3d_grouped_wrw_xdlops.cpp |  4 ++++
 .../conv_hip_implicit_gemm_bwd_data_xdlops.cpp   |  4 ++++
 src/solver/conv_hip_implicit_gemm_fwd_xdlops.cpp |  4 ++++
 ...conv_hip_implicit_gemm_grouped_fwd_xdlops.cpp |  4 ++++
 src/solver/gemm.cpp                              |  2 ++
 src/solver/mlir_common.cpp                       |  1 +
 src/tensor.cpp                                   |  1 +
 test/driver.hpp                                  |  2 ++
 test/gtest/conv_embed_db.cpp                     |  1 +
 test/gtest/conv_hip_igemm_xdlops.cpp             |  1 +
 test/gtest/conv_igemm_dynamic.cpp                |  1 +
 test/gtest/conv_igemm_mlir.cpp                   |  1 +
 test/gtest/conv_igemm_mlir_xdlops.cpp            |  1 +
 test/gtest/conv_trans.cpp                        |  1 +
 test/gtest/db_sync.cpp                           |  2 ++
 32 files changed, 77 insertions(+), 14 deletions(-)

diff --git a/driver/conv_driver.hpp b/driver/conv_driver.hpp
index 3c28859776..5f67b83588 100644
--- a/driver/conv_driver.hpp
+++ b/driver/conv_driver.hpp
@@ -83,10 +83,6 @@ MIOPEN_DECLARE_ENV_VAR(MIOPEN_DRIVER_PAD_BUFFERS_2M)
 MIOPEN_DECLARE_ENV_VAR(MIOPEN_DRIVER_USE_GPU_REFERENCE)
 MIOPEN_DECLARE_ENV_VAR(MIOPEN_DRIVER_SUBNORM_PERCENTAGE)
 
-// Support in the library discontinued, but left in the driver
-// for reference in the future.
-#define miopenInt8x4 (static_cast<miopenDataType_t>(4))
-
 #if MIOPEN_BACKEND_OPENCL
 #define STATUS_SUCCESS CL_SUCCESS
 typedef cl_int status_t;
diff --git a/include/miopen/miopen.h b/include/miopen/miopen.h
index 7121ada6bf..600484e392 100644
--- a/include/miopen/miopen.h
+++ b/include/miopen/miopen.h
@@ -350,7 +350,7 @@ typedef enum
     miopenFloat = 1, /*!< 32-bit floating point (Fully supported) */
     miopenInt32 = 2, /*!< 32-bit int point (Partially supported) */
     miopenInt8  = 3, /*!< 8-bit int point (Partially supported) */
-    // miopenInt8x4   = 4, /*!< Pack of 4x Int8 in NCHW_VECT_C format (Support discontinued) */
+    miopenInt8x4   = 4, /*!< Pack of four Int8 in NCHW_VECT_C format (Support discontinued) */
     miopenBFloat16 = 5, /*!< 16-bit binary floating point (8-bit exponent, 7-bit fraction)
                            (Partially supported) */
     miopenDouble  = 6,  /*!< 64-bit floating point (Partially supported) */
diff --git a/src/check_numerics.cpp b/src/check_numerics.cpp
index 12210069df..50cb0af4af 100644
--- a/src/check_numerics.cpp
+++ b/src/check_numerics.cpp
@@ -64,6 +64,7 @@ std::string GetKernelName(miopenDataType_t data_type)
     case miopenBFloat8: return {"check_numerics_bf8"};
     case miopenInt32:
     case miopenInt8:
+    case miopenInt8x4: // Support discontinued.
     case miopenDouble:
     default: return {""};
     }
diff --git a/src/driver_arguments.cpp b/src/driver_arguments.cpp
index d9e82d270a..53aa6b41ab 100644
--- a/src/driver_arguments.cpp
+++ b/src/driver_arguments.cpp
@@ -50,7 +50,7 @@ void ConvDataType(std::stringstream& ss, const miopen::TensorDescriptor& desc)
     {
         ss << "convbfp16";
     }
-    else if(desc.GetType() == miopenInt8)
+    else if(desc.GetType() == miopenInt8 || desc.GetType() == miopenInt8x4)
     {
         ss << "convint8";
     }
@@ -190,6 +190,8 @@ std::string ConvArgsForMIOpenDriver(const miopen::TensorDescriptor& xDesc,
     ss << " -g " << convDesc.group_count;
     if(print_for_conv_driver)
         ss << " -F " << std::to_string(static_cast<int>(conv_dir)) << " -t 1"; // clang-format on
+    if(xDesc.GetType() == miopenInt8x4)
+        ss << " -Z 1";
     if(immediate_mode_solver_id.has_value())
     {
         ss << " -S " << *immediate_mode_solver_id;
diff --git a/src/gemm_v2.cpp b/src/gemm_v2.cpp
index c4c6057687..bd2f916092 100644
--- a/src/gemm_v2.cpp
+++ b/src/gemm_v2.cpp
@@ -611,9 +611,9 @@ miopenStatus_t CallGemm(const Handle& handle,
         };
         break;
 
+        case miopenInt8x4:
         case miopenDouble: {
-            MIOPEN_THROW(miopenStatusBadParm,
-                         "miopenDouble data type not supported by MIOpenGEMM.");
+            MIOPEN_THROW(miopenStatusBadParm, "Unknown or unsupported data type.");
         };
         break;
         }
@@ -877,10 +877,10 @@ miopenStatus_t CallGemmStridedBatched(const Handle& handle,
             break;
         }
 
+        case miopenInt8x4:
         case miopenDouble: {
-            MIOPEN_THROW(miopenStatusBadParm,
-                         "miopenDouble data type not supported by MIOpenGEMM.");
-        }
+            MIOPEN_THROW(miopenStatusBadParm, "Unknown or unsupported data type.");
+        };
         break;
         }
 
@@ -1141,10 +1141,10 @@ miopenStatus_t CallGemmStridedBatchedSequential(const Handle& handle,
             break;
         }
 
+        case miopenInt8x4:
         case miopenDouble: {
-            MIOPEN_THROW(miopenStatusBadParm,
-                         "miopenDouble data type not supported by MIOpenGEMM.");
-        }
+            MIOPEN_THROW(miopenStatusBadParm, "Unknown or unsupported data type.");
+        };
         break;
         }
 
diff --git a/src/hip/general_tensor_reorder_sol.cpp b/src/hip/general_tensor_reorder_sol.cpp
index ae85048f63..f78aa358f8 100644
--- a/src/hip/general_tensor_reorder_sol.cpp
+++ b/src/hip/general_tensor_reorder_sol.cpp
@@ -137,6 +137,8 @@ GenericReorderSolutionImpl::GenericReorderSolutionImpl(miopenDataType_t data_typ
       order_2(order_2_),
       order_3(order_3_)
 {
+    if(data_type == miopenInt8x4)
+        MIOPEN_THROW("These data type are not supported");
     std::size_t data_size  = miopen::GetTypeSize(data_type);
     kernel_param_heuristic = tensor_reorder::HeuristicGet(data_size, dim_0, dim_1, dim_2, dim_3);
 }
diff --git a/src/include/miopen/conv/problem_description.hpp b/src/include/miopen/conv/problem_description.hpp
index 12d6f08811..438e1d306a 100644
--- a/src/include/miopen/conv/problem_description.hpp
+++ b/src/include/miopen/conv/problem_description.hpp
@@ -51,6 +51,7 @@ inline std::string GetDataTypeName(miopenDataType_t data_type)
     case miopenFloat: return "FP32";
     case miopenHalf: return "FP16";
     case miopenInt8: return "INT8";
+    case miopenInt8x4: return "INT8x4";
     case miopenInt32: return "INT32";
     case miopenBFloat16: return "BF16";
     case miopenDouble: return "FP64";
diff --git a/src/include/miopen/datatype.hpp b/src/include/miopen/datatype.hpp
index 2260cb5b4f..29057e3de9 100644
--- a/src/include/miopen/datatype.hpp
+++ b/src/include/miopen/datatype.hpp
@@ -53,6 +53,10 @@ inline std::string GetDataType(miopenDataType_t type)
         type_str = "bfloat16";
     }
     break;
+    case miopenInt8x4: {
+        type_str = "UNSUPPORTED_TYPE";
+    }
+    break;
     case miopenInt8: {
         type_str = "int8_t";
     }
@@ -153,6 +157,7 @@ inline KernelBuildParameters GetDataTypeKBP(miopenDataType_t type)
     case miopenDouble: use_fp64 = 1; break;
     case miopenFloat8: use_fp8 = 1; break;
     case miopenBFloat8: use_bfp8 = 1; break;
+    case miopenInt8x4: // fallthrough
     default:
         MIOPEN_THROW("Only float, half, bfloat16, int8, float8, bfloat8 data types are supported.");
         break;
diff --git a/src/include/miopen/tensor.hpp b/src/include/miopen/tensor.hpp
index 1bf2bcbaae..a133ab4adc 100644
--- a/src/include/miopen/tensor.hpp
+++ b/src/include/miopen/tensor.hpp
@@ -101,6 +101,7 @@ inline std::size_t GetTypeSize(miopenDataType_t d)
     case miopenFloat: return 4;
     case miopenHalf:
     case miopenBFloat16: return 2;
+    case miopenInt8x4: break;
     case miopenInt8:
     case miopenFloat8:
     case miopenBFloat8: return 1;
diff --git a/src/include/miopen/visit_float.hpp b/src/include/miopen/visit_float.hpp
index ae84b7f0a9..35e1ae6ae7 100644
--- a/src/include/miopen/visit_float.hpp
+++ b/src/include/miopen/visit_float.hpp
@@ -91,6 +91,7 @@ void visit_float(miopenDataType_t t, F f)
         f(as_float<double>{});
         break;
     }
+    case miopenInt8x4: MIOPEN_THROW("miopenInt8x4: Support discontinued.");
     }
 }
 
diff --git a/src/ocl/tensorocl.cpp b/src/ocl/tensorocl.cpp
index 6329d33f69..1e1f8b1df9 100644
--- a/src/ocl/tensorocl.cpp
+++ b/src/ocl/tensorocl.cpp
@@ -1944,6 +1944,7 @@ std::string GetCastTensorBuildOptionFromType(const std::string& buildOption, mio
     case miopenDouble:
         // TODO
         MIOPEN_THROW(miopenStatusBadParm, "miopenDouble data type not supported in cast tensor.");
+    case miopenInt8x4: // fallthrough
     default: MIOPEN_THROW(miopenStatusBadParm, "Invalid data type in cast tensor desc.");
     }
 }
@@ -1967,6 +1968,11 @@ void CastTensor(const Handle& handle,
         MIOPEN_THROW(miopenStatusBadParm, "Tensor dimension lengths do not match.");
     }
 
+    if(srcDesc.GetType() == miopenInt8x4 || dstDesc.GetType() == miopenInt8x4)
+    {
+        MIOPEN_THROW(miopenStatusBadParm, "Tensor cast operation is not supported for int8x4.");
+    }
+
     auto flat_descriptors = GetConsistentFlattenedTensorDescriptors(srcDesc, dstDesc);
     const TensorDescriptor& srcDesc_flat = std::get<0>(flat_descriptors);
     const TensorDescriptor& dstDesc_flat = std::get<1>(flat_descriptors);
diff --git a/src/pooling_api.cpp b/src/pooling_api.cpp
index a2c792f8bb..bf318f7b78 100644
--- a/src/pooling_api.cpp
+++ b/src/pooling_api.cpp
@@ -50,6 +50,7 @@ inline void Pooling_logging_cmd(const miopenPoolingDescriptor_t poolDesc,
         case miopenFloat: ss << "pool"; break;
         case miopenInt32:
         case miopenInt8:
+        case miopenInt8x4: // Support discontinued.
         case miopenBFloat16:
         case miopenDouble:
         case miopenFloat8:
diff --git a/src/reducetensor.cpp b/src/reducetensor.cpp
index 6ca1067fb8..9cabbc03f6 100644
--- a/src/reducetensor.cpp
+++ b/src/reducetensor.cpp
@@ -211,6 +211,7 @@ inline int GetDataTypeSize(miopenDataType_t t)
     case miopenFloat8:
     case miopenBFloat8:
     case miopenInt8: return (1);
+    case miopenInt8x4: return (4); // Support discontinued.
     case miopenBFloat16: return (2);
     case miopenInt32: return (4);
     default: MIOPEN_THROW("Only float, half, double, bfloat16, int8 data types are supported.");
@@ -267,6 +268,7 @@ inline int GetDataTypeId(miopenDataType_t t)
     case miopenBFloat16: return (static_cast<int>('B'));
     case miopenDouble: return (static_cast<int>('D'));
     case miopenInt8:
+    case miopenInt8x4: // Support discontinued.
     case miopenFloat8:
     case miopenBFloat8:
     case miopenInt32: return (static_cast<int>('O'));
@@ -306,6 +308,7 @@ static ck::DataTypeEnum_t mapDataTypeId(miopenDataType_t t)
     case miopenBFloat16: return DataTypeEnum_t::BFloat16;
     case miopenDouble: return DataTypeEnum_t::Double;
     case miopenInt8: return DataTypeEnum_t::Int8;
+    case miopenInt8x4: return DataTypeEnum_t::Int8x4;
     case miopenInt32: return DataTypeEnum_t::Int32;
     case miopenFloat8:
     case miopenBFloat8:
diff --git a/src/solver/batchnorm/forward_inference_ck.cpp b/src/solver/batchnorm/forward_inference_ck.cpp
index d1d67ddea6..ff17432a62 100644
--- a/src/solver/batchnorm/forward_inference_ck.cpp
+++ b/src/solver/batchnorm/forward_inference_ck.cpp
@@ -200,6 +200,7 @@ bool BnCKFwdInference::IsApplicable(const ExecutionContext& context,
         return (CheckCKApplicability<BF16, BF16, F32, BF16, BF16, F32>(bn_problem) != -1);
     case miopenInt32:
     case miopenInt8:
+    case miopenInt8x4: // Support discontinued.
     case miopenFloat8:
     case miopenBFloat8:
     default: MIOPEN_THROW("Unsupported datatype");
@@ -243,6 +244,7 @@ BnCKFwdInference::GetSolution(const ExecutionContext& context,
                 break;
             case miopenInt8:
             case miopenInt32:
+            case miopenInt8x4: // Support discontinued.
             case miopenFloat8:
             case miopenBFloat8:
             default: MIOPEN_THROW("Unsupported datatype");
diff --git a/src/solver/conv_ck_igemm_fwd_bias_activ_fused.cpp b/src/solver/conv_ck_igemm_fwd_bias_activ_fused.cpp
index d9c0410d98..e2df6f8097 100644
--- a/src/solver/conv_ck_igemm_fwd_bias_activ_fused.cpp
+++ b/src/solver/conv_ck_igemm_fwd_bias_activ_fused.cpp
@@ -287,6 +287,7 @@ void PerformanceConfigConvCKIgemmFwdBiasActivFused::HeuristicInit(
     case miopenInt8:
     case miopenFloat:
     case miopenInt32:
+    case miopenInt8x4: // Support discontinued.
     case miopenBFloat16:
     case miopenDouble:
     default: MIOPEN_THROW("Unsupported datatype");
@@ -341,6 +342,7 @@ bool PerformanceConfigConvCKIgemmFwdBiasActivFused::IsValid(
     case miopenInt8:
     case miopenFloat:
     case miopenInt32:
+    case miopenInt8x4: // Support discontinued.
     case miopenBFloat16:
     case miopenDouble:
     default: MIOPEN_THROW("Unsupported datatype");
@@ -433,6 +435,7 @@ bool ConvCKIgemmFwdBiasActivFused::IsApplicable(const FusionContext& ctx,
     case miopenInt8:
     case miopenFloat:
     case miopenInt32:
+    case miopenInt8x4: // Support discontinued.
     case miopenBFloat16:
     case miopenDouble:
     default: MIOPEN_THROW("Unsupported datatype");
@@ -466,6 +469,7 @@ ConvSolution ConvCKIgemmFwdBiasActivFused::GetSolution(
             case miopenInt8:
             case miopenFloat:
             case miopenInt32:
+            case miopenInt8x4: // Support discontinued.
             case miopenBFloat16:
             case miopenDouble:
             default: MIOPEN_THROW("Unsupported datatype");
diff --git a/src/solver/conv_hip_implicit_gemm_3d_grouped_bwd_xdlops.cpp b/src/solver/conv_hip_implicit_gemm_3d_grouped_bwd_xdlops.cpp
index cd95c85cf6..94983c7f0e 100644
--- a/src/solver/conv_hip_implicit_gemm_3d_grouped_bwd_xdlops.cpp
+++ b/src/solver/conv_hip_implicit_gemm_3d_grouped_bwd_xdlops.cpp
@@ -210,6 +210,7 @@ void PerformanceConfigHipImplicitGemm3DGroupBwdXdlops::HeuristicInit(
     case miopenFloat: Init<float>(problem); break;
     case miopenInt8: Init<int8_t>(problem); break;
     case miopenInt32:
+    case miopenInt8x4: // Support discontinued.
     case miopenBFloat16:
     case miopenFloat8:
     case miopenBFloat8:
@@ -252,6 +253,7 @@ bool PerformanceConfigHipImplicitGemm3DGroupBwdXdlops::IsValid(
     case miopenFloat: return CheckIsSupportCKArgs<float>(problem);
     case miopenInt8: return CheckIsSupportCKArgs<int8_t>(problem);
     case miopenInt32:
+    case miopenInt8x4: // Support discontinued.
     case miopenBFloat16:
     case miopenFloat8:
     case miopenBFloat8:
@@ -322,6 +324,7 @@ bool ConvHipImplicitGemm3DGroupBwdXdlops::IsApplicable(
     case miopenFloat: return CheckCKApplicability<float>(problem);
     case miopenInt8: return CheckCKApplicability<int8_t>(problem);
     case miopenInt32:
+    case miopenInt8x4: // Support discontinued.
     case miopenBFloat16:
     case miopenFloat8:
     case miopenBFloat8:
@@ -349,6 +352,7 @@ ConvSolution ConvHipImplicitGemm3DGroupBwdXdlops::GetSolution(
         return InitInvokerFactory<DeviceOpGBwdPtrs<float>, CKArgs, conv::DataInvokeParams>(
             problem, config.kernel_id);
     case miopenInt32:
+    case miopenInt8x4: // Support discontinued.
     case miopenBFloat16:
     case miopenDouble:
     case miopenFloat8:
diff --git a/src/solver/conv_hip_implicit_gemm_3d_grouped_fwd_xdlops.cpp b/src/solver/conv_hip_implicit_gemm_3d_grouped_fwd_xdlops.cpp
index 4a5ecc8a3f..a21c9ba300 100644
--- a/src/solver/conv_hip_implicit_gemm_3d_grouped_fwd_xdlops.cpp
+++ b/src/solver/conv_hip_implicit_gemm_3d_grouped_fwd_xdlops.cpp
@@ -210,6 +210,7 @@ void PerformanceConfigHipImplicitGemm3DGroupFwdXdlops::HeuristicInit(
     case miopenFloat: Init<float>(problem); break;
     case miopenInt8: Init<int8_t>(problem); break;
     case miopenInt32:
+    case miopenInt8x4: // Support discontinued.
     case miopenFloat8:
     case miopenBFloat8:
     case miopenBFloat16:
@@ -252,6 +253,7 @@ bool PerformanceConfigHipImplicitGemm3DGroupFwdXdlops::IsValid(
     case miopenFloat: return CheckIsSupportCKArgs<float>(problem);
     case miopenInt8: return CheckIsSupportCKArgs<int8_t>(problem);
     case miopenInt32:
+    case miopenInt8x4: // Support discontinued.
     case miopenFloat8:
     case miopenBFloat8:
     case miopenBFloat16:
@@ -320,6 +322,7 @@ bool ConvHipImplicitGemm3DGroupFwdXdlops::IsApplicable(
     case miopenFloat: return CheckCKApplicability<float>(problem);
     case miopenInt8: return CheckCKApplicability<int8_t>(problem);
     case miopenInt32:
+    case miopenInt8x4: // Support discontinued.
     case miopenFloat8:
     case miopenBFloat8:
     case miopenBFloat16:
@@ -347,6 +350,7 @@ ConvSolution ConvHipImplicitGemm3DGroupFwdXdlops::GetSolution(
         return InitInvokerFactory<DeviceOpGFwdPtrs<float>, CKArgs, conv::DataInvokeParams>(
             problem, config.kernel_id);
     case miopenInt32:
+    case miopenInt8x4: // Support discontinued.
     case miopenBFloat16:
     case miopenDouble:
     case miopenFloat8:
diff --git a/src/solver/conv_hip_implicit_gemm_3d_grouped_wrw_xdlops.cpp b/src/solver/conv_hip_implicit_gemm_3d_grouped_wrw_xdlops.cpp
index d472c4181c..6225410599 100644
--- a/src/solver/conv_hip_implicit_gemm_3d_grouped_wrw_xdlops.cpp
+++ b/src/solver/conv_hip_implicit_gemm_3d_grouped_wrw_xdlops.cpp
@@ -206,6 +206,7 @@ void PerformanceConfigHipImplicitGemm3DGroupWrwXdlops::HeuristicInit(
     case miopenFloat: Init<float>(problem); break;
     case miopenInt8: Init<int8_t>(problem); break;
     case miopenInt32:
+    case miopenInt8x4: // Support discontinued.
     case miopenBFloat16:
     case miopenFloat8:
     case miopenBFloat8:
@@ -248,6 +249,7 @@ bool PerformanceConfigHipImplicitGemm3DGroupWrwXdlops::IsValid(
     case miopenFloat: return CheckIsSupportCKArgs<float>(problem);
     case miopenInt8: return CheckIsSupportCKArgs<int8_t>(problem);
     case miopenInt32:
+    case miopenInt8x4: // Support discontinued.
     case miopenBFloat16:
     case miopenFloat8:
     case miopenBFloat8:
@@ -318,6 +320,7 @@ bool ConvHipImplicitGemm3DGroupWrwXdlops::IsApplicable(
     case miopenFloat: return CheckCKApplicability<float>(problem);
     case miopenInt8: return CheckCKApplicability<int8_t>(problem);
     case miopenInt32:
+    case miopenInt8x4: // Support discontinued.
     case miopenBFloat16:
     case miopenFloat8:
     case miopenBFloat8:
@@ -345,6 +348,7 @@ ConvSolution ConvHipImplicitGemm3DGroupWrwXdlops::GetSolution(
         return InitInvokerFactory<DeviceOpGWrwPtrs<float>, CKArgs, conv::WrWInvokeParams>(
             problem, config.kernel_id);
     case miopenInt32:
+    case miopenInt8x4: // Support discontinued.
     case miopenBFloat16:
     case miopenFloat8:
     case miopenBFloat8:
diff --git a/src/solver/conv_hip_implicit_gemm_bwd_data_xdlops.cpp b/src/solver/conv_hip_implicit_gemm_bwd_data_xdlops.cpp
index d55ded78de..5aa17e75bf 100644
--- a/src/solver/conv_hip_implicit_gemm_bwd_data_xdlops.cpp
+++ b/src/solver/conv_hip_implicit_gemm_bwd_data_xdlops.cpp
@@ -172,6 +172,7 @@ void PerformanceConfigHipImplicitGemmBwdXdlops::HeuristicInit(
     case miopenBFloat8:
     case miopenInt8:
     case miopenInt32:
+    case miopenInt8x4: // Support discontinued.
     case miopenBFloat16:
     case miopenDouble: break;
     }
@@ -213,6 +214,7 @@ bool PerformanceConfigHipImplicitGemmBwdXdlops::IsValid(
     case miopenBFloat8:
     case miopenInt8:
     case miopenInt32:
+    case miopenInt8x4: // Support discontinued.
     case miopenBFloat16:
     case miopenDouble: break;
     }
@@ -292,6 +294,7 @@ bool ConvHipImplicitGemmBwdXdlops::IsApplicable(
     case miopenBFloat8:
     case miopenInt8:
     case miopenInt32:
+    case miopenInt8x4: // Support discontinued.
     case miopenBFloat16:
     case miopenDouble: break;
     }
@@ -315,6 +318,7 @@ ConvSolution ConvHipImplicitGemmBwdXdlops::GetSolution(
             problem, config.kernel_id);
     case miopenInt8:
     case miopenInt32:
+    case miopenInt8x4: // Support discontinued.
     case miopenBFloat16:
     case miopenDouble:
     case miopenFloat8:
diff --git a/src/solver/conv_hip_implicit_gemm_fwd_xdlops.cpp b/src/solver/conv_hip_implicit_gemm_fwd_xdlops.cpp
index 697ab33f70..7a9c322bc7 100644
--- a/src/solver/conv_hip_implicit_gemm_fwd_xdlops.cpp
+++ b/src/solver/conv_hip_implicit_gemm_fwd_xdlops.cpp
@@ -172,6 +172,7 @@ void PerformanceConfigHipImplicitGemmFwdXdlops::HeuristicInit(
     case miopenFloat8:
     case miopenBFloat8:
     case miopenInt32:
+    case miopenInt8x4: // Support discontinued.
     case miopenBFloat16:
     case miopenDouble: break;
     }
@@ -214,6 +215,7 @@ bool PerformanceConfigHipImplicitGemmFwdXdlops::IsValid(
     case miopenFloat8:
     case miopenBFloat8:
     case miopenInt32:
+    case miopenInt8x4: // Support discontinued.
     case miopenBFloat16:
     case miopenDouble: break;
     }
@@ -292,6 +294,7 @@ bool ConvHipImplicitGemmFwdXdlops::IsApplicable(
     case miopenFloat8:
     case miopenBFloat8:
     case miopenInt32:
+    case miopenInt8x4: // Support discontinued.
     case miopenBFloat16:
     case miopenDouble: break;
     }
@@ -317,6 +320,7 @@ ConvSolution ConvHipImplicitGemmFwdXdlops::GetSolution(
         return InitInvokerFactory<DeviceOpPtrs<float>, CKArgs, conv::DataInvokeParams>(
             problem, config.kernel_id);
     case miopenInt32:
+    case miopenInt8x4: // Support discontinued.
     case miopenBFloat16:
     case miopenDouble:
     case miopenFloat8:
diff --git a/src/solver/conv_hip_implicit_gemm_grouped_fwd_xdlops.cpp b/src/solver/conv_hip_implicit_gemm_grouped_fwd_xdlops.cpp
index 0c5a91b627..0a8b6eeb03 100644
--- a/src/solver/conv_hip_implicit_gemm_grouped_fwd_xdlops.cpp
+++ b/src/solver/conv_hip_implicit_gemm_grouped_fwd_xdlops.cpp
@@ -198,6 +198,7 @@ void PerformanceConfigHipImplicitGemmGroupFwdXdlops::HeuristicInit(
     case miopenFloat: Init<float>(problem); break;
     case miopenInt8: Init<int8_t>(problem); break;
     case miopenInt32:
+    case miopenInt8x4: // Support discontinued.
     case miopenBFloat16:
     case miopenFloat8:
     case miopenBFloat8:
@@ -239,6 +240,7 @@ bool PerformanceConfigHipImplicitGemmGroupFwdXdlops::IsValid(
     case miopenFloat: return CheckIsSupportCKArgs<float>(problem);
     case miopenInt8: return CheckIsSupportCKArgs<int8_t>(problem);
     case miopenInt32:
+    case miopenInt8x4: // Support discontinued.
     case miopenBFloat16:
     case miopenFloat8:
     case miopenBFloat8:
@@ -309,6 +311,7 @@ bool ConvHipImplicitGemmGroupFwdXdlops::IsApplicable(
     case miopenFloat: return CheckCKApplicability<float>(problem);
     case miopenInt8: return CheckCKApplicability<int8_t>(problem);
     case miopenInt32:
+    case miopenInt8x4: // Support discontinued.
     case miopenBFloat16:
     case miopenFloat8:
     case miopenBFloat8:
@@ -336,6 +339,7 @@ ConvSolution ConvHipImplicitGemmGroupFwdXdlops::GetSolution(
         return InitInvokerFactory<DeviceOpGFwdPtrs<int8_t>, CKArgs, conv::DataInvokeParams>(
             problem, config.kernel_id);
     case miopenInt32:
+    case miopenInt8x4: // Support discontinued.
     case miopenBFloat16:
     case miopenDouble:
     case miopenFloat8:
diff --git a/src/solver/gemm.cpp b/src/solver/gemm.cpp
index 2cfbd257dc..6ae4f5dde7 100644
--- a/src/solver/gemm.cpp
+++ b/src/solver/gemm.cpp
@@ -89,6 +89,8 @@ bool GemmFwdBase::IsApplicable(const ExecutionContext& ctx,
     const auto& yDesc = problem.GetOut();
 
     // rocBlas needs the output to be 32-bit always
+    if(xDesc.GetType() == miopenInt8x4)
+        return false;
     if(xDesc.GetType() == miopenInt8      //
        && (yDesc.GetType() != miopenFloat //
            && yDesc.GetType() != miopenInt32))
diff --git a/src/solver/mlir_common.cpp b/src/solver/mlir_common.cpp
index e54da1ac89..4e41b8944a 100644
--- a/src/solver/mlir_common.cpp
+++ b/src/solver/mlir_common.cpp
@@ -57,6 +57,7 @@ static const char* DTypeName(miopenDataType_t ty)
     case miopenBFloat16: return "bf16";
     case miopenInt32: return "i32";
     case miopenInt8: return "i8";
+    case miopenInt8x4: return "i8x4"; // Support discontinued.
     case miopenFloat8: return "fp8";
     case miopenBFloat8: return "bfp8";
     }
diff --git a/src/tensor.cpp b/src/tensor.cpp
index ba1b2ba106..df0d7c2819 100644
--- a/src/tensor.cpp
+++ b/src/tensor.cpp
@@ -50,6 +50,7 @@ bool IsDataTypeSupported(miopenDataType_t t)
     case miopenFloat8:
     case miopenBFloat8:
     case miopenInt8:
+    case miopenInt8x4: // Support discontinued.
     case miopenBFloat16:
     case miopenDouble: return true;
     }
diff --git a/test/driver.hpp b/test/driver.hpp
index bcbfe7d676..fd83dd1fc5 100644
--- a/test/driver.hpp
+++ b/test/driver.hpp
@@ -274,6 +274,7 @@ struct test_driver
         {
         case miopenHalf: ss << "--half "; break;
         case miopenBFloat16: ss << "--bfloat16 "; break;
+        case miopenInt8x4: ss << "--UNSUPPORED_TYPE "; break;
         case miopenInt8: ss << "--int8 "; break;
         case miopenInt32: ss << "--int32 "; break;
         case miopenFloat: ss << "--float "; break;
@@ -302,6 +303,7 @@ struct test_driver
         {
         case miopenHalf: ret.emplace_back("--half"); break;
         case miopenBFloat16: ret.emplace_back("--bf16"); break;
+        case miopenInt8x4: ret.emplace_back("--UNSUPPORTED_TYPE"); break;
         case miopenInt8: ret.emplace_back("--int8"); break;
         case miopenInt32: ret.emplace_back("--int32"); break;
         case miopenFloat: ret.emplace_back("--float"); break;
diff --git a/test/gtest/conv_embed_db.cpp b/test/gtest/conv_embed_db.cpp
index 36d1e28123..4672bb0404 100644
--- a/test/gtest/conv_embed_db.cpp
+++ b/test/gtest/conv_embed_db.cpp
@@ -73,6 +73,7 @@ void Run2dDriver(miopenDataType_t prec)
     case miopenHalf: params = ConfigWithHalf::GetParam(); break;
     case miopenInt8: params = ConfigWithInt8::GetParam(); break;
     case miopenBFloat16: params = ConfigWithBFloat16::GetParam(); break;
+    case miopenInt8x4: // Support discontinued.
     case miopenInt32:
     case miopenFloat8:
     case miopenBFloat8:
diff --git a/test/gtest/conv_hip_igemm_xdlops.cpp b/test/gtest/conv_hip_igemm_xdlops.cpp
index 14e1738a35..3407446557 100644
--- a/test/gtest/conv_hip_igemm_xdlops.cpp
+++ b/test/gtest/conv_hip_igemm_xdlops.cpp
@@ -65,6 +65,7 @@ void Run2dDriver(miopenDataType_t prec)
     case miopenHalf:
     case miopenBFloat16:
     case miopenFloat:
+    case miopenInt8x4: // Support discontinued.
     case miopenInt32:
     case miopenDouble:
         FAIL() << "miopenHalf, miopenBFloat16, miopenFloat, miopenInt32, "
diff --git a/test/gtest/conv_igemm_dynamic.cpp b/test/gtest/conv_igemm_dynamic.cpp
index 6e2863eb8f..59341ac8a0 100644
--- a/test/gtest/conv_igemm_dynamic.cpp
+++ b/test/gtest/conv_igemm_dynamic.cpp
@@ -68,6 +68,7 @@ void Run2dDriver(miopenDataType_t prec)
     case miopenHalf:
     case miopenInt8:
     case miopenBFloat16:
+    case miopenInt8x4: // Support discontinued.
     case miopenInt32:
     case miopenDouble:
     case miopenFloat8:
diff --git a/test/gtest/conv_igemm_mlir.cpp b/test/gtest/conv_igemm_mlir.cpp
index c99d0a03c4..29d240645c 100644
--- a/test/gtest/conv_igemm_mlir.cpp
+++ b/test/gtest/conv_igemm_mlir.cpp
@@ -81,6 +81,7 @@ void Run2dDriver(miopenDataType_t prec)
     case miopenInt8: params = ConfigWithInt8::GetParam(); break;
     case miopenFloat: params = ConfigWithFloat::GetParam(); break;
     case miopenBFloat16:
+    case miopenInt8x4: // Support discontinued.
     case miopenInt32:
     case miopenFloat8:
     case miopenBFloat8:
diff --git a/test/gtest/conv_igemm_mlir_xdlops.cpp b/test/gtest/conv_igemm_mlir_xdlops.cpp
index ea14e3b68e..e2c5a3ed8f 100644
--- a/test/gtest/conv_igemm_mlir_xdlops.cpp
+++ b/test/gtest/conv_igemm_mlir_xdlops.cpp
@@ -53,6 +53,7 @@ void Run2dDriver(miopenDataType_t prec)
     case miopenInt8: params = ConfigWithInt8::GetParam(); break;
     case miopenBFloat16:
     case miopenFloat:
+    case miopenInt8x4: // Support discontinued.
     case miopenInt32:
     case miopenDouble:
     case miopenFloat8:
diff --git a/test/gtest/conv_trans.cpp b/test/gtest/conv_trans.cpp
index a5266d9f9c..185a459947 100644
--- a/test/gtest/conv_trans.cpp
+++ b/test/gtest/conv_trans.cpp
@@ -55,6 +55,7 @@ void Run2dDriver(miopenDataType_t prec)
     case miopenBFloat8:
     case miopenInt8:
     case miopenBFloat16:
+    case miopenInt8x4: // Support discontinued.
     case miopenInt32:
     case miopenDouble:
         FAIL() << "miopenHalf, miopenInt8, miopenBFloat16, miopenInt32, miopenDouble "
diff --git a/test/gtest/db_sync.cpp b/test/gtest/db_sync.cpp
index 13f0297724..13b647ceac 100644
--- a/test/gtest/db_sync.cpp
+++ b/test/gtest/db_sync.cpp
@@ -72,6 +72,8 @@ miopenDataType_t GetDataTypeFromString(const std::string& data_type)
         return miopenHalf;
     else if(data_type == "INT8")
         return miopenInt8;
+    else if(data_type == "INT8x4")
+        return miopenInt8x4; // Support discontinued. Maintain compatibility with old databases.
     else if(data_type == "INT32")
         return miopenInt32;
     else if(data_type == "BF16")

From 483009288b0dff3f851f5409dd16166331bcefd1 Mon Sep 17 00:00:00 2001
From: atamazov <artem.tamazov@gmail.com>
Date: Tue, 10 Oct 2023 01:22:17 +0300
Subject: [PATCH 7/8] rocblas-remove-int8x4-uses(10) tidy fixes

---
 src/solver/batchnorm/backward_ck.cpp         | 2 ++
 src/solver/batchnorm/forward_training_ck.cpp | 2 ++
 2 files changed, 4 insertions(+)

diff --git a/src/solver/batchnorm/backward_ck.cpp b/src/solver/batchnorm/backward_ck.cpp
index 35fb8adbad..fba8724990 100644
--- a/src/solver/batchnorm/backward_ck.cpp
+++ b/src/solver/batchnorm/backward_ck.cpp
@@ -182,6 +182,7 @@ bool BnCKBwdBackward::IsApplicable(const ExecutionContext& ctx,
         return CheckCKApplicability<BF16, F32, F32, F32, BF16, F32, F32>(bn_problem);
     case miopenInt32:
     case miopenInt8:
+    case miopenInt8x4:
     case miopenBFloat8:
     case miopenFloat8:
     default: MIOPEN_THROW("Unsupported datatype");
@@ -235,6 +236,7 @@ ConvSolution BnCKBwdBackward::GetSolution(
         return MakeAnyInvokerFactory<BF16, F32, F32, F32, BF16, F32, F32>(bn_problem);
     case miopenInt8:
     case miopenInt32:
+    case miopenInt8x4:
     case miopenBFloat8:
     case miopenFloat8:
     default:
diff --git a/src/solver/batchnorm/forward_training_ck.cpp b/src/solver/batchnorm/forward_training_ck.cpp
index 17191a4c9e..a65cec14a9 100644
--- a/src/solver/batchnorm/forward_training_ck.cpp
+++ b/src/solver/batchnorm/forward_training_ck.cpp
@@ -174,6 +174,7 @@ bool BnCKFwdTraining::IsApplicable(const ExecutionContext& context,
     case miopenBFloat16: return CheckCKApplicability<BF16, BF16, F32, BF16, BF16, F32>(bn_problem);
     case miopenInt32:
     case miopenInt8:
+    case miopenInt8x4:
     case miopenBFloat8:
     case miopenFloat8:
     default: MIOPEN_THROW("BnCKFwdTraining operation does not supprot this data type");
@@ -223,6 +224,7 @@ ConvSolution BnCKFwdTraining::GetSolution(
     case miopenBFloat16: return MakeAnyInvokerFactory<BF16, BF16, F32, BF16, BF16, F32>(bn_problem);
     case miopenInt8:
     case miopenInt32:
+    case miopenInt8x4:
     case miopenBFloat8:
     case miopenFloat8:
     default:

From 676da096edec3cc26e6ac8f8c3e49cd48280e9b1 Mon Sep 17 00:00:00 2001
From: atamazov <artem.tamazov@gmail.com>
Date: Tue, 10 Oct 2023 15:28:18 +0300
Subject: [PATCH 8/8] rocblas-remove-int8x4-uses(11) format

---
 include/miopen/miopen.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/include/miopen/miopen.h b/include/miopen/miopen.h
index 600484e392..7a85ec5875 100644
--- a/include/miopen/miopen.h
+++ b/include/miopen/miopen.h
@@ -346,10 +346,10 @@ MIOPEN_DECLARE_OBJECT(miopenReduceTensorDescriptor);
  */
 typedef enum
 {
-    miopenHalf  = 0, /*!< 16-bit floating point (Fully supported) */
-    miopenFloat = 1, /*!< 32-bit floating point (Fully supported) */
-    miopenInt32 = 2, /*!< 32-bit int point (Partially supported) */
-    miopenInt8  = 3, /*!< 8-bit int point (Partially supported) */
+    miopenHalf     = 0, /*!< 16-bit floating point (Fully supported) */
+    miopenFloat    = 1, /*!< 32-bit floating point (Fully supported) */
+    miopenInt32    = 2, /*!< 32-bit int point (Partially supported) */
+    miopenInt8     = 3, /*!< 8-bit int point (Partially supported) */
     miopenInt8x4   = 4, /*!< Pack of four Int8 in NCHW_VECT_C format (Support discontinued) */
     miopenBFloat16 = 5, /*!< 16-bit binary floating point (8-bit exponent, 7-bit fraction)
                            (Partially supported) */