From d666d727d6a3625399e3ea1ea6f222656b063536 Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Mon, 17 May 2021 07:11:59 +0000
Subject: [PATCH 01/20] Add miopenDouble data type ID

---
 include/miopen/miopen.h | 1 +
 1 file changed, 1 insertion(+)
diff --git a/include/miopen/miopen.h b/include/miopen/miopen.h
index 7741162457..9c44779362 100644
--- a/include/miopen/miopen.h
+++ b/include/miopen/miopen.h
@@ -334,6 +334,7 @@ typedef enum {
         4, /*!< Pack of four 8-bit int points in NCHW_VECT_C format (Partially supported) */
     miopenBFloat16 = 5, /*!< 16-bit binary floating point (8-bit exponent, 7-bit fraction)
                            (Partially supported) */
+    miopenDouble = 6,   /*!< 64-bit floating point (Partially supported) */
 } miopenDataType_t;
 
 /*! @ingroup pooling

From 031d47c4dcec07530fe659619c55fa6679c8cf8c Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Mon, 17 May 2021 07:14:59 +0000
Subject: [PATCH 02/20] Update the miopenReduceTensor() host C++ interface
 implementation to support Reduction on Double

---
 src/reducetensor.cpp | 19 ++++++++++++++++---
 1 file changed, 16 insertions(+), 3 deletions(-)

diff --git a/src/reducetensor.cpp b/src/reducetensor.cpp
index c268069bfd..6721a493aa 100644
--- a/src/reducetensor.cpp
+++ b/src/reducetensor.cpp
@@ -224,12 +224,13 @@ inline int GetDataTypeSize(miopenDataType_t t)
     {
     case miopenHalf: return (2);
     case miopenFloat: return (4);
+    case miopenDouble: return (8);
     case miopenInt8: return (1);
     case miopenInt8x4: return (4);
     case miopenBFloat16: return (2);
     case miopenInt32: return (4);
     default:
-        MIOPEN_THROW("Only float, half, bfloat16, int8, int8x4 data type is supported.");
+        MIOPEN_THROW("Only float, half, double, bfloat16, int8, int8x4 data type is supported.");
         break;
     };
 };
@@ -241,6 +242,7 @@ inline int GetDataTypeId(miopenDataType_t t)
     case miopenHalf: return (static_cast<int>('H'));
     case miopenFloat: return (static_cast<int>('F'));
     case miopenBFloat16: return (static_cast<int>('B'));
+    case miopenDouble: return (static_cast<int>('D'));
     case miopenInt8:
     case miopenInt8x4:
     case miopenInt32: return (static_cast<int>('O'));
@@ -568,6 +570,13 @@ void ReduceTensorDescriptor::ReduceTensor(const Handle& handle,
 #if WORKAROUND_MIOPEN_ISSUE_557
     if(StartsWith(handle.GetDeviceName(), "gfx10"))
         param += " -DCK_USE_AMD_BUFFER_ADDRESSING=0 ";
+    else if(srcDataType == miopenDouble)
+        // TODO: support from composable kernel utility for using AMD Buffer Addressing for double
+        param += " -DCK_USE_AMD_BUFFER_ADDRESSING=0 ";
+#else
+    if(srcDataType == miopenDouble)
+        // TODO: support from composable kernel utility for using AMD Buffer Addressing for double
+        param += " -DCK_USE_AMD_BUFFER_ADDRESSING=0 ";
 #endif
 
     std::string program_name = "gridwise_generic_reduction.cpp";
@@ -590,8 +599,12 @@ void ReduceTensorDescriptor::ReduceTensor(const Handle& handle,
     const std::vector<size_t> vgd_1 = {
         static_cast<size_t>(gridSize * blockSize), size_t{1}, size_t{1}};
 
-    float alphaVal = *reinterpret_cast<const float*>(alpha);
-    float betaVal  = *reinterpret_cast<const float*>(beta);
+    float alphaVal = (srcDataType == miopenDouble)
+                         ? static_cast<float>(*reinterpret_cast<const double*>(alpha))
+                         : *reinterpret_cast<const float*>(alpha);
+    float betaVal = (srcDataType == miopenDouble)
+                        ? static_cast<float>(*reinterpret_cast<const double*>(beta))
+                        : *reinterpret_cast<const float*>(beta);
 
     handle.AddKernel(algo_name, network_config, program_name, kernel_name1, vld_1, vgd_1, param)(
         alphaVal, A, betaVal, C, ws_buf1_global, ws_buf2_bytes_offset, indices);

From 0a1fbff1eb2e9d0e43135e0344d94aa21efbe946 Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Mon, 17 May 2021 07:17:36 +0000
Subject: [PATCH 03/20] Tiny update in reduction kernel layer to use built-in
 shuffle for Double

---
 .../include/kernel_algorithm/reduction_functions.hpp           | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/kernels/composable_kernel/include/kernel_algorithm/reduction_functions.hpp b/src/kernels/composable_kernel/include/kernel_algorithm/reduction_functions.hpp
index af4a5ea356..4ed6bb6fd2 100644
--- a/src/kernels/composable_kernel/include/kernel_algorithm/reduction_functions.hpp
+++ b/src/kernels/composable_kernel/include/kernel_algorithm/reduction_functions.hpp
@@ -167,7 +167,8 @@ struct WarpReduce
 {
     using compType = typename opReduce::dataType;
     using binop    = detail::binop_with_nan_check<nanPropaOpt, opReduce, compType>;
-    constexpr static bool have_builtin_shuffle = std::is_same<compType, float>::value;
+    constexpr static bool have_builtin_shuffle =
+        std::is_same<compType, float>::value || std::is_same<compType, double>::value;
 
     // This interface does not accumulate on indices
     __device__ static void Reduce(const DataType* p_thread_buffer, compType& accuData)

From fbc28c4e35b85cdab83d2a45062d869c3a1c25f4 Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Mon, 17 May 2021 07:36:27 +0000
Subject: [PATCH 04/20] Update in reduce_driver to support Reduction on Double

---
 driver/driver.hpp           |  4 +--
 driver/main.cpp             |  4 +++
 driver/miopen_Reduction.hpp |  5 ++--
 driver/reduce_driver.hpp    | 56 +++++++++++++++++++++++++++----------
 4 files changed, 50 insertions(+), 19 deletions(-)

diff --git a/driver/driver.hpp b/driver/driver.hpp
index 35ebcc7a80..793b9bf013 100644
--- a/driver/driver.hpp
+++ b/driver/driver.hpp
@@ -130,7 +130,7 @@ void PadBufferSize(size_t& sz, int datatype_sz)
     printf(
         "Supported Base Arguments: conv[fp16|int8|bfp16], CBAInfer[fp16], pool[fp16], lrn[fp16], "
         "activ[fp16], softmax[fp16], bnorm[fp16], rnn[fp16], gemm, ctc, dropout[fp16], "
-        "tensorop[fp16], reduce[fp16]\n");
+        "tensorop[fp16], reduce[fp16,fp64]\n");
     exit(0);
 }
 
@@ -150,7 +150,7 @@ std::string ParseBaseArg(int argc, char* argv[])
        arg != "softmax" && arg != "softmaxfp16" && arg != "bnorm" && arg != "bnormfp16" &&
        arg != "rnn" && arg != "rnnfp16" && arg != "gemm" /*&& arg != "gemmfp16"*/ && arg != "ctc" &&
        arg != "dropout" && arg != "dropoutfp16" && arg != "tensorop" && arg != "tensoropfp16" &&
-       arg != "reduce" && arg != "reducefp16" && arg != "--version")
+       arg != "reduce" && arg != "reducefp16" && arg != "reducefp64" && arg != "--version")
     {
         printf("Invalid Base Input Argument\n");
         Usage();
diff --git a/driver/main.cpp b/driver/main.cpp
index 5d876d0f6d..7859c210e5 100644
--- a/driver/main.cpp
+++ b/driver/main.cpp
@@ -174,6 +174,10 @@ int main(int argc, char* argv[])
     {
         drv = new ReduceDriver<float16, float>();
     }
+    else if(base_arg == "reducefp64")
+    {
+        drv = new ReduceDriver<double, double>();
+    }
     else
     {
         printf("Incorrect BaseArg\n");
diff --git a/driver/miopen_Reduction.hpp b/driver/miopen_Reduction.hpp
index 65b84cae10..8fbf9859c8 100644
--- a/driver/miopen_Reduction.hpp
+++ b/driver/miopen_Reduction.hpp
@@ -90,8 +90,9 @@ class miopenReductionHost
                 RunImpl<Tref>(alpha, in_data, beta, out_data, indices);
             else
                 RunImpl<float16>(alpha, in_data, beta, out_data, indices);
-        };
-
+        }
+        else if(compTypeVal == miopenDouble)
+            RunImpl<double>(alpha, in_data, beta, out_data, indices);
         return;
     };
 
diff --git a/driver/reduce_driver.hpp b/driver/reduce_driver.hpp
index 55ad168fb5..d28ae8fec7 100644
--- a/driver/reduce_driver.hpp
+++ b/driver/reduce_driver.hpp
@@ -38,11 +38,12 @@
 #include <miopen/miopen.h>
 #include <miopen/reduce_common.hpp>
 #include <miopen/tensor.hpp>
-#include <miopen/bfloat16.hpp>
 #include <numeric>
 #include <vector>
 #include <string>
 #include <cassert>
+#include <type_traits>
+#include <half.hpp>
 #include "random.hpp"
 
 #include "miopen_Reduction.hpp"
@@ -58,7 +59,10 @@ class ReduceDriver : public Driver
 
         miopenCreateReduceTensorDescriptor(&reduceDesc);
 
-        data_type = (sizeof(Tgpu) == 4) ? miopenFloat : miopenHalf;
+        if(std::is_same<Tgpu, double>::value)
+            data_type = miopenDouble;
+        else
+            data_type = (sizeof(Tgpu) == 4) ? miopenFloat : miopenHalf;
     }
 
     int AddCmdLineArgs() override;
@@ -289,6 +293,9 @@ int ReduceDriver<Tgpu, Tref>::SetReduceTensorDescriptorFromCmdLineArgs()
         (reduceOp == MIOPEN_REDUCE_TENSOR_MIN || reduceOp == MIOPEN_REDUCE_TENSOR_MAX ||
          reduceOp == MIOPEN_REDUCE_TENSOR_AMAX);
 
+    if(std::is_same<Tgpu, double>::value)
+        compType = miopenDouble;
+
     return (miopenSetReduceTensorDescriptor(
         reduceDesc, reduceOp, compType, nanOpt, indicesOpt, indicesType));
 }
@@ -373,16 +380,32 @@ int ReduceDriver<Tgpu, Tref>::RunForwardGPU()
 
     bool output_accumulate = !(reduce::float_equal_one(alpha) && reduce::float_equal_zero(beta));
 
+    double alphaData, betaData;
+
+    void* alphaPara = reinterpret_cast<void*>(&alphaData);
+    void* betaPara  = reinterpret_cast<void*>(&betaData);
+
+    if(std::is_same<Tgpu, double>::value)
+    {
+        *reinterpret_cast<double*>(alphaPara) = static_cast<double>(alpha);
+        *reinterpret_cast<double*>(betaPara)  = static_cast<double>(beta);
+    }
+    else
+    {
+        *reinterpret_cast<float*>(alphaPara) = alpha;
+        *reinterpret_cast<float*>(betaPara)  = beta;
+    };
+
     miopenReduceTensor(GetHandle(),
                        reduceDesc,
                        this->need_indices ? indices_dev->GetMem() : nullptr, // indices
                        this->need_indices ? indices_sizeInBytes : 0,    // indices size in bytes
                        ws_sizeInBytes > 0 ? ws_dev->GetMem() : nullptr, // workspace
                        ws_sizeInBytes,                                  // workspace size in bytes
-                       &alpha,
+                       const_cast<const void*>(alphaPara),
                        inputTensor,
                        in_dev->GetMem(),
-                       &beta,
+                       const_cast<const void*>(betaPara),
                        outputTensor,
                        out_dev->GetMem());
 
@@ -404,10 +427,10 @@ int ReduceDriver<Tgpu, Tref>::RunForwardGPU()
                            this->need_indices ? indices_sizeInBytes : 0,    // indices size in bytes
                            ws_sizeInBytes > 0 ? ws_dev->GetMem() : nullptr, // workspace
                            ws_sizeInBytes, // workspace size in bytes
-                           &alpha,
+                           const_cast<const void*>(alphaPara),
                            inputTensor,
                            in_dev->GetMem(),
-                           &beta,
+                           const_cast<const void*>(betaPara),
                            outputTensor,
                            out_dev->GetMem());
     }
@@ -455,10 +478,8 @@ int ReduceDriver<Tgpu, Tref>::VerifyForward()
                                                   this->dimsInvariant,
                                                   this->dimsToReduce);
 
-    auto alpha =
-        reduce::convert_type<Tgpu>(static_cast<float>(this->inflags.GetValueDouble("alpha")));
-    auto beta =
-        reduce::convert_type<Tgpu>(static_cast<float>(this->inflags.GetValueDouble("beta")));
+    auto alpha = static_cast<float>(this->inflags.GetValueDouble("alpha"));
+    auto beta  = static_cast<float>(this->inflags.GetValueDouble("beta"));
 
     auto reduceOp = static_cast<miopenReduceTensorOp_t>(inflags.GetValueInt("ReduceOp"));
 
@@ -470,14 +491,19 @@ int ReduceDriver<Tgpu, Tref>::VerifyForward()
 
     hostReduction.Run(alpha, in.data(), beta, outhost.data(), outhost_indices.data());
 
-    auto error = miopen::rms_range(outhost, out);
-    const double tolerance =
-        std::is_same<Tgpu, float16>::value || reduceOp == MIOPEN_REDUCE_TENSOR_NORM2 ? 2e-3
-                                                                                     : 1.5e-4;
+    auto error       = miopen::rms_range(outhost, out);
+    double tolerance = 1.5e-4;
+
+    if(std::is_same<Tgpu, half_float::half>::value)
+        tolerance *= 4.0;
+
+    if(std::is_same<Tgpu, float>::value && reduceOp == MIOPEN_REDUCE_TENSOR_NORM2)
+        tolerance *= 12.0;
 
     if(error > tolerance)
     {
-        std::cout << "ReduceTensor() Failed: " << error << "\n";
+        std::cout << "ReduceTensor() Failed with error = " << error
+                  << " , tolerance = " << tolerance << "\n";
     }
     else
     {

From 5bd4bc4a7e081dc358050e5a069038eb486676ca Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Mon, 17 May 2021 07:40:00 +0000
Subject: [PATCH 05/20] Update in reduce_test to support Reduction on Double

---
 test/driver.hpp        |  17 ++++--
 test/reduce_test.cpp   | 114 +++++++++++++++++++++++++++++++++--------
 test/tensor_holder.hpp |   5 ++
 3 files changed, 112 insertions(+), 24 deletions(-)

diff --git a/test/driver.hpp b/test/driver.hpp
index ce99ef2417..0730f22c2e 100644
--- a/test/driver.hpp
+++ b/test/driver.hpp
@@ -282,6 +282,7 @@ struct test_driver
         case miopenInt8: ss << "--int8 "; break;
         case miopenInt32: ss << "--int32 "; break;
         case miopenFloat: ss << "--float "; break;
+        case miopenDouble: ss << "--double "; break;
         }
         for(auto&& arg : this->arguments)
         {
@@ -308,6 +309,7 @@ struct test_driver
         case miopenInt8: ret.emplace_back("--int8"); break;
         case miopenInt32: ret.emplace_back("--int32"); break;
         case miopenFloat: ret.emplace_back("--float"); break;
+        case miopenDouble: ret.emplace_back("--double"); break;
         }
 
         for(auto&& arg : this->arguments)
@@ -1008,7 +1010,7 @@ void set_driver_datatype(Driver& d,
     }
     else if(arg_map.count("--double") > 0)
     {
-        throw std::runtime_error("Double is not supported");
+        d.type = miopenDouble;
     }
     else
     {
@@ -1113,7 +1115,11 @@ void test_drive_impl_2(std::string program_name, std::vector<std::string> as)
         return;
     }
 
-    set_driver_datatype<Driver>(d, arg_map);
+    if(program_name.find("reduce") != std::string::npos)
+        set_driver_datatype<Driver>(d, arg_map);
+    else
+        throw std::runtime_error("Double is not supported");
+
     std::vector<std::vector<std::string>> configs = build_configs<Driver>(d, arg_map, keywords);
     size_t config_count                           = configs.size();
     double running_average                        = 0;
@@ -1169,7 +1175,10 @@ void test_drive_impl_1(std::string program_name, std::vector<std::string> as)
     }
     else if(arg_map.count("--double") > 0)
     {
-        throw std::runtime_error("Double is not supported");
+        if(program_name.find("reduce") == std::string::npos)
+            throw std::runtime_error("Double is not supported");
+        else
+            d.type = miopenDouble;
     }
     else
     {
@@ -1310,7 +1319,7 @@ void test_drive(int argc, const char* argv[])
         }
         if(arg == "--double")
         {
-            // test_drive_impl<Driver<double>>(argv[0], std::move(as));
+            test_drive_impl<Driver<double>>(argv[0], std::move(as));
             break;
         }
     }
diff --git a/test/reduce_test.cpp b/test/reduce_test.cpp
index fdab312652..4e5085959c 100644
--- a/test/reduce_test.cpp
+++ b/test/reduce_test.cpp
@@ -38,6 +38,7 @@
 #include <iterator>
 #include <limits>
 #include <iostream>
+#include <type_traits>
 
 #include "cpu_reduce_util.hpp"
 
@@ -107,7 +108,9 @@ struct verify_reduce_with_indices
                 results = cpuImpl<float>();
             else
                 results = cpuImpl<half_float::half>();
-        };
+        }
+        else if(compTypeVal == miopenDouble)
+            results = cpuImpl<double>();
 
         if(toVerifyData)
         {
@@ -344,6 +347,22 @@ struct verify_reduce_with_indices
         std::size_t ws_sizeInBytes      = workspace.desc.GetElementSize() * sizeof(T);
         std::size_t indices_sizeInBytes = indices.desc.GetElementSize() * sizeof(int);
 
+        double alphaData, betaData;
+
+        void* alphaPara = reinterpret_cast<void*>(&alphaData);
+        void* betaPara  = reinterpret_cast<void*>(&betaData);
+
+        if(std::is_same<T, double>::value)
+        {
+            *reinterpret_cast<double*>(alphaPara) = static_cast<double>(alpha);
+            *reinterpret_cast<double*>(betaPara)  = static_cast<double>(beta);
+        }
+        else
+        {
+            *reinterpret_cast<float*>(alphaPara) = alpha;
+            *reinterpret_cast<float*>(betaPara)  = beta;
+        };
+
         if(ws_sizeInBytes > 0)
         {
             auto workspace_dev = handle.Write(workspace.data);
@@ -353,10 +372,10 @@ struct verify_reduce_with_indices
                                 indices_sizeInBytes,
                                 workspace_dev.get(),
                                 ws_sizeInBytes,
-                                static_cast<const void*>(&alpha),
+                                const_cast<const void*>(alphaPara),
                                 input.desc,
                                 input_dev.get(),
-                                static_cast<const void*>(&beta),
+                                const_cast<const void*>(betaPara),
                                 output.desc,
                                 output_dev.get());
         }
@@ -367,10 +386,10 @@ struct verify_reduce_with_indices
                                 indices_sizeInBytes,
                                 nullptr,
                                 0,
-                                static_cast<const void*>(&alpha),
+                                const_cast<const void*>(alphaPara),
                                 input.desc,
                                 input_dev.get(),
-                                static_cast<const void*>(&beta),
+                                const_cast<const void*>(betaPara),
                                 output.desc,
                                 output_dev.get());
         };
@@ -423,26 +442,38 @@ struct verify_reduce_no_indices
         nanOpt      = reduce.reduceTensorNanOpt_;
     }
 
-    tensor<T> cpu()
+    tensor<float> cpu()
     {
+        using reduce::convert_type;
+
+        tensor<T> result;
+
         if(compTypeVal == miopenFloat)
         {
             if(std::is_same<T, double>::value)
-                return (cpuImpl<double>());
+                result = cpuImpl<double>();
             else
-                return (cpuImpl<float>());
+                result = cpuImpl<float>();
         }
         else if(compTypeVal == miopenHalf)
         {
             if(std::is_same<T, double>::value)
-                return (cpuImpl<double>());
+                result = cpuImpl<double>();
             else if(std::is_same<T, float>::value)
-                return (cpuImpl<float>());
+                result = cpuImpl<float>();
             else
-                return (cpuImpl<half_float::half>());
-        };
+                result = cpuImpl<half_float::half>();
+        }
+        else if(compTypeVal == miopenDouble)
+            result = cpuImpl<double>();
 
-        return (tensor<T>{});
+        const auto dimLengths = output.desc.GetLengths();
+        auto result_dataFloat = make_tensor<float>(dimLengths);
+
+        for(size_t i                 = 0; i < result.data.size(); i++)
+            result_dataFloat.data[i] = convert_type<float>(result.data[i]);
+
+        return (result_dataFloat);
     };
 
     template <typename compType>
@@ -593,7 +624,22 @@ struct verify_reduce_no_indices
         return (res);
     }
 
-    tensor<T> gpu() const
+    tensor<float> gpu() const
+    {
+        using reduce::convert_type;
+
+        auto result = gpuImpl();
+
+        const auto dimLengths = output.desc.GetLengths();
+        auto result_dataFloat = make_tensor<float>(dimLengths);
+
+        for(size_t i                 = 0; i < result.data.size(); i++)
+            result_dataFloat.data[i] = convert_type<float>(result.data[i]);
+
+        return (result_dataFloat);
+    };
+
+    tensor<T> gpuImpl() const
     {
         auto&& handle   = get_handle();
         auto input_dev  = handle.Write(input.data);
@@ -604,6 +650,22 @@ struct verify_reduce_no_indices
 
         std::size_t ws_sizeInBytes = workspace.desc.GetElementSize() * sizeof(T);
 
+        double alphaData, betaData;
+
+        void* alphaPara = reinterpret_cast<void*>(&alphaData);
+        void* betaPara  = reinterpret_cast<void*>(&betaData);
+
+        if(std::is_same<T, double>::value)
+        {
+            *reinterpret_cast<double*>(alphaPara) = static_cast<double>(alpha);
+            *reinterpret_cast<double*>(betaPara)  = static_cast<double>(beta);
+        }
+        else
+        {
+            *reinterpret_cast<float*>(alphaPara) = alpha;
+            *reinterpret_cast<float*>(betaPara)  = beta;
+        };
+
         if(ws_sizeInBytes > 0)
         {
             auto workspace_dev = handle.Write(workspace.data);
@@ -613,10 +675,10 @@ struct verify_reduce_no_indices
                                 0,
                                 workspace_dev.get(),
                                 ws_sizeInBytes,
-                                static_cast<const void*>(&alpha),
+                                static_cast<const void*>(alphaPara),
                                 input.desc,
                                 input_dev.get(),
-                                static_cast<const void*>(&beta),
+                                static_cast<const void*>(betaPara),
                                 output.desc,
                                 output_dev.get());
         }
@@ -627,10 +689,10 @@ struct verify_reduce_no_indices
                                 0,
                                 nullptr,
                                 0,
-                                static_cast<const void*>(&alpha),
+                                static_cast<const void*>(alphaPara),
                                 input.desc,
                                 input_dev.get(),
-                                static_cast<const void*>(&beta),
+                                static_cast<const void*>(betaPara),
                                 output.desc,
                                 output_dev.get());
         };
@@ -704,6 +766,9 @@ struct reduce_driver : test_driver
     {
         using reduce::convert_type;
 
+        if(std::is_same<T, double>::value)
+            compTypeVal = static_cast<int>(miopenDouble);
+
         if(std::is_same<T, half_float::half>::value)
         {
             if(reduceOp == MIOPEN_REDUCE_TENSOR_MIN || reduceOp == MIOPEN_REDUCE_TENSOR_MAX ||
@@ -801,7 +866,7 @@ struct reduce_driver : test_driver
         };
 
         if(reduceOp == MIOPEN_REDUCE_TENSOR_MUL)
-            this->tolerance = 80 * 500;
+            this->tolerance = 80 * 300;
         else if(reduceOp == MIOPEN_REDUCE_TENSOR_NORM1 || reduceOp == MIOPEN_REDUCE_TENSOR_NORM2)
         {
             if(toReduceDims.size() == 4)
@@ -810,6 +875,9 @@ struct reduce_driver : test_driver
                 this->tolerance = 80 * 10;
         };
 
+        if(std::is_same<T, half_float::half>::value)
+            this->tolerance *= this->tolerance * 10.0;
+
         auto inputTensor = (reduceOp == MIOPEN_REDUCE_TENSOR_MUL)
                                ? tensor<T>{this->inLengths}.generate(gen_value_2)
                                : (need_indices || reduceOp == MIOPEN_REDUCE_TENSOR_NORM1 ||
@@ -858,13 +926,19 @@ int main(int argc, const char* argv[])
 {
     std::vector<std::string> as(argv + 1, argv + argc);
 
-    bool test_half = false;
+    bool test_half   = false;
+    bool test_double = false;
 
     test_half = std::any_of(
         as.begin(), as.end(), [](const std::string& elem) { return (elem == "--half"); });
 
+    test_double = std::any_of(
+        as.begin(), as.end(), [](const std::string& elem) { return (elem == "--double"); });
+
     if(test_half)
         test_drive<reduce_driver<half_float::half>>(argc, argv);
+    else if(test_double)
+        test_drive<reduce_driver<double>>(argc, argv);
     else
         test_drive<reduce_driver<float>>(argc, argv);
 };
diff --git a/test/tensor_holder.hpp b/test/tensor_holder.hpp
index 1b3ccc728c..12158ec7d7 100644
--- a/test/tensor_holder.hpp
+++ b/test/tensor_holder.hpp
@@ -85,6 +85,11 @@ struct miopen_type<float> : std::integral_constant<miopenDataType_t, miopenFloat
 {
 };
 
+template <>
+struct miopen_type<double> : std::integral_constant<miopenDataType_t, miopenDouble>
+{
+};
+
 template <>
 struct miopen_type<half_float::half> : std::integral_constant<miopenDataType_t, miopenHalf>
 {

From e18aff85cdd793b887064f48d2ce8cf96a4788fa Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Mon, 17 May 2021 12:55:06 +0000
Subject: [PATCH 06/20] Update to remove compiler warnings caused by the adding
 of miopenDouble

---
 src/gemm_v2.cpp                               | 23 +++++++++++++++++++
 .../miopen/conv/problem_description.hpp       |  1 +
 src/include/miopen/datatype.hpp               |  5 ++++
 src/include/miopen/tensor.hpp                 |  1 +
 src/include/miopen/visit_float.hpp            |  5 ++++
 src/ocl/tensorocl.cpp                         |  3 +++
 src/tensor.cpp                                |  1 +
 7 files changed, 39 insertions(+)

diff --git a/src/gemm_v2.cpp b/src/gemm_v2.cpp
index 31f69e10e2..9dd547f44e 100644
--- a/src/gemm_v2.cpp
+++ b/src/gemm_v2.cpp
@@ -297,6 +297,8 @@ miopenStatus_t CallGemmMIOpenTensile(const Handle& handle,
         ptrA            = Data_t(reinterpret_cast<const int8_t*>(A) + a_offset);
         ptrB            = Data_t(reinterpret_cast<const int8_t*>(B) + b_offset);
         ptrC            = Data_t(reinterpret_cast<int32_t*>(C) + c_offset);
+    case miopenDouble:
+        MIOPEN_THROW(miopenStatusBadParm, "miopenDouble data type not supported by MIOpenGEMM.");
     }
     if(gemm_desc.dataType == miopenInt8 || gemm_desc.dataType == miopenInt8x4)
     {
@@ -570,6 +572,13 @@ miopenStatus_t CallGemm(const Handle& handle,
                 0);
         }
         break;
+
+        case miopenDouble:
+        {
+            MIOPEN_THROW(miopenStatusBadParm,
+                         "miopenDouble data type not supported by MIOpenGEMM.");
+        };
+        break;
         }
 
         if(handle.IsProfilingEnabled())
@@ -887,6 +896,13 @@ miopenStatus_t CallGemmStridedBatched(const Handle& handle,
                 0);
         }
         break;
+
+        case miopenDouble:
+        {
+            MIOPEN_THROW(miopenStatusBadParm,
+                         "miopenDouble data type not supported by MIOpenGEMM.");
+        }
+        break;
         }
 
         if(handle.IsProfilingEnabled())
@@ -1123,6 +1139,13 @@ miopenStatus_t CallGemmStridedBatchedSequential(const Handle& handle,
             }
         }
         break;
+
+        case miopenDouble:
+        {
+            MIOPEN_THROW(miopenStatusBadParm,
+                         "miopenDouble data type not supported by MIOpenGEMM.");
+        }
+        break;
         }
 
         if(handle.IsProfilingEnabled())
diff --git a/src/include/miopen/conv/problem_description.hpp b/src/include/miopen/conv/problem_description.hpp
index 2a7120a5e5..169de1197b 100644
--- a/src/include/miopen/conv/problem_description.hpp
+++ b/src/include/miopen/conv/problem_description.hpp
@@ -49,6 +49,7 @@ inline std::string GetDataTypeName(miopenDataType_t data_type)
     case miopenInt8x4: return "INT8x4";
     case miopenInt32: return "INT32";
     case miopenBFloat16: return "BF16";
+    case miopenDouble: return "FP64";
     }
 
     return "Unknown(" + std::to_string(data_type) + ")";
diff --git a/src/include/miopen/datatype.hpp b/src/include/miopen/datatype.hpp
index 9d17116cf7..9c33fb5a6e 100644
--- a/src/include/miopen/datatype.hpp
+++ b/src/include/miopen/datatype.hpp
@@ -53,6 +53,9 @@ inline std::string GetDataType(miopenDataType_t type)
     case miopenInt32: { type_str = "int";
     }
     break;
+    case miopenDouble: { type_str = "double";
+    }
+    break;
     }
     return type_str;
 }
@@ -104,6 +107,7 @@ inline std::string GetDataTypeKernelParams(miopenDataType_t type)
     int use_int8x4             = 0;
     int use_int32              = 0;
     int use_bfp16              = 0;
+    int use_fp64               = 0;
     const int use_rne_bfloat16 = MIOPEN_USE_RNE_BFLOAT16;
 
     switch(type)
@@ -114,6 +118,7 @@ inline std::string GetDataTypeKernelParams(miopenDataType_t type)
     case miopenInt8x4: use_int8x4  = 1; break;
     case miopenBFloat16: use_bfp16 = 1; break;
     case miopenInt32: use_int32    = 1; break;
+    case miopenDouble: use_fp64    = 1; break;
     default:
         MIOPEN_THROW("Only float, half, bfloat16, int8, int8x4 data type is supported.");
         break;
diff --git a/src/include/miopen/tensor.hpp b/src/include/miopen/tensor.hpp
index 57a5a53502..db06490d77 100644
--- a/src/include/miopen/tensor.hpp
+++ b/src/include/miopen/tensor.hpp
@@ -99,6 +99,7 @@ inline std::size_t GetTypeSize(miopenDataType_t d)
     case miopenBFloat16: return 2;
     case miopenInt8x4:
     case miopenInt8: return 1;
+    case miopenDouble: return 8;
     }
     MIOPEN_THROW("Unknown data type");
 }
diff --git a/src/include/miopen/visit_float.hpp b/src/include/miopen/visit_float.hpp
index 96b398eb01..ec1e394cc2 100644
--- a/src/include/miopen/visit_float.hpp
+++ b/src/include/miopen/visit_float.hpp
@@ -87,6 +87,11 @@ void visit_float(miopenDataType_t t, F f)
         f(as_float<int>{});
         break;
     }
+    case miopenDouble:
+    {
+        f(as_float<double>{});
+        break;
+    }
     }
 }
 
diff --git a/src/ocl/tensorocl.cpp b/src/ocl/tensorocl.cpp
index 997d64c896..b9727d3adc 100644
--- a/src/ocl/tensorocl.cpp
+++ b/src/ocl/tensorocl.cpp
@@ -1924,6 +1924,9 @@ std::string GetCastTensorBuildOptionFromType(const std::string& buildOption, mio
     case miopenHalf: return option += "2";
     case miopenFloat: return option += "3";
     case miopenBFloat16: return option += "4";
+    case miopenDouble:
+        // TODO
+        MIOPEN_THROW(miopenStatusBadParm, "miopenDouble data type not supported in cast tensor.");
     case miopenInt8x4:
         MIOPEN_THROW(miopenStatusBadParm, "miopenInt8x4 data type not supported in cast tensor.");
     default: MIOPEN_THROW(miopenStatusBadParm, "Invalid data type in cast tensor desc.");
diff --git a/src/tensor.cpp b/src/tensor.cpp
index c8446d551e..50118dfc16 100644
--- a/src/tensor.cpp
+++ b/src/tensor.cpp
@@ -141,6 +141,7 @@ std::size_t TensorDescriptor::GetNumBytes() const
     case miopenHalf: typesize = 2; break;
     case miopenInt32:
     case miopenFloat: typesize = 4; break;
+    case miopenDouble: typesize = 8; break;
     }
     return typesize * this->GetElementSpace();
 }

From 6f849d9ae8aad2715f7bcf54a358c81eb92f7c75 Mon Sep 17 00:00:00 2001
From: Artem Tamazov <artem.tamazov@gmail.com>
Date: Tue, 18 May 2021 02:25:32 +0300
Subject: [PATCH 07/20] Formatting

---
 src/tensor.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/tensor.cpp b/src/tensor.cpp
index 50118dfc16..b3f745233e 100644
--- a/src/tensor.cpp
+++ b/src/tensor.cpp
@@ -140,7 +140,7 @@ std::size_t TensorDescriptor::GetNumBytes() const
     case miopenBFloat16:
     case miopenHalf: typesize = 2; break;
     case miopenInt32:
-    case miopenFloat: typesize = 4; break;
+    case miopenFloat: typesize  = 4; break;
     case miopenDouble: typesize = 8; break;
     }
     return typesize * this->GetElementSpace();

From 30d5faa4f7f41e5d6187be9d430b03ddcba2b5d2 Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Wed, 19 May 2021 06:36:15 +0000
Subject: [PATCH 08/20] Update to eliminate tidy-checking warnings

---
 driver/reduce_driver.hpp        | 31 +++++++----------
 src/include/miopen/datatype.hpp |  1 +
 src/reducetensor.cpp            | 10 ++++--
 test/reduce_test.cpp            | 60 +++++++++++++--------------------
 4 files changed, 44 insertions(+), 58 deletions(-)

diff --git a/driver/reduce_driver.hpp b/driver/reduce_driver.hpp
index d28ae8fec7..180c7a3f95 100644
--- a/driver/reduce_driver.hpp
+++ b/driver/reduce_driver.hpp
@@ -380,21 +380,14 @@ int ReduceDriver<Tgpu, Tref>::RunForwardGPU()
 
     bool output_accumulate = !(reduce::float_equal_one(alpha) && reduce::float_equal_zero(beta));
 
-    double alphaData, betaData;
-
-    void* alphaPara = reinterpret_cast<void*>(&alphaData);
-    void* betaPara  = reinterpret_cast<void*>(&betaData);
-
-    if(std::is_same<Tgpu, double>::value)
-    {
-        *reinterpret_cast<double*>(alphaPara) = static_cast<double>(alpha);
-        *reinterpret_cast<double*>(betaPara)  = static_cast<double>(beta);
-    }
-    else
-    {
-        *reinterpret_cast<float*>(alphaPara) = alpha;
-        *reinterpret_cast<float*>(betaPara)  = beta;
-    };
+    const double alpha64       = alpha;
+    const double beta64        = beta;
+    const void* const alphaPtr = std::is_same<Tgpu, double>::value
+                                     ? reinterpret_cast<const void*>(&alpha64)
+                                     : reinterpret_cast<const void*>(&alpha);
+    const void* const betaPtr = std::is_same<Tgpu, double>::value
+                                    ? reinterpret_cast<const void*>(&beta64)
+                                    : reinterpret_cast<const void*>(&beta);
 
     miopenReduceTensor(GetHandle(),
                        reduceDesc,
@@ -402,10 +395,10 @@ int ReduceDriver<Tgpu, Tref>::RunForwardGPU()
                        this->need_indices ? indices_sizeInBytes : 0,    // indices size in bytes
                        ws_sizeInBytes > 0 ? ws_dev->GetMem() : nullptr, // workspace
                        ws_sizeInBytes,                                  // workspace size in bytes
-                       const_cast<const void*>(alphaPara),
+                       alphaPtr,
                        inputTensor,
                        in_dev->GetMem(),
-                       const_cast<const void*>(betaPara),
+                       betaPtr,
                        outputTensor,
                        out_dev->GetMem());
 
@@ -427,10 +420,10 @@ int ReduceDriver<Tgpu, Tref>::RunForwardGPU()
                            this->need_indices ? indices_sizeInBytes : 0,    // indices size in bytes
                            ws_sizeInBytes > 0 ? ws_dev->GetMem() : nullptr, // workspace
                            ws_sizeInBytes, // workspace size in bytes
-                           const_cast<const void*>(alphaPara),
+                           alphaPtr,
                            inputTensor,
                            in_dev->GetMem(),
-                           const_cast<const void*>(betaPara),
+                           betaPtr,
                            outputTensor,
                            out_dev->GetMem());
     }
diff --git a/src/include/miopen/datatype.hpp b/src/include/miopen/datatype.hpp
index 9c33fb5a6e..7ff4babb26 100644
--- a/src/include/miopen/datatype.hpp
+++ b/src/include/miopen/datatype.hpp
@@ -132,6 +132,7 @@ inline std::string GetDataTypeKernelParams(miopenDataType_t type)
     ss << " -DMIOPEN_USE_BFP16=" << use_bfp16;
     ss << " -DMIOPEN_USE_INT32=" << use_int32;
     ss << " -DMIOPEN_USE_RNE_BFLOAT16=" << use_rne_bfloat16;
+    ss << " -DMIOPEN_USE_FP64=" << use_fp64;
     return ss.str();
 }
 
diff --git a/src/reducetensor.cpp b/src/reducetensor.cpp
index 6721a493aa..54394bca75 100644
--- a/src/reducetensor.cpp
+++ b/src/reducetensor.cpp
@@ -570,9 +570,13 @@ void ReduceTensorDescriptor::ReduceTensor(const Handle& handle,
 #if WORKAROUND_MIOPEN_ISSUE_557
     if(StartsWith(handle.GetDeviceName(), "gfx10"))
         param += " -DCK_USE_AMD_BUFFER_ADDRESSING=0 ";
-    else if(srcDataType == miopenDouble)
-        // TODO: support from composable kernel utility for using AMD Buffer Addressing for double
-        param += " -DCK_USE_AMD_BUFFER_ADDRESSING=0 ";
+    else
+    {
+        if(srcDataType == miopenDouble)
+            // TODO: support from composable kernel utility for using AMD Buffer Addressing for
+            // double
+            param += " -DCK_USE_AMD_BUFFER_ADDRESSING=0 ";
+    };
 #else
     if(srcDataType == miopenDouble)
         // TODO: support from composable kernel utility for using AMD Buffer Addressing for double
diff --git a/test/reduce_test.cpp b/test/reduce_test.cpp
index 4e5085959c..6caf855675 100644
--- a/test/reduce_test.cpp
+++ b/test/reduce_test.cpp
@@ -347,21 +347,15 @@ struct verify_reduce_with_indices
         std::size_t ws_sizeInBytes      = workspace.desc.GetElementSize() * sizeof(T);
         std::size_t indices_sizeInBytes = indices.desc.GetElementSize() * sizeof(int);
 
-        double alphaData, betaData;
+        const double alpha64 = alpha;
+        const double beta64  = beta;
 
-        void* alphaPara = reinterpret_cast<void*>(&alphaData);
-        void* betaPara  = reinterpret_cast<void*>(&betaData);
-
-        if(std::is_same<T, double>::value)
-        {
-            *reinterpret_cast<double*>(alphaPara) = static_cast<double>(alpha);
-            *reinterpret_cast<double*>(betaPara)  = static_cast<double>(beta);
-        }
-        else
-        {
-            *reinterpret_cast<float*>(alphaPara) = alpha;
-            *reinterpret_cast<float*>(betaPara)  = beta;
-        };
+        const void* const alphaPtr = (std::is_same<T, double>::value)
+                                         ? reinterpret_cast<const void*>(&alpha64)
+                                         : reinterpret_cast<const void*>(&alpha);
+        const void* const betaPtr = (std::is_same<T, double>::value)
+                                        ? reinterpret_cast<const void*>(&beta64)
+                                        : reinterpret_cast<const void*>(&beta);
 
         if(ws_sizeInBytes > 0)
         {
@@ -372,10 +366,10 @@ struct verify_reduce_with_indices
                                 indices_sizeInBytes,
                                 workspace_dev.get(),
                                 ws_sizeInBytes,
-                                const_cast<const void*>(alphaPara),
+                                alphaPtr,
                                 input.desc,
                                 input_dev.get(),
-                                const_cast<const void*>(betaPara),
+                                betaPtr,
                                 output.desc,
                                 output_dev.get());
         }
@@ -386,10 +380,10 @@ struct verify_reduce_with_indices
                                 indices_sizeInBytes,
                                 nullptr,
                                 0,
-                                const_cast<const void*>(alphaPara),
+                                alphaPtr,
                                 input.desc,
                                 input_dev.get(),
-                                const_cast<const void*>(betaPara),
+                                betaPtr,
                                 output.desc,
                                 output_dev.get());
         };
@@ -650,21 +644,15 @@ struct verify_reduce_no_indices
 
         std::size_t ws_sizeInBytes = workspace.desc.GetElementSize() * sizeof(T);
 
-        double alphaData, betaData;
+        const double alpha64 = alpha;
+        const double beta64  = beta;
 
-        void* alphaPara = reinterpret_cast<void*>(&alphaData);
-        void* betaPara  = reinterpret_cast<void*>(&betaData);
-
-        if(std::is_same<T, double>::value)
-        {
-            *reinterpret_cast<double*>(alphaPara) = static_cast<double>(alpha);
-            *reinterpret_cast<double*>(betaPara)  = static_cast<double>(beta);
-        }
-        else
-        {
-            *reinterpret_cast<float*>(alphaPara) = alpha;
-            *reinterpret_cast<float*>(betaPara)  = beta;
-        };
+        const void* const alphaPtr = (std::is_same<T, double>::value)
+                                         ? reinterpret_cast<const void*>(&alpha64)
+                                         : reinterpret_cast<const void*>(&alpha);
+        const void* const betaPtr = (std::is_same<T, double>::value)
+                                        ? reinterpret_cast<const void*>(&beta64)
+                                        : reinterpret_cast<const void*>(&beta);
 
         if(ws_sizeInBytes > 0)
         {
@@ -675,10 +663,10 @@ struct verify_reduce_no_indices
                                 0,
                                 workspace_dev.get(),
                                 ws_sizeInBytes,
-                                static_cast<const void*>(alphaPara),
+                                alphaPtr,
                                 input.desc,
                                 input_dev.get(),
-                                static_cast<const void*>(betaPara),
+                                betaPtr,
                                 output.desc,
                                 output_dev.get());
         }
@@ -689,10 +677,10 @@ struct verify_reduce_no_indices
                                 0,
                                 nullptr,
                                 0,
-                                static_cast<const void*>(alphaPara),
+                                alphaPtr,
                                 input.desc,
                                 input_dev.get(),
-                                static_cast<const void*>(betaPara),
+                                betaPtr,
                                 output.desc,
                                 output_dev.get());
         };

From ecad57f3b82347b26eecc4ef51021df90507c0ca Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Wed, 19 May 2021 10:05:44 +0000
Subject: [PATCH 09/20] Update to elminate tidy-checking warnings ...

---
 driver/reduce_driver.hpp     |  8 ++++----
 src/solver/conv_asm_1x1u.cpp |  3 +++
 test/reduce_test.cpp         | 16 ++++++++--------
 test/tensor_vec.cpp          |  6 ++++++
 4 files changed, 21 insertions(+), 12 deletions(-)

diff --git a/driver/reduce_driver.hpp b/driver/reduce_driver.hpp
index 180c7a3f95..9409392111 100644
--- a/driver/reduce_driver.hpp
+++ b/driver/reduce_driver.hpp
@@ -383,11 +383,11 @@ int ReduceDriver<Tgpu, Tref>::RunForwardGPU()
     const double alpha64       = alpha;
     const double beta64        = beta;
     const void* const alphaPtr = std::is_same<Tgpu, double>::value
-                                     ? reinterpret_cast<const void*>(&alpha64)
-                                     : reinterpret_cast<const void*>(&alpha);
+                                     ? static_cast<const void*>(&alpha64)
+                                     : static_cast<const void*>(&alpha);
     const void* const betaPtr = std::is_same<Tgpu, double>::value
-                                    ? reinterpret_cast<const void*>(&beta64)
-                                    : reinterpret_cast<const void*>(&beta);
+                                    ? static_cast<const void*>(&beta64)
+                                    : static_cast<const void*>(&beta);
 
     miopenReduceTensor(GetHandle(),
                        reduceDesc,
diff --git a/src/solver/conv_asm_1x1u.cpp b/src/solver/conv_asm_1x1u.cpp
index cf20612181..33f053f350 100644
--- a/src/solver/conv_asm_1x1u.cpp
+++ b/src/solver/conv_asm_1x1u.cpp
@@ -308,6 +308,9 @@ bool PerformanceConfigConvAsm1x1U::IsValid(const ConvolutionContext& config) con
 
 void PerformanceConfigConvAsm1x1U::HeuristicInit(const ConvolutionContext& config)
 {
+    if(config.in_data_type == miopenDouble)
+        MIOPEN_THROW("Double data type is not supported by ConvAsm1x1U");
+
     const auto elements_in_dword = 4 / GetTypeSize(config.in_data_type);
     read_size                    = 4;
     k_mult                       = 16;
diff --git a/test/reduce_test.cpp b/test/reduce_test.cpp
index 6caf855675..7d8ddc680c 100644
--- a/test/reduce_test.cpp
+++ b/test/reduce_test.cpp
@@ -351,11 +351,11 @@ struct verify_reduce_with_indices
         const double beta64  = beta;
 
         const void* const alphaPtr = (std::is_same<T, double>::value)
-                                         ? reinterpret_cast<const void*>(&alpha64)
-                                         : reinterpret_cast<const void*>(&alpha);
+                                         ? static_cast<const void*>(&alpha64)
+                                         : static_cast<const void*>(&alpha);
         const void* const betaPtr = (std::is_same<T, double>::value)
-                                        ? reinterpret_cast<const void*>(&beta64)
-                                        : reinterpret_cast<const void*>(&beta);
+                                        ? static_cast<const void*>(&beta64)
+                                        : static_cast<const void*>(&beta);
 
         if(ws_sizeInBytes > 0)
         {
@@ -648,11 +648,11 @@ struct verify_reduce_no_indices
         const double beta64  = beta;
 
         const void* const alphaPtr = (std::is_same<T, double>::value)
-                                         ? reinterpret_cast<const void*>(&alpha64)
-                                         : reinterpret_cast<const void*>(&alpha);
+                                         ? static_cast<const void*>(&alpha64)
+                                         : static_cast<const void*>(&alpha);
         const void* const betaPtr = (std::is_same<T, double>::value)
-                                        ? reinterpret_cast<const void*>(&beta64)
-                                        : reinterpret_cast<const void*>(&beta);
+                                        ? static_cast<const void*>(&beta64)
+                                        : static_cast<const void*>(&beta);
 
         if(ws_sizeInBytes > 0)
         {
diff --git a/test/tensor_vec.cpp b/test/tensor_vec.cpp
index 39bd37a87c..e9ae254b08 100644
--- a/test/tensor_vec.cpp
+++ b/test/tensor_vec.cpp
@@ -319,6 +319,12 @@ struct tensor_vec_driver : test_driver
             return;
         }
 
+        if(std::is_same<T, double>::value)
+        {
+            std::cout << "VEC2 transpose does not support double type" << std::endl;
+            return;
+        }
+
         if(!(miopen::float_equal(static_cast<const float>(alpha), 1.0) &&
              miopen::float_equal(static_cast<const float>(beta), 0.0)))
             return;

From ac3de38dac8289f6be806b83e99977139ecc5bfc Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Wed, 19 May 2021 10:10:12 +0000
Subject: [PATCH 10/20] Tiny Fix in reduce_driver.cpp

---
 driver/reduce_driver.hpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/driver/reduce_driver.hpp b/driver/reduce_driver.hpp
index 9409392111..47e9c12e61 100644
--- a/driver/reduce_driver.hpp
+++ b/driver/reduce_driver.hpp
@@ -374,8 +374,8 @@ int ReduceDriver<Tgpu, Tref>::RunForwardGPU()
 
     if(this->need_indices)
     {
-        alpha = reduce::convert_type<Tgpu>(1.0f);
-        beta  = reduce::convert_type<Tgpu>(0.0f);
+        alpha = 1.0f;
+        beta  = 0.0f;
     };
 
     bool output_accumulate = !(reduce::float_equal_one(alpha) && reduce::float_equal_zero(beta));

From 4721ccdbd5240ec3ae0ebed95c0ba8f85bf475b8 Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Thu, 20 May 2021 12:15:56 +0000
Subject: [PATCH 11/20] Tiny fix in gemm_v2.cpp

---
 src/gemm_v2.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/gemm_v2.cpp b/src/gemm_v2.cpp
index 9dd547f44e..cfcbd0680c 100644
--- a/src/gemm_v2.cpp
+++ b/src/gemm_v2.cpp
@@ -297,6 +297,7 @@ miopenStatus_t CallGemmMIOpenTensile(const Handle& handle,
         ptrA            = Data_t(reinterpret_cast<const int8_t*>(A) + a_offset);
         ptrB            = Data_t(reinterpret_cast<const int8_t*>(B) + b_offset);
         ptrC            = Data_t(reinterpret_cast<int32_t*>(C) + c_offset);
+        break;
     case miopenDouble:
         MIOPEN_THROW(miopenStatusBadParm, "miopenDouble data type not supported by MIOpenGEMM.");
     }

From 7e3f738e2d1eb10bd16437090c2b794dd5024601 Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Fri, 21 May 2021 06:56:43 +0000
Subject: [PATCH 12/20] Add workaround for rocm 3.7 in reduce_test.cpp

---
 test/reduce_test.cpp | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

diff --git a/test/reduce_test.cpp b/test/reduce_test.cpp
index 7d8ddc680c..768fb7aa7d 100644
--- a/test/reduce_test.cpp
+++ b/test/reduce_test.cpp
@@ -46,6 +46,10 @@
 #define WORKAROUND_GPU_MEM_ACCESS_FAULT \
     (HIP_PACKAGE_VERSION_MAJOR == 3 && HIP_PACKAGE_VERSION_MINOR == 7)
 
+/// Not reproducible with ROCm 4.1 and 4.2.
+#define WORKAROUND_GPU_NUMERIC_ERROR \
+    (HIP_PACKAGE_VERSION_MAJOR == 3 && HIP_PACKAGE_VERSION_MINOR == 7)
+
 template <class T, bool toVerifyData>
 struct verify_reduce_with_indices
 {
@@ -782,6 +786,20 @@ struct reduce_driver : test_driver
             }
         }
 #endif
+
+#if WORKAROUND_GPU_NUMERIC_ERROR
+        if(std::is_same<T, double>::value)
+        {
+            if(inLengths == std::vector<std::size_t>{64, 3, 280, 81} &&
+               toReduceDims == std::vector<int>{0, 1, 2, 3} && (reduceOp == 3 || reduceOp == 4) &&
+               indicesOpt == 1)
+            {
+                std::cout << "Workaround: Skipping the test." << std::endl;
+                return;
+            };
+        }
+#endif
+
         miopen::ReduceTensorDescriptor reduceDesc(
             static_cast<miopenReduceTensorOp_t>(reduceOp),
             static_cast<miopenDataType_t>(compTypeVal),

From 83c097faca88e0d63a472aff5e1a0b797f82cdef Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Thu, 27 May 2021 15:44:04 +0000
Subject: [PATCH 13/20] Fix repeated scaling tolerance for half tests in
 reduce_test.cpp

---
 test/reduce_test.cpp | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/test/reduce_test.cpp b/test/reduce_test.cpp
index 768fb7aa7d..df9016022a 100644
--- a/test/reduce_test.cpp
+++ b/test/reduce_test.cpp
@@ -871,6 +871,9 @@ struct reduce_driver : test_driver
             return rand_upper * sign_value * rand_ratio;
         };
 
+        // default tolerance (refer to driver.hpp)
+        this->tolerance = 80;
+
         if(reduceOp == MIOPEN_REDUCE_TENSOR_MUL)
             this->tolerance = 80 * 300;
         else if(reduceOp == MIOPEN_REDUCE_TENSOR_NORM1 || reduceOp == MIOPEN_REDUCE_TENSOR_NORM2)

From e0e2cd7a15e99671205b08871a7b5bf3113ca956 Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Fri, 28 May 2021 14:31:10 +0000
Subject: [PATCH 14/20] Fix to use unique network-config for two calls

---
 src/reducetensor.cpp | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/reducetensor.cpp b/src/reducetensor.cpp
index 47f737534e..b3aba09049 100644
--- a/src/reducetensor.cpp
+++ b/src/reducetensor.cpp
@@ -622,6 +622,8 @@ void ReduceTensorDescriptor::ReduceTensor(const Handle& handle,
 
         std::string param2 = param + " -DCK_PARAM_GRIDSIZE=" + std::to_string(gridSize_2) + " ";
 
+        std::string network_config2 = network_config + "_C2";
+
         // compile option and network config for the second-time call
         const std::vector<size_t> vld_2 = {static_cast<size_t>(blockSize), size_t{1}, size_t{1}};
         const std::vector<size_t> vgd_2 = {
@@ -631,7 +633,7 @@ void ReduceTensorDescriptor::ReduceTensor(const Handle& handle,
         std::string kernel_name2 = "gridwise_generic_reduce_2";
 
         handle.AddKernel(
-            algo_name, network_config, program_name, kernel_name2, vld_2, vgd_2, param2)(
+            algo_name, network_config2, program_name, kernel_name2, vld_2, vgd_2, param2)(
             alphaVal, A, betaVal, C, ws_buf1_global, ws_buf2_bytes_offset, indices);
     };
 };

From 4da22f43f5dee0c68ee2663f2bfa3bb139f28190 Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Wed, 2 Jun 2021 04:05:55 +0000
Subject: [PATCH 15/20] Avoid the using of -DMIOPEN_USE_FP64=<xxx> when fp64 is
 not used

---
 src/include/miopen/datatype.hpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/include/miopen/datatype.hpp b/src/include/miopen/datatype.hpp
index 7ff4babb26..8c05ecd325 100644
--- a/src/include/miopen/datatype.hpp
+++ b/src/include/miopen/datatype.hpp
@@ -132,7 +132,8 @@ inline std::string GetDataTypeKernelParams(miopenDataType_t type)
     ss << " -DMIOPEN_USE_BFP16=" << use_bfp16;
     ss << " -DMIOPEN_USE_INT32=" << use_int32;
     ss << " -DMIOPEN_USE_RNE_BFLOAT16=" << use_rne_bfloat16;
-    ss << " -DMIOPEN_USE_FP64=" << use_fp64;
+    if(use_fp64)
+        ss << " -DMIOPEN_USE_FP64=" << use_fp64;
     return ss.str();
 }
 

From bd1610bc043cf1482880be18798de7f29faca973 Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Wed, 2 Jun 2021 04:07:36 +0000
Subject: [PATCH 16/20] Avoid unnecessary runtime warning for using double in
 test/driver.hpp

---
 test/driver.hpp | 10 ++--------
 1 file changed, 2 insertions(+), 8 deletions(-)

diff --git a/test/driver.hpp b/test/driver.hpp
index 0730f22c2e..11e7a020f1 100644
--- a/test/driver.hpp
+++ b/test/driver.hpp
@@ -1115,10 +1115,7 @@ void test_drive_impl_2(std::string program_name, std::vector<std::string> as)
         return;
     }
 
-    if(program_name.find("reduce") != std::string::npos)
-        set_driver_datatype<Driver>(d, arg_map);
-    else
-        throw std::runtime_error("Double is not supported");
+    set_driver_datatype<Driver>(d, arg_map);
 
     std::vector<std::vector<std::string>> configs = build_configs<Driver>(d, arg_map, keywords);
     size_t config_count                           = configs.size();
@@ -1175,10 +1172,7 @@ void test_drive_impl_1(std::string program_name, std::vector<std::string> as)
     }
     else if(arg_map.count("--double") > 0)
     {
-        if(program_name.find("reduce") == std::string::npos)
-            throw std::runtime_error("Double is not supported");
-        else
-            d.type = miopenDouble;
+        d.type = miopenDouble;
     }
     else
     {

From 7d61d2a6257d53bce832fe9d913cf230bfde1900 Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Wed, 2 Jun 2021 06:44:22 +0000
Subject: [PATCH 17/20] Fix for tidy

---
 src/include/miopen/datatype.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/include/miopen/datatype.hpp b/src/include/miopen/datatype.hpp
index 8c05ecd325..05e4042f58 100644
--- a/src/include/miopen/datatype.hpp
+++ b/src/include/miopen/datatype.hpp
@@ -132,7 +132,7 @@ inline std::string GetDataTypeKernelParams(miopenDataType_t type)
     ss << " -DMIOPEN_USE_BFP16=" << use_bfp16;
     ss << " -DMIOPEN_USE_INT32=" << use_int32;
     ss << " -DMIOPEN_USE_RNE_BFLOAT16=" << use_rne_bfloat16;
-    if(use_fp64)
+    if(use_fp64 != 0)
         ss << " -DMIOPEN_USE_FP64=" << use_fp64;
     return ss.str();
 }

From eae5f7f1e5241fe4f01e84b65a2201479fee0bb2 Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Fri, 4 Jun 2021 17:11:51 +0800
Subject: [PATCH 18/20] Remove un-needed setting in test/CMakeLists.txt

---
 test/CMakeLists.txt | 1 -
 1 file changed, 1 deletion(-)

diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index 8cd36dd470..396f1d214d 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -217,7 +217,6 @@ set( LONG_TESTS
     pooling3d.cpp
     soft_max.cpp
     lrn_test.cpp
-    reduce_test.cpp
     )
 
 foreach(TEST ${TESTS})

From 40f3ee74db56920e944270c645e39bf5b0052353 Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Fri, 4 Jun 2021 17:32:04 +0800
Subject: [PATCH 19/20] Add custom_test for testing reduce double

---
 test/CMakeLists.txt | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index 396f1d214d..17906baf34 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -1052,3 +1052,7 @@ if(MIOPEN_TEST_CONV)
     COMMAND	$<TARGET_FILE:test_conv2d>	--verbose	--input	1	48	7	7	--weights	1	48	5	5	--pads_strides_dilations	0	0	4	4	1	1
 )
 endif()
+
+if(MIOPEN_TEST_FLOAT)
+    add_custom_test(test_reduce_double SKIP_UNLESS_ALL FLOAT_ENABLED COMMAND  $<TARGET_FILE:test_reduce_test> --double --all)
+endif()

From b3626a599f2731cb55aa3e0e434006b40b1424f6 Mon Sep 17 00:00:00 2001
From: Qianfeng Zhang <Qianfeng.Zhang@amd.com>
Date: Fri, 4 Jun 2021 22:59:23 +0800
Subject: [PATCH 20/20] Update to the test_reduce_double custom_test

---
 test/CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index 17906baf34..3d25c23954 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -1054,5 +1054,5 @@ if(MIOPEN_TEST_CONV)
 endif()
 
 if(MIOPEN_TEST_FLOAT)
-    add_custom_test(test_reduce_double SKIP_UNLESS_ALL FLOAT_ENABLED COMMAND  $<TARGET_FILE:test_reduce_test> --double --all)
+    add_custom_test(test_reduce_double SKIP_UNLESS_ALL GFX908_ENABLED COMMAND  $<TARGET_FILE:test_reduce_test> --double --all --verbose)
 endif()