ROCm · atamazov · Jun 6, 2021 · May 17, 2021 · May 17, 2021 · May 17, 2021
@@ -130,7 +130,7 @@ void PadBufferSize(size_t& sz, int datatype_sz)
     printf(
         "Supported Base Arguments: conv[fp16|int8|bfp16], CBAInfer[fp16], pool[fp16], lrn[fp16], "
         "activ[fp16], softmax[fp16], bnorm[fp16], rnn[fp16], gemm, ctc, dropout[fp16], "
-        "tensorop[fp16], reduce[fp16]\n");
+        "tensorop[fp16], reduce[fp16,fp64]\n");
     exit(0);
 }
 
@@ -150,7 +150,7 @@ std::string ParseBaseArg(int argc, char* argv[])
        arg != "softmax" && arg != "softmaxfp16" && arg != "bnorm" && arg != "bnormfp16" &&
        arg != "rnn" && arg != "rnnfp16" && arg != "gemm" /*&& arg != "gemmfp16"*/ && arg != "ctc" &&
        arg != "dropout" && arg != "dropoutfp16" && arg != "tensorop" && arg != "tensoropfp16" &&
-       arg != "reduce" && arg != "reducefp16" && arg != "--version")
+       arg != "reduce" && arg != "reducefp16" && arg != "reducefp64" && arg != "--version")
     {
         printf("Invalid Base Input Argument\n");
         Usage();

@@ -174,6 +174,10 @@ int main(int argc, char* argv[])
     {
         drv = new ReduceDriver<float16, float>();
     }
+    else if(base_arg == "reducefp64")
+    {
+        drv = new ReduceDriver<double, double>();
+    }
     else
     {
         printf("Incorrect BaseArg\n");

@@ -90,8 +90,9 @@ class miopenReductionHost
                 RunImpl<Tref>(alpha, in_data, beta, out_data, indices);
             else
                 RunImpl<float16>(alpha, in_data, beta, out_data, indices);
-        };
-
+        }
+        else if(compTypeVal == miopenDouble)
+            RunImpl<double>(alpha, in_data, beta, out_data, indices);
         return;
     };
 

@@ -38,11 +38,12 @@
 #include <miopen/miopen.h>
 #include <miopen/reduce_common.hpp>
 #include <miopen/tensor.hpp>
-#include <miopen/bfloat16.hpp>
 #include <numeric>
 #include <vector>
 #include <string>
 #include <cassert>
+#include <type_traits>
+#include <half.hpp>
 #include "random.hpp"
 
 #include "miopen_Reduction.hpp"
@@ -58,7 +59,10 @@ class ReduceDriver : public Driver
 
         miopenCreateReduceTensorDescriptor(&reduceDesc);
 
-        data_type = (sizeof(Tgpu) == 4) ? miopenFloat : miopenHalf;
+        if(std::is_same<Tgpu, double>::value)
+            data_type = miopenDouble;
+        else
+            data_type = (sizeof(Tgpu) == 4) ? miopenFloat : miopenHalf;
     }
 
     int AddCmdLineArgs() override;
@@ -289,6 +293,9 @@ int ReduceDriver<Tgpu, Tref>::SetReduceTensorDescriptorFromCmdLineArgs()
         (reduceOp == MIOPEN_REDUCE_TENSOR_MIN || reduceOp == MIOPEN_REDUCE_TENSOR_MAX ||
          reduceOp == MIOPEN_REDUCE_TENSOR_AMAX);
 
+    if(std::is_same<Tgpu, double>::value)
+        compType = miopenDouble;
+
     return (miopenSetReduceTensorDescriptor(
         reduceDesc, reduceOp, compType, nanOpt, indicesOpt, indicesType));
 }
@@ -367,22 +374,31 @@ int ReduceDriver<Tgpu, Tref>::RunForwardGPU()
 
     if(this->need_indices)
     {
-        alpha = reduce::convert_type<Tgpu>(1.0f);
-        beta  = reduce::convert_type<Tgpu>(0.0f);
+        alpha = 1.0f;
+        beta  = 0.0f;
     };
 
     bool output_accumulate = !(reduce::float_equal_one(alpha) && reduce::float_equal_zero(beta));
 
+    const double alpha64       = alpha;
+    const double beta64        = beta;
+    const void* const alphaPtr = std::is_same<Tgpu, double>::value
+                                     ? static_cast<const void*>(&alpha64)
+                                     : static_cast<const void*>(&alpha);
+    const void* const betaPtr = std::is_same<Tgpu, double>::value
+                                    ? static_cast<const void*>(&beta64)
+                                    : static_cast<const void*>(&beta);
+
     miopenReduceTensor(GetHandle(),
                        reduceDesc,
                        this->need_indices ? indices_dev->GetMem() : nullptr, // indices
                        this->need_indices ? indices_sizeInBytes : 0,    // indices size in bytes
                        ws_sizeInBytes > 0 ? ws_dev->GetMem() : nullptr, // workspace
                        ws_sizeInBytes,                                  // workspace size in bytes
-                       &alpha,
+                       alphaPtr,
                        inputTensor,
                        in_dev->GetMem(),
-                       &beta,
+                       betaPtr,
                        outputTensor,
                        out_dev->GetMem());
 
@@ -404,10 +420,10 @@ int ReduceDriver<Tgpu, Tref>::RunForwardGPU()
                            this->need_indices ? indices_sizeInBytes : 0,    // indices size in bytes
                            ws_sizeInBytes > 0 ? ws_dev->GetMem() : nullptr, // workspace
                            ws_sizeInBytes, // workspace size in bytes
-                           &alpha,
+                           alphaPtr,
                            inputTensor,
                            in_dev->GetMem(),
-                           &beta,
+                           betaPtr,
                            outputTensor,
                            out_dev->GetMem());
     }
@@ -455,10 +471,8 @@ int ReduceDriver<Tgpu, Tref>::VerifyForward()
                                                   this->dimsInvariant,
                                                   this->dimsToReduce);
 
-    auto alpha =
-        reduce::convert_type<Tgpu>(static_cast<float>(this->inflags.GetValueDouble("alpha")));
-    auto beta =
-        reduce::convert_type<Tgpu>(static_cast<float>(this->inflags.GetValueDouble("beta")));
+    auto alpha = static_cast<float>(this->inflags.GetValueDouble("alpha"));
+    auto beta  = static_cast<float>(this->inflags.GetValueDouble("beta"));
 
     auto reduceOp = static_cast<miopenReduceTensorOp_t>(inflags.GetValueInt("ReduceOp"));
 
@@ -470,14 +484,19 @@ int ReduceDriver<Tgpu, Tref>::VerifyForward()
 
     hostReduction.Run(alpha, in.data(), beta, outhost.data(), outhost_indices.data());
 
-    auto error = miopen::rms_range(outhost, out);
-    const double tolerance =
-        std::is_same<Tgpu, float16>::value || reduceOp == MIOPEN_REDUCE_TENSOR_NORM2 ? 2e-3
-                                                                                     : 1.5e-4;
+    auto error       = miopen::rms_range(outhost, out);
+    double tolerance = 1.5e-4;
+
+    if(std::is_same<Tgpu, half_float::half>::value)
+        tolerance *= 4.0;
+
+    if(std::is_same<Tgpu, float>::value && reduceOp == MIOPEN_REDUCE_TENSOR_NORM2)
+        tolerance *= 12.0;
 
     if(error > tolerance)
     {
-        std::cout << "ReduceTensor() Failed: " << error << "\n";
+        std::cout << "ReduceTensor() Failed with error = " << error
+                  << " , tolerance = " << tolerance << "\n";
     }
     else
     {

@@ -334,6 +334,7 @@ typedef enum {
         4, /*!< Pack of four 8-bit int points in NCHW_VECT_C format (Partially supported) */
     miopenBFloat16 = 5, /*!< 16-bit binary floating point (8-bit exponent, 7-bit fraction)
                            (Partially supported) */
+    miopenDouble = 6,   /*!< 64-bit floating point (Partially supported) */
 } miopenDataType_t;
 
 /*! @ingroup pooling

@@ -297,6 +297,9 @@ miopenStatus_t CallGemmMIOpenTensile(const Handle& handle,
         ptrA            = Data_t(reinterpret_cast<const int8_t*>(A) + a_offset);
         ptrB            = Data_t(reinterpret_cast<const int8_t*>(B) + b_offset);
         ptrC            = Data_t(reinterpret_cast<int32_t*>(C) + c_offset);
+        break;
+    case miopenDouble:
+        MIOPEN_THROW(miopenStatusBadParm, "miopenDouble data type not supported by MIOpenGEMM.");
     }
     if(gemm_desc.dataType == miopenInt8 || gemm_desc.dataType == miopenInt8x4)
     {
@@ -570,6 +573,13 @@ miopenStatus_t CallGemm(const Handle& handle,
                 0);
         }
         break;
+
+        case miopenDouble:
+        {
+            MIOPEN_THROW(miopenStatusBadParm,
+                         "miopenDouble data type not supported by MIOpenGEMM.");
+        };
+        break;
         }
 
         if(handle.IsProfilingEnabled())
@@ -887,6 +897,13 @@ miopenStatus_t CallGemmStridedBatched(const Handle& handle,
                 0);
         }
         break;
+
+        case miopenDouble:
+        {
+            MIOPEN_THROW(miopenStatusBadParm,
+                         "miopenDouble data type not supported by MIOpenGEMM.");
+        }
+        break;
         }
 
         if(handle.IsProfilingEnabled())
@@ -1123,6 +1140,13 @@ miopenStatus_t CallGemmStridedBatchedSequential(const Handle& handle,
             }
         }
         break;
+
+        case miopenDouble:
+        {
+            MIOPEN_THROW(miopenStatusBadParm,
+                         "miopenDouble data type not supported by MIOpenGEMM.");
+        }
+        break;
         }
 
         if(handle.IsProfilingEnabled())

@@ -49,6 +49,7 @@ inline std::string GetDataTypeName(miopenDataType_t data_type)
     case miopenInt8x4: return "INT8x4";
     case miopenInt32: return "INT32";
     case miopenBFloat16: return "BF16";
+    case miopenDouble: return "FP64";
     }
 
     return "Unknown(" + std::to_string(data_type) + ")";

@@ -53,6 +53,9 @@ inline std::string GetDataType(miopenDataType_t type)
     case miopenInt32: { type_str = "int";
     }
     break;
+    case miopenDouble: { type_str = "double";
+    }
+    break;
     }
     return type_str;
 }
@@ -104,6 +107,7 @@ inline std::string GetDataTypeKernelParams(miopenDataType_t type)
     int use_int8x4             = 0;
     int use_int32              = 0;
     int use_bfp16              = 0;
+    int use_fp64               = 0;
     const int use_rne_bfloat16 = MIOPEN_USE_RNE_BFLOAT16;
 
     switch(type)
@@ -114,6 +118,7 @@ inline std::string GetDataTypeKernelParams(miopenDataType_t type)
     case miopenInt8x4: use_int8x4  = 1; break;
     case miopenBFloat16: use_bfp16 = 1; break;
     case miopenInt32: use_int32    = 1; break;
+    case miopenDouble: use_fp64    = 1; break;
     default:
         MIOPEN_THROW("Only float, half, bfloat16, int8, int8x4 data type is supported.");
         break;
@@ -127,6 +132,7 @@ inline std::string GetDataTypeKernelParams(miopenDataType_t type)
     ss << " -DMIOPEN_USE_BFP16=" << use_bfp16;
     ss << " -DMIOPEN_USE_INT32=" << use_int32;
     ss << " -DMIOPEN_USE_RNE_BFLOAT16=" << use_rne_bfloat16;
+    ss << " -DMIOPEN_USE_FP64=" << use_fp64;
     return ss.str();
 }
 

@@ -99,6 +99,7 @@ inline std::size_t GetTypeSize(miopenDataType_t d)
     case miopenBFloat16: return 2;
     case miopenInt8x4:
     case miopenInt8: return 1;
+    case miopenDouble: return 8;
     }
     MIOPEN_THROW("Unknown data type");
 }

@@ -87,6 +87,11 @@ void visit_float(miopenDataType_t t, F f)
         f(as_float<int>{});
         break;
     }
+    case miopenDouble:
+    {
+        f(as_float<double>{});
+        break;
+    }
     }
 }
 

@@ -167,7 +167,8 @@ struct WarpReduce
 {
     using compType = typename opReduce::dataType;
     using binop    = detail::binop_with_nan_check<nanPropaOpt, opReduce, compType>;
-    constexpr static bool have_builtin_shuffle = std::is_same<compType, float>::value;
+    constexpr static bool have_builtin_shuffle =
+        std::is_same<compType, float>::value || std::is_same<compType, double>::value;
 
     // This interface does not accumulate on indices
     __device__ static void Reduce(const DataType* p_thread_buffer, compType& accuData)

@@ -1924,6 +1924,9 @@ std::string GetCastTensorBuildOptionFromType(const std::string& buildOption, mio
     case miopenHalf: return option += "2";
     case miopenFloat: return option += "3";
     case miopenBFloat16: return option += "4";
+    case miopenDouble:
+        // TODO
+        MIOPEN_THROW(miopenStatusBadParm, "miopenDouble data type not supported in cast tensor.");
     case miopenInt8x4:
         MIOPEN_THROW(miopenStatusBadParm, "miopenInt8x4 data type not supported in cast tensor.");
     default: MIOPEN_THROW(miopenStatusBadParm, "Invalid data type in cast tensor desc.");

@@ -224,12 +224,13 @@ inline int GetDataTypeSize(miopenDataType_t t)
     {
     case miopenHalf: return (2);
     case miopenFloat: return (4);
+    case miopenDouble: return (8);
     case miopenInt8: return (1);
     case miopenInt8x4: return (4);
     case miopenBFloat16: return (2);
     case miopenInt32: return (4);
     default:
-        MIOPEN_THROW("Only float, half, bfloat16, int8, int8x4 data type is supported.");
+        MIOPEN_THROW("Only float, half, double, bfloat16, int8, int8x4 data type is supported.");
         break;
     };
 };
@@ -241,6 +242,7 @@ inline int GetDataTypeId(miopenDataType_t t)
     case miopenHalf: return (static_cast<int>('H'));
     case miopenFloat: return (static_cast<int>('F'));
     case miopenBFloat16: return (static_cast<int>('B'));
+    case miopenDouble: return (static_cast<int>('D'));
     case miopenInt8:
     case miopenInt8x4:
     case miopenInt32: return (static_cast<int>('O'));
@@ -568,6 +570,17 @@ void ReduceTensorDescriptor::ReduceTensor(const Handle& handle,
 #if WORKAROUND_MIOPEN_ISSUE_557
     if(StartsWith(handle.GetDeviceName(), "gfx10"))
         param += " -DCK_USE_AMD_BUFFER_ADDRESSING=0 ";
+    else
+    {
+        if(srcDataType == miopenDouble)
+            // TODO: support from composable kernel utility for using AMD Buffer Addressing for
+            // double
+            param += " -DCK_USE_AMD_BUFFER_ADDRESSING=0 ";
+    };
+#else
+    if(srcDataType == miopenDouble)
+        // TODO: support from composable kernel utility for using AMD Buffer Addressing for double
+        param += " -DCK_USE_AMD_BUFFER_ADDRESSING=0 ";
 #endif
 
     std::string program_name = "gridwise_generic_reduction.cpp";
@@ -590,8 +603,12 @@ void ReduceTensorDescriptor::ReduceTensor(const Handle& handle,
     const std::vector<size_t> vgd_1 = {
         static_cast<size_t>(gridSize * blockSize), size_t{1}, size_t{1}};
 
-    float alphaVal = *reinterpret_cast<const float*>(alpha);
-    float betaVal  = *reinterpret_cast<const float*>(beta);
+    float alphaVal = (srcDataType == miopenDouble)
+                         ? static_cast<float>(*reinterpret_cast<const double*>(alpha))
+                         : *reinterpret_cast<const float*>(alpha);
+    float betaVal = (srcDataType == miopenDouble)
+                        ? static_cast<float>(*reinterpret_cast<const double*>(beta))
+                        : *reinterpret_cast<const float*>(beta);
 
     handle.AddKernel(algo_name, network_config, program_name, kernel_name1, vld_1, vgd_1, param)(
         alphaVal, A, betaVal, C, ws_buf1_global, ws_buf2_bytes_offset, indices);

@@ -308,6 +308,9 @@ bool PerformanceConfigConvAsm1x1U::IsValid(const ConvolutionContext& config) con
 
 void PerformanceConfigConvAsm1x1U::HeuristicInit(const ConvolutionContext& config)
 {
+    if(config.in_data_type == miopenDouble)
+        MIOPEN_THROW("Double data type is not supported by ConvAsm1x1U");
+
     const auto elements_in_dword = 4 / GetTypeSize(config.in_data_type);
     read_size                    = 4;
     k_mult                       = 16;

@@ -140,7 +140,8 @@ std::size_t TensorDescriptor::GetNumBytes() const
     case miopenBFloat16:
     case miopenHalf: typesize = 2; break;
     case miopenInt32:
-    case miopenFloat: typesize = 4; break;
+    case miopenFloat: typesize  = 4; break;
+    case miopenDouble: typesize = 8; break;
     }
     return typesize * this->GetElementSpace();
 }