Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Generic Tensor Reduction for Double (SWDEV-284915) #934

Merged
merged 23 commits into from
Jun 6, 2021
Merged
Show file tree
Hide file tree
Changes from 13 commits
Commits
Show all changes
23 commits
Select commit Hold shift + click to select a range
d666d72
Add miopenDouble data type ID
qianfengz May 17, 2021
031d47c
Update the miopenReduceTensor() host C++ interface implementation to …
qianfengz May 17, 2021
0a1fbff
Tiny update in reduction kernel layer to use built-in shuffle for Double
qianfengz May 17, 2021
fbc28c4
Update in reduce_driver to support Reduction on Double
qianfengz May 17, 2021
5bd4bc4
Update in reduce_test to support Reduction on Double
qianfengz May 17, 2021
e18aff8
Update to remove compiler warnings caused by the adding of miopenDouble
qianfengz May 17, 2021
6f849d9
Formatting
atamazov May 17, 2021
30d5faa
Update to eliminate tidy-checking warnings
qianfengz May 19, 2021
7571a74
Merge branch 'develop' into reduction-fp64
May 19, 2021
ecad57f
Update to elminate tidy-checking warnings ...
qianfengz May 19, 2021
ac3de38
Tiny Fix in reduce_driver.cpp
qianfengz May 19, 2021
4721ccd
Tiny fix in gemm_v2.cpp
qianfengz May 20, 2021
7e3f738
Add workaround for rocm 3.7 in reduce_test.cpp
qianfengz May 21, 2021
83c097f
Fix repeated scaling tolerance for half tests in reduce_test.cpp
qianfengz May 27, 2021
bed2203
Merge branch 'develop' into reduction-fp64
qianfengz May 27, 2021
e0e2cd7
Fix to use unique network-config for two calls
qianfengz May 28, 2021
4da22f4
Avoid the using of -DMIOPEN_USE_FP64=<xxx> when fp64 is not used
qianfengz Jun 2, 2021
bd1610b
Avoid unnecessary runtime warning for using double in test/driver.hpp
qianfengz Jun 2, 2021
7d61d2a
Fix for tidy
qianfengz Jun 2, 2021
ecef4f2
Merge branch 'develop' into reduction-fp64
qianfengz Jun 4, 2021
eae5f7f
Remove un-needed setting in test/CMakeLists.txt
qianfengz Jun 4, 2021
40f3ee7
Add custom_test for testing reduce double
qianfengz Jun 4, 2021
b3626a5
Update to the test_reduce_double custom_test
qianfengz Jun 4, 2021
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions driver/driver.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -130,7 +130,7 @@ void PadBufferSize(size_t& sz, int datatype_sz)
printf(
"Supported Base Arguments: conv[fp16|int8|bfp16], CBAInfer[fp16], pool[fp16], lrn[fp16], "
"activ[fp16], softmax[fp16], bnorm[fp16], rnn[fp16], gemm, ctc, dropout[fp16], "
"tensorop[fp16], reduce[fp16]\n");
"tensorop[fp16], reduce[fp16,fp64]\n");
exit(0);
}

Expand All @@ -150,7 +150,7 @@ std::string ParseBaseArg(int argc, char* argv[])
arg != "softmax" && arg != "softmaxfp16" && arg != "bnorm" && arg != "bnormfp16" &&
arg != "rnn" && arg != "rnnfp16" && arg != "gemm" /*&& arg != "gemmfp16"*/ && arg != "ctc" &&
arg != "dropout" && arg != "dropoutfp16" && arg != "tensorop" && arg != "tensoropfp16" &&
arg != "reduce" && arg != "reducefp16" && arg != "--version")
arg != "reduce" && arg != "reducefp16" && arg != "reducefp64" && arg != "--version")
{
printf("Invalid Base Input Argument\n");
Usage();
Expand Down
4 changes: 4 additions & 0 deletions driver/main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -174,6 +174,10 @@ int main(int argc, char* argv[])
{
drv = new ReduceDriver<float16, float>();
}
else if(base_arg == "reducefp64")
{
drv = new ReduceDriver<double, double>();
}
else
{
printf("Incorrect BaseArg\n");
Expand Down
5 changes: 3 additions & 2 deletions driver/miopen_Reduction.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -90,8 +90,9 @@ class miopenReductionHost
RunImpl<Tref>(alpha, in_data, beta, out_data, indices);
else
RunImpl<float16>(alpha, in_data, beta, out_data, indices);
};

}
else if(compTypeVal == miopenDouble)
RunImpl<double>(alpha, in_data, beta, out_data, indices);
return;
};

Expand Down
53 changes: 36 additions & 17 deletions driver/reduce_driver.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -38,11 +38,12 @@
#include <miopen/miopen.h>
#include <miopen/reduce_common.hpp>
#include <miopen/tensor.hpp>
#include <miopen/bfloat16.hpp>
#include <numeric>
#include <vector>
#include <string>
#include <cassert>
#include <type_traits>
#include <half.hpp>
#include "random.hpp"

#include "miopen_Reduction.hpp"
Expand All @@ -58,7 +59,10 @@ class ReduceDriver : public Driver

miopenCreateReduceTensorDescriptor(&reduceDesc);

data_type = (sizeof(Tgpu) == 4) ? miopenFloat : miopenHalf;
if(std::is_same<Tgpu, double>::value)
data_type = miopenDouble;
else
data_type = (sizeof(Tgpu) == 4) ? miopenFloat : miopenHalf;
}

int AddCmdLineArgs() override;
Expand Down Expand Up @@ -289,6 +293,9 @@ int ReduceDriver<Tgpu, Tref>::SetReduceTensorDescriptorFromCmdLineArgs()
(reduceOp == MIOPEN_REDUCE_TENSOR_MIN || reduceOp == MIOPEN_REDUCE_TENSOR_MAX ||
reduceOp == MIOPEN_REDUCE_TENSOR_AMAX);

if(std::is_same<Tgpu, double>::value)
compType = miopenDouble;

return (miopenSetReduceTensorDescriptor(
reduceDesc, reduceOp, compType, nanOpt, indicesOpt, indicesType));
}
Expand Down Expand Up @@ -367,22 +374,31 @@ int ReduceDriver<Tgpu, Tref>::RunForwardGPU()

if(this->need_indices)
{
alpha = reduce::convert_type<Tgpu>(1.0f);
beta = reduce::convert_type<Tgpu>(0.0f);
alpha = 1.0f;
beta = 0.0f;
};

bool output_accumulate = !(reduce::float_equal_one(alpha) && reduce::float_equal_zero(beta));

const double alpha64 = alpha;
const double beta64 = beta;
const void* const alphaPtr = std::is_same<Tgpu, double>::value
? static_cast<const void*>(&alpha64)
: static_cast<const void*>(&alpha);
const void* const betaPtr = std::is_same<Tgpu, double>::value
? static_cast<const void*>(&beta64)
: static_cast<const void*>(&beta);

miopenReduceTensor(GetHandle(),
reduceDesc,
this->need_indices ? indices_dev->GetMem() : nullptr, // indices
this->need_indices ? indices_sizeInBytes : 0, // indices size in bytes
ws_sizeInBytes > 0 ? ws_dev->GetMem() : nullptr, // workspace
ws_sizeInBytes, // workspace size in bytes
&alpha,
alphaPtr,
inputTensor,
in_dev->GetMem(),
&beta,
betaPtr,
outputTensor,
out_dev->GetMem());

Expand All @@ -404,10 +420,10 @@ int ReduceDriver<Tgpu, Tref>::RunForwardGPU()
this->need_indices ? indices_sizeInBytes : 0, // indices size in bytes
ws_sizeInBytes > 0 ? ws_dev->GetMem() : nullptr, // workspace
ws_sizeInBytes, // workspace size in bytes
&alpha,
alphaPtr,
inputTensor,
in_dev->GetMem(),
&beta,
betaPtr,
outputTensor,
out_dev->GetMem());
}
Expand Down Expand Up @@ -455,10 +471,8 @@ int ReduceDriver<Tgpu, Tref>::VerifyForward()
this->dimsInvariant,
this->dimsToReduce);

auto alpha =
reduce::convert_type<Tgpu>(static_cast<float>(this->inflags.GetValueDouble("alpha")));
auto beta =
reduce::convert_type<Tgpu>(static_cast<float>(this->inflags.GetValueDouble("beta")));
auto alpha = static_cast<float>(this->inflags.GetValueDouble("alpha"));
auto beta = static_cast<float>(this->inflags.GetValueDouble("beta"));

auto reduceOp = static_cast<miopenReduceTensorOp_t>(inflags.GetValueInt("ReduceOp"));

Expand All @@ -470,14 +484,19 @@ int ReduceDriver<Tgpu, Tref>::VerifyForward()

hostReduction.Run(alpha, in.data(), beta, outhost.data(), outhost_indices.data());

auto error = miopen::rms_range(outhost, out);
const double tolerance =
std::is_same<Tgpu, float16>::value || reduceOp == MIOPEN_REDUCE_TENSOR_NORM2 ? 2e-3
: 1.5e-4;
auto error = miopen::rms_range(outhost, out);
double tolerance = 1.5e-4;

if(std::is_same<Tgpu, half_float::half>::value)
tolerance *= 4.0;

if(std::is_same<Tgpu, float>::value && reduceOp == MIOPEN_REDUCE_TENSOR_NORM2)
tolerance *= 12.0;

if(error > tolerance)
{
std::cout << "ReduceTensor() Failed: " << error << "\n";
std::cout << "ReduceTensor() Failed with error = " << error
<< " , tolerance = " << tolerance << "\n";
}
else
{
Expand Down
1 change: 1 addition & 0 deletions include/miopen/miopen.h
Original file line number Diff line number Diff line change
Expand Up @@ -334,6 +334,7 @@ typedef enum {
4, /*!< Pack of four 8-bit int points in NCHW_VECT_C format (Partially supported) */
miopenBFloat16 = 5, /*!< 16-bit binary floating point (8-bit exponent, 7-bit fraction)
(Partially supported) */
miopenDouble = 6, /*!< 64-bit floating point (Partially supported) */
} miopenDataType_t;

/*! @ingroup pooling
Expand Down
24 changes: 24 additions & 0 deletions src/gemm_v2.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -297,6 +297,9 @@ miopenStatus_t CallGemmMIOpenTensile(const Handle& handle,
ptrA = Data_t(reinterpret_cast<const int8_t*>(A) + a_offset);
ptrB = Data_t(reinterpret_cast<const int8_t*>(B) + b_offset);
ptrC = Data_t(reinterpret_cast<int32_t*>(C) + c_offset);
break;
atamazov marked this conversation as resolved.
Show resolved Hide resolved
case miopenDouble:
MIOPEN_THROW(miopenStatusBadParm, "miopenDouble data type not supported by MIOpenGEMM.");
}
if(gemm_desc.dataType == miopenInt8 || gemm_desc.dataType == miopenInt8x4)
{
Expand Down Expand Up @@ -570,6 +573,13 @@ miopenStatus_t CallGemm(const Handle& handle,
0);
}
break;

case miopenDouble:
{
MIOPEN_THROW(miopenStatusBadParm,
"miopenDouble data type not supported by MIOpenGEMM.");
};
break;
}

if(handle.IsProfilingEnabled())
Expand Down Expand Up @@ -887,6 +897,13 @@ miopenStatus_t CallGemmStridedBatched(const Handle& handle,
0);
}
break;

case miopenDouble:
{
MIOPEN_THROW(miopenStatusBadParm,
"miopenDouble data type not supported by MIOpenGEMM.");
}
break;
}

if(handle.IsProfilingEnabled())
Expand Down Expand Up @@ -1123,6 +1140,13 @@ miopenStatus_t CallGemmStridedBatchedSequential(const Handle& handle,
}
}
break;

case miopenDouble:
{
MIOPEN_THROW(miopenStatusBadParm,
"miopenDouble data type not supported by MIOpenGEMM.");
}
break;
}

if(handle.IsProfilingEnabled())
Expand Down
1 change: 1 addition & 0 deletions src/include/miopen/conv/problem_description.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,7 @@ inline std::string GetDataTypeName(miopenDataType_t data_type)
case miopenInt8x4: return "INT8x4";
case miopenInt32: return "INT32";
case miopenBFloat16: return "BF16";
case miopenDouble: return "FP64";
}

return "Unknown(" + std::to_string(data_type) + ")";
Expand Down
6 changes: 6 additions & 0 deletions src/include/miopen/datatype.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,9 @@ inline std::string GetDataType(miopenDataType_t type)
case miopenInt32: { type_str = "int";
}
break;
case miopenDouble: { type_str = "double";
}
break;
}
return type_str;
}
Expand Down Expand Up @@ -104,6 +107,7 @@ inline std::string GetDataTypeKernelParams(miopenDataType_t type)
int use_int8x4 = 0;
int use_int32 = 0;
int use_bfp16 = 0;
int use_fp64 = 0;
const int use_rne_bfloat16 = MIOPEN_USE_RNE_BFLOAT16;

switch(type)
Expand All @@ -114,6 +118,7 @@ inline std::string GetDataTypeKernelParams(miopenDataType_t type)
case miopenInt8x4: use_int8x4 = 1; break;
case miopenBFloat16: use_bfp16 = 1; break;
case miopenInt32: use_int32 = 1; break;
case miopenDouble: use_fp64 = 1; break;
default:
MIOPEN_THROW("Only float, half, bfloat16, int8, int8x4 data type is supported.");
break;
Expand All @@ -127,6 +132,7 @@ inline std::string GetDataTypeKernelParams(miopenDataType_t type)
ss << " -DMIOPEN_USE_BFP16=" << use_bfp16;
ss << " -DMIOPEN_USE_INT32=" << use_int32;
ss << " -DMIOPEN_USE_RNE_BFLOAT16=" << use_rne_bfloat16;
ss << " -DMIOPEN_USE_FP64=" << use_fp64;
atamazov marked this conversation as resolved.
Show resolved Hide resolved
return ss.str();
}

Expand Down
1 change: 1 addition & 0 deletions src/include/miopen/tensor.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,7 @@ inline std::size_t GetTypeSize(miopenDataType_t d)
case miopenBFloat16: return 2;
case miopenInt8x4:
case miopenInt8: return 1;
case miopenDouble: return 8;
}
MIOPEN_THROW("Unknown data type");
}
Expand Down
5 changes: 5 additions & 0 deletions src/include/miopen/visit_float.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,11 @@ void visit_float(miopenDataType_t t, F f)
f(as_float<int>{});
break;
}
case miopenDouble:
{
f(as_float<double>{});
break;
}
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -167,7 +167,8 @@ struct WarpReduce
{
using compType = typename opReduce::dataType;
using binop = detail::binop_with_nan_check<nanPropaOpt, opReduce, compType>;
constexpr static bool have_builtin_shuffle = std::is_same<compType, float>::value;
constexpr static bool have_builtin_shuffle =
std::is_same<compType, float>::value || std::is_same<compType, double>::value;

// This interface does not accumulate on indices
__device__ static void Reduce(const DataType* p_thread_buffer, compType& accuData)
Expand Down
3 changes: 3 additions & 0 deletions src/ocl/tensorocl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1924,6 +1924,9 @@ std::string GetCastTensorBuildOptionFromType(const std::string& buildOption, mio
case miopenHalf: return option += "2";
case miopenFloat: return option += "3";
case miopenBFloat16: return option += "4";
case miopenDouble:
// TODO
MIOPEN_THROW(miopenStatusBadParm, "miopenDouble data type not supported in cast tensor.");
case miopenInt8x4:
MIOPEN_THROW(miopenStatusBadParm, "miopenInt8x4 data type not supported in cast tensor.");
default: MIOPEN_THROW(miopenStatusBadParm, "Invalid data type in cast tensor desc.");
Expand Down
23 changes: 20 additions & 3 deletions src/reducetensor.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -224,12 +224,13 @@ inline int GetDataTypeSize(miopenDataType_t t)
{
case miopenHalf: return (2);
case miopenFloat: return (4);
case miopenDouble: return (8);
case miopenInt8: return (1);
case miopenInt8x4: return (4);
case miopenBFloat16: return (2);
case miopenInt32: return (4);
default:
MIOPEN_THROW("Only float, half, bfloat16, int8, int8x4 data type is supported.");
MIOPEN_THROW("Only float, half, double, bfloat16, int8, int8x4 data type is supported.");
break;
};
};
Expand All @@ -241,6 +242,7 @@ inline int GetDataTypeId(miopenDataType_t t)
case miopenHalf: return (static_cast<int>('H'));
case miopenFloat: return (static_cast<int>('F'));
case miopenBFloat16: return (static_cast<int>('B'));
case miopenDouble: return (static_cast<int>('D'));
case miopenInt8:
case miopenInt8x4:
case miopenInt32: return (static_cast<int>('O'));
Expand Down Expand Up @@ -568,6 +570,17 @@ void ReduceTensorDescriptor::ReduceTensor(const Handle& handle,
#if WORKAROUND_MIOPEN_ISSUE_557
if(StartsWith(handle.GetDeviceName(), "gfx10"))
param += " -DCK_USE_AMD_BUFFER_ADDRESSING=0 ";
else
{
if(srcDataType == miopenDouble)
// TODO: support from composable kernel utility for using AMD Buffer Addressing for
// double
param += " -DCK_USE_AMD_BUFFER_ADDRESSING=0 ";
};
#else
if(srcDataType == miopenDouble)
// TODO: support from composable kernel utility for using AMD Buffer Addressing for double
param += " -DCK_USE_AMD_BUFFER_ADDRESSING=0 ";
#endif

std::string program_name = "gridwise_generic_reduction.cpp";
Expand All @@ -590,8 +603,12 @@ void ReduceTensorDescriptor::ReduceTensor(const Handle& handle,
const std::vector<size_t> vgd_1 = {
static_cast<size_t>(gridSize * blockSize), size_t{1}, size_t{1}};

float alphaVal = *reinterpret_cast<const float*>(alpha);
float betaVal = *reinterpret_cast<const float*>(beta);
float alphaVal = (srcDataType == miopenDouble)
? static_cast<float>(*reinterpret_cast<const double*>(alpha))
: *reinterpret_cast<const float*>(alpha);
float betaVal = (srcDataType == miopenDouble)
? static_cast<float>(*reinterpret_cast<const double*>(beta))
: *reinterpret_cast<const float*>(beta);

handle.AddKernel(algo_name, network_config, program_name, kernel_name1, vld_1, vgd_1, param)(
alphaVal, A, betaVal, C, ws_buf1_global, ws_buf2_bytes_offset, indices);
Expand Down
3 changes: 3 additions & 0 deletions src/solver/conv_asm_1x1u.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -308,6 +308,9 @@ bool PerformanceConfigConvAsm1x1U::IsValid(const ConvolutionContext& config) con

void PerformanceConfigConvAsm1x1U::HeuristicInit(const ConvolutionContext& config)
{
if(config.in_data_type == miopenDouble)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It seems like this change is not needed. IsApplicable() of this Solver should refuse configs with double.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

?

MIOPEN_THROW("Double data type is not supported by ConvAsm1x1U");

const auto elements_in_dword = 4 / GetTypeSize(config.in_data_type);
read_size = 4;
k_mult = 16;
Expand Down
3 changes: 2 additions & 1 deletion src/tensor.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -140,7 +140,8 @@ std::size_t TensorDescriptor::GetNumBytes() const
case miopenBFloat16:
case miopenHalf: typesize = 2; break;
case miopenInt32:
case miopenFloat: typesize = 4; break;
case miopenFloat: typesize = 4; break;
case miopenDouble: typesize = 8; break;
}
return typesize * this->GetElementSpace();
}
Expand Down
Loading