Skip to content
This repository has been archived by the owner on Nov 17, 2023. It is now read-only.

[MXNET-11241] Avoid use of troublesome cudnnFind() results when grad_req='add' #11338

Merged
merged 4 commits into from
Jul 30, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 14 additions & 6 deletions src/operator/nn/convolution.cu
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,8 @@ static CuDNNConvolutionOp<DType>& GetCuDNNConvOp(const ConvolutionParam& param,
int backward_compute_type,
const std::vector<TShape>& in_shape,
const std::vector<TShape>& out_shape,
const RunContext& rctx) {
const RunContext& rctx,
bool add_to_weight) {
#if DMLC_CXX11_THREAD_LOCAL
static thread_local std::unordered_map<ConvSignature,
std::shared_ptr<CuDNNConvolutionOp<DType> >,
Expand All @@ -57,14 +58,18 @@ static CuDNNConvolutionOp<DType>& GetCuDNNConvOp(const ConvolutionParam& param,
ndim += s.ndim();
for (auto &s : out_shape)
ndim += s.ndim();
key.Reserve(1 /* for forward_compute_type */ + 1 /* for backward_compute_type */
+ ndim + 1 /* for dev_id */);
key.Reserve(1 /* for forward_compute_type */ +
1 /* for backward_compute_type */ +
ndim /* for in and out shapes */ +
1 /* for dev_id */ +
1 /* for add_to_weight */);

key.AddSign(forward_compute_type);
key.AddSign(backward_compute_type);
key.AddSign(in_shape);
key.AddSign(out_shape);
key.AddSign(rctx.ctx.dev_id);
key.AddSign(add_to_weight ? 1 : 0);

auto it = ops.find(key);
if (it == ops.end()) {
Expand All @@ -74,7 +79,7 @@ static CuDNNConvolutionOp<DType>& GetCuDNNConvOp(const ConvolutionParam& param,
CHECK(ins_ret.second);
it = ins_ret.first;
it->second->Init(param, forward_compute_type, backward_compute_type, in_shape,
out_shape, rctx);
out_shape, rctx, add_to_weight);
}
return *it->second;
}
Expand Down Expand Up @@ -141,8 +146,10 @@ void ConvolutionCompute<gpu>(const nnvm::NodeAttrs& attrs,
std::vector<TShape> out_shape(1, outputs[0].shape_);
for (size_t i = 0; i < in_shape.size(); i++)
in_shape[i] = inputs[i].shape_;
// req[conv::kWeight] is only set for backward, so assume the typical 'write' for now.
auto add_to_weight = false;
CuDNNConvolutionOp<DType> &op = GetCuDNNConvOp<DType>(param,
compute_type, compute_type, in_shape, out_shape, ctx.run_ctx);
compute_type, compute_type, in_shape, out_shape, ctx.run_ctx, add_to_weight);
op.Forward(ctx, inputs, req, outputs);
}
})
Expand Down Expand Up @@ -220,8 +227,9 @@ void ConvolutionGradCompute<gpu>(const nnvm::NodeAttrs& attrs,
std::vector<TShape> out_shape(1, out_grad.shape_);
for (size_t i = 0; i < in_shape.size(); i++)
in_shape[i] = in_data[i].shape_;
auto add_to_weight = req[conv::kWeight] == kAddTo;
CuDNNConvolutionOp<DType> &op = GetCuDNNConvOp<DType>(param,
compute_type, compute_type, in_shape, out_shape, ctx.run_ctx);
compute_type, compute_type, in_shape, out_shape, ctx.run_ctx, add_to_weight);
op.Backward(ctx, std::vector<TBlob>{out_grad}, in_data, req, in_grad);
}
})
Expand Down
11 changes: 8 additions & 3 deletions src/operator/nn/cudnn/cudnn_algoreg-inl.h
Original file line number Diff line number Diff line change
Expand Up @@ -72,12 +72,13 @@ class CuDNNAlgoReg {
cudnnDataType_t cudnn_forward_compute_type,
cudnnDataType_t cudnn_backward_compute_type,
int sm_arch,
bool add_to_weight,
CuDNNAlgo<cudnnConvolutionFwdAlgo_t> *fwd,
CuDNNAlgo<cudnnConvolutionBwdDataAlgo_t> *bwd,
CuDNNAlgo<cudnnConvolutionBwdFilterAlgo_t> *flt) {
CHECK(in_shape.size() == 2 || in_shape.size() == 3);
ParamKey key{param, in_shape[0], in_shape[1], out_shape[0], cudnn_data_type,
cudnn_forward_compute_type, cudnn_backward_compute_type, sm_arch};
cudnn_forward_compute_type, cudnn_backward_compute_type, sm_arch, add_to_weight};
std::lock_guard<std::mutex> guard(lock_);
auto i = reg_.find(key);
if (i != reg_.end()) {
Expand All @@ -96,12 +97,13 @@ class CuDNNAlgoReg {
cudnnDataType_t cudnn_forward_compute_type,
cudnnDataType_t cudnn_backward_compute_type,
int sm_arch,
bool add_to_weight,
const CuDNNAlgo<cudnnConvolutionFwdAlgo_t> &fwd,
const CuDNNAlgo<cudnnConvolutionBwdDataAlgo_t> &bwd,
const CuDNNAlgo<cudnnConvolutionBwdFilterAlgo_t> &flt) {
CHECK(in_shape.size() == 2 || in_shape.size() == 3);
ParamKey key{param, in_shape[0], in_shape[1], out_shape[0], cudnn_data_type,
cudnn_forward_compute_type, cudnn_backward_compute_type, sm_arch};
cudnn_forward_compute_type, cudnn_backward_compute_type, sm_arch, add_to_weight};
std::lock_guard<std::mutex> guard(lock_);
if (param.cudnn_tune.value() && reg_.size() % 50 == 0) {
LOG(INFO) << "Running performance tests to find the best convolution "
Expand Down Expand Up @@ -140,6 +142,7 @@ class CuDNNAlgoReg {
cudnnDataType_t cudnn_forward_compute_type;
cudnnDataType_t cudnn_backward_compute_type;
int sm_arch;
bool add_to_weight;

bool operator==(const ParamKey& other) const {
return this->param == other.param &&
Expand All @@ -149,7 +152,8 @@ class CuDNNAlgoReg {
this->cudnn_data_type == other.cudnn_data_type &&
this->cudnn_forward_compute_type == other.cudnn_forward_compute_type &&
this->cudnn_backward_compute_type == other.cudnn_backward_compute_type &&
this->sm_arch == other.sm_arch;
this->sm_arch == other.sm_arch &&
this->add_to_weight == other.add_to_weight;
}
};

Expand All @@ -164,6 +168,7 @@ class CuDNNAlgoReg {
ret = dmlc::HashCombine(ret, static_cast<int>(key.cudnn_forward_compute_type));
ret = dmlc::HashCombine(ret, static_cast<int>(key.cudnn_backward_compute_type));
ret = dmlc::HashCombine(ret, key.sm_arch);
ret = dmlc::HashCombine(ret, key.add_to_weight);
return ret;
}
};
Expand Down
36 changes: 32 additions & 4 deletions src/operator/nn/cudnn/cudnn_convolution-inl.h
Original file line number Diff line number Diff line change
Expand Up @@ -59,9 +59,11 @@ class CuDNNConvolutionOp {
int backward_compute_type,
const std::vector<TShape>& in_shape,
const std::vector<TShape>& out_shape,
const RunContext& rctx) {
const RunContext& rctx,
bool add_to_weight) {
using namespace mshadow;
this->param_ = param;
this->add_to_weight_ = add_to_weight;
InitBufferForParam();
auto cudnn_forward_compute_type = convertToCuDNNDataType(forward_compute_type);
auto cudnn_backward_compute_type = convertToCuDNNDataType(backward_compute_type);
Expand Down Expand Up @@ -247,6 +249,7 @@ class CuDNNConvolutionOp {
gbias.dptr_));
}
if (req[conv::kWeight] != kNullOp) {
CHECK_EQ(add_to_weight_, req[conv::kWeight] == kAddTo);
CUDNN_CALL(cudnnConvolutionBackwardFilter(s->dnn_handle_,
&alpha,
in_desc_,
Expand Down Expand Up @@ -610,8 +613,8 @@ class CuDNNConvolutionOp {
cudnnDataType_t cudnn_backward_compute_type) {
if (!CuDNNConvAlgoReg::Get()->Find(param_, in_shape, out_shape, dtype_,
cudnn_forward_compute_type, cudnn_backward_compute_type,
SMArch(rctx.ctx.dev_id), &forward_algo_, &back_algo_,
&back_algo_w_)) {
SMArch(rctx.ctx.dev_id), add_to_weight_,
&forward_algo_, &back_algo_, &back_algo_w_)) {
mshadow::Stream<gpu> *s = rctx.get_stream<gpu>();
CHECK_EQ(s->dnn_handle_ownership_, mshadow::Stream<gpu>::OwnHandle);
size_t workspace_byte = static_cast<size_t>(param_.workspace * sizeof(DType));
Expand Down Expand Up @@ -645,6 +648,8 @@ class CuDNNConvolutionOp {
auto max_bwd_filt_algos = MaxBackwardFilterAlgos(s->dnn_handle_);
std::vector<cudnnConvolutionBwdFilterAlgoPerf_t> bwd_filt_results(max_bwd_filt_algos);
int actual_bwd_filter_algos = 0;
// In cudnn v7.1.4, find() returned wgrad algos that could fail for large c if we
// were summing into the output (i.e. beta != 0). Get() returned OK algos though.
auto bwd_filter_algo_discoverer =
param_.cudnn_tune.value() == conv::kOff ? cudnnGetConvolutionBackwardFilterAlgorithm_v7
: cudnnFindConvolutionBackwardFilterAlgorithm;
Expand Down Expand Up @@ -792,14 +797,22 @@ class CuDNNConvolutionOp {
}
}
#endif // CUDNN_MAJOR < 7

// Fix for issue #11241
int cudnn_find_issue_max_features = 64 * 1024;
if (add_to_weight_ && Features(in_shape[conv::kData]) >= cudnn_find_issue_max_features) {
this->back_algo_w_.Set(CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1, true);
}

// An algo specification by the user may be cached here, but another
// convolution will match only if identically specified.
// We're caching results of *Get* as well as *Find*, but these records
// will be held distinctly because param_.cudnn_tune is part of the key.
CuDNNConvAlgoReg::Get()->Register(param_, in_shape, out_shape, dtype_,
cudnn_forward_compute_type,
cudnn_backward_compute_type,
SMArch(rctx.ctx.dev_id), this->forward_algo_,
SMArch(rctx.ctx.dev_id), this->add_to_weight_,
this->forward_algo_,
this->back_algo_, this->back_algo_w_);
}
// If we're allowing Tensor Core variants of the algos to be considered in
Expand Down Expand Up @@ -921,6 +934,19 @@ class CuDNNConvolutionOp {
return tensor.MSize() * sizeof(DType);
}

// Given a tensor shape of this operation, return the number of features 'c'
int64_t Features(const TShape &dshape) {
int c = 0;
switch (dshape.ndim()) {
case 3: c = ConvertLayout(dshape.get<3>(), param_.layout.value(), kNCW)[1]; break;
case 4: c = ConvertLayout(dshape.get<4>(), param_.layout.value(), kNCHW)[1]; break;
case 5: c = ConvertLayout(dshape.get<5>(), param_.layout.value(), kNCDHW)[1]; break;
default:
LOG(FATAL) << "Unexpected convolution data dimension " << dshape.ndim();
}
return c;
}

std::vector<int> param_stride_;
std::vector<int> param_dilate_;
std::vector<int> param_pad_;
Expand Down Expand Up @@ -953,6 +979,8 @@ class CuDNNConvolutionOp {
cudnnTensorFormat_t format_;
// Allow TensorCore algo policy
bool cudnn_tensor_core_;
// Is req[kWeight] == conv::kAddTo ?
bool add_to_weight_;
ConvolutionParam param_;
};
#endif // __CUDACC__ && CUDNN
Expand Down
38 changes: 34 additions & 4 deletions src/operator/nn/cudnn/cudnn_deconvolution-inl.h
Original file line number Diff line number Diff line change
Expand Up @@ -56,9 +56,11 @@ class CuDNNDeconvolutionOp {
int backward_compute_type,
const std::vector<TShape>& in_shape,
const std::vector<TShape>& out_shape,
const RunContext& rctx) {
const RunContext& rctx,
bool add_to_weight) {
using namespace mshadow;
this->param_ = param;
this->add_to_weight_ = add_to_weight;
InitBufferForParam();
auto cudnn_forward_compute_type = convertToCuDNNDataType(forward_compute_type);
auto cudnn_backward_compute_type = convertToCuDNNDataType(backward_compute_type);
Expand Down Expand Up @@ -257,6 +259,7 @@ class CuDNNDeconvolutionOp {
filter_desc_,
gwmat.dptr_ + weight_offset_ * g));
#elif CUDNN_MAJOR >= 5
CHECK_EQ(add_to_weight_, req[deconv::kWeight] == kAddTo);
CUDNN_CALL(cudnnConvolutionBackwardFilter(
s->dnn_handle_,
&alpha,
Expand Down Expand Up @@ -543,8 +546,8 @@ class CuDNNDeconvolutionOp {
if (!CuDNNDeconvAlgoReg::Get()->Find(param_, in_shape, out_shape, dtype_,
cudnn_forward_compute_type,
cudnn_backward_compute_type,
SMArch(rctx.ctx.dev_id), &forward_algo_,
&back_algo_, &back_algo_w_)) {
SMArch(rctx.ctx.dev_id), add_to_weight_,
&forward_algo_, &back_algo_, &back_algo_w_)) {
mshadow::Stream <gpu> *s = rctx.get_stream<gpu>();
CHECK_EQ(s->dnn_handle_ownership_, mshadow::Stream<gpu>::OwnHandle);
size_t workspace_byte = static_cast<size_t>(param_.workspace * sizeof(DType));
Expand Down Expand Up @@ -578,6 +581,8 @@ class CuDNNDeconvolutionOp {
auto max_bwd_filt_algos = MaxBackwardFilterAlgos(s->dnn_handle_);
std::vector<cudnnConvolutionBwdFilterAlgoPerf_t> bwd_filt_results(max_bwd_filt_algos);
int actual_bwd_filter_algos = 0;
// In cudnn v7.1.4, find() returned wgrad algos that could fail for large c if we
// were summing into the output (i.e. beta != 0). Get() returned OK algos though.
auto bwd_filter_algo_discoverer =
param_.cudnn_tune.value() == conv::kOff ? cudnnGetConvolutionBackwardFilterAlgorithm_v7
: cudnnFindConvolutionBackwardFilterAlgorithm;
Expand Down Expand Up @@ -728,14 +733,23 @@ class CuDNNDeconvolutionOp {
}
}
#endif // CUDNN_MAJOR < 7

// Fix for issue #11241
int cudnn_find_issue_max_features = 64 * 1024;
// With deconvolution, the algo sensitivity is to a large number of output features
if (add_to_weight_ && Features(out_shape[deconv::kOut]) >= cudnn_find_issue_max_features) {
this->back_algo_w_.Set(CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1, true);
}

// An algo specification by the user may be cached here, but another
// convolution will match only if identically specified.
// We're caching results of *Get* as well as *Find*, but these records
// will be held distinctly because param_.cudnn_tune is part of the key.
CuDNNDeconvAlgoReg::Get()->Register(param_, in_shape, out_shape, dtype_,
cudnn_forward_compute_type,
cudnn_backward_compute_type,
SMArch(rctx.ctx.dev_id), this->forward_algo_,
SMArch(rctx.ctx.dev_id), this->add_to_weight_,
this->forward_algo_,
this->back_algo_, this->back_algo_w_);
}
// If we're allowing Tensor Core variants of the algos to be considered in
Expand Down Expand Up @@ -866,6 +880,20 @@ class CuDNNDeconvolutionOp {
return tensor.MSize() * sizeof(DType);
}


// Given a tensor shape of this operation, return the number of features 'c'
int64_t Features(const TShape &dshape) {
int c = 0;
switch (dshape.ndim()) {
case 3: c = ConvertLayout(dshape.get<3>(), param_.layout.value(), kNCW)[1]; break;
case 4: c = ConvertLayout(dshape.get<4>(), param_.layout.value(), kNCHW)[1]; break;
case 5: c = ConvertLayout(dshape.get<5>(), param_.layout.value(), kNCDHW)[1]; break;
default:
LOG(FATAL) << "Unexpected deconvolution data dimension " << dshape.ndim();
}
return c;
}

std::vector<int> param_stride_;
std::vector<int> param_dilate_;

Expand Down Expand Up @@ -912,6 +940,8 @@ class CuDNNDeconvolutionOp {
cudnnTensorFormat_t format_;
// Allow TensorCore algo policy
bool cudnn_tensor_core_;
// Is req[kWeight] == deconv::kAddTo ?
bool add_to_weight_;
DeconvolutionParam param_;
};
#endif // CUDNN
Expand Down
20 changes: 14 additions & 6 deletions src/operator/nn/deconvolution.cu
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,8 @@ static CuDNNDeconvolutionOp<DType> &GetCuDNNDeconvOp(const DeconvolutionParam& p
int backward_compute_type,
const std::vector<TShape>& in_shape,
const std::vector<TShape>& out_shape,
const RunContext& rctx) {
const RunContext& rctx,
bool add_to_weight) {
#if DMLC_CXX11_THREAD_LOCAL
static thread_local std::unordered_map<DeconvSignature,
std::shared_ptr<CuDNNDeconvolutionOp<DType> >,
Expand All @@ -55,14 +56,18 @@ static CuDNNDeconvolutionOp<DType> &GetCuDNNDeconvOp(const DeconvolutionParam& p
ndim += s.ndim();
for (auto &s : out_shape)
ndim += s.ndim();
key.Reserve(1 /* for forward_compute_type */ + 1 /* for backward_compute_type */
+ ndim + 1 /* for dev_id */);
key.Reserve(1 /* for forward_compute_type */ +
1 /* for backward_compute_type */ +
ndim /* for in and out shapes */ +
1 /* for dev_id */ +
1 /* for add_to_weight */);

key.AddSign(forward_compute_type);
key.AddSign(backward_compute_type);
key.AddSign(in_shape);
key.AddSign(out_shape);
key.AddSign(rctx.ctx.dev_id);
key.AddSign(add_to_weight ? 1 : 0);

auto it = ops.find(key);
if (it == ops.end()) {
Expand All @@ -72,7 +77,7 @@ static CuDNNDeconvolutionOp<DType> &GetCuDNNDeconvOp(const DeconvolutionParam& p
CHECK(ins_ret.second);
it = ins_ret.first;
it->second->Init(param, forward_compute_type, backward_compute_type, in_shape,
out_shape, rctx);
out_shape, rctx, add_to_weight);
}
return *it->second;
}
Expand Down Expand Up @@ -109,8 +114,10 @@ void DeconvolutionCompute<gpu>(const nnvm::NodeAttrs& attrs,
for (size_t i = 0; i < in_shape.size(); i++) {
in_shape[i] = inputs[i].shape_;
}
// req[deconv::kWeight] is only set for backward, so assume the typical 'write' for now.
auto add_to_weight = false;
GetCuDNNDeconvOp<DType>(param, compute_type, compute_type,
in_shape, out_shape, ctx.run_ctx).Forward(ctx, inputs, req, outputs);
in_shape, out_shape, ctx.run_ctx, add_to_weight).Forward(ctx, inputs, req, outputs);
}
})
#else
Expand Down Expand Up @@ -156,8 +163,9 @@ void DeconvolutionGradCompute<gpu>(const nnvm::NodeAttrs& attrs,
for (size_t i = 0; i < in_shape.size(); i++) {
in_shape[i] = in_data[i].shape_;
}
auto add_to_weight = req[deconv::kWeight] == kAddTo;
GetCuDNNDeconvOp<DType>(param, compute_type, compute_type,
in_shape, out_shape, ctx.run_ctx).Backward(ctx,
in_shape, out_shape, ctx.run_ctx, add_to_weight).Backward(ctx,
std::vector<TBlob>{out_grad}, in_data, req, in_grad);
}
})
Expand Down
2 changes: 1 addition & 1 deletion src/operator/operator_common.h
Original file line number Diff line number Diff line change
Expand Up @@ -494,7 +494,7 @@ inline void LogUnimplementedOp(const nnvm::NodeAttrs& attrs,
}

class OpSignature {
std::vector<int> eles;
std::vector<int64_t> eles;
uint64_t hash;

public:
Expand Down
Loading