Skip to content
This repository has been archived by the owner on Nov 17, 2023. It is now read-only.

cuDNN non-persistant bidirectional RNN dgrad sync fix #16391

Merged
merged 6 commits into from
Oct 10, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
83 changes: 54 additions & 29 deletions src/operator/rnn-inl.h
Original file line number Diff line number Diff line change
Expand Up @@ -493,6 +493,7 @@ class RNNOp {

CUDNN_CALL(cudnnCreateRNNDescriptor(&rnn_desc_));
CUDNN_CALL(cudnnCreateDropoutDescriptor(&dropout_desc_));
CUDA_CALL(cudaEventCreateWithFlags(&dgrad_sync_event_, cudaEventDisableTiming));

#if MXNET_USE_CUDNN_GE_7200
CUDNN_CALL(cudnnCreateRNNDataDescriptor(&x_data_desc_));
Expand Down Expand Up @@ -537,6 +538,7 @@ class RNNOp {
CUDNN_CALL(cudnnDestroyFilterDescriptor(dw_desc_));
CUDNN_CALL(cudnnDestroyRNNDescriptor(rnn_desc_));
CUDNN_CALL(cudnnDestroyDropoutDescriptor(dropout_desc_));
CUDA_CALL(cudaEventDestroy(dgrad_sync_event_));

if (init_cudnn_) {
for (size_t i = 0; i < x_desc_vec_.size(); ++i) {
Expand Down Expand Up @@ -1066,20 +1068,23 @@ class RNNOp {
workspace_byte_,
reserve_space_.dptr,
reserve_space_byte_));
CUDNN_CALL(cudnnRNNBackwardWeightsEx(s->dnn_handle_,
rnn_desc_,
x_data_desc_,
x.dptr_,
hx_desc_,
hx.dptr_,
y_data_desc_,
y.dptr_,
temp_space.dptr_,
workspace_byte_,
dw_desc_,
dw.dptr_,
reserve_space_.dptr,
reserve_space_byte_));
SyncDgrad();
if (req[rnn_enum::kParams] != kNullOp) {
CUDNN_CALL(cudnnRNNBackwardWeightsEx(s->dnn_handle_,
rnn_desc_,
x_data_desc_,
x.dptr_,
hx_desc_,
hx.dptr_,
y_data_desc_,
y.dptr_,
temp_space.dptr_,
workspace_byte_,
dw_desc_,
dw.dptr_,
reserve_space_.dptr,
reserve_space_byte_));
}
#else
CUDNN_CALL(cudnnRNNBackwardData(s->dnn_handle_,
rnn_desc_,
Expand Down Expand Up @@ -1108,21 +1113,24 @@ class RNNOp {
workspace_byte_,
reserve_space_.dptr,
reserve_space_byte_));
CUDNN_CALL(cudnnRNNBackwardWeights(s->dnn_handle_,
rnn_desc_,
param_.seq_length_,
x_desc_vec_.data(),
x.dptr_,
hx_desc_,
hx.dptr_,
y_desc_vec_.data(),
y.dptr_,
temp_space.dptr_,
workspace_byte_,
dw_desc_,
dw.dptr_,
reserve_space_.dptr,
reserve_space_byte_));
SyncDgrad();
if (req[rnn_enum::kParams] != kNullOp) {
CUDNN_CALL(cudnnRNNBackwardWeights(s->dnn_handle_,
rnn_desc_,
param_.seq_length_,
x_desc_vec_.data(),
x.dptr_,
hx_desc_,
hx.dptr_,
y_desc_vec_.data(),
y.dptr_,
temp_space.dptr_,
workspace_byte_,
dw_desc_,
dw.dptr_,
reserve_space_.dptr,
reserve_space_byte_));
}
#endif // MXNET_USE_CUDNN_GE_7200
#endif // MXNET_USE_CUDNN == 1 && defined(__CUDACC__)

Expand Down Expand Up @@ -1365,6 +1373,7 @@ class RNNOp {
// RNN descriptors
cudnnDataType_t dtype_with_fallback_;
cudnnRNNAlgo_t rnn_algo = CUDNN_RNN_ALGO_STANDARD;
dgrad_sync_needed_ = (rnn_algo == CUDNN_RNN_ALGO_STANDARD) && param_.bidirectional;
// On arch's 50 and 52(Maxwell), the gpu doesn't support native fp16 compute.
// Before cuDNN 7.5.0, when running fp16, cuDNN fallback to fp32 under the hood on Maxwell.
// That's not the case begining from 7.5.0. Thereby adding fallback explicitly here.
Expand Down Expand Up @@ -1484,6 +1493,20 @@ class RNNOp {
}
#endif // MXNET_USE_CUDNN == 1 && defined(__CUDACC__)
}

#if MXNET_USE_CUDNN == 1 && defined(__CUDACC__)
// cuDNN versions up to and including v7.6.4 did not sync a last dgrad kernel back to the main
// cudnn handle's stream (non-persistant algo, bidirectional only). This could result in silent
// non-determinstic failures with very low probability, seen more often when wgrad is bypassed.
inline void SyncDgrad() {
if (CUDNN_VERSION <= 7604 && dgrad_sync_needed_) {
// Without blocking the CPU, create a synchronization point of all current GPU activity. No
// need to call cudaStreamWaitEvent- cudaEventRecord on the legacy default stream suffices.
CUDA_CALL(cudaEventRecord(dgrad_sync_event_, cudaStreamLegacy));
Copy link
Contributor

@haojin2 haojin2 Oct 20, 2019

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hi @DickJC123, I'm encountering cudaErrorInvalidResourceHandle error here when I'm trying to run this notebook and this notebook in dive into deep learning textbook. Could you help with a fix to that?

}
}
#endif // MXNET_USE_CUDNN == 1 && defined(__CUDACC__)

#if MXNET_USE_CUDNN == 1
cudnnDataType_t dtype_;
bool init_cudnn_;
Expand Down Expand Up @@ -1511,6 +1534,8 @@ class RNNOp {
bool cudnn_tensor_core_;

cudnnTensorFormat_t format_;
cudaEvent_t dgrad_sync_event_;
bool dgrad_sync_needed_ = false;
#endif // MXNET_USE_CUDNN
bool init_space_, temp_init_space_;
size_t reserve_cpu_space_size_, temp_cpu_space_size_;
Expand Down
7 changes: 4 additions & 3 deletions tests/python/unittest/test_operator.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,11 +68,11 @@ def check_rnn_consistency(cell1, cell2, T, N, I, H, grad_req, rtol=1e-2, atol=1e
dy = mx.random.uniform(shape=mod1.get_outputs()[0].shape)
mod1.backward(out_grads=[dy])
mod2.backward(out_grads=[dy])
if grad_req != 'null':
assert_allclose(mod1.get_input_grads()[0].asnumpy(), mod2.get_input_grads()[0].asnumpy(), rtol=rtol, atol=atol)
else:
if type(grad_req) is dict and grad_req['data'] == 'null' or grad_req == 'null':
assert(mod1.get_input_grads()[0] == None)
assert(mod2.get_input_grads()[0] == None)
else:
assert_allclose(mod1.get_input_grads()[0].asnumpy(), mod2.get_input_grads()[0].asnumpy(), rtol=rtol, atol=atol)


@with_seed()
Expand Down Expand Up @@ -149,6 +149,7 @@ def test_lstm_bidirectional():
check_rnn_consistency(fused, stack, T, N, I, H, 'write')
check_rnn_consistency(fused, stack, T, N, I, H, 'add')
check_rnn_consistency(fused, stack, T, N, I, H, 'null')
check_rnn_consistency(fused, stack, T, N, I, H, {'data': 'add', 'parameters': 'null'})

@with_seed()
@assert_raises_cudnn_not_satisfied(min_version='5.1.10')
Expand Down