diff --git a/HugeCTR/src/hashtable/nv_hashtable.cu b/HugeCTR/src/hashtable/nv_hashtable.cu index d576297749..6dde4d7d38 100644 --- a/HugeCTR/src/hashtable/nv_hashtable.cu +++ b/HugeCTR/src/hashtable/nv_hashtable.cu @@ -271,7 +271,7 @@ void HashTable::dump(KeyType* d_key, ValType* d_val, size_t* d cudaStream_t stream) const { size_t search_length = static_cast(capacity_ / LOAD_FACTOR); // Before we call the kernel, set the global counter to 0 - HCTR_LIB_THROW(cudaMemset(d_dump_counter, 0, sizeof(size_t))); + HCTR_LIB_THROW(cudaMemsetAsync(d_dump_counter, 0, sizeof(size_t), stream)); // grid size according to the searching length. const int grid_size = (search_length - 1) / BLOCK_SIZE_ + 1; // dump_kernel: dump bucket container_[0, search_length) to d_key and d_val, and report