diff --git a/cub/device/dispatch/dispatch_radix_sort.cuh b/cub/device/dispatch/dispatch_radix_sort.cuh index 39a6441a0c..37cd74d84c 100644 --- a/cub/device/dispatch/dispatch_radix_sort.cuh +++ b/cub/device/dispatch/dispatch_radix_sort.cuh @@ -1333,6 +1333,17 @@ struct DispatchRadixSort : if (CubDebug(error = cudaOccupancyMaxActiveBlocksPerMultiprocessor( &histo_blocks_per_sm, histogram_kernel, HISTO_BLOCK_THREADS, 0))) break; + // log histogram_kernel configuration + if (debug_synchronous) + { + _CubLog("Invoking histogram_kernel<<<%d, %d, 0, %lld>>>(), %d items per iteration, " + "%d SM occupancy, bit_grain %d\n", + histo_blocks_per_sm * num_sms, HISTO_BLOCK_THREADS, + reinterpret_cast(stream), + ActivePolicyT::HistogramPolicy::ITEMS_PER_THREAD, histo_blocks_per_sm, + ActivePolicyT::HistogramPolicy::RADIX_BITS); + } + error = THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron( histo_blocks_per_sm * num_sms, HISTO_BLOCK_THREADS, 0, stream ).doit(histogram_kernel, @@ -1341,9 +1352,25 @@ struct DispatchRadixSort : { break; } - + if (debug_synchronous) + { + if (CubDebug(error = SyncStream(stream))) + { + break; + } + } + // exclusive sums to determine starts const int SCAN_BLOCK_THREADS = ActivePolicyT::ExclusiveSumPolicy::BLOCK_THREADS; + + // log exclusive_sum_kernel configuration + if (debug_synchronous) + { + _CubLog("Invoking exclusive_sum_kernel<<<%d, %d, 0, %lld>>>(), bit_grain %d\n", + num_passes, SCAN_BLOCK_THREADS, reinterpret_cast(stream), + ActivePolicyT::ExclusiveSumPolicy::RADIX_BITS); + } + error = THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron( num_passes, SCAN_BLOCK_THREADS, 0, stream ).doit(DeviceRadixSortExclusiveSumKernel, @@ -1352,6 +1379,13 @@ struct DispatchRadixSort : { break; } + if (debug_synchronous) + { + if (CubDebug(error = SyncStream(stream))) + { + break; + } + } // use the other buffer if no overwrite is allowed KeyT* d_keys_tmp = d_keys.Alternate(); @@ -1377,6 +1411,17 @@ struct DispatchRadixSort : if (CubDebug(error = cudaMemsetAsync( d_lookback, 0, num_blocks * RADIX_DIGITS * sizeof(AtomicOffsetT), stream))) break; + + // log onesweep_kernel configuration + if (debug_synchronous) + { + _CubLog("Invoking onesweep_kernel<<<%d, %d, 0, %lld>>>(), %d items per thread, " + "current bit %d, bit_grain %d, portion %d/%d\n", + num_blocks, ONESWEEP_BLOCK_THREADS, reinterpret_cast(stream), + ActivePolicyT::OnesweepPolicy::ITEMS_PER_THREAD, current_bit, + num_bits, static_cast(portion), static_cast(num_portions)); + } + auto onesweep_kernel = DeviceRadixSortOnesweepKernel< MaxPolicyT, IS_DESCENDING, KeyT, ValueT, OffsetT, PortionOffsetT>; @@ -1396,8 +1441,16 @@ struct DispatchRadixSort : { break; } + + if (debug_synchronous) + { + if (CubDebug(error = SyncStream(stream))) + { + break; + } + } } - + // use the temporary buffers if no overwrite is allowed if (!is_overwrite_okay && pass == 0) {