Skip to content
This repository has been archived by the owner on Mar 21, 2024. It is now read-only.

Commit

Permalink
Merge pull request #476 from canonizer/onesweep-debug-sync
Browse files Browse the repository at this point in the history
debug_synchronous support in Onesweep sort
  • Loading branch information
alliepiper authored May 18, 2022
2 parents 5571258 + a46df7e commit f80aa78
Showing 1 changed file with 55 additions and 2 deletions.
57 changes: 55 additions & 2 deletions cub/device/dispatch/dispatch_radix_sort.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -1333,6 +1333,17 @@ struct DispatchRadixSort :
if (CubDebug(error = cudaOccupancyMaxActiveBlocksPerMultiprocessor(
&histo_blocks_per_sm, histogram_kernel, HISTO_BLOCK_THREADS, 0))) break;

// log histogram_kernel configuration
if (debug_synchronous)
{
_CubLog("Invoking histogram_kernel<<<%d, %d, 0, %lld>>>(), %d items per iteration, "
"%d SM occupancy, bit_grain %d\n",
histo_blocks_per_sm * num_sms, HISTO_BLOCK_THREADS,
reinterpret_cast<long long>(stream),
ActivePolicyT::HistogramPolicy::ITEMS_PER_THREAD, histo_blocks_per_sm,
ActivePolicyT::HistogramPolicy::RADIX_BITS);
}

error = THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(
histo_blocks_per_sm * num_sms, HISTO_BLOCK_THREADS, 0, stream
).doit(histogram_kernel,
Expand All @@ -1341,9 +1352,25 @@ struct DispatchRadixSort :
{
break;
}

if (debug_synchronous)
{
if (CubDebug(error = SyncStream(stream)))
{
break;
}
}

// exclusive sums to determine starts
const int SCAN_BLOCK_THREADS = ActivePolicyT::ExclusiveSumPolicy::BLOCK_THREADS;

// log exclusive_sum_kernel configuration
if (debug_synchronous)
{
_CubLog("Invoking exclusive_sum_kernel<<<%d, %d, 0, %lld>>>(), bit_grain %d\n",
num_passes, SCAN_BLOCK_THREADS, reinterpret_cast<long long>(stream),
ActivePolicyT::ExclusiveSumPolicy::RADIX_BITS);
}

error = THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(
num_passes, SCAN_BLOCK_THREADS, 0, stream
).doit(DeviceRadixSortExclusiveSumKernel<MaxPolicyT, OffsetT>,
Expand All @@ -1352,6 +1379,13 @@ struct DispatchRadixSort :
{
break;
}
if (debug_synchronous)
{
if (CubDebug(error = SyncStream(stream)))
{
break;
}
}

// use the other buffer if no overwrite is allowed
KeyT* d_keys_tmp = d_keys.Alternate();
Expand All @@ -1377,6 +1411,17 @@ struct DispatchRadixSort :
if (CubDebug(error = cudaMemsetAsync(
d_lookback, 0, num_blocks * RADIX_DIGITS * sizeof(AtomicOffsetT),
stream))) break;

// log onesweep_kernel configuration
if (debug_synchronous)
{
_CubLog("Invoking onesweep_kernel<<<%d, %d, 0, %lld>>>(), %d items per thread, "
"current bit %d, bit_grain %d, portion %d/%d\n",
num_blocks, ONESWEEP_BLOCK_THREADS, reinterpret_cast<long long>(stream),
ActivePolicyT::OnesweepPolicy::ITEMS_PER_THREAD, current_bit,
num_bits, static_cast<int>(portion), static_cast<int>(num_portions));
}

auto onesweep_kernel = DeviceRadixSortOnesweepKernel<
MaxPolicyT, IS_DESCENDING, KeyT, ValueT, OffsetT, PortionOffsetT>;

Expand All @@ -1396,8 +1441,16 @@ struct DispatchRadixSort :
{
break;
}

if (debug_synchronous)
{
if (CubDebug(error = SyncStream(stream)))
{
break;
}
}
}

// use the temporary buffers if no overwrite is allowed
if (!is_overwrite_okay && pass == 0)
{
Expand Down

0 comments on commit f80aa78

Please sign in to comment.