NVIDIA · wmaxey · Dec 1, 2023 · Nov 1, 2023 · Nov 1, 2023 · Nov 2, 2023
@@ -635,6 +635,14 @@ def do_run(self, ct_point, rt_values, timeout, is_search=True):
                 cmd.append("--min-samples")
                 cmd.append("70")
 
+            # Unlike noise, minimal benchmarking time is not directly related to variance.
+            # Default minimal time is 0.5 seconds. For CI we want to reduce it to 0.1 seconds, 
+            # becuse we have limited time budget. Having smaller minimal time doesn't affect 
+            # stability of sample distribution median in a deterministic way. For small problem sizes, 
+            # 0.1s leads to smaller variation than 0.5s. For other workloads, 0.5 leads to smaller variance. 
+            cmd.append("--min-time")
+            cmd.append("0.1")
+
             # NVBench is currently broken for multiple GPUs, use `CUDA_VISIBLE_DEVICES`
             cmd.append("-d")
             cmd.append("0")

@@ -47,19 +47,41 @@ def parse_arguments():
                         type=str, help="Parameter in the format `Param=Value`.")
     parser.add_argument(
         '--list-benches', action=argparse.BooleanOptionalAction, help="Show available benchmarks.")
+    parser.add_argument('--num-shards', type=int, default=1, help='Split benchmarks into M pieces and only run one')
+    parser.add_argument('--run-shard', type=int, default=0, help='Run shard N / M of benchmarks')
+    parser.add_argument('-P0', action=argparse.BooleanOptionalAction, help="Run P0 benchmarks (overwrites -R)")
     return parser.parse_args()
 
 
-def run_benches(benchmarks, sub_space, regex, seeker):
-    pattern = re.compile(regex)
-
-    for algname in benchmarks:
-        if pattern.match(algname):
+def run_benches(algnames, sub_space, seeker):
+    for algname in algnames:
+        try:
             bench = BaseBench(algname)
             ct_space = bench.ct_workload_space(sub_space)
             rt_values = bench.rt_axes_values(sub_space)
             seeker(algname, ct_space, rt_values)
+        except Exception as e:
+            print("#### ERROR exception occured while running {}: '{}'".format(algname, e))
+
+
+def filter_benchmarks(benchmarks, args):
+    if args.run_shard >= args.num_shards:
+        raise ValueError('run-shard must be less than num-shards')
+
+    R = args.R
+    if args.P0:
+        R = '^(?!.*segmented).*(scan|reduce|select|sort).*'
 
+    pattern = re.compile(R)
+    algnames = list(filter(lambda x: pattern.match(x), benchmarks.keys()))
+    algnames.sort()
+
+    if args.num_shards > 1:
+        algnames = np.array_split(algnames, args.num_shards)[args.run_shard].tolist()
+        return algnames
+
+    return algnames
+
 
 def search(seeker):
     args = parse_arguments()
@@ -79,8 +101,8 @@ def search(seeker):
     if args.list_benches:
         list_benches()
         return
-
-    run_benches(config.benchmarks, workload_sub_space, args.R, seeker)
+    
+    run_benches(filter_benchmarks(config.benchmarks, args), workload_sub_space, seeker)
 
 
 class MedianCenterEstimator:

@@ -6,33 +6,55 @@
 import cccl.bench
 
 
-def elapsed_time_look_good(x):
+def elapsed_time_looks_good(x):
   if isinstance(x, float):
     if math.isfinite(x):
       return True
   return False
 
 
+def problem_size_looks_large_enough(elements):
+  # Small problem sizes do not utilize entire GPU.
+  # Benchmarking small problem sizes in environments where we do not control
+  # distributions comparison, e.g. CI, is not useful because of stability issues.
+  return elements.isdigit() and int(elements) > 20
+
+
+def filter_runtime_workloads_for_ci(rt_values):
+  for subbench in rt_values:
+    for axis in rt_values[subbench]:
+      if axis.startswith('Elements') and axis.endswith('[pow2]'):
+        rt_values[subbench][axis] = list(filter(problem_size_looks_large_enough, rt_values[subbench][axis]))
+
+  return rt_values
+
+
 class BaseRunner:
   def __init__(self):
     self.estimator = cccl.bench.MedianCenterEstimator()
 
   def __call__(self, algname, ct_workload_space, rt_values):
+    failure_occured = False
+    rt_values = filter_runtime_workloads_for_ci(rt_values)
+
     for ct_workload in ct_workload_space:
       bench = cccl.bench.BaseBench(algname)
-      if bench.build():
+      if bench.build(): # might throw
         results = bench.run(ct_workload, rt_values, self.estimator, False)
         for subbench in results:
           for point in results[subbench]:
             bench_name = "{}.{}-{}".format(bench.algorithm_name(), subbench, point)
             bench_name = bench_name.replace(' ', '___')
             bench_name = "".join(c if c.isalnum() else "_" for c in bench_name)
             elapsed_time = results[subbench][point]
-            if elapsed_time_look_good(elapsed_time):
+            if elapsed_time_looks_good(elapsed_time):
               print("&&&& PERF {} {} -sec".format(bench_name, elapsed_time))
       else:
-        print("&&&& FAILED bench")
-        sys.exit(-1)
+        failure_occured = True
+        print("&&&& FAILED {}".format(algname))
+
+    if failure_occured:
+      sys.exit(1)
 
 
 def main():

@@ -100,7 +100,7 @@ void left(nvbench::state& state, nvbench::type_list<T, OffsetT>)
   thrust::device_vector<std::uint8_t> temp_storage(temp_storage_bytes);
   std::uint8_t* d_temp_storage = thrust::raw_pointer_cast(temp_storage.data());
 
-  state.exec([&](nvbench::launch &launch) {
+  state.exec(nvbench::exec_tag::no_batch, [&](nvbench::launch &launch) {
     dispatch_t::Dispatch(d_temp_storage,
                          temp_storage_bytes,
                          d_in,

@@ -249,7 +249,7 @@ void copy(nvbench::state &state,
   thrust::device_vector<nvbench::uint8_t> temp_storage(temp_storage_bytes);
   d_temp_storage = thrust::raw_pointer_cast(temp_storage.data());
 
-  state.exec(nvbench::exec_tag::sync, [&](nvbench::launch &launch) {
+  state.exec(nvbench::exec_tag::no_batch | nvbench::exec_tag::sync, [&](nvbench::launch &launch) {
     dispatch_t::Dispatch(d_temp_storage,
                          temp_storage_bytes,
                          d_input_buffers,
@@ -275,8 +275,8 @@ void uniform(nvbench::state &state, nvbench::type_list<T, OffsetT> tl)
        elements,
        min_buffer_size,
        max_buffer_size,
-       state.get_int64("RandomizeInput"),
-       state.get_int64("RandomizeOutput"));
+       state.get_int64("Randomize"),
+       state.get_int64("Randomize"));
 }
 
 template <class T, class OffsetT>
@@ -309,8 +309,7 @@ NVBENCH_BENCH_TYPES(uniform, NVBENCH_TYPE_AXES(types, u_offset_types))
   .add_int64_power_of_two_axis("Elements{io}", nvbench::range(25, 29, 2))
   .add_int64_axis("MinBufferSizeRatio", {1, 99})
   .add_int64_axis("MaxBufferSize", {8, 64, 256, 1024, 64 * 1024})
-  .add_int64_axis("RandomizeInput", {0, 1})
-  .add_int64_axis("RandomizeOutput", {0, 1});
+  .add_int64_axis("Randomize", {0, 1});
 
 NVBENCH_BENCH_TYPES(large, NVBENCH_TYPE_AXES(types, u_offset_types))
   .set_name("large")

@@ -110,7 +110,7 @@ static void even(nvbench::state &state, nvbench::type_list<SampleT, CounterT, Of
   thrust::device_vector<nvbench::uint8_t> tmp(temp_storage_bytes);
   d_temp_storage = thrust::raw_pointer_cast(tmp.data());
 
-  state.exec([&](nvbench::launch &launch) {
+  state.exec(nvbench::exec_tag::no_batch, [&](nvbench::launch &launch) {
     dispatch_t::DispatchEven(d_temp_storage,
                              temp_storage_bytes,
                              d_input,
@@ -139,5 +139,5 @@ NVBENCH_BENCH_TYPES(even, NVBENCH_TYPE_AXES(sample_types, bin_types, some_offset
   .set_name("base")
   .set_type_axes_names({"SampleT{ct}", "BinT{ct}", "OffsetT{ct}"})
   .add_int64_power_of_two_axis("Elements{io}", nvbench::range(16, 28, 4))
-  .add_int64_axis("Bins", {32, 64, 128, 2048, 2097152})
-  .add_string_axis("Entropy", {"0.201", "0.544", "1.000"});
+  .add_int64_axis("Bins", {32, 128, 2048, 2097152})
+  .add_string_axis("Entropy", {"0.201", "1.000"});
@@ -121,7 +121,7 @@ static void even(nvbench::state &state, nvbench::type_list<SampleT, CounterT, Of
   thrust::device_vector<nvbench::uint8_t> tmp(temp_storage_bytes);
   d_temp_storage = thrust::raw_pointer_cast(tmp.data());
 
-  state.exec([&](nvbench::launch &launch) {
+  state.exec(nvbench::exec_tag::no_batch, [&](nvbench::launch &launch) {
     dispatch_t::DispatchEven(d_temp_storage,
                              temp_storage_bytes,
                              d_input,
@@ -150,5 +150,5 @@ NVBENCH_BENCH_TYPES(even, NVBENCH_TYPE_AXES(sample_types, bin_types, some_offset
   .set_name("base")
   .set_type_axes_names({"SampleT{ct}", "BinT{ct}", "OffsetT{ct}"})
   .add_int64_power_of_two_axis("Elements{io}", nvbench::range(16, 28, 4))
-  .add_int64_axis("Bins", {32, 64, 128, 2048, 2097152})
-  .add_string_axis("Entropy", {"0.201", "0.544", "1.000"});
+  .add_int64_axis("Bins", {32, 128, 2048, 2097152})
+  .add_string_axis("Entropy", {"0.201", "1.000"});
@@ -129,7 +129,7 @@ static void range(nvbench::state &state, nvbench::type_list<SampleT, CounterT, O
   thrust::device_vector<nvbench::uint8_t> tmp(temp_storage_bytes);
   d_temp_storage = thrust::raw_pointer_cast(tmp.data());
 
-  state.exec([&](nvbench::launch &launch) {
+  state.exec(nvbench::exec_tag::no_batch, [&](nvbench::launch &launch) {
     dispatch_t::DispatchRange(d_temp_storage,
                               temp_storage_bytes,
                               d_input,
@@ -157,5 +157,5 @@ NVBENCH_BENCH_TYPES(range, NVBENCH_TYPE_AXES(sample_types, bin_types, some_offse
   .set_name("base")
   .set_type_axes_names({"SampleT{ct}", "BinT{ct}", "OffsetT{ct}"})
   .add_int64_power_of_two_axis("Elements{io}", nvbench::range(16, 28, 4))
-  .add_int64_axis("Bins", {32, 64, 128, 2048, 2097152})
-  .add_string_axis("Entropy", {"0.201", "0.544", "1.000"});
+  .add_int64_axis("Bins", {32, 128, 2048, 2097152})
+  .add_string_axis("Entropy", {"0.201", "1.000"});
@@ -116,7 +116,7 @@ static void range(nvbench::state &state, nvbench::type_list<SampleT, CounterT, O
   thrust::device_vector<nvbench::uint8_t> tmp(temp_storage_bytes);
   d_temp_storage = thrust::raw_pointer_cast(tmp.data());
 
-  state.exec([&](nvbench::launch &launch) {
+  state.exec(nvbench::exec_tag::no_batch, [&](nvbench::launch &launch) {
     dispatch_t::DispatchRange(d_temp_storage,
                               temp_storage_bytes,
                               d_input,
@@ -144,5 +144,5 @@ NVBENCH_BENCH_TYPES(range, NVBENCH_TYPE_AXES(sample_types, bin_types, some_offse
   .set_name("base")
   .set_type_axes_names({"SampleT{ct}", "BinT{ct}", "OffsetT{ct}"})
   .add_int64_power_of_two_axis("Elements{io}", nvbench::range(16, 28, 4))
-  .add_int64_axis("Bins", {32, 64, 128, 2048, 2097152})
-  .add_string_axis("Entropy", {"0.201", "0.544", "1.000"});
+  .add_int64_axis("Bins", {32, 128, 2048, 2097152})
+  .add_string_axis("Entropy", {"0.201", "1.000"});
@@ -131,7 +131,7 @@ void keys(nvbench::state &state, nvbench::type_list<T, OffsetT>)
   thrust::device_vector<nvbench::uint8_t> temp(temp_size);
   auto *temp_storage = thrust::raw_pointer_cast(temp.data());
 
-  state.exec([&](nvbench::launch &launch) {
+  state.exec(nvbench::exec_tag::no_batch, [&](nvbench::launch &launch) {
     dispatch_t::Dispatch(temp_storage,
                          temp_size,
                          d_buffer_1,

@@ -134,7 +134,7 @@ void pairs(nvbench::state &state, nvbench::type_list<KeyT, ValueT, OffsetT>)
   thrust::device_vector<nvbench::uint8_t> temp(temp_size);
   auto *temp_storage = thrust::raw_pointer_cast(temp.data());
 
-  state.exec([&](nvbench::launch &launch) {
+  state.exec(nvbench::exec_tag::no_batch, [&](nvbench::launch &launch) {
     dispatch_t::Dispatch(temp_storage,
                          temp_size,
                          d_keys_buffer_1,

@@ -148,7 +148,7 @@ void flagged(nvbench::state &state, nvbench::type_list<T, OffsetT>)
   thrust::device_vector<nvbench::uint8_t> temp(temp_size);
   auto *temp_storage = thrust::raw_pointer_cast(temp.data());
 
-  state.exec([&](nvbench::launch &launch) {
+  state.exec(nvbench::exec_tag::no_batch, [&](nvbench::launch &launch) {
     dispatch_t::Dispatch(temp_storage,
                          temp_size,
                          d_in,

@@ -170,7 +170,7 @@ void partition(nvbench::state &state, nvbench::type_list<T, OffsetT>)
   thrust::device_vector<nvbench::uint8_t> temp(temp_size);
   auto *temp_storage = thrust::raw_pointer_cast(temp.data());
 
-  state.exec([&](nvbench::launch &launch) {
+  state.exec(nvbench::exec_tag::no_batch, [&](nvbench::launch &launch) {
     dispatch_t::Dispatch(temp_storage,
                          temp_size,
                          d_in,

@@ -146,7 +146,7 @@ void partition(nvbench::state &state, nvbench::type_list<T, OffsetT>)
   thrust::device_vector<nvbench::uint8_t> temp(temp_size);
   auto *temp_storage = thrust::raw_pointer_cast(temp.data());
 
-  state.exec([&](nvbench::launch &launch) {
+  state.exec(nvbench::exec_tag::no_batch, [&](nvbench::launch &launch) {
     dispatch_t::Dispatch(temp_storage,
                          temp_size,
                          d_in,

@@ -182,7 +182,7 @@ void radix_sort_keys(std::integral_constant<bool, true>,
   thrust::device_vector<nvbench::uint8_t> temp(temp_size);
   auto *temp_storage = thrust::raw_pointer_cast(temp.data());
 
-  state.exec([&](nvbench::launch &launch) {
+  state.exec(nvbench::exec_tag::no_batch, [&](nvbench::launch &launch) {
     cub::DoubleBuffer<key_t> keys     = d_keys;
     cub::DoubleBuffer<value_t> values = d_values;
 
@@ -222,4 +222,4 @@ NVBENCH_BENCH_TYPES(radix_sort_keys, NVBENCH_TYPE_AXES(fundamental_types, offset
   .set_name("base")
   .set_type_axes_names({"T{ct}", "OffsetT{ct}"})
   .add_int64_power_of_two_axis("Elements{io}", nvbench::range(16, 28, 4))
-  .add_string_axis("Entropy", {"1.000", "0.811", "0.544", "0.337", "0.201"});
+  .add_string_axis("Entropy", {"1.000", "0.544", "0.201"});
@@ -186,7 +186,7 @@ void radix_sort_values(std::integral_constant<bool, true>,
   thrust::device_vector<nvbench::uint8_t> temp(temp_size);
   auto *temp_storage = thrust::raw_pointer_cast(temp.data());
 
-  state.exec([&](nvbench::launch &launch) {
+  state.exec(nvbench::exec_tag::no_batch, [&](nvbench::launch &launch) {
     cub::DoubleBuffer<key_t> keys     = d_keys;
     cub::DoubleBuffer<value_t> values = d_values;
 
@@ -224,7 +224,7 @@ void radix_sort_values(nvbench::state &state, nvbench::type_list<KeyT, ValueT, O
 #ifdef TUNE_KeyT
 using key_types = nvbench::type_list<TUNE_KeyT>;
 #else // !defined(TUNE_KeyT) 
-using key_types = fundamental_types;
+using key_types = integral_types;
 #endif // TUNE_KeyT
 
 #ifdef TUNE_ValueT
@@ -245,4 +245,4 @@ NVBENCH_BENCH_TYPES(radix_sort_values, NVBENCH_TYPE_AXES(key_types, value_types,
   .set_name("base")
   .set_type_axes_names({"KeyT{ct}", "ValueT{ct}", "OffsetT{ct}"})
   .add_int64_power_of_two_axis("Elements{io}", nvbench::range(16, 28, 4))
-  .add_string_axis("Entropy", {"1.000", "0.811", "0.544", "0.337", "0.201"});
+  .add_string_axis("Entropy", {"1.000", "0.201"});
@@ -103,7 +103,7 @@ void reduce(nvbench::state &state, nvbench::type_list<T, OffsetT>)
   thrust::device_vector<nvbench::uint8_t> temp(temp_size);
   auto *temp_storage = thrust::raw_pointer_cast(temp.data());
 
-  state.exec([&](nvbench::launch &launch) {
+  state.exec(nvbench::exec_tag::no_batch, [&](nvbench::launch &launch) {
     dispatch_t::Dispatch(temp_storage,
                          temp_size,
                          d_in,

@@ -158,7 +158,7 @@ static void reduce(nvbench::state &state, nvbench::type_list<KeyT, ValueT, Offse
   state.add_global_memory_writes<KeyT>(num_runs);
   state.add_global_memory_writes<OffsetT>(1);
 
-  state.exec([&](nvbench::launch &launch) {
+  state.exec(nvbench::exec_tag::no_batch, [&](nvbench::launch &launch) {
     dispatch_t::Dispatch(d_temp_storage,
                          temp_storage_bytes,
                          d_in_keys,

@@ -158,7 +158,7 @@ static void rle(nvbench::state &state, nvbench::type_list<T, OffsetT>)
   state.add_global_memory_writes<OffsetT>(num_runs);
   state.add_global_memory_writes<OffsetT>(1);
 
-  state.exec([&](nvbench::launch &launch) {
+  state.exec(nvbench::exec_tag::no_batch, [&](nvbench::launch &launch) {
     dispatch_t::Dispatch(d_temp_storage,
                          temp_storage_bytes,
                          d_in_keys,

@@ -145,7 +145,7 @@ static void rle(nvbench::state &state, nvbench::type_list<T, OffsetT>)
   state.add_global_memory_writes<OffsetT>(num_runs);
   state.add_global_memory_writes<OffsetT>(1);
 
-  state.exec([&](nvbench::launch &launch) {
+  state.exec(nvbench::exec_tag::no_batch, [&](nvbench::launch &launch) {
     dispatch_t::Dispatch(d_temp_storage,
                          temp_storage_bytes,
                          d_in_keys,

@@ -121,7 +121,7 @@ static void basic(nvbench::state &state, nvbench::type_list<T, OffsetT>)
   thrust::device_vector<nvbench::uint8_t> tmp(tmp_size);
   nvbench::uint8_t *d_tmp = thrust::raw_pointer_cast(tmp.data());
 
-  state.exec([&](nvbench::launch &launch) {
+  state.exec(nvbench::exec_tag::no_batch, [&](nvbench::launch &launch) {
     dispatch_t::Dispatch(thrust::raw_pointer_cast(tmp.data()),
                          tmp_size,
                          d_input,

@@ -134,7 +134,7 @@ static void scan(nvbench::state &state, nvbench::type_list<KeyT, ValueT, OffsetT
   thrust::device_vector<nvbench::uint8_t> tmp(tmp_size);
   nvbench::uint8_t *d_tmp = thrust::raw_pointer_cast(tmp.data());
 
-  state.exec([&](nvbench::launch &launch) {
+  state.exec(nvbench::exec_tag::no_batch, [&](nvbench::launch &launch) {
     dispatch_t::Dispatch(d_tmp,
                          tmp_size,
                          d_keys,

@@ -220,7 +220,7 @@ void seg_sort(nvbench::state &state,
   thrust::device_vector<nvbench::uint8_t> temp_storage(temp_storage_bytes);
   d_temp_storage = thrust::raw_pointer_cast(temp_storage.data());
 
-  state.exec(nvbench::exec_tag::sync, [&](nvbench::launch &launch) {
+  state.exec(nvbench::exec_tag::no_batch | nvbench::exec_tag::sync, [&](nvbench::launch &launch) {
     cub::DoubleBuffer<key_t> keys     = d_keys;
     cub::DoubleBuffer<value_t> values = d_values;
 
@@ -255,7 +255,7 @@ NVBENCH_BENCH_TYPES(power_law, NVBENCH_TYPE_AXES(fundamental_types, some_offset_
   .set_type_axes_names({"T{ct}", "OffsetT{ct}"})
   .add_int64_power_of_two_axis("Elements{io}", nvbench::range(22, 30, 4))
   .add_int64_power_of_two_axis("Segments{io}", nvbench::range(12, 20, 4))
-  .add_string_axis("Entropy", {"1.000", "0.544", "0.201"});
+  .add_string_axis("Entropy", {"1.000", "0.201"});
 
 
 template <class T, typename OffsetT>