Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[2.3.x] Backport benchmarking PRs #1168

Merged
merged 17 commits into from
Dec 1, 2023
Merged
Show file tree
Hide file tree
Changes from 15 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions benchmarks/scripts/cccl/bench/bench.py
Original file line number Diff line number Diff line change
Expand Up @@ -635,6 +635,14 @@ def do_run(self, ct_point, rt_values, timeout, is_search=True):
cmd.append("--min-samples")
cmd.append("70")

# Unlike noise, minimal benchmarking time is not directly related to variance.
# Default minimal time is 0.5 seconds. For CI we want to reduce it to 0.1 seconds,
# becuse we have limited time budget. Having smaller minimal time doesn't affect
# stability of sample distribution median in a deterministic way. For small problem sizes,
# 0.1s leads to smaller variation than 0.5s. For other workloads, 0.5 leads to smaller variance.
cmd.append("--min-time")
cmd.append("0.1")

# NVBench is currently broken for multiple GPUs, use `CUDA_VISIBLE_DEVICES`
cmd.append("-d")
cmd.append("0")
Expand Down
36 changes: 29 additions & 7 deletions benchmarks/scripts/cccl/bench/search.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,19 +47,41 @@ def parse_arguments():
type=str, help="Parameter in the format `Param=Value`.")
parser.add_argument(
'--list-benches', action=argparse.BooleanOptionalAction, help="Show available benchmarks.")
parser.add_argument('--num-shards', type=int, default=1, help='Split benchmarks into M pieces and only run one')
parser.add_argument('--run-shard', type=int, default=0, help='Run shard N / M of benchmarks')
parser.add_argument('-P0', action=argparse.BooleanOptionalAction, help="Run P0 benchmarks (overwrites -R)")
return parser.parse_args()


def run_benches(benchmarks, sub_space, regex, seeker):
pattern = re.compile(regex)

for algname in benchmarks:
if pattern.match(algname):
def run_benches(algnames, sub_space, seeker):
for algname in algnames:
try:
bench = BaseBench(algname)
ct_space = bench.ct_workload_space(sub_space)
rt_values = bench.rt_axes_values(sub_space)
seeker(algname, ct_space, rt_values)
except Exception as e:
print("#### ERROR exception occured while running {}: '{}'".format(algname, e))


def filter_benchmarks(benchmarks, args):
if args.run_shard >= args.num_shards:
raise ValueError('run-shard must be less than num-shards')

R = args.R
if args.P0:
R = '^(?!.*segmented).*(scan|reduce|select|sort).*'

pattern = re.compile(R)
algnames = list(filter(lambda x: pattern.match(x), benchmarks.keys()))
algnames.sort()

if args.num_shards > 1:
algnames = np.array_split(algnames, args.num_shards)[args.run_shard].tolist()
return algnames

return algnames


def search(seeker):
args = parse_arguments()
Expand All @@ -79,8 +101,8 @@ def search(seeker):
if args.list_benches:
list_benches()
return

run_benches(config.benchmarks, workload_sub_space, args.R, seeker)
run_benches(filter_benchmarks(config.benchmarks, args), workload_sub_space, seeker)


class MedianCenterEstimator:
Expand Down
32 changes: 27 additions & 5 deletions benchmarks/scripts/run.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,33 +6,55 @@
import cccl.bench


def elapsed_time_look_good(x):
def elapsed_time_looks_good(x):
if isinstance(x, float):
if math.isfinite(x):
return True
return False


def problem_size_looks_large_enough(elements):
# Small problem sizes do not utilize entire GPU.
# Benchmarking small problem sizes in environments where we do not control
# distributions comparison, e.g. CI, is not useful because of stability issues.
return elements.isdigit() and int(elements) > 20


def filter_runtime_workloads_for_ci(rt_values):
for subbench in rt_values:
for axis in rt_values[subbench]:
if axis.startswith('Elements') and axis.endswith('[pow2]'):
rt_values[subbench][axis] = list(filter(problem_size_looks_large_enough, rt_values[subbench][axis]))

return rt_values


class BaseRunner:
def __init__(self):
self.estimator = cccl.bench.MedianCenterEstimator()

def __call__(self, algname, ct_workload_space, rt_values):
failure_occured = False
rt_values = filter_runtime_workloads_for_ci(rt_values)

for ct_workload in ct_workload_space:
bench = cccl.bench.BaseBench(algname)
if bench.build():
if bench.build(): # might throw
results = bench.run(ct_workload, rt_values, self.estimator, False)
for subbench in results:
for point in results[subbench]:
bench_name = "{}.{}-{}".format(bench.algorithm_name(), subbench, point)
bench_name = bench_name.replace(' ', '___')
bench_name = "".join(c if c.isalnum() else "_" for c in bench_name)
elapsed_time = results[subbench][point]
if elapsed_time_look_good(elapsed_time):
if elapsed_time_looks_good(elapsed_time):
print("&&&& PERF {} {} -sec".format(bench_name, elapsed_time))
else:
print("&&&& FAILED bench")
sys.exit(-1)
failure_occured = True
print("&&&& FAILED {}".format(algname))

if failure_occured:
sys.exit(1)


def main():
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -100,7 +100,7 @@ void left(nvbench::state& state, nvbench::type_list<T, OffsetT>)
thrust::device_vector<std::uint8_t> temp_storage(temp_storage_bytes);
std::uint8_t* d_temp_storage = thrust::raw_pointer_cast(temp_storage.data());

state.exec([&](nvbench::launch &launch) {
state.exec(nvbench::exec_tag::no_batch, [&](nvbench::launch &launch) {
dispatch_t::Dispatch(d_temp_storage,
temp_storage_bytes,
d_in,
Expand Down
9 changes: 4 additions & 5 deletions cub/benchmarks/bench/copy/memcpy.cu
Original file line number Diff line number Diff line change
Expand Up @@ -249,7 +249,7 @@ void copy(nvbench::state &state,
thrust::device_vector<nvbench::uint8_t> temp_storage(temp_storage_bytes);
d_temp_storage = thrust::raw_pointer_cast(temp_storage.data());

state.exec(nvbench::exec_tag::sync, [&](nvbench::launch &launch) {
state.exec(nvbench::exec_tag::no_batch | nvbench::exec_tag::sync, [&](nvbench::launch &launch) {
dispatch_t::Dispatch(d_temp_storage,
temp_storage_bytes,
d_input_buffers,
Expand All @@ -275,8 +275,8 @@ void uniform(nvbench::state &state, nvbench::type_list<T, OffsetT> tl)
elements,
min_buffer_size,
max_buffer_size,
state.get_int64("RandomizeInput"),
state.get_int64("RandomizeOutput"));
state.get_int64("Randomize"),
state.get_int64("Randomize"));
}

template <class T, class OffsetT>
Expand Down Expand Up @@ -309,8 +309,7 @@ NVBENCH_BENCH_TYPES(uniform, NVBENCH_TYPE_AXES(types, u_offset_types))
.add_int64_power_of_two_axis("Elements{io}", nvbench::range(25, 29, 2))
.add_int64_axis("MinBufferSizeRatio", {1, 99})
.add_int64_axis("MaxBufferSize", {8, 64, 256, 1024, 64 * 1024})
.add_int64_axis("RandomizeInput", {0, 1})
.add_int64_axis("RandomizeOutput", {0, 1});
.add_int64_axis("Randomize", {0, 1});

NVBENCH_BENCH_TYPES(large, NVBENCH_TYPE_AXES(types, u_offset_types))
.set_name("large")
Expand Down
6 changes: 3 additions & 3 deletions cub/benchmarks/bench/histogram/even.cu
Original file line number Diff line number Diff line change
Expand Up @@ -110,7 +110,7 @@ static void even(nvbench::state &state, nvbench::type_list<SampleT, CounterT, Of
thrust::device_vector<nvbench::uint8_t> tmp(temp_storage_bytes);
d_temp_storage = thrust::raw_pointer_cast(tmp.data());

state.exec([&](nvbench::launch &launch) {
state.exec(nvbench::exec_tag::no_batch, [&](nvbench::launch &launch) {
dispatch_t::DispatchEven(d_temp_storage,
temp_storage_bytes,
d_input,
Expand Down Expand Up @@ -139,5 +139,5 @@ NVBENCH_BENCH_TYPES(even, NVBENCH_TYPE_AXES(sample_types, bin_types, some_offset
.set_name("base")
.set_type_axes_names({"SampleT{ct}", "BinT{ct}", "OffsetT{ct}"})
.add_int64_power_of_two_axis("Elements{io}", nvbench::range(16, 28, 4))
.add_int64_axis("Bins", {32, 64, 128, 2048, 2097152})
.add_string_axis("Entropy", {"0.201", "0.544", "1.000"});
.add_int64_axis("Bins", {32, 128, 2048, 2097152})
.add_string_axis("Entropy", {"0.201", "1.000"});
6 changes: 3 additions & 3 deletions cub/benchmarks/bench/histogram/multi/even.cu
Original file line number Diff line number Diff line change
Expand Up @@ -121,7 +121,7 @@ static void even(nvbench::state &state, nvbench::type_list<SampleT, CounterT, Of
thrust::device_vector<nvbench::uint8_t> tmp(temp_storage_bytes);
d_temp_storage = thrust::raw_pointer_cast(tmp.data());

state.exec([&](nvbench::launch &launch) {
state.exec(nvbench::exec_tag::no_batch, [&](nvbench::launch &launch) {
dispatch_t::DispatchEven(d_temp_storage,
temp_storage_bytes,
d_input,
Expand Down Expand Up @@ -150,5 +150,5 @@ NVBENCH_BENCH_TYPES(even, NVBENCH_TYPE_AXES(sample_types, bin_types, some_offset
.set_name("base")
.set_type_axes_names({"SampleT{ct}", "BinT{ct}", "OffsetT{ct}"})
.add_int64_power_of_two_axis("Elements{io}", nvbench::range(16, 28, 4))
.add_int64_axis("Bins", {32, 64, 128, 2048, 2097152})
.add_string_axis("Entropy", {"0.201", "0.544", "1.000"});
.add_int64_axis("Bins", {32, 128, 2048, 2097152})
.add_string_axis("Entropy", {"0.201", "1.000"});
6 changes: 3 additions & 3 deletions cub/benchmarks/bench/histogram/multi/range.cu
Original file line number Diff line number Diff line change
Expand Up @@ -129,7 +129,7 @@ static void range(nvbench::state &state, nvbench::type_list<SampleT, CounterT, O
thrust::device_vector<nvbench::uint8_t> tmp(temp_storage_bytes);
d_temp_storage = thrust::raw_pointer_cast(tmp.data());

state.exec([&](nvbench::launch &launch) {
state.exec(nvbench::exec_tag::no_batch, [&](nvbench::launch &launch) {
dispatch_t::DispatchRange(d_temp_storage,
temp_storage_bytes,
d_input,
Expand Down Expand Up @@ -157,5 +157,5 @@ NVBENCH_BENCH_TYPES(range, NVBENCH_TYPE_AXES(sample_types, bin_types, some_offse
.set_name("base")
.set_type_axes_names({"SampleT{ct}", "BinT{ct}", "OffsetT{ct}"})
.add_int64_power_of_two_axis("Elements{io}", nvbench::range(16, 28, 4))
.add_int64_axis("Bins", {32, 64, 128, 2048, 2097152})
.add_string_axis("Entropy", {"0.201", "0.544", "1.000"});
.add_int64_axis("Bins", {32, 128, 2048, 2097152})
.add_string_axis("Entropy", {"0.201", "1.000"});
6 changes: 3 additions & 3 deletions cub/benchmarks/bench/histogram/range.cu
Original file line number Diff line number Diff line change
Expand Up @@ -116,7 +116,7 @@ static void range(nvbench::state &state, nvbench::type_list<SampleT, CounterT, O
thrust::device_vector<nvbench::uint8_t> tmp(temp_storage_bytes);
d_temp_storage = thrust::raw_pointer_cast(tmp.data());

state.exec([&](nvbench::launch &launch) {
state.exec(nvbench::exec_tag::no_batch, [&](nvbench::launch &launch) {
dispatch_t::DispatchRange(d_temp_storage,
temp_storage_bytes,
d_input,
Expand Down Expand Up @@ -144,5 +144,5 @@ NVBENCH_BENCH_TYPES(range, NVBENCH_TYPE_AXES(sample_types, bin_types, some_offse
.set_name("base")
.set_type_axes_names({"SampleT{ct}", "BinT{ct}", "OffsetT{ct}"})
.add_int64_power_of_two_axis("Elements{io}", nvbench::range(16, 28, 4))
.add_int64_axis("Bins", {32, 64, 128, 2048, 2097152})
.add_string_axis("Entropy", {"0.201", "0.544", "1.000"});
.add_int64_axis("Bins", {32, 128, 2048, 2097152})
.add_string_axis("Entropy", {"0.201", "1.000"});
2 changes: 1 addition & 1 deletion cub/benchmarks/bench/merge_sort/keys.cu
Original file line number Diff line number Diff line change
Expand Up @@ -131,7 +131,7 @@ void keys(nvbench::state &state, nvbench::type_list<T, OffsetT>)
thrust::device_vector<nvbench::uint8_t> temp(temp_size);
auto *temp_storage = thrust::raw_pointer_cast(temp.data());

state.exec([&](nvbench::launch &launch) {
state.exec(nvbench::exec_tag::no_batch, [&](nvbench::launch &launch) {
dispatch_t::Dispatch(temp_storage,
temp_size,
d_buffer_1,
Expand Down
2 changes: 1 addition & 1 deletion cub/benchmarks/bench/merge_sort/pairs.cu
Original file line number Diff line number Diff line change
Expand Up @@ -134,7 +134,7 @@ void pairs(nvbench::state &state, nvbench::type_list<KeyT, ValueT, OffsetT>)
thrust::device_vector<nvbench::uint8_t> temp(temp_size);
auto *temp_storage = thrust::raw_pointer_cast(temp.data());

state.exec([&](nvbench::launch &launch) {
state.exec(nvbench::exec_tag::no_batch, [&](nvbench::launch &launch) {
dispatch_t::Dispatch(temp_storage,
temp_size,
d_keys_buffer_1,
Expand Down
2 changes: 1 addition & 1 deletion cub/benchmarks/bench/partition/flagged.cu
Original file line number Diff line number Diff line change
Expand Up @@ -148,7 +148,7 @@ void flagged(nvbench::state &state, nvbench::type_list<T, OffsetT>)
thrust::device_vector<nvbench::uint8_t> temp(temp_size);
auto *temp_storage = thrust::raw_pointer_cast(temp.data());

state.exec([&](nvbench::launch &launch) {
state.exec(nvbench::exec_tag::no_batch, [&](nvbench::launch &launch) {
dispatch_t::Dispatch(temp_storage,
temp_size,
d_in,
Expand Down
2 changes: 1 addition & 1 deletion cub/benchmarks/bench/partition/if.cu
Original file line number Diff line number Diff line change
Expand Up @@ -170,7 +170,7 @@ void partition(nvbench::state &state, nvbench::type_list<T, OffsetT>)
thrust::device_vector<nvbench::uint8_t> temp(temp_size);
auto *temp_storage = thrust::raw_pointer_cast(temp.data());

state.exec([&](nvbench::launch &launch) {
state.exec(nvbench::exec_tag::no_batch, [&](nvbench::launch &launch) {
dispatch_t::Dispatch(temp_storage,
temp_size,
d_in,
Expand Down
2 changes: 1 addition & 1 deletion cub/benchmarks/bench/partition/three_way.cu
Original file line number Diff line number Diff line change
Expand Up @@ -146,7 +146,7 @@ void partition(nvbench::state &state, nvbench::type_list<T, OffsetT>)
thrust::device_vector<nvbench::uint8_t> temp(temp_size);
auto *temp_storage = thrust::raw_pointer_cast(temp.data());

state.exec([&](nvbench::launch &launch) {
state.exec(nvbench::exec_tag::no_batch, [&](nvbench::launch &launch) {
dispatch_t::Dispatch(temp_storage,
temp_size,
d_in,
Expand Down
4 changes: 2 additions & 2 deletions cub/benchmarks/bench/radix_sort/keys.cu
Original file line number Diff line number Diff line change
Expand Up @@ -182,7 +182,7 @@ void radix_sort_keys(std::integral_constant<bool, true>,
thrust::device_vector<nvbench::uint8_t> temp(temp_size);
auto *temp_storage = thrust::raw_pointer_cast(temp.data());

state.exec([&](nvbench::launch &launch) {
state.exec(nvbench::exec_tag::no_batch, [&](nvbench::launch &launch) {
cub::DoubleBuffer<key_t> keys = d_keys;
cub::DoubleBuffer<value_t> values = d_values;

Expand Down Expand Up @@ -222,4 +222,4 @@ NVBENCH_BENCH_TYPES(radix_sort_keys, NVBENCH_TYPE_AXES(fundamental_types, offset
.set_name("base")
.set_type_axes_names({"T{ct}", "OffsetT{ct}"})
.add_int64_power_of_two_axis("Elements{io}", nvbench::range(16, 28, 4))
.add_string_axis("Entropy", {"1.000", "0.811", "0.544", "0.337", "0.201"});
.add_string_axis("Entropy", {"1.000", "0.544", "0.201"});
6 changes: 3 additions & 3 deletions cub/benchmarks/bench/radix_sort/pairs.cu
Original file line number Diff line number Diff line change
Expand Up @@ -186,7 +186,7 @@ void radix_sort_values(std::integral_constant<bool, true>,
thrust::device_vector<nvbench::uint8_t> temp(temp_size);
auto *temp_storage = thrust::raw_pointer_cast(temp.data());

state.exec([&](nvbench::launch &launch) {
state.exec(nvbench::exec_tag::no_batch, [&](nvbench::launch &launch) {
cub::DoubleBuffer<key_t> keys = d_keys;
cub::DoubleBuffer<value_t> values = d_values;

Expand Down Expand Up @@ -224,7 +224,7 @@ void radix_sort_values(nvbench::state &state, nvbench::type_list<KeyT, ValueT, O
#ifdef TUNE_KeyT
using key_types = nvbench::type_list<TUNE_KeyT>;
#else // !defined(TUNE_KeyT)
using key_types = fundamental_types;
using key_types = integral_types;
#endif // TUNE_KeyT

#ifdef TUNE_ValueT
Expand All @@ -245,4 +245,4 @@ NVBENCH_BENCH_TYPES(radix_sort_values, NVBENCH_TYPE_AXES(key_types, value_types,
.set_name("base")
.set_type_axes_names({"KeyT{ct}", "ValueT{ct}", "OffsetT{ct}"})
.add_int64_power_of_two_axis("Elements{io}", nvbench::range(16, 28, 4))
.add_string_axis("Entropy", {"1.000", "0.811", "0.544", "0.337", "0.201"});
.add_string_axis("Entropy", {"1.000", "0.201"});
2 changes: 1 addition & 1 deletion cub/benchmarks/bench/reduce/base.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -103,7 +103,7 @@ void reduce(nvbench::state &state, nvbench::type_list<T, OffsetT>)
thrust::device_vector<nvbench::uint8_t> temp(temp_size);
auto *temp_storage = thrust::raw_pointer_cast(temp.data());

state.exec([&](nvbench::launch &launch) {
state.exec(nvbench::exec_tag::no_batch, [&](nvbench::launch &launch) {
dispatch_t::Dispatch(temp_storage,
temp_size,
d_in,
Expand Down
2 changes: 1 addition & 1 deletion cub/benchmarks/bench/reduce/by_key.cu
Original file line number Diff line number Diff line change
Expand Up @@ -158,7 +158,7 @@ static void reduce(nvbench::state &state, nvbench::type_list<KeyT, ValueT, Offse
state.add_global_memory_writes<KeyT>(num_runs);
state.add_global_memory_writes<OffsetT>(1);

state.exec([&](nvbench::launch &launch) {
state.exec(nvbench::exec_tag::no_batch, [&](nvbench::launch &launch) {
dispatch_t::Dispatch(d_temp_storage,
temp_storage_bytes,
d_in_keys,
Expand Down
2 changes: 1 addition & 1 deletion cub/benchmarks/bench/run_length_encode/encode.cu
Original file line number Diff line number Diff line change
Expand Up @@ -158,7 +158,7 @@ static void rle(nvbench::state &state, nvbench::type_list<T, OffsetT>)
state.add_global_memory_writes<OffsetT>(num_runs);
state.add_global_memory_writes<OffsetT>(1);

state.exec([&](nvbench::launch &launch) {
state.exec(nvbench::exec_tag::no_batch, [&](nvbench::launch &launch) {
dispatch_t::Dispatch(d_temp_storage,
temp_storage_bytes,
d_in_keys,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -145,7 +145,7 @@ static void rle(nvbench::state &state, nvbench::type_list<T, OffsetT>)
state.add_global_memory_writes<OffsetT>(num_runs);
state.add_global_memory_writes<OffsetT>(1);

state.exec([&](nvbench::launch &launch) {
state.exec(nvbench::exec_tag::no_batch, [&](nvbench::launch &launch) {
dispatch_t::Dispatch(d_temp_storage,
temp_storage_bytes,
d_in_keys,
Expand Down
2 changes: 1 addition & 1 deletion cub/benchmarks/bench/scan/exclusive/base.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -121,7 +121,7 @@ static void basic(nvbench::state &state, nvbench::type_list<T, OffsetT>)
thrust::device_vector<nvbench::uint8_t> tmp(tmp_size);
nvbench::uint8_t *d_tmp = thrust::raw_pointer_cast(tmp.data());

state.exec([&](nvbench::launch &launch) {
state.exec(nvbench::exec_tag::no_batch, [&](nvbench::launch &launch) {
dispatch_t::Dispatch(thrust::raw_pointer_cast(tmp.data()),
tmp_size,
d_input,
Expand Down
2 changes: 1 addition & 1 deletion cub/benchmarks/bench/scan/exclusive/by_key.cu
Original file line number Diff line number Diff line change
Expand Up @@ -134,7 +134,7 @@ static void scan(nvbench::state &state, nvbench::type_list<KeyT, ValueT, OffsetT
thrust::device_vector<nvbench::uint8_t> tmp(tmp_size);
nvbench::uint8_t *d_tmp = thrust::raw_pointer_cast(tmp.data());

state.exec([&](nvbench::launch &launch) {
state.exec(nvbench::exec_tag::no_batch, [&](nvbench::launch &launch) {
dispatch_t::Dispatch(d_tmp,
tmp_size,
d_keys,
Expand Down
4 changes: 2 additions & 2 deletions cub/benchmarks/bench/segmented_sort/keys.cu
Original file line number Diff line number Diff line change
Expand Up @@ -220,7 +220,7 @@ void seg_sort(nvbench::state &state,
thrust::device_vector<nvbench::uint8_t> temp_storage(temp_storage_bytes);
d_temp_storage = thrust::raw_pointer_cast(temp_storage.data());

state.exec(nvbench::exec_tag::sync, [&](nvbench::launch &launch) {
state.exec(nvbench::exec_tag::no_batch | nvbench::exec_tag::sync, [&](nvbench::launch &launch) {
cub::DoubleBuffer<key_t> keys = d_keys;
cub::DoubleBuffer<value_t> values = d_values;

Expand Down Expand Up @@ -255,7 +255,7 @@ NVBENCH_BENCH_TYPES(power_law, NVBENCH_TYPE_AXES(fundamental_types, some_offset_
.set_type_axes_names({"T{ct}", "OffsetT{ct}"})
.add_int64_power_of_two_axis("Elements{io}", nvbench::range(22, 30, 4))
.add_int64_power_of_two_axis("Segments{io}", nvbench::range(12, 20, 4))
.add_string_axis("Entropy", {"1.000", "0.544", "0.201"});
.add_string_axis("Entropy", {"1.000", "0.201"});


template <class T, typename OffsetT>
Expand Down
Loading