diff --git a/.github/workflows/build-and-test-linux.yml b/.github/workflows/build-and-test-linux.yml index b328e97884..32cfc25951 100644 --- a/.github/workflows/build-and-test-linux.yml +++ b/.github/workflows/build-and-test-linux.yml @@ -14,9 +14,15 @@ on: container_image: {type: string, required: false} run_tests: {type: boolean, required: false, default: true} +permissions: + contents: read + jobs: build: name: Build ${{inputs.test_name}} + permissions: + id-token: write + contents: read uses: ./.github/workflows/run-as-coder.yml with: name: Build ${{inputs.test_name}} @@ -27,6 +33,9 @@ jobs: test: needs: build + permissions: + id-token: write + contents: read if: ${{ !cancelled() && ( needs.build.result == 'success' || needs.build.result == 'skipped' ) && inputs.run_tests}} name: Test ${{inputs.test_name}} uses: ./.github/workflows/run-as-coder.yml diff --git a/.github/workflows/dispatch-build-and-test.yml b/.github/workflows/dispatch-build-and-test.yml index f421910f24..4606fa26cc 100644 --- a/.github/workflows/dispatch-build-and-test.yml +++ b/.github/workflows/dispatch-build-and-test.yml @@ -8,12 +8,18 @@ on: devcontainer_version: {type: string, required: true} is_windows: {type: boolean, required: true} +permissions: + contents: read + jobs: # Using a matrix to dispatch to the build-and-test reusable workflow for each build configuration # ensures that the build/test steps can overlap across different configurations. For example, # the build step for CUDA 12.1 + gcc 9.3 can run at the same time as the test step for CUDA 11.0 + clang 11. build_and_test_linux: name: build and test linux + permissions: + id-token: write + contents: read if: ${{ !inputs.is_windows }} uses: ./.github/workflows/build-and-test-linux.yml strategy: @@ -30,6 +36,9 @@ jobs: build_and_test_windows: name: build and test windows + permissions: + id-token: write + contents: read if: ${{ inputs.is_windows }} uses: ./.github/workflows/build-and-test-windows.yml strategy: diff --git a/.github/workflows/pr.yml b/.github/workflows/pr.yml index c56e2e1f6f..28da0d5df4 100644 --- a/.github/workflows/pr.yml +++ b/.github/workflows/pr.yml @@ -30,6 +30,10 @@ concurrency: group: ${{ github.workflow }}-on-${{ github.event_name }}-from-${{ github.ref_name }} cancel-in-progress: true +permissions: + contents: read + pull-requests: read + jobs: compute-matrix: name: Compute matrix @@ -53,6 +57,9 @@ jobs: nvrtc: name: NVRTC CUDA${{matrix.cuda}} C++${{matrix.std}} + permissions: + id-token: write + contents: read needs: compute-matrix if: ${{ !contains(github.event.head_commit.message, 'skip-tests') }} uses: ./.github/workflows/run-as-coder.yml @@ -69,6 +76,9 @@ jobs: thrust: name: Thrust CUDA${{ matrix.cuda_version }} ${{ matrix.compiler }} + permissions: + id-token: write + contents: read needs: compute-matrix uses: ./.github/workflows/dispatch-build-and-test.yml strategy: @@ -84,6 +94,9 @@ jobs: cub: name: CUB CUDA${{ matrix.cuda_version }} ${{ matrix.compiler }} + permissions: + id-token: write + contents: read needs: compute-matrix uses: ./.github/workflows/dispatch-build-and-test.yml strategy: @@ -99,6 +112,9 @@ jobs: libcudacxx: name: libcudacxx CUDA${{ matrix.cuda_version }} ${{ matrix.compiler }} + permissions: + id-token: write + contents: read needs: compute-matrix uses: ./.github/workflows/dispatch-build-and-test.yml strategy: @@ -114,6 +130,9 @@ jobs: clang-cuda: name: ${{matrix.lib}} ${{matrix.cpu}}/CTK${{matrix.cuda}}/clang-cuda + permissions: + id-token: write + contents: read needs: compute-matrix strategy: fail-fast: false @@ -129,6 +148,9 @@ jobs: cccl-infra: name: CCCL Infrastructure + permissions: + id-token: write + contents: read needs: compute-matrix if: ${{ !contains(github.event.head_commit.message, 'skip-tests') }} strategy: @@ -146,6 +168,9 @@ jobs: verify-devcontainers: name: Verify Dev Containers + permissions: + id-token: write + contents: read uses: ./.github/workflows/verify-devcontainers.yml # This job is the final job that runs after all other jobs and is used for branch protection status checks. @@ -154,7 +179,7 @@ jobs: ci: runs-on: ubuntu-latest name: CI - if: ${{ always() }} # need to use always() instead of !cancelled() because skipped jobs count as success + if: ${{ always() }} # need to use always() instead of !cancelled() because skipped jobs count as success needs: - clang-cuda - cub diff --git a/.github/workflows/project_automation_set_in_progress.yml b/.github/workflows/project_automation_set_in_progress.yml index 0e30ab608d..f1f7586220 100644 --- a/.github/workflows/project_automation_set_in_progress.yml +++ b/.github/workflows/project_automation_set_in_progress.yml @@ -45,8 +45,7 @@ jobs: runs-on: ubuntu-latest permissions: - issues: write - pull-requests: write + pull-requests: read steps: - name: Check if changes requested from a reviewer @@ -56,17 +55,6 @@ jobs: if [ ${{ github.event.review.state }} != 'changes_requested' ]; then echo "Changes not requested, exiting" exit 0 - - # If it is requesting changes, set PR to draft - # We use the default token here since we're granting write access to the PR - elif [ ${{ github.event.pull_request.draft }} == false ]; then - gh api graphql -f query=' - mutation { - convertPullRequestToDraft(input: {pullRequestId: "${{ env.PR_GLOBAL_ID }}"}) { - clientMutationId - } - }' - exit 0 fi continue-on-error: true diff --git a/.github/workflows/run-as-coder.yml b/.github/workflows/run-as-coder.yml index 8d0cce9fba..6d09fd220f 100644 --- a/.github/workflows/run-as-coder.yml +++ b/.github/workflows/run-as-coder.yml @@ -14,17 +14,21 @@ on: command: {type: string, required: true} env: { type: string, required: false, default: "" } +permissions: + contents: read + jobs: run-as-coder: name: ${{inputs.name}} + permissions: + id-token: write + contents: read runs-on: ${{inputs.runner}} container: options: -u root image: ${{inputs.image}} env: NVIDIA_VISIBLE_DEVICES: ${{ env.NVIDIA_VISIBLE_DEVICES }} - permissions: - id-token: write steps: - name: Checkout repo uses: actions/checkout@v3 diff --git a/.github/workflows/verify-devcontainers.yml b/.github/workflows/verify-devcontainers.yml index baa6c2e273..ef9780f820 100644 --- a/.github/workflows/verify-devcontainers.yml +++ b/.github/workflows/verify-devcontainers.yml @@ -7,6 +7,9 @@ defaults: run: shell: bash -euo pipefail {0} +permissions: + contents: read + jobs: verify-make-devcontainers: name: Verify devcontainer files are up-to-date diff --git a/benchmarks/scripts/cccl/bench/bench.py b/benchmarks/scripts/cccl/bench/bench.py index adaa1f9955..eb5b05baa3 100644 --- a/benchmarks/scripts/cccl/bench/bench.py +++ b/benchmarks/scripts/cccl/bench/bench.py @@ -635,6 +635,14 @@ def do_run(self, ct_point, rt_values, timeout, is_search=True): cmd.append("--min-samples") cmd.append("70") + # Unlike noise, minimal benchmarking time is not directly related to variance. + # Default minimal time is 0.5 seconds. For CI we want to reduce it to 0.1 seconds, + # becuse we have limited time budget. Having smaller minimal time doesn't affect + # stability of sample distribution median in a deterministic way. For small problem sizes, + # 0.1s leads to smaller variation than 0.5s. For other workloads, 0.5 leads to smaller variance. + cmd.append("--min-time") + cmd.append("0.1") + # NVBench is currently broken for multiple GPUs, use `CUDA_VISIBLE_DEVICES` cmd.append("-d") cmd.append("0") diff --git a/benchmarks/scripts/cccl/bench/search.py b/benchmarks/scripts/cccl/bench/search.py index 264861f883..e34113c70e 100644 --- a/benchmarks/scripts/cccl/bench/search.py +++ b/benchmarks/scripts/cccl/bench/search.py @@ -47,19 +47,41 @@ def parse_arguments(): type=str, help="Parameter in the format `Param=Value`.") parser.add_argument( '--list-benches', action=argparse.BooleanOptionalAction, help="Show available benchmarks.") + parser.add_argument('--num-shards', type=int, default=1, help='Split benchmarks into M pieces and only run one') + parser.add_argument('--run-shard', type=int, default=0, help='Run shard N / M of benchmarks') + parser.add_argument('-P0', action=argparse.BooleanOptionalAction, help="Run P0 benchmarks (overwrites -R)") return parser.parse_args() -def run_benches(benchmarks, sub_space, regex, seeker): - pattern = re.compile(regex) - - for algname in benchmarks: - if pattern.match(algname): +def run_benches(algnames, sub_space, seeker): + for algname in algnames: + try: bench = BaseBench(algname) ct_space = bench.ct_workload_space(sub_space) rt_values = bench.rt_axes_values(sub_space) seeker(algname, ct_space, rt_values) + except Exception as e: + print("#### ERROR exception occured while running {}: '{}'".format(algname, e)) + + +def filter_benchmarks(benchmarks, args): + if args.run_shard >= args.num_shards: + raise ValueError('run-shard must be less than num-shards') + + R = args.R + if args.P0: + R = '^(?!.*segmented).*(scan|reduce|select|sort).*' + pattern = re.compile(R) + algnames = list(filter(lambda x: pattern.match(x), benchmarks.keys())) + algnames.sort() + + if args.num_shards > 1: + algnames = np.array_split(algnames, args.num_shards)[args.run_shard].tolist() + return algnames + + return algnames + def search(seeker): args = parse_arguments() @@ -79,8 +101,8 @@ def search(seeker): if args.list_benches: list_benches() return - - run_benches(config.benchmarks, workload_sub_space, args.R, seeker) + + run_benches(filter_benchmarks(config.benchmarks, args), workload_sub_space, seeker) class MedianCenterEstimator: diff --git a/benchmarks/scripts/run.py b/benchmarks/scripts/run.py index 23f0d9e082..2366fd88ba 100755 --- a/benchmarks/scripts/run.py +++ b/benchmarks/scripts/run.py @@ -6,21 +6,40 @@ import cccl.bench -def elapsed_time_look_good(x): +def elapsed_time_looks_good(x): if isinstance(x, float): if math.isfinite(x): return True return False +def problem_size_looks_large_enough(elements): + # Small problem sizes do not utilize entire GPU. + # Benchmarking small problem sizes in environments where we do not control + # distributions comparison, e.g. CI, is not useful because of stability issues. + return elements.isdigit() and int(elements) > 20 + + +def filter_runtime_workloads_for_ci(rt_values): + for subbench in rt_values: + for axis in rt_values[subbench]: + if axis.startswith('Elements') and axis.endswith('[pow2]'): + rt_values[subbench][axis] = list(filter(problem_size_looks_large_enough, rt_values[subbench][axis])) + + return rt_values + + class BaseRunner: def __init__(self): self.estimator = cccl.bench.MedianCenterEstimator() def __call__(self, algname, ct_workload_space, rt_values): + failure_occured = False + rt_values = filter_runtime_workloads_for_ci(rt_values) + for ct_workload in ct_workload_space: bench = cccl.bench.BaseBench(algname) - if bench.build(): + if bench.build(): # might throw results = bench.run(ct_workload, rt_values, self.estimator, False) for subbench in results: for point in results[subbench]: @@ -28,11 +47,14 @@ def __call__(self, algname, ct_workload_space, rt_values): bench_name = bench_name.replace(' ', '___') bench_name = "".join(c if c.isalnum() else "_" for c in bench_name) elapsed_time = results[subbench][point] - if elapsed_time_look_good(elapsed_time): + if elapsed_time_looks_good(elapsed_time): print("&&&& PERF {} {} -sec".format(bench_name, elapsed_time)) else: - print("&&&& FAILED bench") - sys.exit(-1) + failure_occured = True + print("&&&& FAILED {}".format(algname)) + + if failure_occured: + sys.exit(1) def main(): diff --git a/cub/benchmarks/bench/adjacent_difference/subtract_left.cu b/cub/benchmarks/bench/adjacent_difference/subtract_left.cu index d47efd83ab..0539509b09 100644 --- a/cub/benchmarks/bench/adjacent_difference/subtract_left.cu +++ b/cub/benchmarks/bench/adjacent_difference/subtract_left.cu @@ -100,7 +100,7 @@ void left(nvbench::state& state, nvbench::type_list) thrust::device_vector temp_storage(temp_storage_bytes); std::uint8_t* d_temp_storage = thrust::raw_pointer_cast(temp_storage.data()); - state.exec([&](nvbench::launch &launch) { + state.exec(nvbench::exec_tag::no_batch, [&](nvbench::launch &launch) { dispatch_t::Dispatch(d_temp_storage, temp_storage_bytes, d_in, diff --git a/cub/benchmarks/bench/copy/memcpy.cu b/cub/benchmarks/bench/copy/memcpy.cu index 467e0c2e0e..794619bc83 100644 --- a/cub/benchmarks/bench/copy/memcpy.cu +++ b/cub/benchmarks/bench/copy/memcpy.cu @@ -249,7 +249,7 @@ void copy(nvbench::state &state, thrust::device_vector temp_storage(temp_storage_bytes); d_temp_storage = thrust::raw_pointer_cast(temp_storage.data()); - state.exec(nvbench::exec_tag::sync, [&](nvbench::launch &launch) { + state.exec(nvbench::exec_tag::no_batch | nvbench::exec_tag::sync, [&](nvbench::launch &launch) { dispatch_t::Dispatch(d_temp_storage, temp_storage_bytes, d_input_buffers, @@ -275,8 +275,8 @@ void uniform(nvbench::state &state, nvbench::type_list tl) elements, min_buffer_size, max_buffer_size, - state.get_int64("RandomizeInput"), - state.get_int64("RandomizeOutput")); + state.get_int64("Randomize"), + state.get_int64("Randomize")); } template @@ -309,8 +309,7 @@ NVBENCH_BENCH_TYPES(uniform, NVBENCH_TYPE_AXES(types, u_offset_types)) .add_int64_power_of_two_axis("Elements{io}", nvbench::range(25, 29, 2)) .add_int64_axis("MinBufferSizeRatio", {1, 99}) .add_int64_axis("MaxBufferSize", {8, 64, 256, 1024, 64 * 1024}) - .add_int64_axis("RandomizeInput", {0, 1}) - .add_int64_axis("RandomizeOutput", {0, 1}); + .add_int64_axis("Randomize", {0, 1}); NVBENCH_BENCH_TYPES(large, NVBENCH_TYPE_AXES(types, u_offset_types)) .set_name("large") diff --git a/cub/benchmarks/bench/histogram/even.cu b/cub/benchmarks/bench/histogram/even.cu index ce34cab5f4..7ef201dbba 100644 --- a/cub/benchmarks/bench/histogram/even.cu +++ b/cub/benchmarks/bench/histogram/even.cu @@ -110,7 +110,7 @@ static void even(nvbench::state &state, nvbench::type_list tmp(temp_storage_bytes); d_temp_storage = thrust::raw_pointer_cast(tmp.data()); - state.exec([&](nvbench::launch &launch) { + state.exec(nvbench::exec_tag::no_batch, [&](nvbench::launch &launch) { dispatch_t::DispatchEven(d_temp_storage, temp_storage_bytes, d_input, @@ -139,5 +139,5 @@ NVBENCH_BENCH_TYPES(even, NVBENCH_TYPE_AXES(sample_types, bin_types, some_offset .set_name("base") .set_type_axes_names({"SampleT{ct}", "BinT{ct}", "OffsetT{ct}"}) .add_int64_power_of_two_axis("Elements{io}", nvbench::range(16, 28, 4)) - .add_int64_axis("Bins", {32, 64, 128, 2048, 2097152}) - .add_string_axis("Entropy", {"0.201", "0.544", "1.000"}); + .add_int64_axis("Bins", {32, 128, 2048, 2097152}) + .add_string_axis("Entropy", {"0.201", "1.000"}); diff --git a/cub/benchmarks/bench/histogram/multi/even.cu b/cub/benchmarks/bench/histogram/multi/even.cu index 1a501aeec1..83ebfcea5b 100644 --- a/cub/benchmarks/bench/histogram/multi/even.cu +++ b/cub/benchmarks/bench/histogram/multi/even.cu @@ -121,7 +121,7 @@ static void even(nvbench::state &state, nvbench::type_list tmp(temp_storage_bytes); d_temp_storage = thrust::raw_pointer_cast(tmp.data()); - state.exec([&](nvbench::launch &launch) { + state.exec(nvbench::exec_tag::no_batch, [&](nvbench::launch &launch) { dispatch_t::DispatchEven(d_temp_storage, temp_storage_bytes, d_input, @@ -150,5 +150,5 @@ NVBENCH_BENCH_TYPES(even, NVBENCH_TYPE_AXES(sample_types, bin_types, some_offset .set_name("base") .set_type_axes_names({"SampleT{ct}", "BinT{ct}", "OffsetT{ct}"}) .add_int64_power_of_two_axis("Elements{io}", nvbench::range(16, 28, 4)) - .add_int64_axis("Bins", {32, 64, 128, 2048, 2097152}) - .add_string_axis("Entropy", {"0.201", "0.544", "1.000"}); + .add_int64_axis("Bins", {32, 128, 2048, 2097152}) + .add_string_axis("Entropy", {"0.201", "1.000"}); diff --git a/cub/benchmarks/bench/histogram/multi/range.cu b/cub/benchmarks/bench/histogram/multi/range.cu index 9d8431635b..939cb79fe2 100644 --- a/cub/benchmarks/bench/histogram/multi/range.cu +++ b/cub/benchmarks/bench/histogram/multi/range.cu @@ -129,7 +129,7 @@ static void range(nvbench::state &state, nvbench::type_list tmp(temp_storage_bytes); d_temp_storage = thrust::raw_pointer_cast(tmp.data()); - state.exec([&](nvbench::launch &launch) { + state.exec(nvbench::exec_tag::no_batch, [&](nvbench::launch &launch) { dispatch_t::DispatchRange(d_temp_storage, temp_storage_bytes, d_input, @@ -157,5 +157,5 @@ NVBENCH_BENCH_TYPES(range, NVBENCH_TYPE_AXES(sample_types, bin_types, some_offse .set_name("base") .set_type_axes_names({"SampleT{ct}", "BinT{ct}", "OffsetT{ct}"}) .add_int64_power_of_two_axis("Elements{io}", nvbench::range(16, 28, 4)) - .add_int64_axis("Bins", {32, 64, 128, 2048, 2097152}) - .add_string_axis("Entropy", {"0.201", "0.544", "1.000"}); + .add_int64_axis("Bins", {32, 128, 2048, 2097152}) + .add_string_axis("Entropy", {"0.201", "1.000"}); diff --git a/cub/benchmarks/bench/histogram/range.cu b/cub/benchmarks/bench/histogram/range.cu index 9e11806490..266c483a19 100644 --- a/cub/benchmarks/bench/histogram/range.cu +++ b/cub/benchmarks/bench/histogram/range.cu @@ -116,7 +116,7 @@ static void range(nvbench::state &state, nvbench::type_list tmp(temp_storage_bytes); d_temp_storage = thrust::raw_pointer_cast(tmp.data()); - state.exec([&](nvbench::launch &launch) { + state.exec(nvbench::exec_tag::no_batch, [&](nvbench::launch &launch) { dispatch_t::DispatchRange(d_temp_storage, temp_storage_bytes, d_input, @@ -144,5 +144,5 @@ NVBENCH_BENCH_TYPES(range, NVBENCH_TYPE_AXES(sample_types, bin_types, some_offse .set_name("base") .set_type_axes_names({"SampleT{ct}", "BinT{ct}", "OffsetT{ct}"}) .add_int64_power_of_two_axis("Elements{io}", nvbench::range(16, 28, 4)) - .add_int64_axis("Bins", {32, 64, 128, 2048, 2097152}) - .add_string_axis("Entropy", {"0.201", "0.544", "1.000"}); + .add_int64_axis("Bins", {32, 128, 2048, 2097152}) + .add_string_axis("Entropy", {"0.201", "1.000"}); diff --git a/cub/benchmarks/bench/merge_sort/keys.cu b/cub/benchmarks/bench/merge_sort/keys.cu index ba43e86cec..831c021515 100644 --- a/cub/benchmarks/bench/merge_sort/keys.cu +++ b/cub/benchmarks/bench/merge_sort/keys.cu @@ -131,7 +131,7 @@ void keys(nvbench::state &state, nvbench::type_list) thrust::device_vector temp(temp_size); auto *temp_storage = thrust::raw_pointer_cast(temp.data()); - state.exec([&](nvbench::launch &launch) { + state.exec(nvbench::exec_tag::no_batch, [&](nvbench::launch &launch) { dispatch_t::Dispatch(temp_storage, temp_size, d_buffer_1, diff --git a/cub/benchmarks/bench/merge_sort/pairs.cu b/cub/benchmarks/bench/merge_sort/pairs.cu index cf4b23b69b..6a1f9d8d56 100644 --- a/cub/benchmarks/bench/merge_sort/pairs.cu +++ b/cub/benchmarks/bench/merge_sort/pairs.cu @@ -134,7 +134,7 @@ void pairs(nvbench::state &state, nvbench::type_list) thrust::device_vector temp(temp_size); auto *temp_storage = thrust::raw_pointer_cast(temp.data()); - state.exec([&](nvbench::launch &launch) { + state.exec(nvbench::exec_tag::no_batch, [&](nvbench::launch &launch) { dispatch_t::Dispatch(temp_storage, temp_size, d_keys_buffer_1, diff --git a/cub/benchmarks/bench/partition/flagged.cu b/cub/benchmarks/bench/partition/flagged.cu index f26182e260..0045fc4475 100644 --- a/cub/benchmarks/bench/partition/flagged.cu +++ b/cub/benchmarks/bench/partition/flagged.cu @@ -148,7 +148,7 @@ void flagged(nvbench::state &state, nvbench::type_list) thrust::device_vector temp(temp_size); auto *temp_storage = thrust::raw_pointer_cast(temp.data()); - state.exec([&](nvbench::launch &launch) { + state.exec(nvbench::exec_tag::no_batch, [&](nvbench::launch &launch) { dispatch_t::Dispatch(temp_storage, temp_size, d_in, diff --git a/cub/benchmarks/bench/partition/if.cu b/cub/benchmarks/bench/partition/if.cu index 1ab89a02c5..fcbc97446f 100644 --- a/cub/benchmarks/bench/partition/if.cu +++ b/cub/benchmarks/bench/partition/if.cu @@ -170,7 +170,7 @@ void partition(nvbench::state &state, nvbench::type_list) thrust::device_vector temp(temp_size); auto *temp_storage = thrust::raw_pointer_cast(temp.data()); - state.exec([&](nvbench::launch &launch) { + state.exec(nvbench::exec_tag::no_batch, [&](nvbench::launch &launch) { dispatch_t::Dispatch(temp_storage, temp_size, d_in, diff --git a/cub/benchmarks/bench/partition/three_way.cu b/cub/benchmarks/bench/partition/three_way.cu index 9f2b5eb8b1..577d01797f 100644 --- a/cub/benchmarks/bench/partition/three_way.cu +++ b/cub/benchmarks/bench/partition/three_way.cu @@ -146,7 +146,7 @@ void partition(nvbench::state &state, nvbench::type_list) thrust::device_vector temp(temp_size); auto *temp_storage = thrust::raw_pointer_cast(temp.data()); - state.exec([&](nvbench::launch &launch) { + state.exec(nvbench::exec_tag::no_batch, [&](nvbench::launch &launch) { dispatch_t::Dispatch(temp_storage, temp_size, d_in, diff --git a/cub/benchmarks/bench/radix_sort/keys.cu b/cub/benchmarks/bench/radix_sort/keys.cu index 6586e40b91..c2fc6a7d1f 100644 --- a/cub/benchmarks/bench/radix_sort/keys.cu +++ b/cub/benchmarks/bench/radix_sort/keys.cu @@ -182,7 +182,7 @@ void radix_sort_keys(std::integral_constant, thrust::device_vector temp(temp_size); auto *temp_storage = thrust::raw_pointer_cast(temp.data()); - state.exec([&](nvbench::launch &launch) { + state.exec(nvbench::exec_tag::no_batch, [&](nvbench::launch &launch) { cub::DoubleBuffer keys = d_keys; cub::DoubleBuffer values = d_values; @@ -222,4 +222,4 @@ NVBENCH_BENCH_TYPES(radix_sort_keys, NVBENCH_TYPE_AXES(fundamental_types, offset .set_name("base") .set_type_axes_names({"T{ct}", "OffsetT{ct}"}) .add_int64_power_of_two_axis("Elements{io}", nvbench::range(16, 28, 4)) - .add_string_axis("Entropy", {"1.000", "0.811", "0.544", "0.337", "0.201"}); + .add_string_axis("Entropy", {"1.000", "0.544", "0.201"}); diff --git a/cub/benchmarks/bench/radix_sort/pairs.cu b/cub/benchmarks/bench/radix_sort/pairs.cu index 006e33b98c..503a25bc4b 100644 --- a/cub/benchmarks/bench/radix_sort/pairs.cu +++ b/cub/benchmarks/bench/radix_sort/pairs.cu @@ -186,7 +186,7 @@ void radix_sort_values(std::integral_constant, thrust::device_vector temp(temp_size); auto *temp_storage = thrust::raw_pointer_cast(temp.data()); - state.exec([&](nvbench::launch &launch) { + state.exec(nvbench::exec_tag::no_batch, [&](nvbench::launch &launch) { cub::DoubleBuffer keys = d_keys; cub::DoubleBuffer values = d_values; @@ -224,7 +224,7 @@ void radix_sort_values(nvbench::state &state, nvbench::type_list; #else // !defined(TUNE_KeyT) -using key_types = fundamental_types; +using key_types = integral_types; #endif // TUNE_KeyT #ifdef TUNE_ValueT @@ -245,4 +245,4 @@ NVBENCH_BENCH_TYPES(radix_sort_values, NVBENCH_TYPE_AXES(key_types, value_types, .set_name("base") .set_type_axes_names({"KeyT{ct}", "ValueT{ct}", "OffsetT{ct}"}) .add_int64_power_of_two_axis("Elements{io}", nvbench::range(16, 28, 4)) - .add_string_axis("Entropy", {"1.000", "0.811", "0.544", "0.337", "0.201"}); + .add_string_axis("Entropy", {"1.000", "0.201"}); diff --git a/cub/benchmarks/bench/reduce/base.cuh b/cub/benchmarks/bench/reduce/base.cuh index 253e5533b4..d874d69ae6 100644 --- a/cub/benchmarks/bench/reduce/base.cuh +++ b/cub/benchmarks/bench/reduce/base.cuh @@ -103,7 +103,7 @@ void reduce(nvbench::state &state, nvbench::type_list) thrust::device_vector temp(temp_size); auto *temp_storage = thrust::raw_pointer_cast(temp.data()); - state.exec([&](nvbench::launch &launch) { + state.exec(nvbench::exec_tag::no_batch, [&](nvbench::launch &launch) { dispatch_t::Dispatch(temp_storage, temp_size, d_in, diff --git a/cub/benchmarks/bench/reduce/by_key.cu b/cub/benchmarks/bench/reduce/by_key.cu index d11667b7a5..4ccca91147 100644 --- a/cub/benchmarks/bench/reduce/by_key.cu +++ b/cub/benchmarks/bench/reduce/by_key.cu @@ -158,7 +158,7 @@ static void reduce(nvbench::state &state, nvbench::type_list(num_runs); state.add_global_memory_writes(1); - state.exec([&](nvbench::launch &launch) { + state.exec(nvbench::exec_tag::no_batch, [&](nvbench::launch &launch) { dispatch_t::Dispatch(d_temp_storage, temp_storage_bytes, d_in_keys, diff --git a/cub/benchmarks/bench/run_length_encode/encode.cu b/cub/benchmarks/bench/run_length_encode/encode.cu index 1e02f01055..16a07206c9 100644 --- a/cub/benchmarks/bench/run_length_encode/encode.cu +++ b/cub/benchmarks/bench/run_length_encode/encode.cu @@ -158,7 +158,7 @@ static void rle(nvbench::state &state, nvbench::type_list) state.add_global_memory_writes(num_runs); state.add_global_memory_writes(1); - state.exec([&](nvbench::launch &launch) { + state.exec(nvbench::exec_tag::no_batch, [&](nvbench::launch &launch) { dispatch_t::Dispatch(d_temp_storage, temp_storage_bytes, d_in_keys, diff --git a/cub/benchmarks/bench/run_length_encode/non_trivial_runs.cu b/cub/benchmarks/bench/run_length_encode/non_trivial_runs.cu index b679aa4084..83e2a8eff3 100644 --- a/cub/benchmarks/bench/run_length_encode/non_trivial_runs.cu +++ b/cub/benchmarks/bench/run_length_encode/non_trivial_runs.cu @@ -145,7 +145,7 @@ static void rle(nvbench::state &state, nvbench::type_list) state.add_global_memory_writes(num_runs); state.add_global_memory_writes(1); - state.exec([&](nvbench::launch &launch) { + state.exec(nvbench::exec_tag::no_batch, [&](nvbench::launch &launch) { dispatch_t::Dispatch(d_temp_storage, temp_storage_bytes, d_in_keys, diff --git a/cub/benchmarks/bench/scan/exclusive/base.cuh b/cub/benchmarks/bench/scan/exclusive/base.cuh index 3c3ec561c6..1e28450e26 100644 --- a/cub/benchmarks/bench/scan/exclusive/base.cuh +++ b/cub/benchmarks/bench/scan/exclusive/base.cuh @@ -121,7 +121,7 @@ static void basic(nvbench::state &state, nvbench::type_list) thrust::device_vector tmp(tmp_size); nvbench::uint8_t *d_tmp = thrust::raw_pointer_cast(tmp.data()); - state.exec([&](nvbench::launch &launch) { + state.exec(nvbench::exec_tag::no_batch, [&](nvbench::launch &launch) { dispatch_t::Dispatch(thrust::raw_pointer_cast(tmp.data()), tmp_size, d_input, diff --git a/cub/benchmarks/bench/scan/exclusive/by_key.cu b/cub/benchmarks/bench/scan/exclusive/by_key.cu index 93e515e02c..26b36322ee 100644 --- a/cub/benchmarks/bench/scan/exclusive/by_key.cu +++ b/cub/benchmarks/bench/scan/exclusive/by_key.cu @@ -134,7 +134,7 @@ static void scan(nvbench::state &state, nvbench::type_list tmp(tmp_size); nvbench::uint8_t *d_tmp = thrust::raw_pointer_cast(tmp.data()); - state.exec([&](nvbench::launch &launch) { + state.exec(nvbench::exec_tag::no_batch, [&](nvbench::launch &launch) { dispatch_t::Dispatch(d_tmp, tmp_size, d_keys, diff --git a/cub/benchmarks/bench/segmented_sort/keys.cu b/cub/benchmarks/bench/segmented_sort/keys.cu index 6f47c66d66..d0d1578994 100644 --- a/cub/benchmarks/bench/segmented_sort/keys.cu +++ b/cub/benchmarks/bench/segmented_sort/keys.cu @@ -220,7 +220,7 @@ void seg_sort(nvbench::state &state, thrust::device_vector temp_storage(temp_storage_bytes); d_temp_storage = thrust::raw_pointer_cast(temp_storage.data()); - state.exec(nvbench::exec_tag::sync, [&](nvbench::launch &launch) { + state.exec(nvbench::exec_tag::no_batch | nvbench::exec_tag::sync, [&](nvbench::launch &launch) { cub::DoubleBuffer keys = d_keys; cub::DoubleBuffer values = d_values; @@ -255,7 +255,7 @@ NVBENCH_BENCH_TYPES(power_law, NVBENCH_TYPE_AXES(fundamental_types, some_offset_ .set_type_axes_names({"T{ct}", "OffsetT{ct}"}) .add_int64_power_of_two_axis("Elements{io}", nvbench::range(22, 30, 4)) .add_int64_power_of_two_axis("Segments{io}", nvbench::range(12, 20, 4)) - .add_string_axis("Entropy", {"1.000", "0.544", "0.201"}); + .add_string_axis("Entropy", {"1.000", "0.201"}); template diff --git a/cub/benchmarks/bench/select/flagged.cu b/cub/benchmarks/bench/select/flagged.cu index fcbf57aba1..dcf0598bd3 100644 --- a/cub/benchmarks/bench/select/flagged.cu +++ b/cub/benchmarks/bench/select/flagged.cu @@ -152,7 +152,7 @@ void select(nvbench::state &state, nvbench::type_list) thrust::device_vector temp(temp_size); auto *temp_storage = thrust::raw_pointer_cast(temp.data()); - state.exec([&](nvbench::launch &launch) { + state.exec(nvbench::exec_tag::no_batch, [&](nvbench::launch &launch) { dispatch_t::Dispatch(temp_storage, temp_size, d_in, diff --git a/cub/benchmarks/bench/select/if.cu b/cub/benchmarks/bench/select/if.cu index 089ffa0f4a..981ed7b701 100644 --- a/cub/benchmarks/bench/select/if.cu +++ b/cub/benchmarks/bench/select/if.cu @@ -174,7 +174,7 @@ void select(nvbench::state &state, nvbench::type_list) thrust::device_vector temp(temp_size); auto *temp_storage = thrust::raw_pointer_cast(temp.data()); - state.exec([&](nvbench::launch &launch) { + state.exec(nvbench::exec_tag::no_batch, [&](nvbench::launch &launch) { dispatch_t::Dispatch(temp_storage, temp_size, d_in, diff --git a/cub/benchmarks/bench/select/unique_by_key.cu b/cub/benchmarks/bench/select/unique_by_key.cu index 1d610e2b82..e048d81b63 100644 --- a/cub/benchmarks/bench/select/unique_by_key.cu +++ b/cub/benchmarks/bench/select/unique_by_key.cu @@ -150,7 +150,7 @@ static void select(nvbench::state &state, nvbench::type_list(num_runs); state.add_global_memory_writes(1); - state.exec([&](nvbench::launch &launch) { + state.exec(nvbench::exec_tag::no_batch, [&](nvbench::launch &launch) { dispatch_t::Dispatch(d_temp_storage, temp_storage_bytes, d_in_keys, diff --git a/cub/benchmarks/nvbench_helper/CMakeLists.txt b/cub/benchmarks/nvbench_helper/CMakeLists.txt index 38112b1779..942585ef16 100644 --- a/cub/benchmarks/nvbench_helper/CMakeLists.txt +++ b/cub/benchmarks/nvbench_helper/CMakeLists.txt @@ -1,5 +1,5 @@ # Fetch nvbench -CPMAddPackage("gh:NVIDIA/nvbench#39b2770b62ce1f4e0ebeb9af60d7c6de624633a5") +CPMAddPackage("gh:NVIDIA/nvbench#main") add_library(nvbench_helper OBJECT nvbench_helper/nvbench_helper.cuh nvbench_helper/nvbench_helper.cu) @@ -44,4 +44,5 @@ if (CUB_ENABLE_NVBENCH_HELPER_TESTS) add_nvbench_helper_test(cpp) add_nvbench_helper_test(cuda) -endif() \ No newline at end of file +endif() + diff --git a/cub/benchmarks/nvbench_helper/nvbench_helper/nvbench_helper.cuh b/cub/benchmarks/nvbench_helper/nvbench_helper/nvbench_helper.cuh index c5ae5f6c50..3bedf5841a 100644 --- a/cub/benchmarks/nvbench_helper/nvbench_helper/nvbench_helper.cuh +++ b/cub/benchmarks/nvbench_helper/nvbench_helper/nvbench_helper.cuh @@ -53,29 +53,34 @@ using offset_types = nvbench::type_list; #endif #ifdef TUNE_T +using integral_types = nvbench::type_list; using fundamental_types = nvbench::type_list; -using all_types = nvbench::type_list; +using all_types = nvbench::type_list; #else -using fundamental_types = nvbench::type_list; - -using all_types = nvbench::type_list; +using integral_types = nvbench::type_list; + +using fundamental_types = + nvbench::type_list; + +using all_types = + nvbench::type_list; #endif template diff --git a/cub/docs/tuning.rst b/cub/docs/tuning.rst index ae6cdaabe3..9ac2b9752b 100644 --- a/cub/docs/tuning.rst +++ b/cub/docs/tuning.rst @@ -121,7 +121,7 @@ Finally, we can run the algorithm: .. code:: c++ - state.exec([&](nvbench::launch &launch) { + state.exec(nvbench::exec_tag::no_batch, [&](nvbench::launch &launch) { dispatch_t::Dispatch(temp_storage, temp_size, d_in, diff --git a/thrust/benchmarks/bench/adjacent_difference/basic.cu b/thrust/benchmarks/bench/adjacent_difference/basic.cu index d681aaa6aa..47f93f382b 100644 --- a/thrust/benchmarks/bench/adjacent_difference/basic.cu +++ b/thrust/benchmarks/bench/adjacent_difference/basic.cu @@ -43,7 +43,7 @@ static void basic(nvbench::state &state, nvbench::type_list) state.add_global_memory_reads(elements); state.add_global_memory_writes(elements); - state.exec(nvbench::exec_tag::sync, [&](nvbench::launch & /* launch */) { + state.exec(nvbench::exec_tag::no_batch | nvbench::exec_tag::sync, [&](nvbench::launch & /* launch */) { thrust::adjacent_difference(input.cbegin(), input.cend(), output.begin()); }); } diff --git a/thrust/benchmarks/bench/adjacent_difference/custom.cu b/thrust/benchmarks/bench/adjacent_difference/custom.cu index 63d5d69e12..e8e892bf8d 100644 --- a/thrust/benchmarks/bench/adjacent_difference/custom.cu +++ b/thrust/benchmarks/bench/adjacent_difference/custom.cu @@ -60,7 +60,7 @@ static void basic(nvbench::state &state, nvbench::type_list) state.add_global_memory_reads(elements); state.add_global_memory_writes(elements); - state.exec(nvbench::exec_tag::sync, [&](nvbench::launch & /* launch */) { + state.exec(nvbench::exec_tag::no_batch | nvbench::exec_tag::sync, [&](nvbench::launch & /* launch */) { thrust::adjacent_difference(input.cbegin(), input.cend(), output.begin(), custom_op{42}); }); } diff --git a/thrust/benchmarks/bench/adjacent_difference/in_place.cu b/thrust/benchmarks/bench/adjacent_difference/in_place.cu index e6c39748df..4e91e8471c 100644 --- a/thrust/benchmarks/bench/adjacent_difference/in_place.cu +++ b/thrust/benchmarks/bench/adjacent_difference/in_place.cu @@ -42,7 +42,7 @@ static void basic(nvbench::state &state, nvbench::type_list) state.add_global_memory_reads(elements); state.add_global_memory_writes(elements); - state.exec(nvbench::exec_tag::sync, [&](nvbench::launch & /* launch */) { + state.exec(nvbench::exec_tag::no_batch | nvbench::exec_tag::sync, [&](nvbench::launch & /* launch */) { thrust::adjacent_difference(vec.begin(), vec.end(), vec.begin()); }); } diff --git a/thrust/benchmarks/bench/copy/basic.cu b/thrust/benchmarks/bench/copy/basic.cu index 8969a7b886..1b2b96214d 100644 --- a/thrust/benchmarks/bench/copy/basic.cu +++ b/thrust/benchmarks/bench/copy/basic.cu @@ -45,7 +45,7 @@ static void basic(nvbench::state &state, state.add_global_memory_reads(elements); state.add_global_memory_writes(elements); - state.exec(nvbench::exec_tag::sync, [&](nvbench::launch & /* launch */) { + state.exec(nvbench::exec_tag::no_batch | nvbench::exec_tag::sync, [&](nvbench::launch & /* launch */) { thrust::copy(input.cbegin(), input.cend(), output.begin()); diff --git a/thrust/benchmarks/bench/copy/if.cu b/thrust/benchmarks/bench/copy/if.cu index d8c4fd22e6..8b89e08db6 100644 --- a/thrust/benchmarks/bench/copy/if.cu +++ b/thrust/benchmarks/bench/copy/if.cu @@ -74,7 +74,7 @@ static void basic(nvbench::state &state, state.add_global_memory_reads(elements); state.add_global_memory_writes(selected_elements); - state.exec(nvbench::exec_tag::sync, [&](nvbench::launch & /* launch */) { + state.exec(nvbench::exec_tag::no_batch | nvbench::exec_tag::sync, [&](nvbench::launch & /* launch */) { thrust::copy_if(input.cbegin(), input.cend(), output.begin(), select_op); }); } diff --git a/thrust/benchmarks/bench/fill/basic.cu b/thrust/benchmarks/bench/fill/basic.cu index 63a24d2bc2..3c29f3c704 100644 --- a/thrust/benchmarks/bench/fill/basic.cu +++ b/thrust/benchmarks/bench/fill/basic.cu @@ -41,7 +41,7 @@ static void basic(nvbench::state &state, nvbench::type_list) state.add_element_count(elements); state.add_global_memory_writes(elements); - state.exec(nvbench::exec_tag::sync, [&](nvbench::launch & /* launch */) { + state.exec(nvbench::exec_tag::no_batch | nvbench::exec_tag::sync, [&](nvbench::launch & /* launch */) { thrust::fill(output.begin(), output.end(), T{42}); }); } diff --git a/thrust/benchmarks/bench/inner_product/basic.cu b/thrust/benchmarks/bench/inner_product/basic.cu index 5a60ca1cfd..aa3b5d467e 100644 --- a/thrust/benchmarks/bench/inner_product/basic.cu +++ b/thrust/benchmarks/bench/inner_product/basic.cu @@ -44,7 +44,7 @@ static void basic(nvbench::state &state, nvbench::type_list) state.add_global_memory_reads(elements * 2); state.add_global_memory_writes(1); - state.exec(nvbench::exec_tag::sync, [&](nvbench::launch & /* launch */) { + state.exec(nvbench::exec_tag::no_batch | nvbench::exec_tag::sync, [&](nvbench::launch & /* launch */) { thrust::inner_product(lhs.begin(), lhs.end(), rhs.begin(), T{0}); }); } diff --git a/thrust/benchmarks/bench/merge/basic.cu b/thrust/benchmarks/bench/merge/basic.cu index fb8e8f8822..854baf8ec0 100644 --- a/thrust/benchmarks/bench/merge/basic.cu +++ b/thrust/benchmarks/bench/merge/basic.cu @@ -50,7 +50,7 @@ static void basic(nvbench::state &state, nvbench::type_list) state.add_global_memory_reads(elements); state.add_global_memory_writes(elements); - state.exec(nvbench::exec_tag::sync, [&](nvbench::launch & /* launch */) { + state.exec(nvbench::exec_tag::no_batch | nvbench::exec_tag::sync, [&](nvbench::launch & /* launch */) { thrust::merge(in.cbegin(), in.cbegin() + elements_in_lhs, in.cbegin() + elements_in_lhs, diff --git a/thrust/benchmarks/bench/partition/basic.cu b/thrust/benchmarks/bench/partition/basic.cu index a04aae3128..aafdc89223 100644 --- a/thrust/benchmarks/bench/partition/basic.cu +++ b/thrust/benchmarks/bench/partition/basic.cu @@ -72,7 +72,7 @@ static void basic(nvbench::state &state, state.add_global_memory_reads(elements); state.add_global_memory_writes(elements); - state.exec(nvbench::exec_tag::sync, [&](nvbench::launch & /* launch */) { + state.exec(nvbench::exec_tag::no_batch | nvbench::exec_tag::sync, [&](nvbench::launch & /* launch */) { thrust::partition_copy(input.cbegin(), input.cend(), output.begin(), diff --git a/thrust/benchmarks/bench/reduce/basic.cu b/thrust/benchmarks/bench/reduce/basic.cu index 97dbe5d02b..e6e31c22a0 100644 --- a/thrust/benchmarks/bench/reduce/basic.cu +++ b/thrust/benchmarks/bench/reduce/basic.cu @@ -42,7 +42,7 @@ static void basic(nvbench::state &state, nvbench::type_list) state.add_global_memory_reads(elements); state.add_global_memory_writes(1); - state.exec(nvbench::exec_tag::sync, [&](nvbench::launch & /* launch */) { + state.exec(nvbench::exec_tag::no_batch | nvbench::exec_tag::sync, [&](nvbench::launch & /* launch */) { do_not_optimize(thrust::reduce(in.begin(), in.end())); }); } diff --git a/thrust/benchmarks/bench/reduce/by_key.cu b/thrust/benchmarks/bench/reduce/by_key.cu index 4eaaed194e..282dff7d94 100644 --- a/thrust/benchmarks/bench/reduce/by_key.cu +++ b/thrust/benchmarks/bench/reduce/by_key.cu @@ -57,7 +57,7 @@ static void basic(nvbench::state &state, nvbench::type_list) state.add_global_memory_writes(unique_keys); state.add_global_memory_writes(unique_keys); - state.exec(nvbench::exec_tag::sync, [&](nvbench::launch & /* launch */) { + state.exec(nvbench::exec_tag::no_batch | nvbench::exec_tag::sync, [&](nvbench::launch & /* launch */) { thrust::reduce_by_key(in_keys.begin(), in_keys.end(), in_vals.begin(), diff --git a/thrust/benchmarks/bench/scan/exclusive/by_key.cu b/thrust/benchmarks/bench/scan/exclusive/by_key.cu index 76a5a0f992..df650554b3 100644 --- a/thrust/benchmarks/bench/scan/exclusive/by_key.cu +++ b/thrust/benchmarks/bench/scan/exclusive/by_key.cu @@ -45,7 +45,7 @@ static void scan(nvbench::state &state, nvbench::type_list) state.add_global_memory_reads(elements); state.add_global_memory_writes(elements); - state.exec(nvbench::exec_tag::sync, [&](nvbench::launch & /* launch */) { + state.exec(nvbench::exec_tag::no_batch | nvbench::exec_tag::sync, [&](nvbench::launch & /* launch */) { thrust::exclusive_scan_by_key(keys.cbegin(), keys.cend(), in_vals.cbegin(), out_vals.begin()); }); } diff --git a/thrust/benchmarks/bench/scan/exclusive/max.cu b/thrust/benchmarks/bench/scan/exclusive/max.cu index c434c537e5..a18a3c96cb 100644 --- a/thrust/benchmarks/bench/scan/exclusive/max.cu +++ b/thrust/benchmarks/bench/scan/exclusive/max.cu @@ -43,7 +43,7 @@ static void basic(nvbench::state &state, nvbench::type_list) state.add_global_memory_reads(elements); state.add_global_memory_writes(elements); - state.exec(nvbench::exec_tag::sync, [&](nvbench::launch & /* launch */) { + state.exec(nvbench::exec_tag::no_batch | nvbench::exec_tag::sync, [&](nvbench::launch & /* launch */) { thrust::exclusive_scan(input.cbegin(), input.cend(), output.begin(), T{}, max_t{}); }); } diff --git a/thrust/benchmarks/bench/scan/exclusive/sum.cu b/thrust/benchmarks/bench/scan/exclusive/sum.cu index 75ae35894d..29b82b68a8 100644 --- a/thrust/benchmarks/bench/scan/exclusive/sum.cu +++ b/thrust/benchmarks/bench/scan/exclusive/sum.cu @@ -43,7 +43,7 @@ static void basic(nvbench::state &state, nvbench::type_list) state.add_global_memory_reads(elements); state.add_global_memory_writes(elements); - state.exec(nvbench::exec_tag::sync, [&](nvbench::launch & /* launch */) { + state.exec(nvbench::exec_tag::no_batch | nvbench::exec_tag::sync, [&](nvbench::launch & /* launch */) { thrust::exclusive_scan(input.cbegin(), input.cend(), output.begin()); }); } diff --git a/thrust/benchmarks/bench/scan/inclusive/by_key.cu b/thrust/benchmarks/bench/scan/inclusive/by_key.cu index bb468ff57d..10e0cbc408 100644 --- a/thrust/benchmarks/bench/scan/inclusive/by_key.cu +++ b/thrust/benchmarks/bench/scan/inclusive/by_key.cu @@ -45,7 +45,7 @@ static void scan(nvbench::state &state, nvbench::type_list) state.add_global_memory_reads(elements); state.add_global_memory_writes(elements); - state.exec(nvbench::exec_tag::sync, [&](nvbench::launch & /* launch */) { + state.exec(nvbench::exec_tag::no_batch | nvbench::exec_tag::sync, [&](nvbench::launch & /* launch */) { thrust::inclusive_scan_by_key(keys.cbegin(), keys.cend(), in_vals.cbegin(), out_vals.begin()); }); } diff --git a/thrust/benchmarks/bench/scan/inclusive/max.cu b/thrust/benchmarks/bench/scan/inclusive/max.cu index affecbdd9f..40d84942ec 100644 --- a/thrust/benchmarks/bench/scan/inclusive/max.cu +++ b/thrust/benchmarks/bench/scan/inclusive/max.cu @@ -43,7 +43,7 @@ static void basic(nvbench::state &state, nvbench::type_list) state.add_global_memory_reads(elements); state.add_global_memory_writes(elements); - state.exec(nvbench::exec_tag::sync, [&](nvbench::launch & /* launch */) { + state.exec(nvbench::exec_tag::no_batch | nvbench::exec_tag::sync, [&](nvbench::launch & /* launch */) { thrust::inclusive_scan(input.cbegin(), input.cend(), output.begin(), max_t{}); }); } diff --git a/thrust/benchmarks/bench/scan/inclusive/sum.cu b/thrust/benchmarks/bench/scan/inclusive/sum.cu index 540001a234..ea98b7bcf3 100644 --- a/thrust/benchmarks/bench/scan/inclusive/sum.cu +++ b/thrust/benchmarks/bench/scan/inclusive/sum.cu @@ -43,7 +43,7 @@ static void basic(nvbench::state &state, nvbench::type_list) state.add_global_memory_reads(elements); state.add_global_memory_writes(elements); - state.exec(nvbench::exec_tag::sync, [&](nvbench::launch & /* launch */) { + state.exec(nvbench::exec_tag::no_batch | nvbench::exec_tag::sync, [&](nvbench::launch & /* launch */) { thrust::inclusive_scan(input.cbegin(), input.cend(), output.begin()); }); } diff --git a/thrust/benchmarks/bench/set_operations/base.cuh b/thrust/benchmarks/bench/set_operations/base.cuh index c660d222f0..9f5ab563ac 100644 --- a/thrust/benchmarks/bench/set_operations/base.cuh +++ b/thrust/benchmarks/bench/set_operations/base.cuh @@ -61,7 +61,7 @@ static void basic(nvbench::state &state, nvbench::type_list, OpT op) state.add_global_memory_reads(elements); state.add_global_memory_writes(elements_in_AB); - state.exec(nvbench::exec_tag::sync, [&](nvbench::launch & /* launch */) { + state.exec(nvbench::exec_tag::no_batch | nvbench::exec_tag::sync, [&](nvbench::launch & /* launch */) { op(input.cbegin(), input.cbegin() + elements_in_A, input.cbegin() + elements_in_A, diff --git a/thrust/benchmarks/bench/set_operations/by_key.cuh b/thrust/benchmarks/bench/set_operations/by_key.cuh index ae19b2407e..6e71601f85 100644 --- a/thrust/benchmarks/bench/set_operations/by_key.cuh +++ b/thrust/benchmarks/bench/set_operations/by_key.cuh @@ -70,7 +70,7 @@ static void basic(nvbench::state &state, nvbench::type_list, OpT o state.add_global_memory_reads(OpT::read_all_values ? elements : elements_in_A); state.add_global_memory_writes(elements_in_AB); - state.exec(nvbench::exec_tag::sync, [&](nvbench::launch & /* launch */) { + state.exec(nvbench::exec_tag::no_batch | nvbench::exec_tag::sync, [&](nvbench::launch & /* launch */) { op(in_keys.cbegin(), in_keys.cbegin() + elements_in_A, in_keys.cbegin() + elements_in_A, diff --git a/thrust/benchmarks/bench/shuffle/basic.cu b/thrust/benchmarks/bench/shuffle/basic.cu index f70629f2a4..cc24d26785 100644 --- a/thrust/benchmarks/bench/shuffle/basic.cu +++ b/thrust/benchmarks/bench/shuffle/basic.cu @@ -44,7 +44,7 @@ static void basic(nvbench::state &state, nvbench::type_list) state.add_global_memory_writes(elements); auto do_engine = [&](auto &&engine_constructor) { - state.exec(nvbench::exec_tag::sync, [&](nvbench::launch & /* launch */) { + state.exec(nvbench::exec_tag::no_batch | nvbench::exec_tag::sync, [&](nvbench::launch & /* launch */) { thrust::shuffle(data.begin(), data.end(), engine_constructor()); }); }; diff --git a/thrust/benchmarks/bench/sort/keys.cu b/thrust/benchmarks/bench/sort/keys.cu index f086505fa2..d52dd6e7d1 100644 --- a/thrust/benchmarks/bench/sort/keys.cu +++ b/thrust/benchmarks/bench/sort/keys.cu @@ -58,4 +58,4 @@ NVBENCH_BENCH_TYPES(basic, NVBENCH_TYPE_AXES(fundamental_types)) .set_name("base") .set_type_axes_names({"T{ct}"}) .add_int64_power_of_two_axis("Elements", nvbench::range(16, 28, 4)) - .add_string_axis("Entropy", {"1.000", "0.811", "0.544", "0.337", "0.201"}); + .add_string_axis("Entropy", {"1.000", "0.201"}); diff --git a/thrust/benchmarks/bench/sort/keys_custom.cu b/thrust/benchmarks/bench/sort/keys_custom.cu index 3728006f68..f1eb8c2fdf 100644 --- a/thrust/benchmarks/bench/sort/keys_custom.cu +++ b/thrust/benchmarks/bench/sort/keys_custom.cu @@ -58,4 +58,4 @@ NVBENCH_BENCH_TYPES(basic, NVBENCH_TYPE_AXES(fundamental_types)) .set_name("base") .set_type_axes_names({"T{ct}"}) .add_int64_power_of_two_axis("Elements", nvbench::range(16, 28, 4)) - .add_string_axis("Entropy", {"1.000", "0.811", "0.544", "0.337", "0.201"}); + .add_string_axis("Entropy", {"1.000", "0.201"}); diff --git a/thrust/benchmarks/bench/sort/pairs.cu b/thrust/benchmarks/bench/sort/pairs.cu index a6d45e33ed..9d2f06b2f5 100644 --- a/thrust/benchmarks/bench/sort/pairs.cu +++ b/thrust/benchmarks/bench/sort/pairs.cu @@ -59,19 +59,11 @@ static void basic(nvbench::state &state, nvbench::type_list) }); } -using key_types = fundamental_types; -using value_types = nvbench::type_list; +using key_types = integral_types; +using value_types = integral_types; NVBENCH_BENCH_TYPES(basic, NVBENCH_TYPE_AXES(key_types, value_types)) .set_name("base") .set_type_axes_names({"KeyT{ct}", "ValueT{ct}"}) .add_int64_power_of_two_axis("Elements", nvbench::range(16, 28, 4)) - .add_string_axis("Entropy", {"1.000", "0.811", "0.544", "0.337", "0.201"}); + .add_string_axis("Entropy", {"1.000", "0.201"}); diff --git a/thrust/benchmarks/bench/sort/pairs_custom.cu b/thrust/benchmarks/bench/sort/pairs_custom.cu index 4cd7313989..bb731e03c6 100644 --- a/thrust/benchmarks/bench/sort/pairs_custom.cu +++ b/thrust/benchmarks/bench/sort/pairs_custom.cu @@ -59,19 +59,11 @@ static void basic(nvbench::state &state, nvbench::type_list) }); } -using key_types = fundamental_types; -using value_types = nvbench::type_list; +using key_types = integral_types; +using value_types = integral_types; NVBENCH_BENCH_TYPES(basic, NVBENCH_TYPE_AXES(key_types, value_types)) .set_name("base") .set_type_axes_names({"KeyT{ct}", "ValueT{ct}"}) .add_int64_power_of_two_axis("Elements", nvbench::range(16, 28, 4)) - .add_string_axis("Entropy", {"1.000", "0.811", "0.544", "0.337", "0.201"}); + .add_string_axis("Entropy", {"1.000", "0.201"}); diff --git a/thrust/benchmarks/bench/unique/basic.cu b/thrust/benchmarks/bench/unique/basic.cu index 7bef39ecc4..2f01fb3045 100644 --- a/thrust/benchmarks/bench/unique/basic.cu +++ b/thrust/benchmarks/bench/unique/basic.cu @@ -51,7 +51,7 @@ static void basic(nvbench::state &state, nvbench::type_list) state.add_global_memory_reads(elements); state.add_global_memory_writes(unique_items); - state.exec(nvbench::exec_tag::sync, [&](nvbench::launch & /* launch */) { + state.exec(nvbench::exec_tag::no_batch | nvbench::exec_tag::sync, [&](nvbench::launch & /* launch */) { thrust::unique_copy(input.cbegin(), input.cend(), output.begin()); }); } diff --git a/thrust/benchmarks/bench/unique/by_key.cu b/thrust/benchmarks/bench/unique/by_key.cu index e6961bc4d9..ed43e64cb9 100644 --- a/thrust/benchmarks/bench/unique/by_key.cu +++ b/thrust/benchmarks/bench/unique/by_key.cu @@ -56,7 +56,7 @@ static void basic(nvbench::state &state, nvbench::type_list) state.add_global_memory_reads(elements); state.add_global_memory_writes(unique_elements); - state.exec(nvbench::exec_tag::sync, [&](nvbench::launch & /* launch */) { + state.exec(nvbench::exec_tag::no_batch | nvbench::exec_tag::sync, [&](nvbench::launch & /* launch */) { thrust::unique_by_key_copy(in_keys.cbegin(), in_keys.cend(), in_vals.cbegin(), diff --git a/thrust/benchmarks/bench/vectorized_search/base.cu b/thrust/benchmarks/bench/vectorized_search/base.cu index a733830fa1..67e9ddedcc 100644 --- a/thrust/benchmarks/bench/vectorized_search/base.cu +++ b/thrust/benchmarks/bench/vectorized_search/base.cu @@ -46,7 +46,7 @@ static void basic(nvbench::state &state, nvbench::type_list) state.add_element_count(needles); - state.exec(nvbench::exec_tag::sync, [&](nvbench::launch & /* launch */) { + state.exec(nvbench::exec_tag::no_batch | nvbench::exec_tag::sync, [&](nvbench::launch & /* launch */) { thrust::binary_search(data.begin(), data.begin() + elements, data.begin() + elements, diff --git a/thrust/benchmarks/bench/vectorized_search/lower_bound.cu b/thrust/benchmarks/bench/vectorized_search/lower_bound.cu index 4ab9539215..e3fbd6e6cb 100644 --- a/thrust/benchmarks/bench/vectorized_search/lower_bound.cu +++ b/thrust/benchmarks/bench/vectorized_search/lower_bound.cu @@ -46,7 +46,7 @@ static void basic(nvbench::state &state, nvbench::type_list) state.add_element_count(needles); - state.exec(nvbench::exec_tag::sync, [&](nvbench::launch & /* launch */) { + state.exec(nvbench::exec_tag::no_batch | nvbench::exec_tag::sync, [&](nvbench::launch & /* launch */) { thrust::lower_bound(data.begin(), data.begin() + elements, data.begin() + elements, diff --git a/thrust/benchmarks/bench/vectorized_search/upper_bound.cu b/thrust/benchmarks/bench/vectorized_search/upper_bound.cu index 5b57ebf92a..6b412ca299 100644 --- a/thrust/benchmarks/bench/vectorized_search/upper_bound.cu +++ b/thrust/benchmarks/bench/vectorized_search/upper_bound.cu @@ -46,7 +46,7 @@ static void basic(nvbench::state &state, nvbench::type_list) state.add_element_count(needles); - state.exec(nvbench::exec_tag::sync, [&](nvbench::launch & /* launch */) { + state.exec(nvbench::exec_tag::no_batch | nvbench::exec_tag::sync, [&](nvbench::launch & /* launch */) { thrust::upper_bound(data.begin(), data.begin() + elements, data.begin() + elements,