Skip to content

Commit

Permalink
[2.3.x] Backport benchmarking PRs (#1168)
Browse files Browse the repository at this point in the history
* Avoid batch benchmarks

* Relax minimal benchmarking time

* Don't benchmark small problem sizes

* Reduce sort workloads

* Split benchmarks into shards

* Fewer histogram benchmarks

* Fewer seg sort benchmarks

* Think about P0 algorithms

* Extra print

* Next round of simplification

* Do not treat segmented problems as P0

* Fewer workloads for memcpy

* Better workload filter

* Do not stop on first bench failure

* Update nvbench

* Remove draft automation + reduce permissions (#1154)

* Set finer-grain workflow permissions (#1163)

* Set workflow read permissions and per-job write permissions.

* set pull-requests read at workflow level.

* Set contents read at job level.

* Explicitly set job-level read permissions.

* Add permissions to verify-devcontainers.

---------

Co-authored-by: Georgy Evtushenko <evtushenko.georgy@gmail.com>
Co-authored-by: Ben Jarmak <104460670+jarmak-nv@users.noreply.github.com>
Co-authored-by: Jake Hemstad <jhemstad@nvidia.com>
  • Loading branch information
4 people authored Dec 1, 2023
1 parent 3dd0c5b commit 4d5c181
Show file tree
Hide file tree
Showing 64 changed files with 219 additions and 140 deletions.
9 changes: 9 additions & 0 deletions .github/workflows/build-and-test-linux.yml
Original file line number Diff line number Diff line change
Expand Up @@ -14,9 +14,15 @@ on:
container_image: {type: string, required: false}
run_tests: {type: boolean, required: false, default: true}

permissions:
contents: read

jobs:
build:
name: Build ${{inputs.test_name}}
permissions:
id-token: write
contents: read
uses: ./.github/workflows/run-as-coder.yml
with:
name: Build ${{inputs.test_name}}
Expand All @@ -27,6 +33,9 @@ jobs:
test:
needs: build
permissions:
id-token: write
contents: read
if: ${{ !cancelled() && ( needs.build.result == 'success' || needs.build.result == 'skipped' ) && inputs.run_tests}}
name: Test ${{inputs.test_name}}
uses: ./.github/workflows/run-as-coder.yml
Expand Down
9 changes: 9 additions & 0 deletions .github/workflows/dispatch-build-and-test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -8,12 +8,18 @@ on:
devcontainer_version: {type: string, required: true}
is_windows: {type: boolean, required: true}

permissions:
contents: read

jobs:
# Using a matrix to dispatch to the build-and-test reusable workflow for each build configuration
# ensures that the build/test steps can overlap across different configurations. For example,
# the build step for CUDA 12.1 + gcc 9.3 can run at the same time as the test step for CUDA 11.0 + clang 11.
build_and_test_linux:
name: build and test linux
permissions:
id-token: write
contents: read
if: ${{ !inputs.is_windows }}
uses: ./.github/workflows/build-and-test-linux.yml
strategy:
Expand All @@ -30,6 +36,9 @@ jobs:

build_and_test_windows:
name: build and test windows
permissions:
id-token: write
contents: read
if: ${{ inputs.is_windows }}
uses: ./.github/workflows/build-and-test-windows.yml
strategy:
Expand Down
27 changes: 26 additions & 1 deletion .github/workflows/pr.yml
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,10 @@ concurrency:
group: ${{ github.workflow }}-on-${{ github.event_name }}-from-${{ github.ref_name }}
cancel-in-progress: true

permissions:
contents: read
pull-requests: read

jobs:
compute-matrix:
name: Compute matrix
Expand All @@ -53,6 +57,9 @@ jobs:
nvrtc:
name: NVRTC CUDA${{matrix.cuda}} C++${{matrix.std}}
permissions:
id-token: write
contents: read
needs: compute-matrix
if: ${{ !contains(github.event.head_commit.message, 'skip-tests') }}
uses: ./.github/workflows/run-as-coder.yml
Expand All @@ -69,6 +76,9 @@ jobs:
thrust:
name: Thrust CUDA${{ matrix.cuda_version }} ${{ matrix.compiler }}
permissions:
id-token: write
contents: read
needs: compute-matrix
uses: ./.github/workflows/dispatch-build-and-test.yml
strategy:
Expand All @@ -84,6 +94,9 @@ jobs:

cub:
name: CUB CUDA${{ matrix.cuda_version }} ${{ matrix.compiler }}
permissions:
id-token: write
contents: read
needs: compute-matrix
uses: ./.github/workflows/dispatch-build-and-test.yml
strategy:
Expand All @@ -99,6 +112,9 @@ jobs:

libcudacxx:
name: libcudacxx CUDA${{ matrix.cuda_version }} ${{ matrix.compiler }}
permissions:
id-token: write
contents: read
needs: compute-matrix
uses: ./.github/workflows/dispatch-build-and-test.yml
strategy:
Expand All @@ -114,6 +130,9 @@ jobs:

clang-cuda:
name: ${{matrix.lib}} ${{matrix.cpu}}/CTK${{matrix.cuda}}/clang-cuda
permissions:
id-token: write
contents: read
needs: compute-matrix
strategy:
fail-fast: false
Expand All @@ -129,6 +148,9 @@ jobs:
cccl-infra:
name: CCCL Infrastructure
permissions:
id-token: write
contents: read
needs: compute-matrix
if: ${{ !contains(github.event.head_commit.message, 'skip-tests') }}
strategy:
Expand All @@ -146,6 +168,9 @@ jobs:
verify-devcontainers:
name: Verify Dev Containers
permissions:
id-token: write
contents: read
uses: ./.github/workflows/verify-devcontainers.yml

# This job is the final job that runs after all other jobs and is used for branch protection status checks.
Expand All @@ -154,7 +179,7 @@ jobs:
ci:
runs-on: ubuntu-latest
name: CI
if: ${{ always() }} # need to use always() instead of !cancelled() because skipped jobs count as success
if: ${{ always() }} # need to use always() instead of !cancelled() because skipped jobs count as success
needs:
- clang-cuda
- cub
Expand Down
14 changes: 1 addition & 13 deletions .github/workflows/project_automation_set_in_progress.yml
Original file line number Diff line number Diff line change
Expand Up @@ -45,8 +45,7 @@ jobs:
runs-on: ubuntu-latest

permissions:
issues: write
pull-requests: write
pull-requests: read

steps:
- name: Check if changes requested from a reviewer
Expand All @@ -56,17 +55,6 @@ jobs:
if [ ${{ github.event.review.state }} != 'changes_requested' ]; then
echo "Changes not requested, exiting"
exit 0
# If it is requesting changes, set PR to draft
# We use the default token here since we're granting write access to the PR
elif [ ${{ github.event.pull_request.draft }} == false ]; then
gh api graphql -f query='
mutation {
convertPullRequestToDraft(input: {pullRequestId: "${{ env.PR_GLOBAL_ID }}"}) {
clientMutationId
}
}'
exit 0
fi
continue-on-error: true

Expand Down
8 changes: 6 additions & 2 deletions .github/workflows/run-as-coder.yml
Original file line number Diff line number Diff line change
Expand Up @@ -14,17 +14,21 @@ on:
command: {type: string, required: true}
env: { type: string, required: false, default: "" }

permissions:
contents: read

jobs:
run-as-coder:
name: ${{inputs.name}}
permissions:
id-token: write
contents: read
runs-on: ${{inputs.runner}}
container:
options: -u root
image: ${{inputs.image}}
env:
NVIDIA_VISIBLE_DEVICES: ${{ env.NVIDIA_VISIBLE_DEVICES }}
permissions:
id-token: write
steps:
- name: Checkout repo
uses: actions/checkout@v3
Expand Down
3 changes: 3 additions & 0 deletions .github/workflows/verify-devcontainers.yml
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,9 @@ defaults:
run:
shell: bash -euo pipefail {0}

permissions:
contents: read

jobs:
verify-make-devcontainers:
name: Verify devcontainer files are up-to-date
Expand Down
8 changes: 8 additions & 0 deletions benchmarks/scripts/cccl/bench/bench.py
Original file line number Diff line number Diff line change
Expand Up @@ -635,6 +635,14 @@ def do_run(self, ct_point, rt_values, timeout, is_search=True):
cmd.append("--min-samples")
cmd.append("70")

# Unlike noise, minimal benchmarking time is not directly related to variance.
# Default minimal time is 0.5 seconds. For CI we want to reduce it to 0.1 seconds,
# becuse we have limited time budget. Having smaller minimal time doesn't affect
# stability of sample distribution median in a deterministic way. For small problem sizes,
# 0.1s leads to smaller variation than 0.5s. For other workloads, 0.5 leads to smaller variance.
cmd.append("--min-time")
cmd.append("0.1")

# NVBench is currently broken for multiple GPUs, use `CUDA_VISIBLE_DEVICES`
cmd.append("-d")
cmd.append("0")
Expand Down
36 changes: 29 additions & 7 deletions benchmarks/scripts/cccl/bench/search.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,19 +47,41 @@ def parse_arguments():
type=str, help="Parameter in the format `Param=Value`.")
parser.add_argument(
'--list-benches', action=argparse.BooleanOptionalAction, help="Show available benchmarks.")
parser.add_argument('--num-shards', type=int, default=1, help='Split benchmarks into M pieces and only run one')
parser.add_argument('--run-shard', type=int, default=0, help='Run shard N / M of benchmarks')
parser.add_argument('-P0', action=argparse.BooleanOptionalAction, help="Run P0 benchmarks (overwrites -R)")
return parser.parse_args()


def run_benches(benchmarks, sub_space, regex, seeker):
pattern = re.compile(regex)

for algname in benchmarks:
if pattern.match(algname):
def run_benches(algnames, sub_space, seeker):
for algname in algnames:
try:
bench = BaseBench(algname)
ct_space = bench.ct_workload_space(sub_space)
rt_values = bench.rt_axes_values(sub_space)
seeker(algname, ct_space, rt_values)
except Exception as e:
print("#### ERROR exception occured while running {}: '{}'".format(algname, e))


def filter_benchmarks(benchmarks, args):
if args.run_shard >= args.num_shards:
raise ValueError('run-shard must be less than num-shards')

R = args.R
if args.P0:
R = '^(?!.*segmented).*(scan|reduce|select|sort).*'

pattern = re.compile(R)
algnames = list(filter(lambda x: pattern.match(x), benchmarks.keys()))
algnames.sort()

if args.num_shards > 1:
algnames = np.array_split(algnames, args.num_shards)[args.run_shard].tolist()
return algnames

return algnames


def search(seeker):
args = parse_arguments()
Expand All @@ -79,8 +101,8 @@ def search(seeker):
if args.list_benches:
list_benches()
return

run_benches(config.benchmarks, workload_sub_space, args.R, seeker)
run_benches(filter_benchmarks(config.benchmarks, args), workload_sub_space, seeker)


class MedianCenterEstimator:
Expand Down
32 changes: 27 additions & 5 deletions benchmarks/scripts/run.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,33 +6,55 @@
import cccl.bench


def elapsed_time_look_good(x):
def elapsed_time_looks_good(x):
if isinstance(x, float):
if math.isfinite(x):
return True
return False


def problem_size_looks_large_enough(elements):
# Small problem sizes do not utilize entire GPU.
# Benchmarking small problem sizes in environments where we do not control
# distributions comparison, e.g. CI, is not useful because of stability issues.
return elements.isdigit() and int(elements) > 20


def filter_runtime_workloads_for_ci(rt_values):
for subbench in rt_values:
for axis in rt_values[subbench]:
if axis.startswith('Elements') and axis.endswith('[pow2]'):
rt_values[subbench][axis] = list(filter(problem_size_looks_large_enough, rt_values[subbench][axis]))

return rt_values


class BaseRunner:
def __init__(self):
self.estimator = cccl.bench.MedianCenterEstimator()

def __call__(self, algname, ct_workload_space, rt_values):
failure_occured = False
rt_values = filter_runtime_workloads_for_ci(rt_values)

for ct_workload in ct_workload_space:
bench = cccl.bench.BaseBench(algname)
if bench.build():
if bench.build(): # might throw
results = bench.run(ct_workload, rt_values, self.estimator, False)
for subbench in results:
for point in results[subbench]:
bench_name = "{}.{}-{}".format(bench.algorithm_name(), subbench, point)
bench_name = bench_name.replace(' ', '___')
bench_name = "".join(c if c.isalnum() else "_" for c in bench_name)
elapsed_time = results[subbench][point]
if elapsed_time_look_good(elapsed_time):
if elapsed_time_looks_good(elapsed_time):
print("&&&& PERF {} {} -sec".format(bench_name, elapsed_time))
else:
print("&&&& FAILED bench")
sys.exit(-1)
failure_occured = True
print("&&&& FAILED {}".format(algname))

if failure_occured:
sys.exit(1)


def main():
Expand Down
2 changes: 1 addition & 1 deletion cub/benchmarks/bench/adjacent_difference/subtract_left.cu
Original file line number Diff line number Diff line change
Expand Up @@ -100,7 +100,7 @@ void left(nvbench::state& state, nvbench::type_list<T, OffsetT>)
thrust::device_vector<std::uint8_t> temp_storage(temp_storage_bytes);
std::uint8_t* d_temp_storage = thrust::raw_pointer_cast(temp_storage.data());

state.exec([&](nvbench::launch &launch) {
state.exec(nvbench::exec_tag::no_batch, [&](nvbench::launch &launch) {
dispatch_t::Dispatch(d_temp_storage,
temp_storage_bytes,
d_in,
Expand Down
9 changes: 4 additions & 5 deletions cub/benchmarks/bench/copy/memcpy.cu
Original file line number Diff line number Diff line change
Expand Up @@ -249,7 +249,7 @@ void copy(nvbench::state &state,
thrust::device_vector<nvbench::uint8_t> temp_storage(temp_storage_bytes);
d_temp_storage = thrust::raw_pointer_cast(temp_storage.data());

state.exec(nvbench::exec_tag::sync, [&](nvbench::launch &launch) {
state.exec(nvbench::exec_tag::no_batch | nvbench::exec_tag::sync, [&](nvbench::launch &launch) {
dispatch_t::Dispatch(d_temp_storage,
temp_storage_bytes,
d_input_buffers,
Expand All @@ -275,8 +275,8 @@ void uniform(nvbench::state &state, nvbench::type_list<T, OffsetT> tl)
elements,
min_buffer_size,
max_buffer_size,
state.get_int64("RandomizeInput"),
state.get_int64("RandomizeOutput"));
state.get_int64("Randomize"),
state.get_int64("Randomize"));
}

template <class T, class OffsetT>
Expand Down Expand Up @@ -309,8 +309,7 @@ NVBENCH_BENCH_TYPES(uniform, NVBENCH_TYPE_AXES(types, u_offset_types))
.add_int64_power_of_two_axis("Elements{io}", nvbench::range(25, 29, 2))
.add_int64_axis("MinBufferSizeRatio", {1, 99})
.add_int64_axis("MaxBufferSize", {8, 64, 256, 1024, 64 * 1024})
.add_int64_axis("RandomizeInput", {0, 1})
.add_int64_axis("RandomizeOutput", {0, 1});
.add_int64_axis("Randomize", {0, 1});

NVBENCH_BENCH_TYPES(large, NVBENCH_TYPE_AXES(types, u_offset_types))
.set_name("large")
Expand Down
6 changes: 3 additions & 3 deletions cub/benchmarks/bench/histogram/even.cu
Original file line number Diff line number Diff line change
Expand Up @@ -110,7 +110,7 @@ static void even(nvbench::state &state, nvbench::type_list<SampleT, CounterT, Of
thrust::device_vector<nvbench::uint8_t> tmp(temp_storage_bytes);
d_temp_storage = thrust::raw_pointer_cast(tmp.data());

state.exec([&](nvbench::launch &launch) {
state.exec(nvbench::exec_tag::no_batch, [&](nvbench::launch &launch) {
dispatch_t::DispatchEven(d_temp_storage,
temp_storage_bytes,
d_input,
Expand Down Expand Up @@ -139,5 +139,5 @@ NVBENCH_BENCH_TYPES(even, NVBENCH_TYPE_AXES(sample_types, bin_types, some_offset
.set_name("base")
.set_type_axes_names({"SampleT{ct}", "BinT{ct}", "OffsetT{ct}"})
.add_int64_power_of_two_axis("Elements{io}", nvbench::range(16, 28, 4))
.add_int64_axis("Bins", {32, 64, 128, 2048, 2097152})
.add_string_axis("Entropy", {"0.201", "0.544", "1.000"});
.add_int64_axis("Bins", {32, 128, 2048, 2097152})
.add_string_axis("Entropy", {"0.201", "1.000"});
Loading

0 comments on commit 4d5c181

Please sign in to comment.