Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[2.3.x] Backport benchmarking PRs #1168

Merged
merged 17 commits into from
Dec 1, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions .github/workflows/build-and-test-linux.yml
Original file line number Diff line number Diff line change
Expand Up @@ -14,9 +14,15 @@ on:
container_image: {type: string, required: false}
run_tests: {type: boolean, required: false, default: true}

permissions:
contents: read

jobs:
build:
name: Build ${{inputs.test_name}}
permissions:
id-token: write
contents: read
uses: ./.github/workflows/run-as-coder.yml
with:
name: Build ${{inputs.test_name}}
Expand All @@ -27,6 +33,9 @@ jobs:
test:
needs: build
permissions:
id-token: write
contents: read
if: ${{ !cancelled() && ( needs.build.result == 'success' || needs.build.result == 'skipped' ) && inputs.run_tests}}
name: Test ${{inputs.test_name}}
uses: ./.github/workflows/run-as-coder.yml
Expand Down
9 changes: 9 additions & 0 deletions .github/workflows/dispatch-build-and-test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -8,12 +8,18 @@ on:
devcontainer_version: {type: string, required: true}
is_windows: {type: boolean, required: true}

permissions:
contents: read

jobs:
# Using a matrix to dispatch to the build-and-test reusable workflow for each build configuration
# ensures that the build/test steps can overlap across different configurations. For example,
# the build step for CUDA 12.1 + gcc 9.3 can run at the same time as the test step for CUDA 11.0 + clang 11.
build_and_test_linux:
name: build and test linux
permissions:
id-token: write
contents: read
if: ${{ !inputs.is_windows }}
uses: ./.github/workflows/build-and-test-linux.yml
strategy:
Expand All @@ -30,6 +36,9 @@ jobs:

build_and_test_windows:
name: build and test windows
permissions:
id-token: write
contents: read
if: ${{ inputs.is_windows }}
uses: ./.github/workflows/build-and-test-windows.yml
strategy:
Expand Down
27 changes: 26 additions & 1 deletion .github/workflows/pr.yml
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,10 @@ concurrency:
group: ${{ github.workflow }}-on-${{ github.event_name }}-from-${{ github.ref_name }}
cancel-in-progress: true

permissions:
contents: read
pull-requests: read

jobs:
compute-matrix:
name: Compute matrix
Expand All @@ -53,6 +57,9 @@ jobs:
nvrtc:
name: NVRTC CUDA${{matrix.cuda}} C++${{matrix.std}}
permissions:
id-token: write
contents: read
needs: compute-matrix
if: ${{ !contains(github.event.head_commit.message, 'skip-tests') }}
uses: ./.github/workflows/run-as-coder.yml
Expand All @@ -69,6 +76,9 @@ jobs:
thrust:
name: Thrust CUDA${{ matrix.cuda_version }} ${{ matrix.compiler }}
permissions:
id-token: write
contents: read
needs: compute-matrix
uses: ./.github/workflows/dispatch-build-and-test.yml
strategy:
Expand All @@ -84,6 +94,9 @@ jobs:

cub:
name: CUB CUDA${{ matrix.cuda_version }} ${{ matrix.compiler }}
permissions:
id-token: write
contents: read
needs: compute-matrix
uses: ./.github/workflows/dispatch-build-and-test.yml
strategy:
Expand All @@ -99,6 +112,9 @@ jobs:

libcudacxx:
name: libcudacxx CUDA${{ matrix.cuda_version }} ${{ matrix.compiler }}
permissions:
id-token: write
contents: read
needs: compute-matrix
uses: ./.github/workflows/dispatch-build-and-test.yml
strategy:
Expand All @@ -114,6 +130,9 @@ jobs:

clang-cuda:
name: ${{matrix.lib}} ${{matrix.cpu}}/CTK${{matrix.cuda}}/clang-cuda
permissions:
id-token: write
contents: read
needs: compute-matrix
strategy:
fail-fast: false
Expand All @@ -129,6 +148,9 @@ jobs:
cccl-infra:
name: CCCL Infrastructure
permissions:
id-token: write
contents: read
needs: compute-matrix
if: ${{ !contains(github.event.head_commit.message, 'skip-tests') }}
strategy:
Expand All @@ -146,6 +168,9 @@ jobs:
verify-devcontainers:
name: Verify Dev Containers
permissions:
id-token: write
contents: read
uses: ./.github/workflows/verify-devcontainers.yml

# This job is the final job that runs after all other jobs and is used for branch protection status checks.
Expand All @@ -154,7 +179,7 @@ jobs:
ci:
runs-on: ubuntu-latest
name: CI
if: ${{ always() }} # need to use always() instead of !cancelled() because skipped jobs count as success
if: ${{ always() }} # need to use always() instead of !cancelled() because skipped jobs count as success
needs:
- clang-cuda
- cub
Expand Down
14 changes: 1 addition & 13 deletions .github/workflows/project_automation_set_in_progress.yml
Original file line number Diff line number Diff line change
Expand Up @@ -45,8 +45,7 @@ jobs:
runs-on: ubuntu-latest

permissions:
issues: write
pull-requests: write
pull-requests: read

steps:
- name: Check if changes requested from a reviewer
Expand All @@ -56,17 +55,6 @@ jobs:
if [ ${{ github.event.review.state }} != 'changes_requested' ]; then
echo "Changes not requested, exiting"
exit 0
# If it is requesting changes, set PR to draft
# We use the default token here since we're granting write access to the PR
elif [ ${{ github.event.pull_request.draft }} == false ]; then
gh api graphql -f query='
mutation {
convertPullRequestToDraft(input: {pullRequestId: "${{ env.PR_GLOBAL_ID }}"}) {
clientMutationId
}
}'
exit 0
fi
continue-on-error: true

Expand Down
8 changes: 6 additions & 2 deletions .github/workflows/run-as-coder.yml
Original file line number Diff line number Diff line change
Expand Up @@ -14,17 +14,21 @@ on:
command: {type: string, required: true}
env: { type: string, required: false, default: "" }

permissions:
contents: read

jobs:
run-as-coder:
name: ${{inputs.name}}
permissions:
id-token: write
contents: read
runs-on: ${{inputs.runner}}
container:
options: -u root
image: ${{inputs.image}}
env:
NVIDIA_VISIBLE_DEVICES: ${{ env.NVIDIA_VISIBLE_DEVICES }}
permissions:
id-token: write
steps:
- name: Checkout repo
uses: actions/checkout@v3
Expand Down
3 changes: 3 additions & 0 deletions .github/workflows/verify-devcontainers.yml
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,9 @@ defaults:
run:
shell: bash -euo pipefail {0}

permissions:
contents: read

jobs:
verify-make-devcontainers:
name: Verify devcontainer files are up-to-date
Expand Down
8 changes: 8 additions & 0 deletions benchmarks/scripts/cccl/bench/bench.py
Original file line number Diff line number Diff line change
Expand Up @@ -635,6 +635,14 @@ def do_run(self, ct_point, rt_values, timeout, is_search=True):
cmd.append("--min-samples")
cmd.append("70")

# Unlike noise, minimal benchmarking time is not directly related to variance.
# Default minimal time is 0.5 seconds. For CI we want to reduce it to 0.1 seconds,
# becuse we have limited time budget. Having smaller minimal time doesn't affect
# stability of sample distribution median in a deterministic way. For small problem sizes,
# 0.1s leads to smaller variation than 0.5s. For other workloads, 0.5 leads to smaller variance.
cmd.append("--min-time")
cmd.append("0.1")

# NVBench is currently broken for multiple GPUs, use `CUDA_VISIBLE_DEVICES`
cmd.append("-d")
cmd.append("0")
Expand Down
36 changes: 29 additions & 7 deletions benchmarks/scripts/cccl/bench/search.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,19 +47,41 @@ def parse_arguments():
type=str, help="Parameter in the format `Param=Value`.")
parser.add_argument(
'--list-benches', action=argparse.BooleanOptionalAction, help="Show available benchmarks.")
parser.add_argument('--num-shards', type=int, default=1, help='Split benchmarks into M pieces and only run one')
parser.add_argument('--run-shard', type=int, default=0, help='Run shard N / M of benchmarks')
parser.add_argument('-P0', action=argparse.BooleanOptionalAction, help="Run P0 benchmarks (overwrites -R)")
return parser.parse_args()


def run_benches(benchmarks, sub_space, regex, seeker):
pattern = re.compile(regex)

for algname in benchmarks:
if pattern.match(algname):
def run_benches(algnames, sub_space, seeker):
for algname in algnames:
try:
bench = BaseBench(algname)
ct_space = bench.ct_workload_space(sub_space)
rt_values = bench.rt_axes_values(sub_space)
seeker(algname, ct_space, rt_values)
except Exception as e:
print("#### ERROR exception occured while running {}: '{}'".format(algname, e))


def filter_benchmarks(benchmarks, args):
if args.run_shard >= args.num_shards:
raise ValueError('run-shard must be less than num-shards')

R = args.R
if args.P0:
R = '^(?!.*segmented).*(scan|reduce|select|sort).*'

pattern = re.compile(R)
algnames = list(filter(lambda x: pattern.match(x), benchmarks.keys()))
algnames.sort()

if args.num_shards > 1:
algnames = np.array_split(algnames, args.num_shards)[args.run_shard].tolist()
return algnames

return algnames


def search(seeker):
args = parse_arguments()
Expand All @@ -79,8 +101,8 @@ def search(seeker):
if args.list_benches:
list_benches()
return

run_benches(config.benchmarks, workload_sub_space, args.R, seeker)
run_benches(filter_benchmarks(config.benchmarks, args), workload_sub_space, seeker)


class MedianCenterEstimator:
Expand Down
32 changes: 27 additions & 5 deletions benchmarks/scripts/run.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,33 +6,55 @@
import cccl.bench


def elapsed_time_look_good(x):
def elapsed_time_looks_good(x):
if isinstance(x, float):
if math.isfinite(x):
return True
return False


def problem_size_looks_large_enough(elements):
# Small problem sizes do not utilize entire GPU.
# Benchmarking small problem sizes in environments where we do not control
# distributions comparison, e.g. CI, is not useful because of stability issues.
return elements.isdigit() and int(elements) > 20


def filter_runtime_workloads_for_ci(rt_values):
for subbench in rt_values:
for axis in rt_values[subbench]:
if axis.startswith('Elements') and axis.endswith('[pow2]'):
rt_values[subbench][axis] = list(filter(problem_size_looks_large_enough, rt_values[subbench][axis]))

return rt_values


class BaseRunner:
def __init__(self):
self.estimator = cccl.bench.MedianCenterEstimator()

def __call__(self, algname, ct_workload_space, rt_values):
failure_occured = False
rt_values = filter_runtime_workloads_for_ci(rt_values)

for ct_workload in ct_workload_space:
bench = cccl.bench.BaseBench(algname)
if bench.build():
if bench.build(): # might throw
results = bench.run(ct_workload, rt_values, self.estimator, False)
for subbench in results:
for point in results[subbench]:
bench_name = "{}.{}-{}".format(bench.algorithm_name(), subbench, point)
bench_name = bench_name.replace(' ', '___')
bench_name = "".join(c if c.isalnum() else "_" for c in bench_name)
elapsed_time = results[subbench][point]
if elapsed_time_look_good(elapsed_time):
if elapsed_time_looks_good(elapsed_time):
print("&&&& PERF {} {} -sec".format(bench_name, elapsed_time))
else:
print("&&&& FAILED bench")
sys.exit(-1)
failure_occured = True
print("&&&& FAILED {}".format(algname))

if failure_occured:
sys.exit(1)


def main():
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -100,7 +100,7 @@ void left(nvbench::state& state, nvbench::type_list<T, OffsetT>)
thrust::device_vector<std::uint8_t> temp_storage(temp_storage_bytes);
std::uint8_t* d_temp_storage = thrust::raw_pointer_cast(temp_storage.data());

state.exec([&](nvbench::launch &launch) {
state.exec(nvbench::exec_tag::no_batch, [&](nvbench::launch &launch) {
dispatch_t::Dispatch(d_temp_storage,
temp_storage_bytes,
d_in,
Expand Down
9 changes: 4 additions & 5 deletions cub/benchmarks/bench/copy/memcpy.cu
Original file line number Diff line number Diff line change
Expand Up @@ -249,7 +249,7 @@ void copy(nvbench::state &state,
thrust::device_vector<nvbench::uint8_t> temp_storage(temp_storage_bytes);
d_temp_storage = thrust::raw_pointer_cast(temp_storage.data());

state.exec(nvbench::exec_tag::sync, [&](nvbench::launch &launch) {
state.exec(nvbench::exec_tag::no_batch | nvbench::exec_tag::sync, [&](nvbench::launch &launch) {
dispatch_t::Dispatch(d_temp_storage,
temp_storage_bytes,
d_input_buffers,
Expand All @@ -275,8 +275,8 @@ void uniform(nvbench::state &state, nvbench::type_list<T, OffsetT> tl)
elements,
min_buffer_size,
max_buffer_size,
state.get_int64("RandomizeInput"),
state.get_int64("RandomizeOutput"));
state.get_int64("Randomize"),
state.get_int64("Randomize"));
}

template <class T, class OffsetT>
Expand Down Expand Up @@ -309,8 +309,7 @@ NVBENCH_BENCH_TYPES(uniform, NVBENCH_TYPE_AXES(types, u_offset_types))
.add_int64_power_of_two_axis("Elements{io}", nvbench::range(25, 29, 2))
.add_int64_axis("MinBufferSizeRatio", {1, 99})
.add_int64_axis("MaxBufferSize", {8, 64, 256, 1024, 64 * 1024})
.add_int64_axis("RandomizeInput", {0, 1})
.add_int64_axis("RandomizeOutput", {0, 1});
.add_int64_axis("Randomize", {0, 1});

NVBENCH_BENCH_TYPES(large, NVBENCH_TYPE_AXES(types, u_offset_types))
.set_name("large")
Expand Down
6 changes: 3 additions & 3 deletions cub/benchmarks/bench/histogram/even.cu
Original file line number Diff line number Diff line change
Expand Up @@ -110,7 +110,7 @@ static void even(nvbench::state &state, nvbench::type_list<SampleT, CounterT, Of
thrust::device_vector<nvbench::uint8_t> tmp(temp_storage_bytes);
d_temp_storage = thrust::raw_pointer_cast(tmp.data());

state.exec([&](nvbench::launch &launch) {
state.exec(nvbench::exec_tag::no_batch, [&](nvbench::launch &launch) {
dispatch_t::DispatchEven(d_temp_storage,
temp_storage_bytes,
d_input,
Expand Down Expand Up @@ -139,5 +139,5 @@ NVBENCH_BENCH_TYPES(even, NVBENCH_TYPE_AXES(sample_types, bin_types, some_offset
.set_name("base")
.set_type_axes_names({"SampleT{ct}", "BinT{ct}", "OffsetT{ct}"})
.add_int64_power_of_two_axis("Elements{io}", nvbench::range(16, 28, 4))
.add_int64_axis("Bins", {32, 64, 128, 2048, 2097152})
.add_string_axis("Entropy", {"0.201", "0.544", "1.000"});
.add_int64_axis("Bins", {32, 128, 2048, 2097152})
.add_string_axis("Entropy", {"0.201", "1.000"});
Loading
Loading