NVIDIA · wmaxey · Dec 1, 2023 · Nov 1, 2023 · Nov 1, 2023 · Nov 2, 2023
@@ -14,9 +14,15 @@ on:
       container_image: {type: string, required: false}
       run_tests: {type: boolean, required: false, default: true}
 
+permissions:
+  contents: read
+
 jobs:
   build:
     name: Build ${{inputs.test_name}}
+    permissions:
+      id-token: write
+      contents: read
     uses: ./.github/workflows/run-as-coder.yml
     with:
       name: Build ${{inputs.test_name}}
@@ -27,6 +33,9 @@ jobs:
 
   test:
     needs: build
+    permissions:
+      id-token: write
+      contents: read
     if:  ${{ !cancelled() && ( needs.build.result == 'success' || needs.build.result == 'skipped' ) && inputs.run_tests}}
     name: Test ${{inputs.test_name}}
     uses: ./.github/workflows/run-as-coder.yml

@@ -8,12 +8,18 @@ on:
       devcontainer_version: {type: string, required: true}
       is_windows: {type: boolean, required: true}
 
+permissions:
+  contents: read
+
 jobs:
   # Using a matrix to dispatch to the build-and-test reusable workflow for each build configuration
   # ensures that the build/test steps can overlap across different configurations. For example,
   # the build step for CUDA 12.1 + gcc 9.3 can run at the same time as the test step for CUDA 11.0 + clang 11.
   build_and_test_linux:
     name: build and test linux
+    permissions:
+      id-token: write
+      contents: read
     if: ${{ !inputs.is_windows }}
     uses: ./.github/workflows/build-and-test-linux.yml
     strategy:
@@ -30,6 +36,9 @@ jobs:
 
   build_and_test_windows:
     name: build and test windows
+    permissions:
+      id-token: write
+      contents: read
     if: ${{ inputs.is_windows }}
     uses: ./.github/workflows/build-and-test-windows.yml
     strategy:

@@ -30,6 +30,10 @@ concurrency:
   group: ${{ github.workflow }}-on-${{ github.event_name }}-from-${{ github.ref_name }}
   cancel-in-progress: true
 
+permissions:
+  contents: read
+  pull-requests: read
+
 jobs:
   compute-matrix:
     name: Compute matrix
@@ -53,6 +57,9 @@ jobs:
 
   nvrtc:
     name: NVRTC CUDA${{matrix.cuda}} C++${{matrix.std}}
+    permissions:
+      id-token: write
+      contents: read
     needs: compute-matrix
     if: ${{ !contains(github.event.head_commit.message, 'skip-tests') }}
     uses: ./.github/workflows/run-as-coder.yml
@@ -69,6 +76,9 @@ jobs:
 
   thrust:
     name: Thrust CUDA${{ matrix.cuda_version }} ${{ matrix.compiler }}
+    permissions:
+      id-token: write
+      contents: read
     needs: compute-matrix
     uses: ./.github/workflows/dispatch-build-and-test.yml
     strategy:
@@ -84,6 +94,9 @@ jobs:
 
   cub:
     name: CUB CUDA${{ matrix.cuda_version }} ${{ matrix.compiler }}
+    permissions:
+      id-token: write
+      contents: read
     needs: compute-matrix
     uses: ./.github/workflows/dispatch-build-and-test.yml
     strategy:
@@ -99,6 +112,9 @@ jobs:
 
   libcudacxx:
     name: libcudacxx CUDA${{ matrix.cuda_version }} ${{ matrix.compiler }}
+    permissions:
+      id-token: write
+      contents: read
     needs: compute-matrix
     uses: ./.github/workflows/dispatch-build-and-test.yml
     strategy:
@@ -114,6 +130,9 @@ jobs:
 
   clang-cuda:
     name: ${{matrix.lib}} ${{matrix.cpu}}/CTK${{matrix.cuda}}/clang-cuda
+    permissions:
+      id-token: write
+      contents: read
     needs: compute-matrix
     strategy:
       fail-fast: false
@@ -129,6 +148,9 @@ jobs:
 
   cccl-infra:
     name: CCCL Infrastructure
+    permissions:
+      id-token: write
+      contents: read
     needs: compute-matrix
     if: ${{ !contains(github.event.head_commit.message, 'skip-tests') }}
     strategy:
@@ -146,6 +168,9 @@ jobs:
 
   verify-devcontainers:
     name: Verify Dev Containers
+    permissions:
+      id-token: write
+      contents: read
     uses: ./.github/workflows/verify-devcontainers.yml
 
   # This job is the final job that runs after all other jobs and is used for branch protection status checks.
@@ -154,7 +179,7 @@ jobs:
   ci:
     runs-on: ubuntu-latest
     name: CI
-    if: ${{ always() }} # need to use always() instead of !cancelled() because skipped jobs count as success 
+    if: ${{ always() }} # need to use always() instead of !cancelled() because skipped jobs count as success
     needs:
       - clang-cuda
       - cub

@@ -45,8 +45,7 @@ jobs:
     runs-on: ubuntu-latest
 
     permissions:
-      issues: write
-      pull-requests: write
+      pull-requests: read
 
     steps:
       - name: Check if changes requested from a reviewer
@@ -56,17 +55,6 @@ jobs:
           if [ ${{ github.event.review.state }} != 'changes_requested' ]; then
             echo "Changes not requested, exiting"
             exit 0
-          
-          # If it is requesting changes, set PR to draft
-          # We use the default token here since we're granting write access to the PR
-          elif [ ${{ github.event.pull_request.draft }} == false ]; then
-            gh api graphql -f query='
-              mutation {
-                convertPullRequestToDraft(input: {pullRequestId: "${{ env.PR_GLOBAL_ID }}"}) {
-                  clientMutationId
-                }
-              }'
-            exit 0
           fi
         continue-on-error: true
 

@@ -14,17 +14,21 @@ on:
       command: {type: string, required: true}
       env: { type: string, required: false, default: "" }
 
+permissions:
+  contents: read
+
 jobs:
   run-as-coder:
     name: ${{inputs.name}}
+    permissions:
+      id-token: write
+      contents: read
     runs-on: ${{inputs.runner}}
     container:
       options: -u root
       image: ${{inputs.image}}
       env:
         NVIDIA_VISIBLE_DEVICES: ${{ env.NVIDIA_VISIBLE_DEVICES }}
-    permissions:
-      id-token: write
     steps:
       - name: Checkout repo
         uses: actions/checkout@v3

@@ -7,6 +7,9 @@ defaults:
   run:
     shell: bash -euo pipefail {0}
 
+permissions:
+  contents: read
+
 jobs:
   verify-make-devcontainers:
     name: Verify devcontainer files are up-to-date

@@ -635,6 +635,14 @@ def do_run(self, ct_point, rt_values, timeout, is_search=True):
                 cmd.append("--min-samples")
                 cmd.append("70")
 
+            # Unlike noise, minimal benchmarking time is not directly related to variance.
+            # Default minimal time is 0.5 seconds. For CI we want to reduce it to 0.1 seconds, 
+            # becuse we have limited time budget. Having smaller minimal time doesn't affect 
+            # stability of sample distribution median in a deterministic way. For small problem sizes, 
+            # 0.1s leads to smaller variation than 0.5s. For other workloads, 0.5 leads to smaller variance. 
+            cmd.append("--min-time")
+            cmd.append("0.1")
+
             # NVBench is currently broken for multiple GPUs, use `CUDA_VISIBLE_DEVICES`
             cmd.append("-d")
             cmd.append("0")

@@ -47,19 +47,41 @@ def parse_arguments():
                         type=str, help="Parameter in the format `Param=Value`.")
     parser.add_argument(
         '--list-benches', action=argparse.BooleanOptionalAction, help="Show available benchmarks.")
+    parser.add_argument('--num-shards', type=int, default=1, help='Split benchmarks into M pieces and only run one')
+    parser.add_argument('--run-shard', type=int, default=0, help='Run shard N / M of benchmarks')
+    parser.add_argument('-P0', action=argparse.BooleanOptionalAction, help="Run P0 benchmarks (overwrites -R)")
     return parser.parse_args()
 
 
-def run_benches(benchmarks, sub_space, regex, seeker):
-    pattern = re.compile(regex)
-
-    for algname in benchmarks:
-        if pattern.match(algname):
+def run_benches(algnames, sub_space, seeker):
+    for algname in algnames:
+        try:
             bench = BaseBench(algname)
             ct_space = bench.ct_workload_space(sub_space)
             rt_values = bench.rt_axes_values(sub_space)
             seeker(algname, ct_space, rt_values)
+        except Exception as e:
+            print("#### ERROR exception occured while running {}: '{}'".format(algname, e))
+
+
+def filter_benchmarks(benchmarks, args):
+    if args.run_shard >= args.num_shards:
+        raise ValueError('run-shard must be less than num-shards')
+
+    R = args.R
+    if args.P0:
+        R = '^(?!.*segmented).*(scan|reduce|select|sort).*'
 
+    pattern = re.compile(R)
+    algnames = list(filter(lambda x: pattern.match(x), benchmarks.keys()))
+    algnames.sort()
+
+    if args.num_shards > 1:
+        algnames = np.array_split(algnames, args.num_shards)[args.run_shard].tolist()
+        return algnames
+
+    return algnames
+
 
 def search(seeker):
     args = parse_arguments()
@@ -79,8 +101,8 @@ def search(seeker):
     if args.list_benches:
         list_benches()
         return
-
-    run_benches(config.benchmarks, workload_sub_space, args.R, seeker)
+    
+    run_benches(filter_benchmarks(config.benchmarks, args), workload_sub_space, seeker)
 
 
 class MedianCenterEstimator:

@@ -6,33 +6,55 @@
 import cccl.bench
 
 
-def elapsed_time_look_good(x):
+def elapsed_time_looks_good(x):
   if isinstance(x, float):
     if math.isfinite(x):
       return True
   return False
 
 
+def problem_size_looks_large_enough(elements):
+  # Small problem sizes do not utilize entire GPU.
+  # Benchmarking small problem sizes in environments where we do not control
+  # distributions comparison, e.g. CI, is not useful because of stability issues.
+  return elements.isdigit() and int(elements) > 20
+
+
+def filter_runtime_workloads_for_ci(rt_values):
+  for subbench in rt_values:
+    for axis in rt_values[subbench]:
+      if axis.startswith('Elements') and axis.endswith('[pow2]'):
+        rt_values[subbench][axis] = list(filter(problem_size_looks_large_enough, rt_values[subbench][axis]))
+
+  return rt_values
+
+
 class BaseRunner:
   def __init__(self):
     self.estimator = cccl.bench.MedianCenterEstimator()
 
   def __call__(self, algname, ct_workload_space, rt_values):
+    failure_occured = False
+    rt_values = filter_runtime_workloads_for_ci(rt_values)
+
     for ct_workload in ct_workload_space:
       bench = cccl.bench.BaseBench(algname)
-      if bench.build():
+      if bench.build(): # might throw
         results = bench.run(ct_workload, rt_values, self.estimator, False)
         for subbench in results:
           for point in results[subbench]:
             bench_name = "{}.{}-{}".format(bench.algorithm_name(), subbench, point)
             bench_name = bench_name.replace(' ', '___')
             bench_name = "".join(c if c.isalnum() else "_" for c in bench_name)
             elapsed_time = results[subbench][point]
-            if elapsed_time_look_good(elapsed_time):
+            if elapsed_time_looks_good(elapsed_time):
               print("&&&& PERF {} {} -sec".format(bench_name, elapsed_time))
       else:
-        print("&&&& FAILED bench")
-        sys.exit(-1)
+        failure_occured = True
+        print("&&&& FAILED {}".format(algname))
+
+    if failure_occured:
+      sys.exit(1)
 
 
 def main():

@@ -100,7 +100,7 @@ void left(nvbench::state& state, nvbench::type_list<T, OffsetT>)
   thrust::device_vector<std::uint8_t> temp_storage(temp_storage_bytes);
   std::uint8_t* d_temp_storage = thrust::raw_pointer_cast(temp_storage.data());
 
-  state.exec([&](nvbench::launch &launch) {
+  state.exec(nvbench::exec_tag::no_batch, [&](nvbench::launch &launch) {
     dispatch_t::Dispatch(d_temp_storage,
                          temp_storage_bytes,
                          d_in,

@@ -249,7 +249,7 @@ void copy(nvbench::state &state,
   thrust::device_vector<nvbench::uint8_t> temp_storage(temp_storage_bytes);
   d_temp_storage = thrust::raw_pointer_cast(temp_storage.data());
 
-  state.exec(nvbench::exec_tag::sync, [&](nvbench::launch &launch) {
+  state.exec(nvbench::exec_tag::no_batch | nvbench::exec_tag::sync, [&](nvbench::launch &launch) {
     dispatch_t::Dispatch(d_temp_storage,
                          temp_storage_bytes,
                          d_input_buffers,
@@ -275,8 +275,8 @@ void uniform(nvbench::state &state, nvbench::type_list<T, OffsetT> tl)
        elements,
        min_buffer_size,
        max_buffer_size,
-       state.get_int64("RandomizeInput"),
-       state.get_int64("RandomizeOutput"));
+       state.get_int64("Randomize"),
+       state.get_int64("Randomize"));
 }
 
 template <class T, class OffsetT>
@@ -309,8 +309,7 @@ NVBENCH_BENCH_TYPES(uniform, NVBENCH_TYPE_AXES(types, u_offset_types))
   .add_int64_power_of_two_axis("Elements{io}", nvbench::range(25, 29, 2))
   .add_int64_axis("MinBufferSizeRatio", {1, 99})
   .add_int64_axis("MaxBufferSize", {8, 64, 256, 1024, 64 * 1024})
-  .add_int64_axis("RandomizeInput", {0, 1})
-  .add_int64_axis("RandomizeOutput", {0, 1});
+  .add_int64_axis("Randomize", {0, 1});
 
 NVBENCH_BENCH_TYPES(large, NVBENCH_TYPE_AXES(types, u_offset_types))
   .set_name("large")

@@ -110,7 +110,7 @@ static void even(nvbench::state &state, nvbench::type_list<SampleT, CounterT, Of
   thrust::device_vector<nvbench::uint8_t> tmp(temp_storage_bytes);
   d_temp_storage = thrust::raw_pointer_cast(tmp.data());
 
-  state.exec([&](nvbench::launch &launch) {
+  state.exec(nvbench::exec_tag::no_batch, [&](nvbench::launch &launch) {
     dispatch_t::DispatchEven(d_temp_storage,
                              temp_storage_bytes,
                              d_input,
@@ -139,5 +139,5 @@ NVBENCH_BENCH_TYPES(even, NVBENCH_TYPE_AXES(sample_types, bin_types, some_offset
   .set_name("base")
   .set_type_axes_names({"SampleT{ct}", "BinT{ct}", "OffsetT{ct}"})
   .add_int64_power_of_two_axis("Elements{io}", nvbench::range(16, 28, 4))
-  .add_int64_axis("Bins", {32, 64, 128, 2048, 2097152})
-  .add_string_axis("Entropy", {"0.201", "0.544", "1.000"});
+  .add_int64_axis("Bins", {32, 128, 2048, 2097152})
+  .add_string_axis("Entropy", {"0.201", "1.000"});