Merge remote-tracking branch 'upstream/branch-23.08' into feat/cuda12…

…_nvidia_build
rapidsai · Jul 18, 2023 · 86ebcdc · 86ebcdc
2 parents ad91c38 + 32e6e51
commit 86ebcdc
Show file tree

Hide file tree

Showing 106 changed files with 7,276 additions and 1,162 deletions.
diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml
@@ -30,6 +30,7 @@ jobs:
     secrets: inherit
     uses: rapidsai/shared-action-workflows/.github/workflows/conda-cpp-build.yaml@cuda-120
     with:
+      matrix_filter: map(select(.CUDA_VER | startswith("11")))
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
       date: ${{ inputs.date }}
@@ -39,6 +40,7 @@ jobs:
     secrets: inherit
     uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-build.yaml@cuda-120
     with:
+      matrix_filter: map(select(.CUDA_VER | startswith("11")))
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
       date: ${{ inputs.date }}
@@ -61,7 +63,11 @@ jobs:
       build_type: branch
       node_type: "gpu-latest-1"
       arch: "amd64"
+      branch: ${{ inputs.branch }}
+      build_type: ${{ inputs.build_type || 'branch' }}
       container_image: "rapidsai/ci:latest"
+      date: ${{ inputs.date }}
+      node_type: "gpu-v100-latest-1"
       run_script: "ci/build_docs.sh"
   wheel-build-pylibcugraph:
     secrets: inherit

diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml
@@ -35,25 +35,29 @@ jobs:
     secrets: inherit
     uses: rapidsai/shared-action-workflows/.github/workflows/conda-cpp-build.yaml@cuda-120
     with:
+      matrix_filter: map(select(.CUDA_VER | startswith("11")))
       build_type: pull-request
       node_type: cpu16
   conda-cpp-tests:
     needs: conda-cpp-build
     secrets: inherit
     uses: rapidsai/shared-action-workflows/.github/workflows/conda-cpp-tests.yaml@cuda-120
     with:
+      matrix_filter: map(select(.CUDA_VER | startswith("11")))
       build_type: pull-request
   conda-python-build:
     needs: conda-cpp-build
     secrets: inherit
     uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-build.yaml@cuda-120
     with:
+      matrix_filter: map(select(.CUDA_VER | startswith("11")))
       build_type: pull-request
   conda-python-tests:
     needs: conda-python-build
     secrets: inherit
     uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-tests.yaml@cuda-120
     with:
+      matrix_filter: map(select(.CUDA_VER | startswith("11")))
       build_type: pull-request
   conda-notebook-tests:
     needs: conda-python-build
@@ -63,7 +67,7 @@ jobs:
       build_type: pull-request
       node_type: "gpu-latest-1"
       arch: "amd64"
-      container_image: "rapidsai/ci:latest"
+      container_image: "rapidsai/ci:cuda11.8.0-ubuntu22.04-py3.10"
       run_script: "ci/test_notebooks.sh"
   docs-build:
     needs: conda-python-build
@@ -73,7 +77,7 @@ jobs:
       build_type: pull-request
       node_type: "gpu-latest-1"
       arch: "amd64"
-      container_image: "rapidsai/ci:latest"
+      container_image: "rapidsai/ci:cuda11.8.0-ubuntu22.04-py3.10"
       run_script: "ci/build_docs.sh"
   wheel-build-pylibcugraph:
     needs: checks
@@ -97,7 +101,7 @@ jobs:
       package-name: pylibcugraph
       # On arm also need to install cupy from the specific webpage.
       test-before-arm64: "pip install 'cupy-cuda11x<12.0.0' -f https://pip.cupy.dev/aarch64"
-      test-unittest: "RAPIDS_DATASET_ROOT_DIR=./datasets pytest -v ./python/pylibcugraph/pylibcugraph/tests"
+      test-unittest: "RAPIDS_DATASET_ROOT_DIR=./datasets python -m pytest -v ./python/pylibcugraph/pylibcugraph/tests"
       test-smoketest: "python ci/wheel_smoke_test_pylibcugraph.py"
   wheel-build-cugraph:
     needs: wheel-tests-pylibcugraph
@@ -124,6 +128,6 @@ jobs:
       test-before-amd64: "cd ./datasets && bash ./get_test_data.sh && cd - && RAPIDS_PY_WHEEL_NAME=pylibcugraph_cu11 rapids-download-wheels-from-s3 ./local-pylibcugraph-dep && pip install --no-deps ./local-pylibcugraph-dep/*.whl && pip install git+https://github.com/dask/dask.git@main git+https://github.com/dask/distributed.git@main git+https://github.com/rapidsai/dask-cuda.git@branch-23.06"
       # Skip dataset downloads on arm to save CI time -- arm only runs smoke tests.
       # On arm also need to install cupy from the specific site.
-      test-before-arm64: "RAPIDS_PY_WHEEL_NAME=pylibcugraph_cu11 rapids-download-wheels-from-s3 ./local-pylibcugraph-dep && pip install --no-deps ./local-pylibcugraph-dep/*.whl && pip install 'cupy-cuda11x<12.0.0' -f https://pip.cupy.dev/aarch64 && pip install git+https://github.com/dask/dask.git@main git+https://github.com/dask/distributed.git@main git+https://github.com/rapidsai/dask-cuda.git@branch-23.06"
-      test-unittest: "RAPIDS_DATASET_ROOT_DIR=/__w/cugraph/cugraph/datasets pytest -v -m sg ./python/cugraph/cugraph/tests"
+      test-before-arm64: "RAPIDS_PY_WHEEL_NAME=pylibcugraph_${{ '${PIP_CU_VERSION}' }} rapids-download-wheels-from-s3 ./local-pylibcugraph-dep && pip install --no-deps ./local-pylibcugraph-dep/*.whl && pip install 'cupy-cuda11x<12.0.0' -f https://pip.cupy.dev/aarch64 && pip install git+https://github.com/dask/dask.git@main git+https://github.com/dask/distributed.git@main git+https://github.com/rapidsai/dask-cuda.git@branch-23.08"
+      test-unittest: "RAPIDS_DATASET_ROOT_DIR=/__w/cugraph/cugraph/datasets python -m pytest -v -m sg ./python/cugraph/cugraph/tests"
       test-smoketest: "python ci/wheel_smoke_test_cugraph.py"
diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml
@@ -18,6 +18,7 @@ jobs:
     secrets: inherit
     uses: rapidsai/shared-action-workflows/.github/workflows/conda-cpp-tests.yaml@cuda-120
     with:
+      matrix_filter: map(select(.CUDA_VER | startswith("11")))
       build_type: nightly
       branch: ${{ inputs.branch }}
       date: ${{ inputs.date }}
@@ -26,6 +27,7 @@ jobs:
     secrets: inherit
     uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-tests.yaml@cuda-120
     with:
+      matrix_filter: map(select(.CUDA_VER | startswith("11")))
       build_type: nightly
       branch: ${{ inputs.branch }}
       date: ${{ inputs.date }}
@@ -41,7 +43,7 @@ jobs:
       package-name: pylibcugraph
       # On arm also need to install cupy from the specific webpage.
       test-before-arm64: "pip install 'cupy-cuda11x<12.0.0' -f https://pip.cupy.dev/aarch64"
-      test-unittest: "RAPIDS_DATASET_ROOT_DIR=./datasets pytest -v ./python/pylibcugraph/pylibcugraph/tests"
+      test-unittest: "RAPIDS_DATASET_ROOT_DIR=./datasets python -m pytest -v ./python/pylibcugraph/pylibcugraph/tests"
   wheel-tests-cugraph:
     secrets: inherit
     uses: rapidsai/shared-action-workflows/.github/workflows/wheels-manylinux-test.yml@cuda-120
@@ -52,7 +54,7 @@ jobs:
       sha: ${{ inputs.sha }}
       package-name: cugraph
       # Always want to test against latest dask/distributed.
-      test-before-amd64: "cd ./datasets && bash ./get_test_data.sh && cd - && pip install git+https://github.com/dask/dask.git@main git+https://github.com/dask/distributed.git@main git+https://github.com/rapidsai/dask-cuda.git@branch-23.06"
+      test-before-amd64: "cd ./datasets && bash ./get_test_data.sh && cd - && pip install git+https://github.com/dask/dask.git@main git+https://github.com/dask/distributed.git@main git+https://github.com/rapidsai/dask-cuda.git@branch-23.08"
       # On arm also need to install cupy from the specific webpage.
-      test-before-arm64: "cd ./datasets && bash ./get_test_data.sh && cd - && pip install 'cupy-cuda11x<12.0.0' -f https://pip.cupy.dev/aarch64 && pip install git+https://github.com/dask/dask.git@main git+https://github.com/dask/distributed.git@main git+https://github.com/rapidsai/dask-cuda.git@branch-23.06"
-      test-unittest: "RAPIDS_DATASET_ROOT_DIR=/__w/cugraph/cugraph/datasets pytest -v -m sg ./python/cugraph/cugraph/tests"
+      test-before-arm64: "cd ./datasets && bash ./get_test_data.sh && cd - && pip install 'cupy-cuda11x<12.0.0' -f https://pip.cupy.dev/aarch64 && pip install git+https://github.com/dask/dask.git@main git+https://github.com/dask/distributed.git@main git+https://github.com/rapidsai/dask-cuda.git@branch-23.08"
+      test-unittest: "RAPIDS_DATASET_ROOT_DIR=/__w/cugraph/cugraph/datasets python -m pytest -v -m sg ./python/cugraph/cugraph/tests"
diff --git a/benchmarks/cugraph/standalone/bulk_sampling/README.md b/benchmarks/cugraph/standalone/bulk_sampling/README.md
@@ -0,0 +1,116 @@
+# cuGraph Bulk Sampling
+
+## Overview
+The `cugraph_bulk_sampling.py` script runs the bulk sampler for a variety of datasets, including
+both generated (rmat) datasets and disk (ogbn_papers100M, etc.) datasets.  It can also load
+replicas of these datasets to create a larger benchmark (i.e. ogbn_papers100M x2).
+
+## Arguments
+The script takes a variety of arguments to control sampling behavior.
+Required:
+    --output_root
+        The output root directory.  File/folder names are auto-generated.
+        For instance, if the output root directory is /home/samples,
+        the samples will be written to a new folder in /home/samples that
+        contains information about the sampling run as well as the time
+        of the run.
+
+    --dataset_root
+        The folder where datasets are stored.  Uses the format described
+        in the input format section.
+
+    --datasets
+        Comma-separated list of datasets; can specify ogb or rmat (i.e. ogb_papers100M[2],rmat_22_16).
+        For ogb datasets, can provide replication factor using brackets.
+        Will attempt to read from dataset_root/<datset_name>.
+
+Optional:
+    --fanouts
+        Comma-separated list of fanout values (i.e. [10, 25]).
+        The default fanout is [10, 25].
+
+    --batch_sizes
+        Comma-separated list of batch sizes (i.e. 500, 1000).
+        Defaults to "512,1024"
+
+    --seeds_per_call_opts
+        Comma-separated list of seeds per call.  Controls the number of input seed vertices processed
+        in a single sampling call.
+        Defaults to 524288
+
+    --reverse_edges
+        Whether to reverse the edges of the input edgelist. Should be set to False for PyG and True for DGL.
+        Defaults to False (PyG).
+
+    --dask_worker_devices
+        Comma-separated list of the GPUs to assign to dask (i.e. "0,1,2").
+        Defaults to just the default GPU (0).
+        Changing this is strongly recommended in order to take advantage of all GPUs on the system.
+
+    --random_seed
+        Seed for random number generation.
+        Defaults to '62'
+
+    --persist
+        Whether to aggressively use persist() in dask to make the ETL steps (NOT PART OF SAMPLING) faster.
+        Will probably make this script finish sooner at the expense of memory usage, but won't affect
+        sampling time.
+        Changing this is not recommended unless you know what you are doing.
+        Defaults to False.
+
+## Input Format
+The script expects its input data in the following format:
+```
+<top level directory>
+|
+|------ meta.json
+|------ parquet
+|------ |---------- <node type 0 (i.e. paper)>
+|------ |---------- |---------------------------- [node_label.parquet]
+|------ |---------- <node type 1 (i.e. author)>
+|------ |---------- |---------------------------- [node_label.parquet]
+...
+|------ |---------- <edge type 0 (i.e. paper__cites__paper)>
+|------ |---------- |------------------------------------------ edge_index.parquet
+|------ |---------- <edge type 1 (i.e. author__writes__paper)>
+|------ |---------- |------------------------------------------ edge_index.parquet
+...
+
+```
+
+`node_label.parquet` only needs to be present for vertex types that have labeled
+nodes. It consists of two columns, "node" which contains node ids, and "label",
+which contains the labeled class of the node.
+
+`edge_index.parquet` is required for all edge types.  It has two columns, `src`
+and `dst`, representing the source and destination vertices of the edges in that
+edge type's COO edge index.
+
+`meta.json` is a json file containing metadata needed to properly process
+the parquet files.  It must have the following format:
+```
+{
+    "num_nodes": {
+        "<node type 0 (i.e. paper)">: <# nodes of node type 0>,
+        "<node type 1 (i.e. author)">: <# nodes of node type 1>,
+        ...
+    },
+    "num_edges": {
+        <edge type 0 (i.e. paper__cites__paper)>: <# edges of edge type 0>,
+        <edge type 1 (i.e. author__writes__paper)>: <# edges of edge type 1>,
+        ...
+    }
+}
+```
+
+## Output Meta
+The script, in addition to the samples, will also output a file named `output_meta.json`.
+This file contains various statistics about the sampling run, including the runtime,
+as well as information about the dataset and system that the samples were produced from.
+
+This metadata file can be used to gather the results from the sampling and training stages
+together.
+
+## Other Notes
+For rmat datasets, you will need to generate your own bogus features in the training stage.
+Since that is trivial, that is not done in this sampling script.