Skip to content

Commit

Permalink
Merge remote-tracking branch 'upstream/branch-23.08' into feat/cuda12…
Browse files Browse the repository at this point in the history
…_nvidia_build
  • Loading branch information
rlratzel committed Jul 18, 2023
2 parents ad91c38 + 32e6e51 commit 86ebcdc
Show file tree
Hide file tree
Showing 106 changed files with 7,276 additions and 1,162 deletions.
6 changes: 6 additions & 0 deletions .github/workflows/build.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ jobs:
secrets: inherit
uses: rapidsai/shared-action-workflows/.github/workflows/conda-cpp-build.yaml@cuda-120
with:
matrix_filter: map(select(.CUDA_VER | startswith("11")))
build_type: ${{ inputs.build_type || 'branch' }}
branch: ${{ inputs.branch }}
date: ${{ inputs.date }}
Expand All @@ -39,6 +40,7 @@ jobs:
secrets: inherit
uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-build.yaml@cuda-120
with:
matrix_filter: map(select(.CUDA_VER | startswith("11")))
build_type: ${{ inputs.build_type || 'branch' }}
branch: ${{ inputs.branch }}
date: ${{ inputs.date }}
Expand All @@ -61,7 +63,11 @@ jobs:
build_type: branch
node_type: "gpu-latest-1"
arch: "amd64"
branch: ${{ inputs.branch }}
build_type: ${{ inputs.build_type || 'branch' }}
container_image: "rapidsai/ci:latest"
date: ${{ inputs.date }}
node_type: "gpu-v100-latest-1"
run_script: "ci/build_docs.sh"
wheel-build-pylibcugraph:
secrets: inherit
Expand Down
14 changes: 9 additions & 5 deletions .github/workflows/pr.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -35,25 +35,29 @@ jobs:
secrets: inherit
uses: rapidsai/shared-action-workflows/.github/workflows/conda-cpp-build.yaml@cuda-120
with:
matrix_filter: map(select(.CUDA_VER | startswith("11")))
build_type: pull-request
node_type: cpu16
conda-cpp-tests:
needs: conda-cpp-build
secrets: inherit
uses: rapidsai/shared-action-workflows/.github/workflows/conda-cpp-tests.yaml@cuda-120
with:
matrix_filter: map(select(.CUDA_VER | startswith("11")))
build_type: pull-request
conda-python-build:
needs: conda-cpp-build
secrets: inherit
uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-build.yaml@cuda-120
with:
matrix_filter: map(select(.CUDA_VER | startswith("11")))
build_type: pull-request
conda-python-tests:
needs: conda-python-build
secrets: inherit
uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-tests.yaml@cuda-120
with:
matrix_filter: map(select(.CUDA_VER | startswith("11")))
build_type: pull-request
conda-notebook-tests:
needs: conda-python-build
Expand All @@ -63,7 +67,7 @@ jobs:
build_type: pull-request
node_type: "gpu-latest-1"
arch: "amd64"
container_image: "rapidsai/ci:latest"
container_image: "rapidsai/ci:cuda11.8.0-ubuntu22.04-py3.10"
run_script: "ci/test_notebooks.sh"
docs-build:
needs: conda-python-build
Expand All @@ -73,7 +77,7 @@ jobs:
build_type: pull-request
node_type: "gpu-latest-1"
arch: "amd64"
container_image: "rapidsai/ci:latest"
container_image: "rapidsai/ci:cuda11.8.0-ubuntu22.04-py3.10"
run_script: "ci/build_docs.sh"
wheel-build-pylibcugraph:
needs: checks
Expand All @@ -97,7 +101,7 @@ jobs:
package-name: pylibcugraph
# On arm also need to install cupy from the specific webpage.
test-before-arm64: "pip install 'cupy-cuda11x<12.0.0' -f https://pip.cupy.dev/aarch64"
test-unittest: "RAPIDS_DATASET_ROOT_DIR=./datasets pytest -v ./python/pylibcugraph/pylibcugraph/tests"
test-unittest: "RAPIDS_DATASET_ROOT_DIR=./datasets python -m pytest -v ./python/pylibcugraph/pylibcugraph/tests"
test-smoketest: "python ci/wheel_smoke_test_pylibcugraph.py"
wheel-build-cugraph:
needs: wheel-tests-pylibcugraph
Expand All @@ -124,6 +128,6 @@ jobs:
test-before-amd64: "cd ./datasets && bash ./get_test_data.sh && cd - && RAPIDS_PY_WHEEL_NAME=pylibcugraph_cu11 rapids-download-wheels-from-s3 ./local-pylibcugraph-dep && pip install --no-deps ./local-pylibcugraph-dep/*.whl && pip install git+https://github.com/dask/dask.git@main git+https://github.com/dask/distributed.git@main git+https://github.com/rapidsai/dask-cuda.git@branch-23.06"
# Skip dataset downloads on arm to save CI time -- arm only runs smoke tests.
# On arm also need to install cupy from the specific site.
test-before-arm64: "RAPIDS_PY_WHEEL_NAME=pylibcugraph_cu11 rapids-download-wheels-from-s3 ./local-pylibcugraph-dep && pip install --no-deps ./local-pylibcugraph-dep/*.whl && pip install 'cupy-cuda11x<12.0.0' -f https://pip.cupy.dev/aarch64 && pip install git+https://github.com/dask/dask.git@main git+https://github.com/dask/distributed.git@main git+https://github.com/rapidsai/dask-cuda.git@branch-23.06"
test-unittest: "RAPIDS_DATASET_ROOT_DIR=/__w/cugraph/cugraph/datasets pytest -v -m sg ./python/cugraph/cugraph/tests"
test-before-arm64: "RAPIDS_PY_WHEEL_NAME=pylibcugraph_${{ '${PIP_CU_VERSION}' }} rapids-download-wheels-from-s3 ./local-pylibcugraph-dep && pip install --no-deps ./local-pylibcugraph-dep/*.whl && pip install 'cupy-cuda11x<12.0.0' -f https://pip.cupy.dev/aarch64 && pip install git+https://github.com/dask/dask.git@main git+https://github.com/dask/distributed.git@main git+https://github.com/rapidsai/dask-cuda.git@branch-23.08"
test-unittest: "RAPIDS_DATASET_ROOT_DIR=/__w/cugraph/cugraph/datasets python -m pytest -v -m sg ./python/cugraph/cugraph/tests"
test-smoketest: "python ci/wheel_smoke_test_cugraph.py"
10 changes: 6 additions & 4 deletions .github/workflows/test.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ jobs:
secrets: inherit
uses: rapidsai/shared-action-workflows/.github/workflows/conda-cpp-tests.yaml@cuda-120
with:
matrix_filter: map(select(.CUDA_VER | startswith("11")))
build_type: nightly
branch: ${{ inputs.branch }}
date: ${{ inputs.date }}
Expand All @@ -26,6 +27,7 @@ jobs:
secrets: inherit
uses: rapidsai/shared-action-workflows/.github/workflows/conda-python-tests.yaml@cuda-120
with:
matrix_filter: map(select(.CUDA_VER | startswith("11")))
build_type: nightly
branch: ${{ inputs.branch }}
date: ${{ inputs.date }}
Expand All @@ -41,7 +43,7 @@ jobs:
package-name: pylibcugraph
# On arm also need to install cupy from the specific webpage.
test-before-arm64: "pip install 'cupy-cuda11x<12.0.0' -f https://pip.cupy.dev/aarch64"
test-unittest: "RAPIDS_DATASET_ROOT_DIR=./datasets pytest -v ./python/pylibcugraph/pylibcugraph/tests"
test-unittest: "RAPIDS_DATASET_ROOT_DIR=./datasets python -m pytest -v ./python/pylibcugraph/pylibcugraph/tests"
wheel-tests-cugraph:
secrets: inherit
uses: rapidsai/shared-action-workflows/.github/workflows/wheels-manylinux-test.yml@cuda-120
Expand All @@ -52,7 +54,7 @@ jobs:
sha: ${{ inputs.sha }}
package-name: cugraph
# Always want to test against latest dask/distributed.
test-before-amd64: "cd ./datasets && bash ./get_test_data.sh && cd - && pip install git+https://github.com/dask/dask.git@main git+https://github.com/dask/distributed.git@main git+https://github.com/rapidsai/dask-cuda.git@branch-23.06"
test-before-amd64: "cd ./datasets && bash ./get_test_data.sh && cd - && pip install git+https://github.com/dask/dask.git@main git+https://github.com/dask/distributed.git@main git+https://github.com/rapidsai/dask-cuda.git@branch-23.08"
# On arm also need to install cupy from the specific webpage.
test-before-arm64: "cd ./datasets && bash ./get_test_data.sh && cd - && pip install 'cupy-cuda11x<12.0.0' -f https://pip.cupy.dev/aarch64 && pip install git+https://github.com/dask/dask.git@main git+https://github.com/dask/distributed.git@main git+https://github.com/rapidsai/dask-cuda.git@branch-23.06"
test-unittest: "RAPIDS_DATASET_ROOT_DIR=/__w/cugraph/cugraph/datasets pytest -v -m sg ./python/cugraph/cugraph/tests"
test-before-arm64: "cd ./datasets && bash ./get_test_data.sh && cd - && pip install 'cupy-cuda11x<12.0.0' -f https://pip.cupy.dev/aarch64 && pip install git+https://github.com/dask/dask.git@main git+https://github.com/dask/distributed.git@main git+https://github.com/rapidsai/dask-cuda.git@branch-23.08"
test-unittest: "RAPIDS_DATASET_ROOT_DIR=/__w/cugraph/cugraph/datasets python -m pytest -v -m sg ./python/cugraph/cugraph/tests"
116 changes: 116 additions & 0 deletions benchmarks/cugraph/standalone/bulk_sampling/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,116 @@
# cuGraph Bulk Sampling

## Overview
The `cugraph_bulk_sampling.py` script runs the bulk sampler for a variety of datasets, including
both generated (rmat) datasets and disk (ogbn_papers100M, etc.) datasets. It can also load
replicas of these datasets to create a larger benchmark (i.e. ogbn_papers100M x2).

## Arguments
The script takes a variety of arguments to control sampling behavior.
Required:
--output_root
The output root directory. File/folder names are auto-generated.
For instance, if the output root directory is /home/samples,
the samples will be written to a new folder in /home/samples that
contains information about the sampling run as well as the time
of the run.

--dataset_root
The folder where datasets are stored. Uses the format described
in the input format section.

--datasets
Comma-separated list of datasets; can specify ogb or rmat (i.e. ogb_papers100M[2],rmat_22_16).
For ogb datasets, can provide replication factor using brackets.
Will attempt to read from dataset_root/<datset_name>.

Optional:
--fanouts
Comma-separated list of fanout values (i.e. [10, 25]).
The default fanout is [10, 25].

--batch_sizes
Comma-separated list of batch sizes (i.e. 500, 1000).
Defaults to "512,1024"

--seeds_per_call_opts
Comma-separated list of seeds per call. Controls the number of input seed vertices processed
in a single sampling call.
Defaults to 524288

--reverse_edges
Whether to reverse the edges of the input edgelist. Should be set to False for PyG and True for DGL.
Defaults to False (PyG).

--dask_worker_devices
Comma-separated list of the GPUs to assign to dask (i.e. "0,1,2").
Defaults to just the default GPU (0).
Changing this is strongly recommended in order to take advantage of all GPUs on the system.

--random_seed
Seed for random number generation.
Defaults to '62'

--persist
Whether to aggressively use persist() in dask to make the ETL steps (NOT PART OF SAMPLING) faster.
Will probably make this script finish sooner at the expense of memory usage, but won't affect
sampling time.
Changing this is not recommended unless you know what you are doing.
Defaults to False.

## Input Format
The script expects its input data in the following format:
```
<top level directory>
|
|------ meta.json
|------ parquet
|------ |---------- <node type 0 (i.e. paper)>
|------ |---------- |---------------------------- [node_label.parquet]
|------ |---------- <node type 1 (i.e. author)>
|------ |---------- |---------------------------- [node_label.parquet]
...
|------ |---------- <edge type 0 (i.e. paper__cites__paper)>
|------ |---------- |------------------------------------------ edge_index.parquet
|------ |---------- <edge type 1 (i.e. author__writes__paper)>
|------ |---------- |------------------------------------------ edge_index.parquet
...
```

`node_label.parquet` only needs to be present for vertex types that have labeled
nodes. It consists of two columns, "node" which contains node ids, and "label",
which contains the labeled class of the node.

`edge_index.parquet` is required for all edge types. It has two columns, `src`
and `dst`, representing the source and destination vertices of the edges in that
edge type's COO edge index.

`meta.json` is a json file containing metadata needed to properly process
the parquet files. It must have the following format:
```
{
"num_nodes": {
"<node type 0 (i.e. paper)">: <# nodes of node type 0>,
"<node type 1 (i.e. author)">: <# nodes of node type 1>,
...
},
"num_edges": {
<edge type 0 (i.e. paper__cites__paper)>: <# edges of edge type 0>,
<edge type 1 (i.e. author__writes__paper)>: <# edges of edge type 1>,
...
}
}
```

## Output Meta
The script, in addition to the samples, will also output a file named `output_meta.json`.
This file contains various statistics about the sampling run, including the runtime,
as well as information about the dataset and system that the samples were produced from.

This metadata file can be used to gather the results from the sampling and training stages
together.

## Other Notes
For rmat datasets, you will need to generate your own bogus features in the training stage.
Since that is trivial, that is not done in this sampling script.
Loading

0 comments on commit 86ebcdc

Please sign in to comment.