Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

test future tensorrt version in windows wf #3290

Merged
merged 35 commits into from
Nov 14, 2024
Merged
Show file tree
Hide file tree
Changes from 28 commits
Commits
Show all changes
35 commits
Select commit Hold shift + click to select a range
a0b1989
add test tensorrt workflow
lanluo-nvidia Oct 28, 2024
09689ee
test
lanluo-nvidia Oct 28, 2024
3206da7
test
lanluo-nvidia Oct 28, 2024
0a64986
test
lanluo-nvidia Oct 28, 2024
a02d944
test
lanluo-nvidia Oct 28, 2024
9c0ca36
test
lanluo-nvidia Oct 28, 2024
67cbaf3
test
lanluo-nvidia Nov 1, 2024
7765146
add some test
lanluo-nvidia Nov 3, 2024
8a199aa
test
lanluo-nvidia Nov 4, 2024
546a574
test
lanluo-nvidia Nov 4, 2024
41aec8b
test
lanluo-nvidia Nov 4, 2024
a554285
test
lanluo-nvidia Nov 4, 2024
7102fa5
test
lanluo-nvidia Nov 5, 2024
0fd94e6
test
lanluo-nvidia Nov 5, 2024
6cc2faa
resolve comments
lanluo-nvidia Nov 5, 2024
e9af038
add more tests
lanluo-nvidia Nov 7, 2024
646a515
Merge branch 'main' into lluo/tensorrt_test_workflow
lanluo-nvidia Nov 8, 2024
1bf5673
merge main into the branch
lanluo-nvidia Nov 8, 2024
1f92a78
add comments
lanluo-nvidia Nov 8, 2024
f047aa1
add future tensorrt test workflow on windows
lanluo-nvidia Nov 12, 2024
80a4a7f
test
lanluo-nvidia Nov 12, 2024
78878f6
test
lanluo-nvidia Nov 12, 2024
a624610
test
lanluo-nvidia Nov 12, 2024
5b7352c
test with other windows nodes
lanluo-nvidia Nov 12, 2024
d984c90
test
lanluo-nvidia Nov 12, 2024
1928e8a
test
lanluo-nvidia Nov 12, 2024
8fc3482
change decomposition default table due to upstream torch change
lanluo-nvidia Nov 12, 2024
4ed8e28
test
lanluo-nvidia Nov 12, 2024
7e3c1fa
test
lanluo-nvidia Nov 12, 2024
229bdf1
Merge branch 'lluo/decomposition_upstream_change' into lluo/tensorrt_…
lanluo-nvidia Nov 12, 2024
bd69526
Merge branch 'lluo/tensorrt_test_workflow' into lluo/tensorrt_test_wi…
lanluo-nvidia Nov 12, 2024
dbd740b
test
lanluo-nvidia Nov 13, 2024
69649b3
Merge branch 'main' into lluo/tensorrt_test_windows_wf
lanluo-nvidia Nov 14, 2024
1fdb135
test
lanluo-nvidia Nov 14, 2024
bb9b2b6
test
lanluo-nvidia Nov 14, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
123 changes: 123 additions & 0 deletions .github/scripts/generate-tensorrt-test-matrix.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,123 @@
#!/usr/bin/env python3

import argparse
import copy
import json
import sys

# please update the cuda version you want to test with the future tensorRT version here
# channel: nightly if the future tensorRT version test workflow is triggered from the main branch or your personal branch
# channel: test if the future tensorRT version test workflow is triggered from the release branch(release/2.5 etc....)
CUDA_VERSIONS_DICT = {
"nightly": ["cu124"],
"test": ["cu121", "cu124"],
"release": ["cu121", "cu124"],
}

# please update the python version you want to test with the future tensorRT version here
# channel: nightly if the future tensorRT version test workflow is triggered from the main branch or your personal branch
# channel: test if the future tensorRT version test workflow is triggered from the release branch(release/2.5 etc....)
PYTHON_VERSIONS_DICT = {
"nightly": ["3.9"],
"test": ["3.9", "3.10", "3.11", "3.12"],
"release": ["3.9", "3.10", "3.11", "3.12"],
}

# please update the future tensorRT version you want to test here
TENSORRT_VERSIONS_DICT = {
"windows": {
"10.4.0": {
"urls": "https://developer.nvidia.com/downloads/compute/machine-learning/tensorrt/10.4.0/zip/TensorRT-10.4.0.26.Windows.win10.cuda-12.6.zip",
"strip_prefix": "TensorRT-10.4.0.26",
"sha256": "3a7de83778b9e9f812fd8901e07e0d7d6fc54ce633fcff2e340f994df2c6356c",
},
"10.5.0": {
"urls": "https://developer.nvidia.com/downloads/compute/machine-learning/tensorrt/10.5.0/zip/TensorRT-10.5.0.18.Windows.win10.cuda-12.6.zip",
"strip_prefix": "TensorRT-10.5.0.18",
"sha256": "e6436f4164db4e44d727354dccf7d93755efb70d6fbfd6fa95bdfeb2e7331b24",
},
"10.6.0": {
"urls": "https://developer.nvidia.com/downloads/compute/machine-learning/tensorrt/10.6.0/zip/TensorRT-10.6.0.26.Windows.win10.cuda-12.6.zip",
"strip_prefix": "TensorRT-10.6.0.26",
"sha256": "6c6d92c108a1b3368423e8f69f08d31269830f1e4c9da43b37ba34a176797254",
},
},
"linux": {
"10.4.0": {
"urls": "https://developer.nvidia.com/downloads/compute/machine-learning/tensorrt/10.4.0/tars/TensorRT-10.4.0.26.Linux.x86_64-gnu.cuda-12.6.tar.gz",
"strip_prefix": "TensorRT-10.4.0.26",
"sha256": "cb0273ecb3ba4db8993a408eedd354712301a6c7f20704c52cdf9f78aa97bbdb",
},
"10.5.0": {
"urls": "https://developer.nvidia.com/downloads/compute/machine-learning/tensorrt/10.5.0/tars/TensorRT-10.5.0.18.Linux.x86_64-gnu.cuda-12.6.tar.gz",
"strip_prefix": "TensorRT-10.5.0.18",
"sha256": "f404d379d639552a3e026cd5267213bd6df18a4eb899d6e47815bbdb34854958",
},
"10.6.0": {
"urls": "https://developer.nvidia.com/downloads/compute/machine-learning/tensorrt/10.6.0/tars/TensorRT-10.6.0.26.Linux.x86_64-gnu.cuda-12.6.tar.gz",
"strip_prefix": "TensorRT-10.6.0.26",
"sha256": "33d3c2f3f4c84dc7991a4337a6fde9ed33f5c8e5c4f03ac2eb6b994a382b03a0",
},
},
}


def main(args: list[str]) -> None:
parser = argparse.ArgumentParser()
parser.add_argument(
"--matrix",
help="matrix",
type=str,
default="",
)

options = parser.parse_args(args)
if options.matrix == "":
raise Exception("--matrix is empty, please provide the matrix json str")

matrix_dict = json.loads(options.matrix)
includes = matrix_dict["include"]
assert len(includes) > 0
if "channel" not in includes[0]:
raise Exception(f"channel field is missing from the matrix: {options.matrix}")
channel = includes[0]["channel"]
if channel not in ("nightly", "test", "release"):
raise Exception(
f"channel field: {channel} is not supported, currently supported value: nightly, test, release"
)

if "validation_runner" not in includes[0]:
raise Exception(
f"validation_runner field is missing from the matrix: {options.matrix}"
)
if "windows" in includes[0]["validation_runner"]:
arch = "windows"
elif "linux" in includes[0]["validation_runner"]:
arch = "linux"
else:
raise Exception(
f"{includes[0].validation_runner} is not the supported arch, currently only support windows and linux"
)

cuda_versions = CUDA_VERSIONS_DICT[channel]
python_versions = PYTHON_VERSIONS_DICT[channel]
tensorrt_versions = TENSORRT_VERSIONS_DICT[arch]

filtered_includes = []
for item in includes:
if (
item["desired_cuda"] in cuda_versions
and item["python_version"] in python_versions
):
for tensorrt_version, tensorrt_json in tensorrt_versions.items():
new_item = copy.deepcopy(item)
tensorrt_json["version"] = tensorrt_version
new_item["tensorrt"] = tensorrt_json
filtered_includes.append(new_item)
filtered_matrix_dict = {}
filtered_matrix_dict["include"] = filtered_includes
print(json.dumps(filtered_matrix_dict))


if __name__ == "__main__":
main(sys.argv[1:])
222 changes: 222 additions & 0 deletions .github/workflows/build-tensorrt-linux.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,222 @@
name: Build Torch-TensorRT wheel on Linux with Future TensorRT Versions

on:
workflow_call:
inputs:
repository:
description: 'Repository to checkout, defaults to ""'
default: ""
type: string
ref:
description: 'Reference to checkout, defaults to "nightly"'
default: "nightly"
type: string
test-infra-repository:
description: "Test infra repository to use"
default: "pytorch/test-infra"
type: string
test-infra-ref:
description: "Test infra reference to use"
default: ""
type: string
build-matrix:
description: "Build matrix to utilize"
default: ""
type: string
pre-script:
description: "Pre script to run prior to build"
default: ""
type: string
post-script:
description: "Post script to run prior to build"
default: ""
type: string
smoke-test-script:
description: "Script for Smoke Test for a specific domain"
default: ""
type: string
env-var-script:
description: "Script that sets Domain-Specific Environment Variables"
default: ""
type: string
package-name:
description: "Name of the actual python package that is imported"
default: ""
type: string
trigger-event:
description: "Trigger Event in caller that determines whether or not to upload"
default: ""
type: string
cache-path:
description: "The path(s) on the runner to cache or restore. The path is relative to repository."
default: ""
type: string
cache-key:
description: "The key created when saving a cache and the key used to search for a cache."
default: ""
type: string
architecture:
description: Architecture to build for x86_64 for default Linux, or aarch64 for Linux aarch64 builds
required: false
type: string
default: x86_64
submodules:
description: Works as stated in actions/checkout, but the default value is recursive
required: false
type: string
default: recursive
setup-miniconda:
description: Set to true if setup-miniconda is needed
required: false
type: boolean
default: true

permissions:
id-token: write
contents: read

jobs:
build:
strategy:
fail-fast: false
matrix: ${{ fromJSON(inputs.build-matrix) }}
env:
PYTHON_VERSION: ${{ matrix.python_version }}
PACKAGE_TYPE: wheel
REPOSITORY: ${{ inputs.repository }}
REF: ${{ inputs.ref }}
CU_VERSION: ${{ matrix.desired_cuda }}
UPLOAD_TO_BASE_BUCKET: ${{ matrix.upload_to_base_bucket }}
ARCH: ${{ inputs.architecture }}
TENSORRT_STRIP_PREFIX: ${{ matrix.tensorrt.strip_prefix }}
TENSORRT_VERSION: ${{ matrix.tensorrt.version }}
TENSORRT_URLS: ${{ matrix.tensorrt.urls }}
TENSORRT_SHA256: ${{ matrix.tensorrt.sha256 }}
UPLOAD_ARTIFACT_NAME: pytorch_tensorrt_${{ matrix.tensorrt.version }}_${{ matrix.python_version }}_${{ matrix.desired_cuda }}_${{ inputs.architecture }}
name: build_tensorrt${{ matrix.tensorrt.version }}_py${{matrix.python_version}}_${{matrix.desired_cuda}}
runs-on: ${{ matrix.validation_runner }}
container:
image: ${{ matrix.container_image }}
options: ${{ matrix.gpu_arch_type == 'cuda' && '--gpus all' || ' ' }}
# If a build is taking longer than 120 minutes on these runners we need
# to have a conversation
timeout-minutes: 120

steps:
- name: Clean workspace
shell: bash -l {0}
run: |
set -x
echo "::group::Cleanup debug output"
rm -rf "${GITHUB_WORKSPACE}"
mkdir -p "${GITHUB_WORKSPACE}"
if [[ "${{ inputs.architecture }}" = "aarch64" ]]; then
rm -rf "${RUNNER_TEMP}/*"
fi
echo "::endgroup::"
- uses: actions/checkout@v3
with:
# Support the use case where we need to checkout someone's fork
repository: ${{ inputs.test-infra-repository }}
ref: ${{ inputs.test-infra-ref }}
path: test-infra
- uses: actions/checkout@v3
if: ${{ env.ARCH == 'aarch64' }}
with:
# Support the use case where we need to checkout someone's fork
repository: "pytorch/builder"
ref: "main"
path: builder
- name: Set linux aarch64 CI
if: ${{ inputs.architecture == 'aarch64' }}
shell: bash -l {0}
env:
DESIRED_PYTHON: ${{ matrix.python_version }}
run: |
set +e
# TODO: This is temporary aarch64 setup script, this should be integrated into aarch64 docker.
${GITHUB_WORKSPACE}/builder/aarch64_linux/aarch64_ci_setup.sh
echo "/opt/conda/bin" >> $GITHUB_PATH
set -e
- uses: ./test-infra/.github/actions/set-channel
- name: Set PYTORCH_VERSION
if: ${{ env.CHANNEL == 'test' }}
run: |
# When building RC, set the version to be the current candidate version,
# otherwise, leave it alone so nightly will pick up the latest
echo "PYTORCH_VERSION=${{ matrix.stable_version }}" >> "${GITHUB_ENV}"
- uses: ./test-infra/.github/actions/setup-binary-builds
env:
PLATFORM: ${{ inputs.architecture == 'aarch64' && 'linux-aarch64' || ''}}
with:
repository: ${{ inputs.repository }}
ref: ${{ inputs.ref }}
submodules: ${{ inputs.submodules }}
setup-miniconda: ${{ inputs.setup-miniconda }}
python-version: ${{ env.PYTHON_VERSION }}
cuda-version: ${{ env.CU_VERSION }}
arch: ${{ env.ARCH }}
- name: Combine Env Var and Build Env Files
if: ${{ inputs.env-var-script != '' }}
working-directory: ${{ inputs.repository }}
shell: bash -l {0}
run: |
cat "${{ inputs.env-var-script }}" >> "${BUILD_ENV_FILE}"
- name: Install torch dependency
shell: bash -l {0}
run: |
set -x
# shellcheck disable=SC1090
source "${BUILD_ENV_FILE}"
# shellcheck disable=SC2086
${CONDA_RUN} ${PIP_INSTALL_TORCH}
- name: Run Pre-Script with Caching
if: ${{ inputs.pre-script != '' }}
uses: ./test-infra/.github/actions/run-script-with-cache
with:
cache-path: ${{ inputs.cache-path }}
cache-key: ${{ inputs.cache-key }}
repository: ${{ inputs.repository }}
script: ${{ inputs.pre-script }}
- name: Build clean
working-directory: ${{ inputs.repository }}
shell: bash -l {0}
run: |
set -x
source "${BUILD_ENV_FILE}"
${CONDA_RUN} python setup.py clean
- name: Build the wheel (bdist_wheel)
working-directory: ${{ inputs.repository }}
shell: bash -l {0}
run: |
set -x
source "${BUILD_ENV_FILE}"
${CONDA_RUN} python setup.py bdist_wheel

- name: Run Post-Script
if: ${{ inputs.post-script != '' }}
uses: ./test-infra/.github/actions/run-script-with-cache
with:
repository: ${{ inputs.repository }}
script: ${{ inputs.post-script }}
- name: Smoke Test
shell: bash -l {0}
env:
PACKAGE_NAME: ${{ inputs.package-name }}
SMOKE_TEST_SCRIPT: ${{ inputs.smoke-test-script }}
run: |
set -x
source "${BUILD_ENV_FILE}"
# TODO: add smoke test for the auditwheel tarball built

# NB: Only upload to GitHub after passing smoke tests
- name: Upload wheel to GitHub
continue-on-error: true
uses: actions/upload-artifact@v3
with:
name: ${{ env.UPLOAD_ARTIFACT_NAME }}
path: ${{ inputs.repository }}/dist

concurrency:
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ inputs.repository }}-${{ github.event_name == 'workflow_dispatch' }}-${{ inputs.job-name }}
cancel-in-progress: true
Loading
Loading