From 4fd0a36638615ba9f220fe934eaa4bd2607af266 Mon Sep 17 00:00:00 2001 From: "Christian M. Todie" Date: Thu, 20 Jul 2023 11:43:00 -0400 Subject: [PATCH 01/63] chore(git): Add .gitignore --- .gitignore | 164 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 164 insertions(+) create mode 100644 .gitignore diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..9d90afc --- /dev/null +++ b/.gitignore @@ -0,0 +1,164 @@ +# General +.DS_Store +.AppleDouble +.LSOverride + +# Icon must end with two \r +Icon + +# Thumbnails +._* + +# Files that might appear in the root of a volume +.DocumentRevisions-V100 +.fseventsd +.Spotlight-V100 +.TemporaryItems +.Trashes +.VolumeIcon.icns +.com.apple.timemachine.donotpresent + +# Directories potentially created on remote AFP share +.AppleDB +.AppleDesktop +Network Trash Folder +Temporary Items +.apdisk + +*~ + +# temporary files which can be created if a process still has a handle open of a deleted file +.fuse_hidden* + +# KDE directory preferences +.directory + +# Linux trash folder which might appear on any partition or disk +.Trash-* + +# .nfs files are created when an open file is removed but is still being accessed +.nfs* + +# Swap +[._]*.s[a-v][a-z] +!*.svg # comment out if you don't need vector files +[._]*.sw[a-p] +[._]s[a-rt-v][a-z] +[._]ss[a-gi-z] +[._]sw[a-p] + +# Session +Session.vim +Sessionx.vim + +# Temporary +.netrwhist +*~ +# Auto-generated tag files +tags +# Persistent undo +[._]*.un~ + +# -*- mode: gitignore; -*- +*~ +\#*\# +/.emacs.desktop +/.emacs.desktop.lock +*.elc +auto-save-list +tramp +.\#* + +# Org-mode +.org-id-locations +*_archive + +# flymake-mode +*_flymake.* + +# eshell files +/eshell/history +/eshell/lastdir + +# elpa packages +/elpa/ + +# reftex files +*.rel + +# AUCTeX auto folder +/auto/ + +# cask packages +.cask/ +dist/ + +# Flycheck +flycheck_*.el + +# server auth directory +/server/ + +# projectiles files +.projectile + +# directory configuration +.dir-locals.el + +# network security +/network-security.data + +# -*- mode: gitignore; -*- +*~ +\#*\# +/.emacs.desktop +/.emacs.desktop.lock +*.elc +auto-save-list +tramp +.\#* + +# Org-mode +.org-id-locations +*_archive + +# flymake-mode +*_flymake.* + +# eshell files +/eshell/history +/eshell/lastdir + +# elpa packages +/elpa/ + +# reftex files +*.rel + +# AUCTeX auto folder +/auto/ + +# cask packages +.cask/ +dist/ + +# Flycheck +flycheck_*.el + +# server auth directory +/server/ + +# projectiles files +.projectile + +# directory configuration +.dir-locals.el + +# network security +/network-security.data + +# local environment files +.env +.env* +.environment +.environment* From 1eb71f6af6fb8b590dfc1ce1a8401d1fd2b6140f Mon Sep 17 00:00:00 2001 From: "Christian M. Todie" Date: Thu, 20 Jul 2023 11:47:24 -0400 Subject: [PATCH 02/63] ci(build): Add DockerHub login and secret inheritence --- .github/workflows/bloom.yml | 5 +++-- .github/workflows/build.yml | 18 ++++++++++++------ .github/workflows/cuda-ssh.yml | 1 + .github/workflows/gpt-neox-determined.yml | 5 +++-- .github/workflows/gpt-neox-mpi.yml | 1 + .github/workflows/sd-finetuner.yml | 5 +++-- .github/workflows/sd-inference.yml | 5 +++-- .github/workflows/sd-serializer.yml | 5 +++-- .github/workflows/slurm.yml | 3 ++- .github/workflows/tensorizer.yml | 5 +++-- .github/workflows/torch-base.yml | 1 + .github/workflows/torch-extras.yml | 1 + .github/workflows/torch-nccl.yml | 1 + .github/workflows/torch.yml | 2 ++ 14 files changed, 39 insertions(+), 19 deletions(-) diff --git a/.github/workflows/bloom.yml b/.github/workflows/bloom.yml index 1aeaa41..8e169ff 100644 --- a/.github/workflows/bloom.yml +++ b/.github/workflows/bloom.yml @@ -10,7 +10,8 @@ on: jobs: build: uses: ./.github/workflows/build.yml - with: + secrets: inherit + with: image-name: bloom folder: bloom - build-args: "" \ No newline at end of file + build-args: "" diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 7eb4616..1c376ae 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -38,12 +38,17 @@ jobs: - uses: actions/checkout@v3 - name: Set up Docker Buildx uses: docker/setup-buildx-action@v2.2.1 - - name: Login to container registry - uses: docker/login-action@v2.1.0 + - name: Login to GitHub container registry + uses: docker/login-action@v2.2.0 with: registry: ghcr.io username: ${{ github.actor }} password: ${{ secrets.GITHUB_TOKEN }} + - name: Login to DockerHub container registry + uses: docker/login-action@v2.2.0 + with: + username: ${{ secrets.DOCKERHUB_USERNAME }} + password: ${{ secrets.DOCKERHUB_PASSWORD }} - name: Get base registry run: | echo "REGISTRY=ghcr.io/${GITHUB_REPOSITORY,,}" >> $GITHUB_ENV @@ -67,7 +72,8 @@ jobs: uses: docker/build-push-action@v3.2.0 with: context: ${{ inputs.folder }} - build-args: ${{ inputs.build-args }} + build-args: |- + ${{ inputs.build-args }} push: true tags: ${{ steps.meta.outputs.tags }} labels: ${{ steps.meta.outputs.labels }} @@ -80,10 +86,10 @@ jobs: - name: Comment if: steps.PR.outputs.number uses: peter-evans/create-or-update-comment@v2.1.0 - with: + with: issue-number: ${{ steps.PR.outputs.number }} body: > - @${{ github.triggering_actor }} Build complete, ${{ steps.docker-build.outcome }}: + @${{ github.triggering_actor }} Build complete, ${{ steps.docker-build.outcome }}: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }} - Image: `${{ fromJSON(steps.docker-build.outputs.metadata)['image.name'] }}` \ No newline at end of file + Image: `${{ fromJSON(steps.docker-build.outputs.metadata)['image.name'] }}` diff --git a/.github/workflows/cuda-ssh.yml b/.github/workflows/cuda-ssh.yml index 892e6c3..6ba26ea 100644 --- a/.github/workflows/cuda-ssh.yml +++ b/.github/workflows/cuda-ssh.yml @@ -16,6 +16,7 @@ jobs: - ceeb8c2-nccl-cuda11.8.0-nccl2.16.2-1-torch2.0.1-vision0.15.2-audio2.0.2 uses: ./.github/workflows/build.yml + secrets: inherit with: image-name: cuda-ssh folder: cuda-ssh diff --git a/.github/workflows/gpt-neox-determined.yml b/.github/workflows/gpt-neox-determined.yml index 2ae03f7..4e0a6f4 100644 --- a/.github/workflows/gpt-neox-determined.yml +++ b/.github/workflows/gpt-neox-determined.yml @@ -10,7 +10,8 @@ on: jobs: build: uses: ./.github/workflows/build.yml - with: + secrets: inherit + with: image-name: gpt-neox-determined folder: gpt-neox-determined - build-args: "" \ No newline at end of file + build-args: "" diff --git a/.github/workflows/gpt-neox-mpi.yml b/.github/workflows/gpt-neox-mpi.yml index f2c6c0c..aec2a5f 100644 --- a/.github/workflows/gpt-neox-mpi.yml +++ b/.github/workflows/gpt-neox-mpi.yml @@ -10,6 +10,7 @@ on: jobs: build: uses: ./.github/workflows/build.yml + secrets: inherit with: image-name: gpt-neox-mpi folder: gpt-neox-mpi diff --git a/.github/workflows/sd-finetuner.yml b/.github/workflows/sd-finetuner.yml index 165a2a5..6e6203c 100644 --- a/.github/workflows/sd-finetuner.yml +++ b/.github/workflows/sd-finetuner.yml @@ -15,7 +15,8 @@ on: jobs: build: uses: ./.github/workflows/build.yml - with: + secrets: inherit + with: image-name: sd-finetuner folder: sd-finetuner - build-args: "--build-arg COMMIT=${{ github.event.inputs.commit }}" \ No newline at end of file + build-args: "--build-arg COMMIT=${{ github.event.inputs.commit }}" diff --git a/.github/workflows/sd-inference.yml b/.github/workflows/sd-inference.yml index 06d9052..22167bb 100644 --- a/.github/workflows/sd-inference.yml +++ b/.github/workflows/sd-inference.yml @@ -15,7 +15,8 @@ on: jobs: build: uses: ./.github/workflows/build.yml - with: + secrets: inherit + with: image-name: sd-inference folder: sd-inference - build-args: "--build-arg COMMIT=${{ github.event.inputs.commit }}" \ No newline at end of file + build-args: "--build-arg COMMIT=${{ github.event.inputs.commit }}" diff --git a/.github/workflows/sd-serializer.yml b/.github/workflows/sd-serializer.yml index e964807..fc11bfa 100644 --- a/.github/workflows/sd-serializer.yml +++ b/.github/workflows/sd-serializer.yml @@ -15,7 +15,8 @@ on: jobs: build: uses: ./.github/workflows/build.yml - with: + secrets: inherit + with: image-name: sd-serializer folder: sd-serializer - build-args: "--build-arg COMMIT=${{ github.event.inputs.commit }}" \ No newline at end of file + build-args: "--build-arg COMMIT=${{ github.event.inputs.commit }}" diff --git a/.github/workflows/slurm.yml b/.github/workflows/slurm.yml index fef7d0a..1c63a49 100644 --- a/.github/workflows/slurm.yml +++ b/.github/workflows/slurm.yml @@ -21,7 +21,8 @@ jobs: BASE_IMAGE=registry.gitlab.com/coreweave/sunk/slurmd-cw-cu117-extras:bc5a133d uses: ./.github/workflows/build.yml - with: + secrets: inherit + with: image-name: ${{ matrix.image.name }} folder: ${{ matrix.image.folder }} build-args: ${{ matrix.image.build-args }} diff --git a/.github/workflows/tensorizer.yml b/.github/workflows/tensorizer.yml index 5778b9b..a9a870f 100644 --- a/.github/workflows/tensorizer.yml +++ b/.github/workflows/tensorizer.yml @@ -15,7 +15,8 @@ on: jobs: build: uses: ./.github/workflows/build.yml - with: + secrets: inherit + with: image-name: tensorizer folder: tensorizer - build-args: "--build-arg COMMIT=${{ github.event.inputs.commit }}" \ No newline at end of file + build-args: "--build-arg COMMIT=${{ github.event.inputs.commit }}" diff --git a/.github/workflows/torch-base.yml b/.github/workflows/torch-base.yml index 64b3af9..1aedab9 100644 --- a/.github/workflows/torch-base.yml +++ b/.github/workflows/torch-base.yml @@ -19,6 +19,7 @@ jobs: audio: 2.0.2 uses: ./.github/workflows/torch.yml + secrets: inherit with: tag: ${{ format('base-cuda{0}-torch{1}-vision{2}-audio{3}', matrix.cuda, matrix.torch, matrix.vision, matrix.audio) }} builder-base-image: nvidia/cuda:${{ matrix.cuda }}-devel-ubuntu20.04 diff --git a/.github/workflows/torch-extras.yml b/.github/workflows/torch-extras.yml index 9892af2..a9460b5 100644 --- a/.github/workflows/torch-extras.yml +++ b/.github/workflows/torch-extras.yml @@ -21,6 +21,7 @@ on: jobs: build: uses: ./.github/workflows/build.yml + secrets: inherit with: image-name: torch-extras folder: torch-extras diff --git a/.github/workflows/torch-nccl.yml b/.github/workflows/torch-nccl.yml index fb84489..a0660b6 100644 --- a/.github/workflows/torch-nccl.yml +++ b/.github/workflows/torch-nccl.yml @@ -28,6 +28,7 @@ jobs: audio: 2.0.2 uses: ./.github/workflows/torch.yml + secrets: inherit with: tag: ${{ format('nccl-cuda{0}-nccl{1}-torch{2}-vision{3}-audio{4}', matrix.image.cuda, matrix.image.nccl, matrix.torch, matrix.vision, matrix.audio) }} builder-base-image: ghcr.io/coreweave/nccl-tests:${{ matrix.image.cuda }}-cudnn8-devel-ubuntu20.04-nccl${{ matrix.image.nccl }}-${{ matrix.image.nccl-tests-hash }} diff --git a/.github/workflows/torch.yml b/.github/workflows/torch.yml index f2b553a..681cf06 100644 --- a/.github/workflows/torch.yml +++ b/.github/workflows/torch.yml @@ -60,6 +60,7 @@ on: jobs: build: uses: ./.github/workflows/build.yml + secrets: inherit with: image-name: torch folder: torch @@ -76,6 +77,7 @@ jobs: if: inputs.build-extras needs: build uses: ./.github/workflows/torch-extras.yml + secrets: inherit with: tag: ${{ inputs.tag }} base-image: ${{ needs.build.outputs.tags }} From 639915eb7089e9fa68b2ae7b96c0c48a36ecc889 Mon Sep 17 00:00:00 2001 From: Eta Date: Mon, 31 Jul 2023 21:00:46 -0500 Subject: [PATCH 03/63] fix(torch-extras): Fix Apex build options --- torch-extras/Dockerfile | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/torch-extras/Dockerfile b/torch-extras/Dockerfile index a57ef7a..3ad578c 100644 --- a/torch-extras/Dockerfile +++ b/torch-extras/Dockerfile @@ -157,7 +157,8 @@ RUN --mount=type=bind,from=apex-downloader,source=/git/apex,target=apex/,rw \ packaging setuptools wheel pip && \ export CC=$(realpath -e ./compiler) && \ export MAX_JOBS=$(($(./effective_cpu_count.sh) + 2)) && \ - EXTENSIONS=$(printf -- '--config-settings "--build-option=%s" ' $( \ + export NVCC_APPEND_FLAGS='-diag-suppress 186,177' && \ + printf -- '--config-settings="--build-option=%s" ' $( \ echo \ --cpp_ext \ --cuda_ext \ @@ -180,11 +181,10 @@ RUN --mount=type=bind,from=apex-downloader,source=/git/apex,target=apex/,rw \ --cudnn_gbn \ --fused_conv_bias_relu; \ fi; \ - )) && \ + ) > ./apex-extensions.conf && \ + echo "Extensions: $(cat ./apex-extensions.conf)" && \ cd apex && \ - python3 -m pip wheel -w /wheels -v \ - --no-cache-dir --no-build-isolation --no-deps \ - $EXTENSIONS ./ + xargs -a ../apex-extensions.conf python3 -m pip wheel -w /wheels -v --no-cache-dir --no-build-isolation --no-deps ./ WORKDIR /wheels From 52bbae0399c2edb7b504c6b28a22b0ece5c2cfd5 Mon Sep 17 00:00:00 2001 From: Eta Date: Mon, 31 Jul 2023 21:04:23 -0500 Subject: [PATCH 04/63] ci: Add option to `build.yml` to select a bigger runner [skip ci] --- .github/workflows/build.yml | 12 +++++++++++- .github/workflows/torch-extras.yml | 2 ++ 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 7eb4616..cfe6f5d 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -15,6 +15,16 @@ on: tag-suffix: required: false type: string + runner: + required: false + type: choice + default: | + ["self-hosted", "Linux"] + options: + - | + ["self-hosted", "Linux"] + - | + ["self-hosted", "Linux", "chunky"] outputs: outcome: description: "The outcome of the build" @@ -29,7 +39,7 @@ on: jobs: build: name: Build Images - runs-on: [self-hosted, Linux] + runs-on: ${{ fromJSON(inputs.runner) }} outputs: outcome: ${{ steps.docker-build.outcome }} tags: ${{ steps.meta.outputs.tags }} diff --git a/.github/workflows/torch-extras.yml b/.github/workflows/torch-extras.yml index bd1decf..065e0d4 100644 --- a/.github/workflows/torch-extras.yml +++ b/.github/workflows/torch-extras.yml @@ -30,6 +30,8 @@ jobs: image-name: torch-extras folder: torch-extras tag-suffix: ${{ inputs.tag }}-flash_attn${{ matrix.flash-attn }} + runner: | + ["self-hosted", "Linux", "chunky"] build-args: | BASE_IMAGE=${{ inputs.base-image }} FLASH_ATTN_VERSION=${{ matrix.flash-attn }} From 750fc34f7bc842e22210b4efb63aed2899134499 Mon Sep 17 00:00:00 2001 From: Eta Date: Mon, 31 Jul 2023 21:34:52 -0500 Subject: [PATCH 05/63] ci: Fix CI runner selection [skip ci] --- .github/workflows/build.yml | 14 ++++---------- .github/workflows/torch-extras.yml | 3 +-- 2 files changed, 5 insertions(+), 12 deletions(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index cfe6f5d..603f56f 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -15,16 +15,10 @@ on: tag-suffix: required: false type: string - runner: + large-runner: required: false - type: choice - default: | - ["self-hosted", "Linux"] - options: - - | - ["self-hosted", "Linux"] - - | - ["self-hosted", "Linux", "chunky"] + type: boolean + default: false outputs: outcome: description: "The outcome of the build" @@ -39,7 +33,7 @@ on: jobs: build: name: Build Images - runs-on: ${{ fromJSON(inputs.runner) }} + runs-on: ${{ fromJSON(inputs.large-runner == 'true' && '["self-hosted", "Linux", "chunky"]' || '["self-hosted", "Linux"]') }} outputs: outcome: ${{ steps.docker-build.outcome }} tags: ${{ steps.meta.outputs.tags }} diff --git a/.github/workflows/torch-extras.yml b/.github/workflows/torch-extras.yml index 065e0d4..f18dde0 100644 --- a/.github/workflows/torch-extras.yml +++ b/.github/workflows/torch-extras.yml @@ -30,8 +30,7 @@ jobs: image-name: torch-extras folder: torch-extras tag-suffix: ${{ inputs.tag }}-flash_attn${{ matrix.flash-attn }} - runner: | - ["self-hosted", "Linux", "chunky"] + large-runner: true build-args: | BASE_IMAGE=${{ inputs.base-image }} FLASH_ATTN_VERSION=${{ matrix.flash-attn }} From 5fbc6bbb6dc4a50da442dbbc0f6676da425ce639 Mon Sep 17 00:00:00 2001 From: Eta Date: Mon, 31 Jul 2023 21:56:06 -0500 Subject: [PATCH 06/63] ci: Fix CI runner selection ternary to use correct type Since this uses GitHub's inputs context rather than github.events.inputs, boolean arguments remain booleans in expressions, so inputs.large-runner == 'true' is not needed. [skip ci] --- .github/workflows/build.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 603f56f..53d8d87 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -33,7 +33,7 @@ on: jobs: build: name: Build Images - runs-on: ${{ fromJSON(inputs.large-runner == 'true' && '["self-hosted", "Linux", "chunky"]' || '["self-hosted", "Linux"]') }} + runs-on: ${{ fromJSON(inputs.large-runner && '["self-hosted", "Linux", "chunky"]' || '["self-hosted", "Linux"]') }} outputs: outcome: ${{ steps.docker-build.outcome }} tags: ${{ steps.meta.outputs.tags }} From 34f0098f1df5fa87352c202e16efa84077f00f6e Mon Sep 17 00:00:00 2001 From: Eta Date: Mon, 14 Aug 2023 20:09:50 -0500 Subject: [PATCH 07/63] feat(torch-extras): Add `--distributed_*` and `--group_norm` to Apex --- torch-extras/Dockerfile | 3 +++ 1 file changed, 3 insertions(+) diff --git a/torch-extras/Dockerfile b/torch-extras/Dockerfile index 3ad578c..ac27d45 100644 --- a/torch-extras/Dockerfile +++ b/torch-extras/Dockerfile @@ -162,9 +162,12 @@ RUN --mount=type=bind,from=apex-downloader,source=/git/apex,target=apex/,rw \ echo \ --cpp_ext \ --cuda_ext \ + --distributed_adam \ + --distributed_lamb \ --permutation_search \ --xentropy \ --focal_loss \ + --group_norm \ --index_mul_2d \ --deprecated_fused_adam \ --deprecated_fused_lamb \ From b090697fe4d87f8fa9200a308f16486b68b224b8 Mon Sep 17 00:00:00 2001 From: Eta Date: Mon, 14 Aug 2023 20:11:06 -0500 Subject: [PATCH 08/63] ci: Trigger `torch-extras` builds on pushes to their own source --- .github/workflows/torch-extras.yml | 96 +++++++++++++++++++++++++++++- 1 file changed, 93 insertions(+), 3 deletions(-) diff --git a/.github/workflows/torch-extras.yml b/.github/workflows/torch-extras.yml index f18dde0..8d24573 100644 --- a/.github/workflows/torch-extras.yml +++ b/.github/workflows/torch-extras.yml @@ -7,21 +7,95 @@ on: base-image: required: true type: string + skip-bases-check: + required: false + type: boolean + default: true workflow_dispatch: inputs: tag: - required: true + required: false description: "Tag suffix to identify the build" type: string base-image: - required: true + required: false description: "Base image for the build" type: string + skip-bases-check: + required: false + type: boolean + default: true + + push: + paths: + - "torch-extras/**" + - ".github/workflows/torch-extras.yml" + - ".github/workflows/build.yml" jobs: - build: + get-required-bases: + if: !inputs.skip-bases-check && github.event.action == 'push' + runs-on: ["self-hosted", "Linux"] + outputs: + bases-list: ${{ steps.choose-bases.outputs.list }} + steps: + - name: check-changed + run: | + CHANGED_FILES="$(git diff --name-only '${{ github.event.before }}' '${{ github.event.after }}')" && \ + { \ + echo "$CHANGED_FILES" \ + | grep -P '^(\./)?(torch/|\.github/workflows/torch(-base)?\.yml|\.github/workflows/build\.yml)' > /dev/null \ + && echo "BASE_PROVIDED=true" >> $GITHUB_OUTPUT \ + || echo "BASE_PROVIDED=false" >> $GITHUB_OUTPUT; \ + } && { \ + echo "$CHANGED_FILES" \ + | grep -P '^(\./)?(torch/|\.github/workflows/torch(-nccl)?\.yml|\.github/workflows/build\.yml)' > /dev/null \ + && echo "NCCL_PROVIDED=true" >> $GITHUB_OUTPUT \ + || echo "NCCL_PROVIDED=false" >> $GITHUB_OUTPUT; \ + } + - name: get-latest + run: | + RELEASES="$( \ + /bin/curl -f -s --oauth2-bearer "$(echo "$BEARER_TOKEN" | base64 -w 0)" \ + https://ghcr.io/v2/coreweave/ml-containers%2Ftorch/tags/list \ + | jq -r '.["tags"][]' \ + | grep -E '^[0-9a-f]{7}-(base|nccl)-' \ + )" && \ + BASE_RELEASES="$(echo "$RELEASES" | grep -E '^[0-9a-f]{7}-base-')" && \ + NCCL_RELEASES="$(echo "$RELEASES" | grep -E '^[0-9a-f]{7}-nccl-')" && \ + LATEST_BASE_COMMIT="$(echo "$BASE_RELEASES" | tail -1 | cut -c1-7)" && \ + LATEST_NCCL_COMMIT="$(echo "$NCCL_RELEASES" | tail -1 | cut -c1-7)" && \ + LATEST_BASE_IMAGES="$(echo "$BASE_RELEASES" | grep -F "${LATEST_BASE_COMMIT}-")" && \ + LATEST_NCCL_IMAGES="$(echo "$NCCL_RELEASES" | grep -F "${LATEST_NCCL_COMMIT}-")" && \ + echo "LATEST_BASE_IMAGES=\"$LATEST_BASE_IMAGES\"" >> $GITHUB_OUTPUT && \ + echo "LATEST_NCCL_IMAGES=\"$LATEST_NCCL_IMAGES\"" >> $GITHUB_OUTPUT + env: + BEARER_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: choose-bases + run: | + echo "list=[ $( \ + echo $( \ + if [ "$BASE_PROVIDED" = 'false' ]; then \ + echo "$LATEST_BASE_IMAGES"; \ + fi && \ + if [ "$NCCL_PROVIDED" = 'false' ]; then \ + echo "$LATEST_NCCL_IMAGES"; \ + fi; \ + ) | tr ' ' '\n' \ + | sed -E -e 's/^[0-9a-f]{7}-(.*)$/{"tag":"\1","image":"\0"}/g' \ + | tr '[:space:]' ',' \ + | sed -e 's/,$//' \ + ) ]" >> $GITHUB_OUTPUT; + env: + BASE_PROVIDED: ${{ steps.check-changed.outputs.BASE_PROVIDED }} + NCCL_PROVIDED: ${{ steps.check-changed.outputs.NCCL_PROVIDED }} + LATEST_BASE_IMAGES: ${{ steps.get-latest.outputs.LATEST_BASE_IMAGES }} + LATEST_NCCL_IMAGES: ${{ steps.get-latest.outputs.LATEST_NCCL_IMAGES }} + + build-call: + if: inputs.skip-bases-check strategy: matrix: flash-attn: [ 2.0.2, 1.0.9 ] @@ -34,3 +108,19 @@ jobs: build-args: | BASE_IMAGE=${{ inputs.base-image }} FLASH_ATTN_VERSION=${{ matrix.flash-attn }} + + build-self: + needs: get-required-bases + strategy: + matrix: + flash-attn: [ 2.0.2, 1.0.9 ] + bases: ${{ fromJSON(needs.get-required-bases.outputs.bases-list) }} + uses: ./.github/workflows/build.yml + with: + image-name: torch-extras + folder: torch-extras + tag-suffix: ${{ matrix.bases.tag }}-flash_attn${{ matrix.flash-attn }} + large-runner: true + build-args: | + BASE_IMAGE=${{ matrix.bases.image }} + FLASH_ATTN_VERSION=${{ matrix.flash-attn }} From 7d5ae3842f30f8e28131f4bbeadaf415b69c743f Mon Sep 17 00:00:00 2001 From: Eta Date: Mon, 14 Aug 2023 20:13:51 -0500 Subject: [PATCH 09/63] docs(torch-extras): Add comment about undocumented Apex build options --- torch-extras/Dockerfile | 2 ++ 1 file changed, 2 insertions(+) diff --git a/torch-extras/Dockerfile b/torch-extras/Dockerfile index ac27d45..eeae8ae 100644 --- a/torch-extras/Dockerfile +++ b/torch-extras/Dockerfile @@ -152,6 +152,8 @@ RUN LIBNCCL2_VERSION=$(dpkg-query --showformat='${Version}' --show libnccl2) && libnccl-dev=$LIBNCCL2_VERSION && \ apt-get clean +# --distributed_adam, --distributed_lamb, and --group_norm aren't documented +# in the Apex README, but are defined in its setup.py config. RUN --mount=type=bind,from=apex-downloader,source=/git/apex,target=apex/,rw \ python3 -m pip install -U --no-cache-dir \ packaging setuptools wheel pip && \ From 0f0935023dc2e40c41b7f0484f83acd2922fffc3 Mon Sep 17 00:00:00 2001 From: Eta Date: Mon, 14 Aug 2023 20:18:32 -0500 Subject: [PATCH 10/63] ci(torch-extras): Fix syntax error in `get-required-bases` --- .github/workflows/torch-extras.yml | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/.github/workflows/torch-extras.yml b/.github/workflows/torch-extras.yml index 8d24573..2ec8386 100644 --- a/.github/workflows/torch-extras.yml +++ b/.github/workflows/torch-extras.yml @@ -36,14 +36,14 @@ on: jobs: get-required-bases: - if: !inputs.skip-bases-check && github.event.action == 'push' + if: github.event.action == 'push' && !inputs.skip-bases-check runs-on: ["self-hosted", "Linux"] outputs: bases-list: ${{ steps.choose-bases.outputs.list }} steps: - name: check-changed run: | - CHANGED_FILES="$(git diff --name-only '${{ github.event.before }}' '${{ github.event.after }}')" && \ + CHANGED_FILES="$(git diff --name-only "$BEFORE_HASH" "$AFTER_HASH")" && \ { \ echo "$CHANGED_FILES" \ | grep -P '^(\./)?(torch/|\.github/workflows/torch(-base)?\.yml|\.github/workflows/build\.yml)' > /dev/null \ @@ -73,6 +73,8 @@ jobs: echo "LATEST_NCCL_IMAGES=\"$LATEST_NCCL_IMAGES\"" >> $GITHUB_OUTPUT env: BEARER_TOKEN: ${{ secrets.GITHUB_TOKEN }} + BEFORE_HASH: ${{ github.event.before }} + AFTER_HASH: ${{ github.event.after }} - name: choose-bases run: | echo "list=[ $( \ From c27874f0f00f5b0a586117aa610255142e9c4483 Mon Sep 17 00:00:00 2001 From: Eta Date: Mon, 14 Aug 2023 20:23:03 -0500 Subject: [PATCH 11/63] ci(torch-extras): Attempt to appease GitHub Actions type coercion rules --- .github/workflows/torch-extras.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/torch-extras.yml b/.github/workflows/torch-extras.yml index 2ec8386..1bbd0a0 100644 --- a/.github/workflows/torch-extras.yml +++ b/.github/workflows/torch-extras.yml @@ -36,7 +36,7 @@ on: jobs: get-required-bases: - if: github.event.action == 'push' && !inputs.skip-bases-check + if: github.event.action == 'push' && inputs.skip-bases-check != true runs-on: ["self-hosted", "Linux"] outputs: bases-list: ${{ steps.choose-bases.outputs.list }} From 041f7fb9edbdb3277f6ba62e3f0f2099e07d7dfc Mon Sep 17 00:00:00 2001 From: Eta Date: Mon, 14 Aug 2023 20:25:08 -0500 Subject: [PATCH 12/63] ci(torch-extras): Use correct name for `github.event_name` --- .github/workflows/torch-extras.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/torch-extras.yml b/.github/workflows/torch-extras.yml index 1bbd0a0..1edb719 100644 --- a/.github/workflows/torch-extras.yml +++ b/.github/workflows/torch-extras.yml @@ -36,7 +36,7 @@ on: jobs: get-required-bases: - if: github.event.action == 'push' && inputs.skip-bases-check != true + if: github.event_name == 'push' && !inputs.skip-bases-check runs-on: ["self-hosted", "Linux"] outputs: bases-list: ${{ steps.choose-bases.outputs.list }} From dc6bb1d324b2cdb8948cd25eaeaf5c910864ee6f Mon Sep 17 00:00:00 2001 From: Eta Date: Wed, 16 Aug 2023 13:35:34 -0500 Subject: [PATCH 13/63] ci(torch-extras): Disable large runner flags, fix syntax --- .github/workflows/torch-extras.yml | 28 +++++++++++++++++----------- 1 file changed, 17 insertions(+), 11 deletions(-) diff --git a/.github/workflows/torch-extras.yml b/.github/workflows/torch-extras.yml index 1edb719..51df55a 100644 --- a/.github/workflows/torch-extras.yml +++ b/.github/workflows/torch-extras.yml @@ -38,6 +38,8 @@ jobs: get-required-bases: if: github.event_name == 'push' && !inputs.skip-bases-check runs-on: ["self-hosted", "Linux"] + permissions: + packages: read outputs: bases-list: ${{ steps.choose-bases.outputs.list }} steps: @@ -55,6 +57,9 @@ jobs: && echo "NCCL_PROVIDED=true" >> $GITHUB_OUTPUT \ || echo "NCCL_PROVIDED=false" >> $GITHUB_OUTPUT; \ } + env: + BEFORE_HASH: ${{ github.event.before }} + AFTER_HASH: ${{ github.event.after }} - name: get-latest run: | RELEASES="$( \ @@ -73,23 +78,23 @@ jobs: echo "LATEST_NCCL_IMAGES=\"$LATEST_NCCL_IMAGES\"" >> $GITHUB_OUTPUT env: BEARER_TOKEN: ${{ secrets.GITHUB_TOKEN }} - BEFORE_HASH: ${{ github.event.before }} - AFTER_HASH: ${{ github.event.after }} - name: choose-bases run: | - echo "list=[ $( \ - echo $( \ + TAG_PATTERN='^[0-9a-f]{7}-(.*)' && \ + JSON_REPLACE='{"tag":"\1","image":"ghcr.io/coreweave/ml-containers/torch:\0"}' && \ + TAG_TO_JSON() { sed -E -e "s@${TAG_PATTERN}@${JSON_REPLACE}@g"; } + SPLIT_TO_LINES() { xargs -n 1; } && \ + JOIN_LINES() { tr '[:space:]' ',' | sed -e 's/,$//'; } && \ + echo "list=[$( \ + ( \ if [ "$BASE_PROVIDED" = 'false' ]; then \ echo "$LATEST_BASE_IMAGES"; \ fi && \ if [ "$NCCL_PROVIDED" = 'false' ]; then \ echo "$LATEST_NCCL_IMAGES"; \ fi; \ - ) | tr ' ' '\n' \ - | sed -E -e 's/^[0-9a-f]{7}-(.*)$/{"tag":"\1","image":"\0"}/g' \ - | tr '[:space:]' ',' \ - | sed -e 's/,$//' \ - ) ]" >> $GITHUB_OUTPUT; + ) | SPLIT_TO_LINES | TAG_TO_JSON | JOIN_LINES \ + )]" >> $GITHUB_OUTPUT; env: BASE_PROVIDED: ${{ steps.check-changed.outputs.BASE_PROVIDED }} NCCL_PROVIDED: ${{ steps.check-changed.outputs.NCCL_PROVIDED }} @@ -106,13 +111,14 @@ jobs: image-name: torch-extras folder: torch-extras tag-suffix: ${{ inputs.tag }}-flash_attn${{ matrix.flash-attn }} - large-runner: true + large-runner: false build-args: | BASE_IMAGE=${{ inputs.base-image }} FLASH_ATTN_VERSION=${{ matrix.flash-attn }} build-self: needs: get-required-bases + if: needs.get-required-bases.outputs.bases-list && needs.get-required-bases.outputs.bases-list != '[]' strategy: matrix: flash-attn: [ 2.0.2, 1.0.9 ] @@ -122,7 +128,7 @@ jobs: image-name: torch-extras folder: torch-extras tag-suffix: ${{ matrix.bases.tag }}-flash_attn${{ matrix.flash-attn }} - large-runner: true + large-runner: false build-args: | BASE_IMAGE=${{ matrix.bases.image }} FLASH_ATTN_VERSION=${{ matrix.flash-attn }} From 614b8161b0b25a4d99a13bbe7473fcdb3c2ddb4c Mon Sep 17 00:00:00 2001 From: Eta Date: Wed, 16 Aug 2023 13:40:44 -0500 Subject: [PATCH 14/63] ci(torch-extras): Check out repo before using `git diff` --- .github/workflows/torch-extras.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/torch-extras.yml b/.github/workflows/torch-extras.yml index 51df55a..905b2a5 100644 --- a/.github/workflows/torch-extras.yml +++ b/.github/workflows/torch-extras.yml @@ -43,6 +43,7 @@ jobs: outputs: bases-list: ${{ steps.choose-bases.outputs.list }} steps: + - uses: actions/checkout@v3 - name: check-changed run: | CHANGED_FILES="$(git diff --name-only "$BEFORE_HASH" "$AFTER_HASH")" && \ From aba8d6a0e1609c8a5675b2f1488ce9664f53d3c2 Mon Sep 17 00:00:00 2001 From: Eta Date: Wed, 16 Aug 2023 13:44:12 -0500 Subject: [PATCH 15/63] ci(torch-extras): Fetch more history for diff --- .github/workflows/torch-extras.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/workflows/torch-extras.yml b/.github/workflows/torch-extras.yml index 905b2a5..3317d4e 100644 --- a/.github/workflows/torch-extras.yml +++ b/.github/workflows/torch-extras.yml @@ -44,6 +44,8 @@ jobs: bases-list: ${{ steps.choose-bases.outputs.list }} steps: - uses: actions/checkout@v3 + with: + fetch-depth: 0 - name: check-changed run: | CHANGED_FILES="$(git diff --name-only "$BEFORE_HASH" "$AFTER_HASH")" && \ From 17bec641fe27eba9921e7f12f3692e39650ce367 Mon Sep 17 00:00:00 2001 From: Eta Date: Wed, 16 Aug 2023 13:48:54 -0500 Subject: [PATCH 16/63] ci(torch-extras): Store only single-line strings in `$GITHUB_OUTPUT` --- .github/workflows/torch-extras.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/torch-extras.yml b/.github/workflows/torch-extras.yml index 3317d4e..7e8275e 100644 --- a/.github/workflows/torch-extras.yml +++ b/.github/workflows/torch-extras.yml @@ -77,8 +77,8 @@ jobs: LATEST_NCCL_COMMIT="$(echo "$NCCL_RELEASES" | tail -1 | cut -c1-7)" && \ LATEST_BASE_IMAGES="$(echo "$BASE_RELEASES" | grep -F "${LATEST_BASE_COMMIT}-")" && \ LATEST_NCCL_IMAGES="$(echo "$NCCL_RELEASES" | grep -F "${LATEST_NCCL_COMMIT}-")" && \ - echo "LATEST_BASE_IMAGES=\"$LATEST_BASE_IMAGES\"" >> $GITHUB_OUTPUT && \ - echo "LATEST_NCCL_IMAGES=\"$LATEST_NCCL_IMAGES\"" >> $GITHUB_OUTPUT + echo "LATEST_BASE_IMAGES=\"$(echo $LATEST_BASE_IMAGES)\"" >> $GITHUB_OUTPUT && \ + echo "LATEST_NCCL_IMAGES=\"$(echo $LATEST_NCCL_IMAGES)\"" >> $GITHUB_OUTPUT env: BEARER_TOKEN: ${{ secrets.GITHUB_TOKEN }} - name: choose-bases From 5014d248f490a57393c62c19b0f86ba91be88d5a Mon Sep 17 00:00:00 2001 From: Eta Date: Wed, 16 Aug 2023 13:53:19 -0500 Subject: [PATCH 17/63] ci(torch-extras): Split step IDs and names --- .github/workflows/torch-extras.yml | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/.github/workflows/torch-extras.yml b/.github/workflows/torch-extras.yml index 7e8275e..3a3aa3e 100644 --- a/.github/workflows/torch-extras.yml +++ b/.github/workflows/torch-extras.yml @@ -46,7 +46,8 @@ jobs: - uses: actions/checkout@v3 with: fetch-depth: 0 - - name: check-changed + - name: Check changed files + id: check-changed run: | CHANGED_FILES="$(git diff --name-only "$BEFORE_HASH" "$AFTER_HASH")" && \ { \ @@ -63,7 +64,8 @@ jobs: env: BEFORE_HASH: ${{ github.event.before }} AFTER_HASH: ${{ github.event.after }} - - name: get-latest + - name: Get latest torch container releases + id: get-latest run: | RELEASES="$( \ /bin/curl -f -s --oauth2-bearer "$(echo "$BEARER_TOKEN" | base64 -w 0)" \ @@ -81,7 +83,8 @@ jobs: echo "LATEST_NCCL_IMAGES=\"$(echo $LATEST_NCCL_IMAGES)\"" >> $GITHUB_OUTPUT env: BEARER_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: choose-bases + - name: Choose which torch containers to use as a build base + id: choose-bases run: | TAG_PATTERN='^[0-9a-f]{7}-(.*)' && \ JSON_REPLACE='{"tag":"\1","image":"ghcr.io/coreweave/ml-containers/torch:\0"}' && \ From 78a3b00eb63e41c9cc5ea270f06bccbdfe4c182e Mon Sep 17 00:00:00 2001 From: Eta Date: Wed, 16 Aug 2023 13:56:03 -0500 Subject: [PATCH 18/63] ci(torch-extras): Use single quotes around `$GITHUB_OUPUT` values --- .github/workflows/torch-extras.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/torch-extras.yml b/.github/workflows/torch-extras.yml index 3a3aa3e..e37edc3 100644 --- a/.github/workflows/torch-extras.yml +++ b/.github/workflows/torch-extras.yml @@ -91,7 +91,7 @@ jobs: TAG_TO_JSON() { sed -E -e "s@${TAG_PATTERN}@${JSON_REPLACE}@g"; } SPLIT_TO_LINES() { xargs -n 1; } && \ JOIN_LINES() { tr '[:space:]' ',' | sed -e 's/,$//'; } && \ - echo "list=[$( \ + echo "list='[$( \ ( \ if [ "$BASE_PROVIDED" = 'false' ]; then \ echo "$LATEST_BASE_IMAGES"; \ @@ -100,7 +100,7 @@ jobs: echo "$LATEST_NCCL_IMAGES"; \ fi; \ ) | SPLIT_TO_LINES | TAG_TO_JSON | JOIN_LINES \ - )]" >> $GITHUB_OUTPUT; + )]'" >> $GITHUB_OUTPUT; env: BASE_PROVIDED: ${{ steps.check-changed.outputs.BASE_PROVIDED }} NCCL_PROVIDED: ${{ steps.check-changed.outputs.NCCL_PROVIDED }} From e31df1fa0d505feec08ca55782637af53cbd409a Mon Sep 17 00:00:00 2001 From: Eta Date: Wed, 16 Aug 2023 14:32:17 -0500 Subject: [PATCH 19/63] ci(torch-extras): Fix `$GITHUB_OUTPUT` quoting again --- .github/workflows/torch-extras.yml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/workflows/torch-extras.yml b/.github/workflows/torch-extras.yml index e37edc3..9f87ac4 100644 --- a/.github/workflows/torch-extras.yml +++ b/.github/workflows/torch-extras.yml @@ -79,8 +79,8 @@ jobs: LATEST_NCCL_COMMIT="$(echo "$NCCL_RELEASES" | tail -1 | cut -c1-7)" && \ LATEST_BASE_IMAGES="$(echo "$BASE_RELEASES" | grep -F "${LATEST_BASE_COMMIT}-")" && \ LATEST_NCCL_IMAGES="$(echo "$NCCL_RELEASES" | grep -F "${LATEST_NCCL_COMMIT}-")" && \ - echo "LATEST_BASE_IMAGES=\"$(echo $LATEST_BASE_IMAGES)\"" >> $GITHUB_OUTPUT && \ - echo "LATEST_NCCL_IMAGES=\"$(echo $LATEST_NCCL_IMAGES)\"" >> $GITHUB_OUTPUT + echo "LATEST_BASE_IMAGES=$(echo $LATEST_BASE_IMAGES)" >> $GITHUB_OUTPUT && \ + echo "LATEST_NCCL_IMAGES=$(echo $LATEST_NCCL_IMAGES)" >> $GITHUB_OUTPUT env: BEARER_TOKEN: ${{ secrets.GITHUB_TOKEN }} - name: Choose which torch containers to use as a build base @@ -91,7 +91,7 @@ jobs: TAG_TO_JSON() { sed -E -e "s@${TAG_PATTERN}@${JSON_REPLACE}@g"; } SPLIT_TO_LINES() { xargs -n 1; } && \ JOIN_LINES() { tr '[:space:]' ',' | sed -e 's/,$//'; } && \ - echo "list='[$( \ + echo "list=[$( \ ( \ if [ "$BASE_PROVIDED" = 'false' ]; then \ echo "$LATEST_BASE_IMAGES"; \ @@ -100,7 +100,7 @@ jobs: echo "$LATEST_NCCL_IMAGES"; \ fi; \ ) | SPLIT_TO_LINES | TAG_TO_JSON | JOIN_LINES \ - )]'" >> $GITHUB_OUTPUT; + )]" >> $GITHUB_OUTPUT; env: BASE_PROVIDED: ${{ steps.check-changed.outputs.BASE_PROVIDED }} NCCL_PROVIDED: ${{ steps.check-changed.outputs.NCCL_PROVIDED }} From 8802f432ce8c017e1203f2876a4c075001d12622 Mon Sep 17 00:00:00 2001 From: Eta Date: Wed, 16 Aug 2023 14:43:40 -0500 Subject: [PATCH 20/63] build(torch-extras): Update Apex to `38a1269` for `--group_norm` --- torch-extras/Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/torch-extras/Dockerfile b/torch-extras/Dockerfile index eeae8ae..8d39bd8 100644 --- a/torch-extras/Dockerfile +++ b/torch-extras/Dockerfile @@ -3,7 +3,7 @@ ARG BASE_IMAGE ARG DEEPSPEED_VERSION="0.9.4" ARG FLASH_ATTN_VERSION="2.0.2" -ARG APEX_COMMIT="7b2e71b0d4013f8e2f9f1c8dd21980ff1d76f1b6" +ARG APEX_COMMIT="38a12698bc3cc95987bca270bcd6d025bb0be346" FROM alpine/git:2.36.3 as flash-attn-downloader WORKDIR /git From 6671b446335dd0d9a5a7f90d057527017d1b1d38 Mon Sep 17 00:00:00 2001 From: Eta Date: Wed, 16 Aug 2023 15:44:05 -0500 Subject: [PATCH 21/63] ci(torch-extras): Fix `workflow_dispatch` for auto-detected bases --- .github/workflows/torch-extras.yml | 48 ++++++++++++++++++------------ 1 file changed, 29 insertions(+), 19 deletions(-) diff --git a/.github/workflows/torch-extras.yml b/.github/workflows/torch-extras.yml index 9f87ac4..96b8c71 100644 --- a/.github/workflows/torch-extras.yml +++ b/.github/workflows/torch-extras.yml @@ -24,6 +24,7 @@ on: type: string skip-bases-check: required: false + description: "If false, build from the most recent releases from the main branch rather than a specific image" type: boolean default: true @@ -36,7 +37,7 @@ on: jobs: get-required-bases: - if: github.event_name == 'push' && !inputs.skip-bases-check + if: !inputs.skip-bases-check runs-on: ["self-hosted", "Linux"] permissions: packages: read @@ -46,25 +47,32 @@ jobs: - uses: actions/checkout@v3 with: fetch-depth: 0 - - name: Check changed files + - name: Check if torch-extras needs to be rebuilt from previous bases id: check-changed run: | - CHANGED_FILES="$(git diff --name-only "$BEFORE_HASH" "$AFTER_HASH")" && \ - { \ - echo "$CHANGED_FILES" \ - | grep -P '^(\./)?(torch/|\.github/workflows/torch(-base)?\.yml|\.github/workflows/build\.yml)' > /dev/null \ - && echo "BASE_PROVIDED=true" >> $GITHUB_OUTPUT \ - || echo "BASE_PROVIDED=false" >> $GITHUB_OUTPUT; \ - } && { \ - echo "$CHANGED_FILES" \ - | grep -P '^(\./)?(torch/|\.github/workflows/torch(-nccl)?\.yml|\.github/workflows/build\.yml)' > /dev/null \ - && echo "NCCL_PROVIDED=true" >> $GITHUB_OUTPUT \ - || echo "NCCL_PROVIDED=false" >> $GITHUB_OUTPUT; \ - } + if [ "$EVENT_NAME" = 'push' ]; then \ + CHANGED_FILES="$(git diff --name-only "$BEFORE_HASH" "$AFTER_HASH")" && \ + { \ + echo "$CHANGED_FILES" \ + | grep -P '^(\./)?(torch/|\.github/workflows/torch(-base)?\.yml|\.github/workflows/build\.yml)' > /dev/null \ + && echo "BASE_PROVIDED=true" >> "$GITHUB_OUTPUT" \ + || echo "BASE_PROVIDED=false" >> "$GITHUB_OUTPUT"; \ + } && { \ + echo "$CHANGED_FILES" \ + | grep -P '^(\./)?(torch/|\.github/workflows/torch(-nccl)?\.yml|\.github/workflows/build\.yml)' > /dev/null \ + && echo "NCCL_PROVIDED=true" >> "$GITHUB_OUTPUT" \ + || echo "NCCL_PROVIDED=false" >> "$GITHUB_OUTPUT"; \ + }; \ + else \ + echo "BASE_PROVIDED=false" >> "$GITHUB_OUTPUT" && \ + echo "NCCL_PROVIDED=false" >> "$GITHUB_OUTPUT"; + fi env: + EVENT_NAME: ${{ github.event_name }} BEFORE_HASH: ${{ github.event.before }} AFTER_HASH: ${{ github.event.after }} - name: Get latest torch container releases + if: steps.check-changed.outputs.BASE_PROVIDED != 'true' || steps.check-changed.outputs.NCCL_PROVIDED != 'true' id: get-latest run: | RELEASES="$( \ @@ -79,11 +87,12 @@ jobs: LATEST_NCCL_COMMIT="$(echo "$NCCL_RELEASES" | tail -1 | cut -c1-7)" && \ LATEST_BASE_IMAGES="$(echo "$BASE_RELEASES" | grep -F "${LATEST_BASE_COMMIT}-")" && \ LATEST_NCCL_IMAGES="$(echo "$NCCL_RELEASES" | grep -F "${LATEST_NCCL_COMMIT}-")" && \ - echo "LATEST_BASE_IMAGES=$(echo $LATEST_BASE_IMAGES)" >> $GITHUB_OUTPUT && \ - echo "LATEST_NCCL_IMAGES=$(echo $LATEST_NCCL_IMAGES)" >> $GITHUB_OUTPUT + echo "LATEST_BASE_IMAGES=$(echo $LATEST_BASE_IMAGES)" >> "$GITHUB_OUTPUT" && \ + echo "LATEST_NCCL_IMAGES=$(echo $LATEST_NCCL_IMAGES)" >> "$GITHUB_OUTPUT" env: BEARER_TOKEN: ${{ secrets.GITHUB_TOKEN }} - name: Choose which torch containers to use as a build base + if: steps.check-changed.outputs.BASE_PROVIDED != 'true' || steps.check-changed.outputs.NCCL_PROVIDED != 'true' id: choose-bases run: | TAG_PATTERN='^[0-9a-f]{7}-(.*)' && \ @@ -91,16 +100,19 @@ jobs: TAG_TO_JSON() { sed -E -e "s@${TAG_PATTERN}@${JSON_REPLACE}@g"; } SPLIT_TO_LINES() { xargs -n 1; } && \ JOIN_LINES() { tr '[:space:]' ',' | sed -e 's/,$//'; } && \ + echo '## Pre-existing `ghcr.io/coreweave/ml-containers/torch` images to build from' >> "$GITHUB_STEP_SUMMARY" && \ echo "list=[$( \ ( \ if [ "$BASE_PROVIDED" = 'false' ]; then \ + echo "$LATEST_BASE_IMAGES" | xargs -n 1 echo '-' >> "$GITHUB_STEP_SUMMARY" && \ echo "$LATEST_BASE_IMAGES"; \ fi && \ if [ "$NCCL_PROVIDED" = 'false' ]; then \ + echo "$LATEST_NCCL_IMAGES" | xargs -n 1 echo '-' >> "$GITHUB_STEP_SUMMARY" && \ echo "$LATEST_NCCL_IMAGES"; \ fi; \ ) | SPLIT_TO_LINES | TAG_TO_JSON | JOIN_LINES \ - )]" >> $GITHUB_OUTPUT; + )]" >> "$GITHUB_OUTPUT"; env: BASE_PROVIDED: ${{ steps.check-changed.outputs.BASE_PROVIDED }} NCCL_PROVIDED: ${{ steps.check-changed.outputs.NCCL_PROVIDED }} @@ -117,7 +129,6 @@ jobs: image-name: torch-extras folder: torch-extras tag-suffix: ${{ inputs.tag }}-flash_attn${{ matrix.flash-attn }} - large-runner: false build-args: | BASE_IMAGE=${{ inputs.base-image }} FLASH_ATTN_VERSION=${{ matrix.flash-attn }} @@ -134,7 +145,6 @@ jobs: image-name: torch-extras folder: torch-extras tag-suffix: ${{ matrix.bases.tag }}-flash_attn${{ matrix.flash-attn }} - large-runner: false build-args: | BASE_IMAGE=${{ matrix.bases.image }} FLASH_ATTN_VERSION=${{ matrix.flash-attn }} From 9e2b503d38f188831da294aabd6d68be64442a68 Mon Sep 17 00:00:00 2001 From: Eta Date: Wed, 16 Aug 2023 15:44:40 -0500 Subject: [PATCH 22/63] ci: Remove `large-runner` label distinction [skip ci] --- .github/workflows/build.yml | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 53d8d87..b7d4599 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -15,10 +15,6 @@ on: tag-suffix: required: false type: string - large-runner: - required: false - type: boolean - default: false outputs: outcome: description: "The outcome of the build" @@ -33,7 +29,7 @@ on: jobs: build: name: Build Images - runs-on: ${{ fromJSON(inputs.large-runner && '["self-hosted", "Linux", "chunky"]' || '["self-hosted", "Linux"]') }} + runs-on: [ self-hosted, Linux ] outputs: outcome: ${{ steps.docker-build.outcome }} tags: ${{ steps.meta.outputs.tags }} From c5e7997b203822ca0dfd2361309035e20d32118f Mon Sep 17 00:00:00 2001 From: Eta Date: Wed, 16 Aug 2023 15:54:35 -0500 Subject: [PATCH 23/63] ci(torch-extras): Fix `get-required-bases` conditional syntax [skip ci] --- .github/workflows/torch-extras.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/torch-extras.yml b/.github/workflows/torch-extras.yml index 96b8c71..8ed83cf 100644 --- a/.github/workflows/torch-extras.yml +++ b/.github/workflows/torch-extras.yml @@ -24,7 +24,7 @@ on: type: string skip-bases-check: required: false - description: "If false, build from the most recent releases from the main branch rather than a specific image" + description: "Build from one specific image rather than the most recent releases from the main branch" type: boolean default: true @@ -37,7 +37,7 @@ on: jobs: get-required-bases: - if: !inputs.skip-bases-check + if: inputs.skip-bases-check != true runs-on: ["self-hosted", "Linux"] permissions: packages: read From 6ba377d79fe08d3dba8def39f7228af2d3c78318 Mon Sep 17 00:00:00 2001 From: Eta Date: Mon, 21 Aug 2023 17:35:11 -0500 Subject: [PATCH 24/63] feat(torch-nightly): Add `nightly/torch` containers --- .github/workflows/torch-base.yml | 58 ++++++++++++++++-- .github/workflows/torch-extras.yml | 17 ++++-- .github/workflows/torch-nccl.yml | 58 ++++++++++++++++-- .github/workflows/torch-nightly.yml | 95 +++++++++++++++++++++++++++++ .github/workflows/torch.yml | 18 +++++- torch/Dockerfile | 73 ++++++++++++++++------ 6 files changed, 289 insertions(+), 30 deletions(-) create mode 100644 .github/workflows/torch-nightly.yml diff --git a/.github/workflows/torch-base.yml b/.github/workflows/torch-base.yml index 64b3af9..39bde3f 100644 --- a/.github/workflows/torch-base.yml +++ b/.github/workflows/torch-base.yml @@ -1,5 +1,52 @@ +name: torch-base + on: + workflow_call: + inputs: + torch-version: + required: false + type: string + torchvision-version: + required: false + type: string + torchaudio-version: + required: false + type: string + triton-version: + required: false + type: string + image-name: + required: false + type: string + image-tag-suffix: + required: false + type: string workflow_dispatch: + inputs: + torch-version: + required: false + description: "Tagged version number from pytorch/pytorch to build" + type: string + torchvision-version: + required: false + description: "Tagged version number from pytorch/vision to build" + type: string + torchaudio-version: + required: false + description: "Tagged version number from pytorch/audio to build" + type: string + triton-version: + required: false + description: "Tagged version number from openai/triton to build" + type: string + image-name: + required: false + description: "Custom name under which to publish the resulting container" + type: string + image-tag-suffix: + required: false + description: "Custom tag suffix listing library versions under which to publish the resulting container" + type: string push: paths: - "torch/**" @@ -14,16 +61,19 @@ jobs: matrix: cuda: [12.1.1, 12.0.1, 11.8.0] include: - - torch: 2.0.1 - vision: 0.15.2 - audio: 2.0.2 + - torch: ${{ inputs.torch-version || '2.0.1' }} + vision: ${{ inputs.torchvision-version || '0.15.2' }} + audio: ${{ inputs.torchaudio-version || '2.0.2' }} + triton: ${{ inputs.triton-version }} uses: ./.github/workflows/torch.yml with: - tag: ${{ format('base-cuda{0}-torch{1}-vision{2}-audio{3}', matrix.cuda, matrix.torch, matrix.vision, matrix.audio) }} + image-name: ${{ inputs.image-name }} + tag: ${{ format('{0}-{1}', format('base-cuda{0}', matrix.cuda), inputs.image-tag-suffix || format('torch{0}-vision{1}-audio{2}', matrix.torch, matrix.vision, matrix.audio)) }} builder-base-image: nvidia/cuda:${{ matrix.cuda }}-devel-ubuntu20.04 base-image: nvidia/cuda:${{ matrix.cuda }}-base-ubuntu20.04 torch-version: ${{ matrix.torch }} torchvision-version: ${{ matrix.vision }} torchaudio-version: ${{ matrix.audio }} + triton-version: ${{ matrix.triton }} build-extras: true diff --git a/.github/workflows/torch-extras.yml b/.github/workflows/torch-extras.yml index 8ed83cf..ff88d19 100644 --- a/.github/workflows/torch-extras.yml +++ b/.github/workflows/torch-extras.yml @@ -7,6 +7,9 @@ on: base-image: required: true type: string + image-name: + required: false + type: string skip-bases-check: required: false type: boolean @@ -22,6 +25,10 @@ on: required: false description: "Base image for the build" type: string + image-name: + required: false + description: "Custom name under which to publish the resulting container" + type: string skip-bases-check: required: false description: "Build from one specific image rather than the most recent releases from the main branch" @@ -95,9 +102,11 @@ jobs: if: steps.check-changed.outputs.BASE_PROVIDED != 'true' || steps.check-changed.outputs.NCCL_PROVIDED != 'true' id: choose-bases run: | - TAG_PATTERN='^[0-9a-f]{7}-(.*)' && \ - JSON_REPLACE='{"tag":"\1","image":"ghcr.io/coreweave/ml-containers/torch:\0"}' && \ - TAG_TO_JSON() { sed -E -e "s@${TAG_PATTERN}@${JSON_REPLACE}@g"; } + TAG_TO_JSON() { + TAG_PATTERN='^[0-9a-f]{7}-(.*)'; + JSON_REPLACE='{"tag":"\1","image":"ghcr.io/coreweave/ml-containers/torch:\0"}'; + sed -E -e "s@${TAG_PATTERN}@${JSON_REPLACE}@g"; + } && \ SPLIT_TO_LINES() { xargs -n 1; } && \ JOIN_LINES() { tr '[:space:]' ',' | sed -e 's/,$//'; } && \ echo '## Pre-existing `ghcr.io/coreweave/ml-containers/torch` images to build from' >> "$GITHUB_STEP_SUMMARY" && \ @@ -142,7 +151,7 @@ jobs: bases: ${{ fromJSON(needs.get-required-bases.outputs.bases-list) }} uses: ./.github/workflows/build.yml with: - image-name: torch-extras + image-name: ${{ inputs.image-name || 'torch-extras' }} folder: torch-extras tag-suffix: ${{ matrix.bases.tag }}-flash_attn${{ matrix.flash-attn }} build-args: | diff --git a/.github/workflows/torch-nccl.yml b/.github/workflows/torch-nccl.yml index 7c68903..76d1387 100644 --- a/.github/workflows/torch-nccl.yml +++ b/.github/workflows/torch-nccl.yml @@ -1,5 +1,52 @@ +name: torch-nccl + on: + workflow_call: + inputs: + torch-version: + required: false + type: string + torchvision-version: + required: false + type: string + torchaudio-version: + required: false + type: string + triton-version: + required: false + type: string + image-name: + required: false + type: string + image-tag-suffix: + required: false + type: string workflow_dispatch: + inputs: + torch-version: + required: false + description: "Tagged version number from pytorch/pytorch to build" + type: string + torchvision-version: + required: false + description: "Tagged version number from pytorch/vision to build" + type: string + torchaudio-version: + required: false + description: "Tagged version number from pytorch/audio to build" + type: string + triton-version: + required: false + description: "Tagged version number from openai/triton to build" + type: string + image-name: + required: false + description: "Custom name under which to publish the resulting container" + type: string + image-tag-suffix: + required: false + description: "Custom tag suffix listing library versions under which to publish the resulting container" + type: string push: paths: - "torch/**" @@ -23,16 +70,19 @@ jobs: nccl: 2.16.2-1 nccl-tests-hash: 471f0db include: - - torch: 2.0.1 - vision: 0.15.2 - audio: 2.0.2 + - torch: ${{ inputs.torch-version || '2.0.1' }} + vision: ${{ inputs.torchvision-version || '0.15.2' }} + audio: ${{ inputs.torchaudio-version || '2.0.2' }} + triton: ${{ inputs.triton-version }} uses: ./.github/workflows/torch.yml with: - tag: ${{ format('nccl-cuda{0}-nccl{1}-torch{2}-vision{3}-audio{4}', matrix.image.cuda, matrix.image.nccl, matrix.torch, matrix.vision, matrix.audio) }} + image-name: ${{ inputs.image-name }} + tag: ${{ format('{0}-{1}', format('nccl-cuda{0}-nccl{1}', matrix.image.cuda, matrix.image.nccl), inputs.image-tag-suffix || format('torch{0}-vision{1}-audio{2}', matrix.torch, matrix.vision, matrix.audio)) }} builder-base-image: ghcr.io/coreweave/nccl-tests:${{ matrix.image.cuda }}-cudnn8-devel-ubuntu20.04-nccl${{ matrix.image.nccl }}-${{ matrix.image.nccl-tests-hash }} base-image: ghcr.io/coreweave/nccl-tests:${{ matrix.image.cuda }}-cudnn8-devel-ubuntu20.04-nccl${{ matrix.image.nccl }}-${{ matrix.image.nccl-tests-hash }} torch-version: ${{ matrix.torch }} torchvision-version: ${{ matrix.vision }} torchaudio-version: ${{ matrix.audio }} + triton-version: ${{ matrix.triton }} build-extras: true diff --git a/.github/workflows/torch-nightly.yml b/.github/workflows/torch-nightly.yml new file mode 100644 index 0000000..d0dc47d --- /dev/null +++ b/.github/workflows/torch-nightly.yml @@ -0,0 +1,95 @@ +name: torch-nightly + +on: + workflow_dispatch: + schedule: + # At 05:00 UTC (midnight EST) + - cron: "0 5 * * *" + push: + paths: +# - "torch/**" + - ".github/workflows/torch-nightly.yml" +# - ".github/workflows/torch.yml" +# - ".github/workflows/build.yml" + + +jobs: + get-nightly-commit-hash: + runs-on: [ self-hosted, Linux ] + outputs: + pytorch-commit: ${{ steps.get-hash.outputs.pytorch-commit }} + triton-commit: ${{ steps.get-hash.outputs.triton-commit }} + torchvision-commit: ${{ steps.get-hash.outputs.torchvision-commit }} + torchaudio-commit: ${{ steps.get-hash.outputs.torchaudio-commit }} + version-string: ${{ steps.get-hash.outputs.version-string }} + steps: + - name: Get latest commit hashes + id: get-hash + run: | + set -e; + + FORMAT_COMMIT_LINK() { + echo "[$(echo "$2" | cut -c1-7)](https://github.com/$1/tree/$2)"; + }; + + CLONE() { + git clone --filter=blob:none --no-checkout --depth=1 \ + "https://github.com/$1" \ + "$2" > /dev/null 2> /dev/null && \ + local COMMIT=$(git -C "$2" rev-parse HEAD) && \ + echo "Latest $1 commit: $( + FORMAT_COMMIT_LINK $1 $COMMIT + )" >> "$GITHUB_STEP_SUMMARY" && \ + echo $COMMIT; + }; + + GET_VERSION() { + git -C "$1" show HEAD:version.txt 2> /dev/null; + }; + + PYTORCH_COMMIT="$(CLONE pytorch/pytorch pytorch-git)"; + PYTORCH_VERSION="$(GET_VERSION pytorch-git)"; + TRITON_COMMIT_FILE=".ci/docker/ci_commit_pins/triton.txt"; + TRITON_COMMIT="$(git -C pytorch-git show "HEAD:$TRITON_COMMIT_FILE" 2> /dev/null)"; + rm -rf pytorch-git; + + echo "Corresponding Triton commit: $( + FORMAT_COMMIT_LINK openai/triton $TRITON_COMMIT + )" >> "$GITHUB_STEP_SUMMARY"; + + TORCHVISION_COMMIT="$(CLONE pytorch/vision torchvision-git)"; + TORCHVISION_VERSION="$(GET_VERSION torchvision-git)"; + rm -rf torchvision-git; + + TORCHAUDIO_COMMIT="$(CLONE pytorch/audio torchaudio-git)"; + TORCHAUDIO_VERSION="$(GET_VERSION torchaudio-git)"; + rm -rf torchaudio-git; + + echo "pytorch-commit=$PYTORCH_COMMIT" >> "$GITHUB_OUTPUT"; + echo "triton-commit=$TRITON_COMMIT" >> "$GITHUB_OUTPUT"; + echo "torchvision-commit=$TORCHVISION_COMMIT" >> "$GITHUB_OUTPUT"; + echo "torchaudio-commit=$TORCHAUDIO_COMMIT" >> "$GITHUB_OUTPUT"; + + printf -- 'version-string=torch%s-vision%s-audio%s\n' \ + "$PYTORCH_VERSION" "$TORCHVISION_VERSION" "$TORCHAUDIO_VERSION" \ + >> "$GITHUB_OUTPUT"; + build-base: + needs: get-nightly-commit-hash + uses: ./.github/workflows/torch-base.yml + with: + image-name: nightly/torch + image-tag-suffix: ${{ needs.get-nightly-commit-hash.outputs.version-string }} + torch-version: ${{ needs.get-nightly-commit-hash.outputs.pytorch-commit }} + torchvision-version: ${{ needs.get-nightly-commit-hash.outputs.torchvision-commit }} + torchaudio-version: ${{ needs.get-nightly-commit-hash.outputs.torchaudio-commit }} + triton-version: ${{ needs.get-nightly-commit-hash.outputs.triton-commit }} + build-nccl: + needs: get-nightly-commit-hash + uses: ./.github/workflows/torch-nccl.yml + with: + image-name: nightly/torch + image-tag-suffix: ${{ needs.get-nightly-commit-hash.outputs.version-string }} + torch-version: ${{ needs.get-nightly-commit-hash.outputs.pytorch-commit }} + torchvision-version: ${{ needs.get-nightly-commit-hash.outputs.torchvision-commit }} + torchaudio-version: ${{ needs.get-nightly-commit-hash.outputs.torchaudio-commit }} + triton-version: ${{ needs.get-nightly-commit-hash.outputs.triton-commit }} diff --git a/.github/workflows/torch.yml b/.github/workflows/torch.yml index d61fd03..5c995bb 100644 --- a/.github/workflows/torch.yml +++ b/.github/workflows/torch.yml @@ -19,10 +19,16 @@ on: torchaudio-version: required: true type: string + triton-version: + required: false + type: string cuda-arch-support: required: false type: string default: "7.0 7.5 8.0 8.6 8.9 9.0+PTX" + image-name: + required: false + type: string build-extras: required: false type: boolean @@ -54,11 +60,19 @@ on: required: true description: "Tagged version number from pytorch/audio to build" type: string + triton-version: + required: false + description: "Tagged version number from openai/triton to build" + type: string cuda-arch-support: required: false description: "Space-separated list of CUDA architectures to support" type: string default: "7.0 7.5 8.0 8.6 8.9 9.0+PTX" + image-name: + required: false + description: "Custom name under which to publish the resulting container" + type: string build-extras: required: false description: "Whether to build and push a torch-extras container as well" @@ -69,7 +83,7 @@ jobs: build: uses: ./.github/workflows/build.yml with: - image-name: torch + image-name: ${{ inputs.image-name || 'torch' }} folder: torch tag-suffix: ${{ inputs.tag }} build-args: | @@ -80,6 +94,7 @@ jobs: BUILD_TORCH_VISION_VERSION=${{ inputs.torchvision-version }} BUILD_TORCH_AUDIO_VERSION=${{ inputs.torchaudio-version }} ${{ inputs.cuda-arch-support && format('BUILD_TORCH_CUDA_ARCH_LIST={0}', inputs.cuda-arch-support) || '' }} + ${{ inputs.triton-version && format('BUILD_TRITON_VERSION={0}', inputs.triton-version) || '' }} build-extras: if: inputs.build-extras needs: build @@ -87,3 +102,4 @@ jobs: with: tag: ${{ inputs.tag }} base-image: ${{ needs.build.outputs.tags }} + image-name: ${{ inputs.image-name && format('{0}-extras', inputs.image-name) || '' }} diff --git a/torch/Dockerfile b/torch/Dockerfile index 9f2f2d2..ae2f805 100755 --- a/torch/Dockerfile +++ b/torch/Dockerfile @@ -1,37 +1,67 @@ -# syntax=docker/dockerfile:1.2 +# syntax=docker/dockerfile:1.4 ARG BUILDER_BASE_IMAGE="nvidia/cuda:12.0.1-devel-ubuntu20.04" ARG FINAL_BASE_IMAGE="nvidia/cuda:12.0.1-base-ubuntu20.04" ARG BUILD_TORCH_VERSION="2.0.1" ARG BUILD_TORCH_VISION_VERSION="0.15.2" ARG BUILD_TORCH_AUDIO_VERSION="2.0.2" +ARG BUILD_TRITON_VERSION="" ARG BUILD_TORCH_CUDA_ARCH_LIST="6.0 6.1 6.2 7.0 7.2 7.5 8.0 8.6 8.9 9.0+PTX" # 8.7 is supported in the PyTorch main branch, but not 2.0.0 # Clone PyTorch repositories independently from all other build steps # for cache-friendliness and parallelization -FROM alpine/git:2.36.3 as pytorch-downloader +FROM alpine/git:2.40.1 as downloader-base WORKDIR /git +RUN git config --global advice.detachedHead false + +COPY < Date: Mon, 21 Aug 2023 17:52:45 -0500 Subject: [PATCH 25/63] fix(torch): Fix here-document syntax --- torch/Dockerfile | 33 ++++++++++++++++----------------- 1 file changed, 16 insertions(+), 17 deletions(-) diff --git a/torch/Dockerfile b/torch/Dockerfile index ae2f805..00da01d 100755 --- a/torch/Dockerfile +++ b/torch/Dockerfile @@ -15,23 +15,22 @@ FROM alpine/git:2.40.1 as downloader-base WORKDIR /git RUN git config --global advice.detachedHead false -COPY < Date: Wed, 23 Aug 2023 14:29:06 -0500 Subject: [PATCH 26/63] build(torch): Increase `ccache` size, parallelize builds better [skip ci] --- .github/workflows/torch.yml | 2 +- torch/Dockerfile | 19 ++++++++++++++++--- torch/effective_cpu_count.sh | 32 ++++++++++++++++++++++++++++++++ 3 files changed, 49 insertions(+), 4 deletions(-) create mode 100755 torch/effective_cpu_count.sh diff --git a/.github/workflows/torch.yml b/.github/workflows/torch.yml index 5c995bb..f84a173 100644 --- a/.github/workflows/torch.yml +++ b/.github/workflows/torch.yml @@ -87,7 +87,7 @@ jobs: folder: torch tag-suffix: ${{ inputs.tag }} build-args: | - BUILD_CCACHE_SIZE=1Gi + BUILD_CCACHE_SIZE=20Gi BUILDER_BASE_IMAGE=${{ inputs.builder-base-image }} FINAL_BASE_IMAGE=${{ inputs.base-image }} BUILD_TORCH_VERSION=${{ inputs.torch-version }} diff --git a/torch/Dockerfile b/torch/Dockerfile index 00da01d..8a0951d 100755 --- a/torch/Dockerfile +++ b/torch/Dockerfile @@ -71,17 +71,25 @@ ARG BUILD_CCACHE_SIZE="1Gi" RUN apt-get -qq update && apt-get -qq install -y \ libncurses5 python3 python3-pip git apt-utils ssh ca-certificates \ libpng-dev libjpeg-dev pkg-config python3-distutils python3-numpy \ - build-essential ninja-build ccache gcc-10 g++-10 lld && \ + build-essential ninja-build gcc-10 g++-10 lld && \ update-alternatives --install /usr/bin/python python /usr/bin/python3 1 && \ update-alternatives --install /usr/bin/pip pip /usr/bin/pip3 1 && \ update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-10 10 && \ update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-10 10 && \ update-alternatives --install /usr/bin/ld ld /usr/bin/ld.lld 1 && \ - ccache -M "${BUILD_CCACHE_SIZE}" && \ - ccache -F 0 && \ pip3 install --no-cache-dir --upgrade pip && \ apt-get clean +RUN mkdir /tmp/ccache-install && \ + pushd /tmp/ccache-install && \ + CCACHE_URL='https://github.com/ccache/ccache/releases/download/v4.8.2/ccache-4.8.2-linux-x86_64.tar.xz' && \ + wget -qO - $CCACHE_URL | tar --strip-components 1 -xJf - && \ + make install && \ + popd && \ + rm -rf /tmp/ccache-install && \ + ccache -M "${BUILD_CCACHE_SIZE}" && \ + ccache -F 0 + # Build-time environment variables ENV CCACHE_DIR=/ccache \ CMAKE_C_COMPILER_LAUNCHER=ccache \ @@ -98,6 +106,7 @@ RUN apt-get -qq update && apt-get -qq install -y \ RUN mkdir /build /build/dist WORKDIR /build +COPY --chmod=755 effective_cpu_count.sh . ## Build torch RUN --mount=type=bind,from=pytorch-downloader,source=/git/pytorch,target=pytorch/ \ @@ -114,6 +123,7 @@ ARG BUILD_TRITON_VERSION RUN --mount=type=bind,from=triton-downloader,source=/git/triton,target=triton/,rw \ --mount=type=cache,target=/ccache \ if [ -n "$BUILD_TRITON_VERSION" ]; then \ + export MAX_JOBS="$(./effective_cpu_count.sh)" && \ cd triton/python && \ python -m pip wheel -w wheels/ --no-build-isolation --no-deps -vv . && \ pip install wheels/*.whl; \ @@ -139,6 +149,7 @@ ENV TORCH_CUDA_ARCH_LIST=$BUILD_TORCH_CUDA_ARCH_LIST # remain the same. RUN --mount=type=bind,from=pytorch-downloader,source=/git/pytorch,target=pytorch/,rw \ --mount=type=cache,target=/ccache \ + export MAX_JOBS="$(./effective_cpu_count.sh)" && \ cd pytorch && \ mkdir build && \ ln -s /usr/bin/cc build/cc && \ @@ -174,6 +185,7 @@ RUN pip3 install --no-cache-dir --upgrade \ matplotlib numpy typing_extensions requests pillow RUN --mount=type=bind,from=torchvision-downloader,source=/git/vision,target=vision/,rw \ + export MAX_JOBS="$(./effective_cpu_count.sh)" && \ cd vision && \ mkdir build && \ ln -s /usr/bin/cc build/cc && \ @@ -208,6 +220,7 @@ RUN pip3 install --no-cache-dir --upgrade \ matplotlib numpy typing_extensions requests pillow RUN --mount=type=bind,from=torchaudio-downloader,source=/git/audio,target=audio/,rw \ + export MAX_JOBS="$(./effective_cpu_count.sh)" && \ cd audio && \ mkdir build && \ ln -s /usr/bin/cc build/cc && \ diff --git a/torch/effective_cpu_count.sh b/torch/effective_cpu_count.sh new file mode 100755 index 0000000..029ecbc --- /dev/null +++ b/torch/effective_cpu_count.sh @@ -0,0 +1,32 @@ +#!/bin/sh + +CPU_QUOTA() ( + CGROUP='/sys/fs/cgroup'; + CGROUP_V1="$CGROUP/cpu,cpuacct"; + CGROUP_V1_QUOTA="$CGROUP_V1/cpu.cfs_quota_us"; + CGROUP_V1_PERIOD="$CGROUP_V1/cpu.cfs_period_us"; + CGROUP_V2="$CGROUP/user.slice/cpu.max"; + if [ ! -d "$CGROUP" ]; then + return 1; + elif [ -f "$CGROUP_V1_QUOTA" ] && [ -f "$CGROUP_V1_PERIOD" ]; then + IFS='' read -r QUOTA 2> /dev/null < "$CGROUP_V1_QUOTA" || return 1; + IFS='' read -r PERIOD 2> /dev/null < "$CGROUP_V1_PERIOD" || return 1; + elif [ -f "$CGROUP_V2" ]; then + IFS=' ' read -r QUOTA PERIOD 2> /dev/null < "$CGROUP_V2" || return 1; + else + return 1; + fi; + + if [ "$QUOTA" -gt 0 ] 2> /dev/null && [ "$PERIOD" -gt 0 ] 2> /dev/null; then + echo $((QUOTA / PERIOD)); + return 0; + else + return 1; + fi; +) + +EFFECTIVE_CPU_COUNT() { + CPU_QUOTA || getconf _NPROCESSORS_ONLN; +} + +EFFECTIVE_CPU_COUNT; From 0e0456e8cd4a9825bb6069ef93650702b5bb6f68 Mon Sep 17 00:00:00 2001 From: Eta Date: Wed, 23 Aug 2023 14:31:55 -0500 Subject: [PATCH 27/63] ci(torch-nightly): Change name from `nightly/torch` to `nightly-torch` --- .github/workflows/torch-nightly.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/torch-nightly.yml b/.github/workflows/torch-nightly.yml index d0dc47d..40a7991 100644 --- a/.github/workflows/torch-nightly.yml +++ b/.github/workflows/torch-nightly.yml @@ -77,7 +77,7 @@ jobs: needs: get-nightly-commit-hash uses: ./.github/workflows/torch-base.yml with: - image-name: nightly/torch + image-name: nightly-torch image-tag-suffix: ${{ needs.get-nightly-commit-hash.outputs.version-string }} torch-version: ${{ needs.get-nightly-commit-hash.outputs.pytorch-commit }} torchvision-version: ${{ needs.get-nightly-commit-hash.outputs.torchvision-commit }} @@ -87,7 +87,7 @@ jobs: needs: get-nightly-commit-hash uses: ./.github/workflows/torch-nccl.yml with: - image-name: nightly/torch + image-name: nightly-torch image-tag-suffix: ${{ needs.get-nightly-commit-hash.outputs.version-string }} torch-version: ${{ needs.get-nightly-commit-hash.outputs.pytorch-commit }} torchvision-version: ${{ needs.get-nightly-commit-hash.outputs.torchvision-commit }} From 563ced8a7d28ce584a81ddaae1baaebf3199f01f Mon Sep 17 00:00:00 2001 From: Eta Date: Thu, 24 Aug 2023 14:46:42 -0500 Subject: [PATCH 28/63] docs: Explain `torch` and `torch-extras` containers in the README --- README.md | 62 ++++++++++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 57 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index 8780cfb..10678b9 100644 --- a/README.md +++ b/README.md @@ -2,21 +2,73 @@ Repository for building ML images at CoreWeave + +## Index + +See the [list of all published images](https://github.com/orgs/coreweave/packages?repo_name=ml-containers). + +### PyTorch Base Images + +- [`ghcr.io/coreweave/ml-containers/torch`](https://github.com/coreweave/ml-containers/pkgs/container/ml-containers%2Ftorch) + +CoreWeave provides custom builds of +[PyTorch](https://github.com/pytorch/pytorch), +[`torchvision`](https://github.com/pytorch/vision) +and [`torchaudio`](https://github.com/pytorch/audio) +tuned for our platform in a single container image, [`ml-containers/torch`](https://github.com/coreweave/ml-containers/pkgs/container/ml-containers%2Ftorch). + +Versions compiled against CUDA 11.8.0, 12.0.1, and 12.1.1 are available in this repository, with two variants: + +1. `base`: Tagged as `ml-containers/torch:a1b2c3d-base-...`. + 1. Built from [`nvidia/cuda:...-base-ubuntu20.04`](https://hub.docker.com/r/nvidia/cuda/tags?name=base-ubuntu20.04) as a base. + 2. Only includes essentials (CUDA, `torch`, `torchvision`, `torchaudio`), + so it has a small image size, making it fast to launch. +2. `nccl`: Tagged as `ml-containers/torch:a1b2c3d-nccl-...`. + 1. Built from [`ghcr.io/coreweave/nccl-tests`](https://github.com/coreweave/nccl-tests/pkgs/container/nccl-tests) as a base. + 2. Ultimately inherits from [`nvidia/cuda:...-cudnn8-devel-ubuntu20.04`](https://hub.docker.com/r/nvidia/cuda/tags?name=cudnn8-devel-ubuntu20.04). + 3. Larger, but includes development libraries and build tools such as `nvcc` necessary for compiling other PyTorch extensions. + 4. These PyTorch builds are built on component libraries optimized for the CoreWeave cloud—see + [`coreweave/nccl-tests`](https://github.com/coreweave/nccl-tests/blob/master/README.md). + +### PyTorch Extras + +- [`ghcr.io/coreweave/ml-containers/torch-extras`](https://github.com/coreweave/ml-containers/pkgs/container/ml-containers%2Ftorch-extras) + +[`ml-containers/torch-extras`](https://github.com/coreweave/ml-containers/pkgs/container/ml-containers%2Ftorch-extras) +extends the [`ml-containers/torch`](https://github.com/coreweave/ml-containers/pkgs/container/ml-containers%2Ftorch) +images with a set of common PyTorch extensions: + +1. [DeepSpeed](https://github.com/microsoft/DeepSpeed) +2. [FlashAttention](https://github.com/Dao-AILab/flash-attention) +3. [NVIDIA Apex](https://github.com/NVIDIA/apex) + +Each one is compiled specially against the custom PyTorch builds in [`ml-containers/torch`](https://github.com/coreweave/ml-containers/pkgs/container/ml-containers%2Ftorch). + +Both `base` and `nccl` editions are available for +[`ml-containers/torch-extras`](https://github.com/coreweave/ml-containers/pkgs/container/ml-containers%2Ftorch-extras) +matching those for +[`ml-containers/torch`](https://github.com/coreweave/ml-containers/pkgs/container/ml-containers%2Ftorch). +The `base` edition retains a small size, as a multi-stage build is used to avoid including +CUDA development libraries in it, despite those libraries being required to build +the extensions themselves. + + ## Organization This repository contains multiple container image Dockerfiles, each is expected to be within its own folder along with any other needed files for the build. + ## CI Builds (Actions) -The current CI builds are setup to run when changes to files in the respective -folders are detected so that only the changed container images are built. The -actions are setup with an action per image utilizing a reusable base action -[build.yml](.github/workflows/build.yml). The reusable action accepts several inputs: +The current CI builds are set up to run when changes to files in the respective +folders are detected so that only the changed container images are built. The +actions are set up with an action per image utilizing a reusable base action +[build.yml](.github/workflows/build.yml). The reusable action accepts several inputs: - `folder` - the folder containing the dockerfile for the image - `image-name` - the name to use for the image - `build-args` - arguments to pass to the docker build Images built using the same source can utilize one action as the main reason for -the multiple actions is to handle only building the changed images. A build +the multiple actions is to handle only building the changed images. A build matrix can be helpful for these cases https://docs.github.com/en/actions/using-jobs/using-a-matrix-for-your-jobs. From 05cb7b9336d397498450cf8bb93fb2ab20ec92b7 Mon Sep 17 00:00:00 2001 From: Eta Date: Thu, 24 Aug 2023 17:43:53 -0500 Subject: [PATCH 29/63] fix(torch): Use local version identifiers for builds from commits --- torch/Dockerfile | 27 +++++++++++++++++++++++---- 1 file changed, 23 insertions(+), 4 deletions(-) diff --git a/torch/Dockerfile b/torch/Dockerfile index 8a0951d..45baf14 100755 --- a/torch/Dockerfile +++ b/torch/Dockerfile @@ -33,7 +33,7 @@ COPY <<-"EOT" /git/clone.sh }; EOT -RUN chmod +x /git/clone.sh +RUN chmod 755 /git/clone.sh FROM downloader-base as pytorch-downloader @@ -108,6 +108,25 @@ RUN mkdir /build /build/dist WORKDIR /build COPY --chmod=755 effective_cpu_count.sh . +COPY <<-"EOT" /build/version-string.sh + #!/bin/sh + set -x; + VERSION="$1"; + + IS_HASH() { + echo "$1" | grep -qxiEe '[0-9a-f]{40}'; + }; + + if IS_HASH "$VERSION"; then + REAL_VERSION="$(cat ./version.txt)"; + SHORT_HASH="$(echo "$VERSION" | cut -c1-7)"; + echo "$REAL_VERSION+$SHORT_HASH"; + else + echo "$VERSION"; + fi; +EOT +RUN chmod 755 /build/version-string.sh + ## Build torch RUN --mount=type=bind,from=pytorch-downloader,source=/git/pytorch,target=pytorch/ \ cd pytorch && pip3 install --no-cache-dir -r requirements.txt @@ -172,7 +191,7 @@ RUN --mount=type=bind,from=pytorch-downloader,source=/git/pytorch,target=pytorch CXX=c++ \ USE_EIGEN_FOR_BLAS=ON \ USE_MKL=OFF \ - PYTORCH_BUILD_VERSION="${TORCH_VERSION}" \ + PYTORCH_BUILD_VERSION="$(../version-string.sh "$TORCH_VERSION")" \ PYTORCH_BUILD_NUMBER=0 \ TORCH_NVCC_FLAGS="-Xfatbin -compress-all" \ python3 setup.py bdist_wheel --dist-dir ../dist @@ -209,7 +228,7 @@ RUN --mount=type=bind,from=torchvision-downloader,source=/git/vision,target=visi CXX=c++ \ USE_EIGEN_FOR_BLAS=ON \ USE_MKL=OFF \ - BUILD_VERSION="${TORCH_VISION_VERSION}" \ + BUILD_VERSION="$(../version-string.sh "$TORCH_VISION_VERSION")" \ TORCH_NVCC_FLAGS="-Xfatbin -compress-all" \ python3 setup.py bdist_wheel --dist-dir ../dist @@ -244,7 +263,7 @@ RUN --mount=type=bind,from=torchaudio-downloader,source=/git/audio,target=audio/ CXX=c++ \ USE_EIGEN_FOR_BLAS=ON \ USE_MKL=OFF \ - BUILD_VERSION="${TORCH_AUDIO_VERSION}" \ + BUILD_VERSION="$(../version-string.sh "$TORCH_AUDIO_VERSION")" \ TORCH_NVCC_FLAGS="-Xfatbin -compress-all" \ python3 setup.py bdist_wheel --dist-dir ../dist From 3174ba157412a3ddc23f6def4c5f1291cf01bd57 Mon Sep 17 00:00:00 2001 From: Eta Date: Thu, 24 Aug 2023 17:49:55 -0500 Subject: [PATCH 30/63] ci(torch): Separate configuration settings from workflow definitions --- .github/configurations/torch-base.yml | 5 +++ .github/configurations/torch-nccl.yml | 14 +++++++ .github/workflows/read-configuration.yml | 33 +++++++++++++++ .github/workflows/torch-base.yml | 52 ++++------------------- .github/workflows/torch-nccl.yml | 53 ++++-------------------- 5 files changed, 66 insertions(+), 91 deletions(-) create mode 100644 .github/configurations/torch-base.yml create mode 100644 .github/configurations/torch-nccl.yml create mode 100644 .github/workflows/read-configuration.yml diff --git a/.github/configurations/torch-base.yml b/.github/configurations/torch-base.yml new file mode 100644 index 0000000..328a01b --- /dev/null +++ b/.github/configurations/torch-base.yml @@ -0,0 +1,5 @@ +cuda: [ 12.1.1, 12.0.1, 11.8.0 ] +include: + - torch: 2.0.1 + vision: 0.15.2 + audio: 2.0.2 diff --git a/.github/configurations/torch-nccl.yml b/.github/configurations/torch-nccl.yml new file mode 100644 index 0000000..9ae1ed0 --- /dev/null +++ b/.github/configurations/torch-nccl.yml @@ -0,0 +1,14 @@ +image: + - cuda: 12.1.1 + nccl: 2.18.3-1 + nccl-tests-hash: 471f0db + - cuda: 12.0.1 + nccl: 2.18.3-1 + nccl-tests-hash: 471f0db + - cuda: 11.8.0 + nccl: 2.16.2-1 + nccl-tests-hash: 471f0db +include: + - torch: 2.0.1 + vision: 0.15.2 + audio: 2.0.2 diff --git a/.github/workflows/read-configuration.yml b/.github/workflows/read-configuration.yml new file mode 100644 index 0000000..3104715 --- /dev/null +++ b/.github/workflows/read-configuration.yml @@ -0,0 +1,33 @@ +name: read-configuration + +on: + workflow_call: + inputs: + path: + required: true + type: string + filter: + required: false + type: string + +jobs: + get-required-bases: + runs-on: ["self-hosted", "Linux"] + permissions: {} + outputs: + config: ${{ steps.read.outputs.contents }} + steps: + - uses: actions/checkout@v3 + - name: Read configuration + id: read + env: + FILE_PATH: ${{ inputs.path }} + FILTER: ${{ inputs.filter }} + run: | + set -x; + if [ -n "$FILTER" ]; then + CONTENTS="$(yq e "$FILE_PATH" --expression "$FILTER" -oj -I0)"; + else + CONTENTS="$(yq e "$FILE_PATH" -oj -I0)"; + fi; + echo "contents=$CONTENTS" >> "$GITHUB_OUTPUT"; diff --git a/.github/workflows/torch-base.yml b/.github/workflows/torch-base.yml index 39bde3f..085213a 100644 --- a/.github/workflows/torch-base.yml +++ b/.github/workflows/torch-base.yml @@ -1,44 +1,8 @@ name: torch-base on: - workflow_call: - inputs: - torch-version: - required: false - type: string - torchvision-version: - required: false - type: string - torchaudio-version: - required: false - type: string - triton-version: - required: false - type: string - image-name: - required: false - type: string - image-tag-suffix: - required: false - type: string workflow_dispatch: inputs: - torch-version: - required: false - description: "Tagged version number from pytorch/pytorch to build" - type: string - torchvision-version: - required: false - description: "Tagged version number from pytorch/vision to build" - type: string - torchaudio-version: - required: false - description: "Tagged version number from pytorch/audio to build" - type: string - triton-version: - required: false - description: "Tagged version number from openai/triton to build" - type: string image-name: required: false description: "Custom name under which to publish the resulting container" @@ -50,22 +14,21 @@ on: push: paths: - "torch/**" + - ".github/configurations/torch-base.yml" - ".github/workflows/torch-base.yml" - ".github/workflows/torch.yml" - ".github/workflows/build.yml" jobs: + get-config: + uses: ./.github/workflows/read-configuration.yml + with: + path: ./.github/configurations/torch-base.yml build: + needs: get-config strategy: - matrix: - cuda: [12.1.1, 12.0.1, 11.8.0] - include: - - torch: ${{ inputs.torch-version || '2.0.1' }} - vision: ${{ inputs.torchvision-version || '0.15.2' }} - audio: ${{ inputs.torchaudio-version || '2.0.2' }} - triton: ${{ inputs.triton-version }} - + matrix: ${{ fromJSON(needs.get-config.outputs.config) }} uses: ./.github/workflows/torch.yml with: image-name: ${{ inputs.image-name }} @@ -75,5 +38,4 @@ jobs: torch-version: ${{ matrix.torch }} torchvision-version: ${{ matrix.vision }} torchaudio-version: ${{ matrix.audio }} - triton-version: ${{ matrix.triton }} build-extras: true diff --git a/.github/workflows/torch-nccl.yml b/.github/workflows/torch-nccl.yml index 76d1387..cce1277 100644 --- a/.github/workflows/torch-nccl.yml +++ b/.github/workflows/torch-nccl.yml @@ -3,18 +3,6 @@ name: torch-nccl on: workflow_call: inputs: - torch-version: - required: false - type: string - torchvision-version: - required: false - type: string - torchaudio-version: - required: false - type: string - triton-version: - required: false - type: string image-name: required: false type: string @@ -23,22 +11,6 @@ on: type: string workflow_dispatch: inputs: - torch-version: - required: false - description: "Tagged version number from pytorch/pytorch to build" - type: string - torchvision-version: - required: false - description: "Tagged version number from pytorch/vision to build" - type: string - torchaudio-version: - required: false - description: "Tagged version number from pytorch/audio to build" - type: string - triton-version: - required: false - description: "Tagged version number from openai/triton to build" - type: string image-name: required: false description: "Custom name under which to publish the resulting container" @@ -50,31 +22,21 @@ on: push: paths: - "torch/**" + - ".github/configurations/torch-nccl.yml" - ".github/workflows/torch-nccl.yml" - ".github/workflows/torch.yml" - ".github/workflows/build.yml" jobs: + get-config: + uses: ./.github/workflows/read-configuration.yml + with: + path: ./.github/configurations/torch-nccl.yml build: + needs: get-config strategy: - matrix: - image: - - cuda: 12.1.1 - nccl: 2.18.3-1 - nccl-tests-hash: 471f0db - - cuda: 12.0.1 - nccl: 2.18.3-1 - nccl-tests-hash: 471f0db - - cuda: 11.8.0 - nccl: 2.16.2-1 - nccl-tests-hash: 471f0db - include: - - torch: ${{ inputs.torch-version || '2.0.1' }} - vision: ${{ inputs.torchvision-version || '0.15.2' }} - audio: ${{ inputs.torchaudio-version || '2.0.2' }} - triton: ${{ inputs.triton-version }} - + matrix: ${{ fromJSON(needs.get-config.outputs.config) }} uses: ./.github/workflows/torch.yml with: image-name: ${{ inputs.image-name }} @@ -84,5 +46,4 @@ jobs: torch-version: ${{ matrix.torch }} torchvision-version: ${{ matrix.vision }} torchaudio-version: ${{ matrix.audio }} - triton-version: ${{ matrix.triton }} build-extras: true From e1dd86915648cd0cdcb4b6ca84623fc7bf517e1a Mon Sep 17 00:00:00 2001 From: Eta Date: Thu, 24 Aug 2023 17:50:45 -0500 Subject: [PATCH 31/63] ci(torch-nightly): Flatten workflow to avoid hitting call depth limit --- .github/workflows/torch-nightly.yml | 60 +++++++++++++++++++++-------- 1 file changed, 45 insertions(+), 15 deletions(-) diff --git a/.github/workflows/torch-nightly.yml b/.github/workflows/torch-nightly.yml index 40a7991..bc6eec9 100644 --- a/.github/workflows/torch-nightly.yml +++ b/.github/workflows/torch-nightly.yml @@ -14,7 +14,7 @@ on: jobs: - get-nightly-commit-hash: + get-nightly-info: runs-on: [ self-hosted, Linux ] outputs: pytorch-commit: ${{ steps.get-hash.outputs.pytorch-commit }} @@ -22,6 +22,7 @@ jobs: torchvision-commit: ${{ steps.get-hash.outputs.torchvision-commit }} torchaudio-commit: ${{ steps.get-hash.outputs.torchaudio-commit }} version-string: ${{ steps.get-hash.outputs.version-string }} + date: ${{ steps.get-date.outputs.date }} steps: - name: Get latest commit hashes id: get-hash @@ -73,23 +74,52 @@ jobs: printf -- 'version-string=torch%s-vision%s-audio%s\n' \ "$PYTORCH_VERSION" "$TORCHVISION_VERSION" "$TORCHAUDIO_VERSION" \ >> "$GITHUB_OUTPUT"; + - name: Get date + id: get-date + run: echo "date=$(date '+%Y.%m.%d')" >> "$GITHUB_OUTPUT"; + + get-base-config: + uses: ./.github/workflows/read-configuration.yml + with: + path: ./.github/configurations/torch-base.yml + filter: del(.include) + get-nccl-config: + uses: ./.github/workflows/read-configuration.yml + with: + path: ./.github/configurations/torch-nccl.yml + filter: del(.include) + build-base: - needs: get-nightly-commit-hash - uses: ./.github/workflows/torch-base.yml + needs: + - get-nightly-info + - get-base-config + strategy: + matrix: ${{ fromJSON(needs.get-base-config.outputs.config) }} + uses: ./.github/workflows/torch.yml with: image-name: nightly-torch - image-tag-suffix: ${{ needs.get-nightly-commit-hash.outputs.version-string }} - torch-version: ${{ needs.get-nightly-commit-hash.outputs.pytorch-commit }} - torchvision-version: ${{ needs.get-nightly-commit-hash.outputs.torchvision-commit }} - torchaudio-version: ${{ needs.get-nightly-commit-hash.outputs.torchaudio-commit }} - triton-version: ${{ needs.get-nightly-commit-hash.outputs.triton-commit }} + tag: ${{ format('base-{0}-cuda{1}-{2}', needs.get-nightly-info.outputs.date, matrix.cuda, needs.get-nightly-info.outputs.version-string ) }} + builder-base-image: nvidia/cuda:${{ matrix.cuda }}-devel-ubuntu20.04 + base-image: nvidia/cuda:${{ matrix.cuda }}-base-ubuntu20.04 + torch-version: ${{ needs.get-nightly-info.outputs.pytorch-commit }} + torchvision-version: ${{ needs.get-nightly-info.outputs.torchvision-commit }} + torchaudio-version: ${{ needs.get-nightly-info.outputs.torchaudio-commit }} + triton-version: ${{ needs.get-nightly-info.outputs.triton-commit }} + build-extras: true build-nccl: - needs: get-nightly-commit-hash - uses: ./.github/workflows/torch-nccl.yml + needs: + - get-nightly-info + - get-nccl-config + strategy: + matrix: ${{ fromJSON(needs.get-nccl-config.outputs.config) }} + uses: ./.github/workflows/torch.yml with: image-name: nightly-torch - image-tag-suffix: ${{ needs.get-nightly-commit-hash.outputs.version-string }} - torch-version: ${{ needs.get-nightly-commit-hash.outputs.pytorch-commit }} - torchvision-version: ${{ needs.get-nightly-commit-hash.outputs.torchvision-commit }} - torchaudio-version: ${{ needs.get-nightly-commit-hash.outputs.torchaudio-commit }} - triton-version: ${{ needs.get-nightly-commit-hash.outputs.triton-commit }} + tag: ${{ format('nccl-{0}-cuda{1}-nccl{2}-{3}', needs.get-nightly-info.outputs.date, matrix.image.cuda, matrix.image.nccl, needs.get-nightly-info.outputs.version-string ) }} + builder-base-image: ghcr.io/coreweave/nccl-tests:${{ matrix.image.cuda }}-cudnn8-devel-ubuntu20.04-nccl${{ matrix.image.nccl }}-${{ matrix.image.nccl-tests-hash }} + base-image: ghcr.io/coreweave/nccl-tests:${{ matrix.image.cuda }}-cudnn8-devel-ubuntu20.04-nccl${{ matrix.image.nccl }}-${{ matrix.image.nccl-tests-hash }} + torch-version: ${{ needs.get-nightly-info.outputs.pytorch-commit }} + torchvision-version: ${{ needs.get-nightly-info.outputs.torchvision-commit }} + torchaudio-version: ${{ needs.get-nightly-info.outputs.torchaudio-commit }} + triton-version: ${{ needs.get-nightly-info.outputs.triton-commit }} + build-extras: true \ No newline at end of file From ce02091630881052122cee98ca6afc06fd70fb54 Mon Sep 17 00:00:00 2001 From: Eta Date: Thu, 24 Aug 2023 18:01:10 -0500 Subject: [PATCH 32/63] ci(torch-nightly): Inherit secrets in calls from `torch-nightly.yml` [skip ci] --- .github/workflows/torch-nightly.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/workflows/torch-nightly.yml b/.github/workflows/torch-nightly.yml index bc6eec9..b2affc0 100644 --- a/.github/workflows/torch-nightly.yml +++ b/.github/workflows/torch-nightly.yml @@ -96,6 +96,7 @@ jobs: strategy: matrix: ${{ fromJSON(needs.get-base-config.outputs.config) }} uses: ./.github/workflows/torch.yml + secrets: inherit with: image-name: nightly-torch tag: ${{ format('base-{0}-cuda{1}-{2}', needs.get-nightly-info.outputs.date, matrix.cuda, needs.get-nightly-info.outputs.version-string ) }} @@ -113,6 +114,7 @@ jobs: strategy: matrix: ${{ fromJSON(needs.get-nccl-config.outputs.config) }} uses: ./.github/workflows/torch.yml + secrets: inherit with: image-name: nightly-torch tag: ${{ format('nccl-{0}-cuda{1}-nccl{2}-{3}', needs.get-nightly-info.outputs.date, matrix.image.cuda, matrix.image.nccl, needs.get-nightly-info.outputs.version-string ) }} From bf288b2b2eb0c9f2ca2fccecd5dcc38a203d3d59 Mon Sep 17 00:00:00 2001 From: Eta Date: Thu, 24 Aug 2023 18:30:58 -0500 Subject: [PATCH 33/63] ci(torch-nightly): Trigger on changes to `base`/`nccl` configurations --- .github/workflows/torch-nightly.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/workflows/torch-nightly.yml b/.github/workflows/torch-nightly.yml index b2affc0..28be521 100644 --- a/.github/workflows/torch-nightly.yml +++ b/.github/workflows/torch-nightly.yml @@ -8,6 +8,8 @@ on: push: paths: # - "torch/**" + - ".github/configurations/torch-base.yml" + - ".github/configurations/torch-nccl.yml" - ".github/workflows/torch-nightly.yml" # - ".github/workflows/torch.yml" # - ".github/workflows/build.yml" From 3bdcc001cb77c932338c5093479f8bc2977c568d Mon Sep 17 00:00:00 2001 From: Eta Date: Fri, 25 Aug 2023 15:06:48 -0500 Subject: [PATCH 34/63] build(torch): Build `nccl` images from bases with HPC-X v2.16 --- .github/workflows/torch-nccl.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/torch-nccl.yml b/.github/workflows/torch-nccl.yml index 4f9738c..324185b 100644 --- a/.github/workflows/torch-nccl.yml +++ b/.github/workflows/torch-nccl.yml @@ -15,13 +15,13 @@ jobs: image: - cuda: 12.1.1 nccl: 2.18.3-1 - nccl-tests-hash: 471f0db + nccl-tests-hash: 253a5b1 - cuda: 12.0.1 nccl: 2.18.3-1 - nccl-tests-hash: 471f0db + nccl-tests-hash: 253a5b1 - cuda: 11.8.0 nccl: 2.16.2-1 - nccl-tests-hash: 471f0db + nccl-tests-hash: 253a5b1 include: - torch: 2.0.1 vision: 0.15.2 From 1b0dbcc08278b47117382aa00c5197f2b38e47f5 Mon Sep 17 00:00:00 2001 From: Eta Date: Fri, 25 Aug 2023 16:17:00 -0500 Subject: [PATCH 35/63] ci: Register a workflow output for `read-configuration` --- .github/workflows/read-configuration.yml | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/.github/workflows/read-configuration.yml b/.github/workflows/read-configuration.yml index 3104715..3301364 100644 --- a/.github/workflows/read-configuration.yml +++ b/.github/workflows/read-configuration.yml @@ -9,9 +9,13 @@ on: filter: required: false type: string + outputs: + config: + description: "The retrieved configuration, as JSON" + value: ${{ jobs.read-file.outputs.config }} jobs: - get-required-bases: + read-file: runs-on: ["self-hosted", "Linux"] permissions: {} outputs: @@ -31,3 +35,10 @@ jobs: CONTENTS="$(yq e "$FILE_PATH" -oj -I0)"; fi; echo "contents=$CONTENTS" >> "$GITHUB_OUTPUT"; + + { + echo '## Configuration'; + echo '```json'; + echo "$CONTENTS" | jq .; + echo '```'; + } >> "$GITHUB_STEP_SUMMARY"; From a7259cc2e7bcd8f93c5047731b52c1260e5fe8f6 Mon Sep 17 00:00:00 2001 From: Eta Date: Fri, 25 Aug 2023 16:22:22 -0500 Subject: [PATCH 36/63] ci(torch-nightly): Use UTC for date tagging, increase precision to hours Increased precision for date tags reduces the likelihood of a tag conflict. --- .github/workflows/torch-nightly.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/torch-nightly.yml b/.github/workflows/torch-nightly.yml index 28be521..e6e66b9 100644 --- a/.github/workflows/torch-nightly.yml +++ b/.github/workflows/torch-nightly.yml @@ -78,7 +78,7 @@ jobs: >> "$GITHUB_OUTPUT"; - name: Get date id: get-date - run: echo "date=$(date '+%Y.%m.%d')" >> "$GITHUB_OUTPUT"; + run: echo "date=$(date -u '+%Y.%m.%d.%H')" >> "$GITHUB_OUTPUT"; get-base-config: uses: ./.github/workflows/read-configuration.yml From 2f4708dc5cf2ef1c0ead5419910377735e7b4c70 Mon Sep 17 00:00:00 2001 From: Eta Date: Fri, 25 Aug 2023 18:00:47 -0500 Subject: [PATCH 37/63] fix(torch): Update `pip` correctly Previously, updating pip failed without an error because of the way the command was invoked, leaving it unchanged. --- torch/Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/torch/Dockerfile b/torch/Dockerfile index 45baf14..0aaeef3 100755 --- a/torch/Dockerfile +++ b/torch/Dockerfile @@ -72,12 +72,12 @@ RUN apt-get -qq update && apt-get -qq install -y \ libncurses5 python3 python3-pip git apt-utils ssh ca-certificates \ libpng-dev libjpeg-dev pkg-config python3-distutils python3-numpy \ build-essential ninja-build gcc-10 g++-10 lld && \ + /usr/bin/python3 -m pip install --no-cache-dir --upgrade pip && \ update-alternatives --install /usr/bin/python python /usr/bin/python3 1 && \ update-alternatives --install /usr/bin/pip pip /usr/bin/pip3 1 && \ update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-10 10 && \ update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-10 10 && \ update-alternatives --install /usr/bin/ld ld /usr/bin/ld.lld 1 && \ - pip3 install --no-cache-dir --upgrade pip && \ apt-get clean RUN mkdir /tmp/ccache-install && \ From 29d6561c99cc3924b61a46438e9b9ab5adb2be8f Mon Sep 17 00:00:00 2001 From: Eta Date: Fri, 25 Aug 2023 18:02:46 -0500 Subject: [PATCH 38/63] build(torch): Build `nccl` images from bases with HPC-X v2.16 Port of commit 3bdcc001cb77c932338c5093479f8bc2977c568d. [skip ci] --- .github/configurations/torch-nccl.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/configurations/torch-nccl.yml b/.github/configurations/torch-nccl.yml index 9ae1ed0..0ac1caf 100644 --- a/.github/configurations/torch-nccl.yml +++ b/.github/configurations/torch-nccl.yml @@ -1,13 +1,13 @@ image: - cuda: 12.1.1 nccl: 2.18.3-1 - nccl-tests-hash: 471f0db + nccl-tests-hash: 253a5b1 - cuda: 12.0.1 nccl: 2.18.3-1 - nccl-tests-hash: 471f0db + nccl-tests-hash: 253a5b1 - cuda: 11.8.0 nccl: 2.16.2-1 - nccl-tests-hash: 471f0db + nccl-tests-hash: 253a5b1 include: - torch: 2.0.1 vision: 0.15.2 From c020ab83404bd8a414a30c69f7f1cabe74211352 Mon Sep 17 00:00:00 2001 From: Eta Date: Fri, 25 Aug 2023 19:14:34 -0500 Subject: [PATCH 39/63] ci(torch-nightly): Format step summaries better --- .github/workflows/torch-nightly.yml | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/.github/workflows/torch-nightly.yml b/.github/workflows/torch-nightly.yml index e6e66b9..b4c14e0 100644 --- a/.github/workflows/torch-nightly.yml +++ b/.github/workflows/torch-nightly.yml @@ -32,17 +32,20 @@ jobs: set -e; FORMAT_COMMIT_LINK() { - echo "[$(echo "$2" | cut -c1-7)](https://github.com/$1/tree/$2)"; + printf '[`%.7s`](https://github.com/%s/tree/%s)\n' "$2" "$1" "$2"; }; + LOG() { + printf -- "$@" >> "$GITHUB_STEP_SUMMARY"; + } + CLONE() { git clone --filter=blob:none --no-checkout --depth=1 \ "https://github.com/$1" \ "$2" > /dev/null 2> /dev/null && \ - local COMMIT=$(git -C "$2" rev-parse HEAD) && \ - echo "Latest $1 commit: $( - FORMAT_COMMIT_LINK $1 $COMMIT - )" >> "$GITHUB_STEP_SUMMARY" && \ + local COMMIT="$(git -C "$2" rev-parse HEAD)" && \ + LOG 'Latest `%s` commit: %s\n' \ + "$1" "$(FORMAT_COMMIT_LINK "$1" "$COMMIT")" && \ echo $COMMIT; }; @@ -56,9 +59,8 @@ jobs: TRITON_COMMIT="$(git -C pytorch-git show "HEAD:$TRITON_COMMIT_FILE" 2> /dev/null)"; rm -rf pytorch-git; - echo "Corresponding Triton commit: $( - FORMAT_COMMIT_LINK openai/triton $TRITON_COMMIT - )" >> "$GITHUB_STEP_SUMMARY"; + LOG 'Corresponding `openai/triton` commit: %s\n' \ + "$(FORMAT_COMMIT_LINK openai/triton "$TRITON_COMMIT")"; TORCHVISION_COMMIT="$(CLONE pytorch/vision torchvision-git)"; TORCHVISION_VERSION="$(GET_VERSION torchvision-git)"; From 27d63d0e208a0aa2f12028895bbfa19201f0e8c0 Mon Sep 17 00:00:00 2001 From: Eta Date: Fri, 25 Aug 2023 19:23:09 -0500 Subject: [PATCH 40/63] fix(torch): Don't use `pushd`/`popd` They are not available within /bin/sh. [skip ci] --- torch/Dockerfile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/torch/Dockerfile b/torch/Dockerfile index 0aaeef3..05901ec 100755 --- a/torch/Dockerfile +++ b/torch/Dockerfile @@ -81,11 +81,11 @@ RUN apt-get -qq update && apt-get -qq install -y \ apt-get clean RUN mkdir /tmp/ccache-install && \ - pushd /tmp/ccache-install && \ + cd /tmp/ccache-install && \ CCACHE_URL='https://github.com/ccache/ccache/releases/download/v4.8.2/ccache-4.8.2-linux-x86_64.tar.xz' && \ wget -qO - $CCACHE_URL | tar --strip-components 1 -xJf - && \ make install && \ - popd && \ + cd .. && \ rm -rf /tmp/ccache-install && \ ccache -M "${BUILD_CCACHE_SIZE}" && \ ccache -F 0 From a82b75448ca87be628e15c3cb9fe5d26ed1c389e Mon Sep 17 00:00:00 2001 From: Eta Date: Fri, 25 Aug 2023 19:24:25 -0500 Subject: [PATCH 41/63] ci(torch-nightly): Reactivate all `push` triggers --- .github/workflows/torch-nightly.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/torch-nightly.yml b/.github/workflows/torch-nightly.yml index b4c14e0..96328e2 100644 --- a/.github/workflows/torch-nightly.yml +++ b/.github/workflows/torch-nightly.yml @@ -7,12 +7,12 @@ on: - cron: "0 5 * * *" push: paths: -# - "torch/**" + - "torch/**" - ".github/configurations/torch-base.yml" - ".github/configurations/torch-nccl.yml" - ".github/workflows/torch-nightly.yml" -# - ".github/workflows/torch.yml" -# - ".github/workflows/build.yml" + - ".github/workflows/torch.yml" + - ".github/workflows/build.yml" jobs: From cc05e87aa6cc66eafcc5e2e803572af5467f4fe8 Mon Sep 17 00:00:00 2001 From: Eta Date: Mon, 28 Aug 2023 13:17:27 -0500 Subject: [PATCH 42/63] build(torch): Build with `gcc-13`, `g++-13`, and `lld-17` [skip ci] --- torch/Dockerfile | 25 +++++++++++++++++-------- 1 file changed, 17 insertions(+), 8 deletions(-) diff --git a/torch/Dockerfile b/torch/Dockerfile index 05901ec..72c1383 100755 --- a/torch/Dockerfile +++ b/torch/Dockerfile @@ -67,18 +67,15 @@ ENV DEBIAN_FRONTEND=noninteractive ARG BUILD_CCACHE_SIZE="1Gi" -# ninja-build, ccache, gcc-10, g++-10, and lld are optional but improve the build +# ninja-build, ccache, and lld are optional but improve the build RUN apt-get -qq update && apt-get -qq install -y \ libncurses5 python3 python3-pip git apt-utils ssh ca-certificates \ libpng-dev libjpeg-dev pkg-config python3-distutils python3-numpy \ - build-essential ninja-build gcc-10 g++-10 lld && \ + build-essential ninja-build && \ + apt-get clean && \ /usr/bin/python3 -m pip install --no-cache-dir --upgrade pip && \ update-alternatives --install /usr/bin/python python /usr/bin/python3 1 && \ - update-alternatives --install /usr/bin/pip pip /usr/bin/pip3 1 && \ - update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-10 10 && \ - update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-10 10 && \ - update-alternatives --install /usr/bin/ld ld /usr/bin/ld.lld 1 && \ - apt-get clean + update-alternatives --install /usr/bin/pip pip /usr/bin/pip3 1 RUN mkdir /tmp/ccache-install && \ cd /tmp/ccache-install && \ @@ -99,11 +96,23 @@ ENV CCACHE_DIR=/ccache \ # Add Kitware's apt repository to get a newer version of CMake RUN apt-get -qq update && apt-get -qq install -y \ software-properties-common lsb-release && \ - { wget -O - https://apt.kitware.com/keys/kitware-archive-latest.asc 2>/dev/null \ + { wget -qO - https://apt.kitware.com/keys/kitware-archive-latest.asc \ | gpg --dearmor -o /etc/apt/trusted.gpg.d/kitware.gpg; } && \ apt-add-repository "deb https://apt.kitware.com/ubuntu/ $(lsb_release -cs) main" && \ apt-get -qq update && apt-get -qq install -y cmake && apt-get clean +# Update compiler (GCC) and linker (LLD) versions +RUN CODENAME="$(lsb_release -cs)" && \ + wget -qO - 'https://apt.llvm.org/llvm-snapshot.gpg.key' > /etc/apt/trusted.gpg.d/apt.llvm.org.asc && \ + apt-add-repository "deb https://apt.llvm.org/$CODENAME/ llvm-toolchain-$CODENAME-17 main" && \ + apt-add-repository -y ppa:ubuntu-toolchain-r/test && \ + apt-get -qq update && apt-get -qq install --no-install-recommends -y \ + gcc-13 g++-13 lld-17 && \ + apt-get clean && \ + update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-13 13 && \ + update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-13 13 && \ + update-alternatives --install /usr/bin/ld ld /usr/bin/ld.lld-17 1 + RUN mkdir /build /build/dist WORKDIR /build COPY --chmod=755 effective_cpu_count.sh . From a8ef593bbcf83994f4c556f7c1688423a06066bd Mon Sep 17 00:00:00 2001 From: Eta Date: Mon, 28 Aug 2023 13:18:03 -0500 Subject: [PATCH 43/63] style(torch-nightly): Add trailing newline in `torch-nightly.yml` --- .github/workflows/torch-nightly.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/torch-nightly.yml b/.github/workflows/torch-nightly.yml index 96328e2..68c7d3c 100644 --- a/.github/workflows/torch-nightly.yml +++ b/.github/workflows/torch-nightly.yml @@ -128,4 +128,4 @@ jobs: torchvision-version: ${{ needs.get-nightly-info.outputs.torchvision-commit }} torchaudio-version: ${{ needs.get-nightly-info.outputs.torchaudio-commit }} triton-version: ${{ needs.get-nightly-info.outputs.triton-commit }} - build-extras: true \ No newline at end of file + build-extras: true From 05be91ca89674db78b25454e29443fd0d561b50a Mon Sep 17 00:00:00 2001 From: Eta Date: Mon, 28 Aug 2023 13:32:36 -0500 Subject: [PATCH 44/63] fix(torch): Update `pip` correctly in final image --- torch/Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/torch/Dockerfile b/torch/Dockerfile index 72c1383..38cbef0 100755 --- a/torch/Dockerfile +++ b/torch/Dockerfile @@ -286,9 +286,9 @@ RUN apt-get -qq update && apt-get -qq install -y \ libncurses5 python3 python3-pip python3-distutils python3-numpy \ curl git apt-utils ssh ca-certificates tmux nano vim sudo bash rsync \ htop wget unzip tini && \ + /usr/bin/python3 -m pip install --no-cache-dir --upgrade pip && \ update-alternatives --install /usr/bin/python python /usr/bin/python3 1 && \ update-alternatives --install /usr/bin/pip pip /usr/bin/pip3 1 && \ - pip3 install --no-cache-dir --upgrade pip && \ apt-get clean ARG BUILD_TORCH_VERSION From 5847e4e8ac8af3780c862d22d310ee58a60ccfae Mon Sep 17 00:00:00 2001 From: Eta Date: Mon, 28 Aug 2023 13:34:55 -0500 Subject: [PATCH 45/63] fix(torch): Downgrade to GCC 11 --- .github/workflows/torch-nightly.yml | 2 +- torch/Dockerfile | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/workflows/torch-nightly.yml b/.github/workflows/torch-nightly.yml index 68c7d3c..485da13 100644 --- a/.github/workflows/torch-nightly.yml +++ b/.github/workflows/torch-nightly.yml @@ -37,7 +37,7 @@ jobs: LOG() { printf -- "$@" >> "$GITHUB_STEP_SUMMARY"; - } + }; CLONE() { git clone --filter=blob:none --no-checkout --depth=1 \ diff --git a/torch/Dockerfile b/torch/Dockerfile index 38cbef0..b46da7c 100755 --- a/torch/Dockerfile +++ b/torch/Dockerfile @@ -107,10 +107,10 @@ RUN CODENAME="$(lsb_release -cs)" && \ apt-add-repository "deb https://apt.llvm.org/$CODENAME/ llvm-toolchain-$CODENAME-17 main" && \ apt-add-repository -y ppa:ubuntu-toolchain-r/test && \ apt-get -qq update && apt-get -qq install --no-install-recommends -y \ - gcc-13 g++-13 lld-17 && \ + gcc-11 g++-11 lld-17 && \ apt-get clean && \ - update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-13 13 && \ - update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-13 13 && \ + update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-11 11 && \ + update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-11 11 && \ update-alternatives --install /usr/bin/ld ld /usr/bin/ld.lld-17 1 RUN mkdir /build /build/dist From 8858a1e1b28d2fbb9cb4ae90507a1f8c4b2f08cd Mon Sep 17 00:00:00 2001 From: Eta Date: Mon, 28 Aug 2023 14:06:58 -0500 Subject: [PATCH 46/63] build(torch-extras): Build with `gcc-11`, `g++-11`, and `lld-17` --- torch-extras/Dockerfile | 31 +++++++++++++++++++++++-------- 1 file changed, 23 insertions(+), 8 deletions(-) diff --git a/torch-extras/Dockerfile b/torch-extras/Dockerfile index 8d39bd8..cd07e54 100644 --- a/torch-extras/Dockerfile +++ b/torch-extras/Dockerfile @@ -43,16 +43,31 @@ RUN export \ cuda-nvprof-${CUDA_PACKAGE_VERSION} \ cuda-profiler-api-${CUDA_PACKAGE_VERSION} \ libaio-dev \ - ninja-build \ - # gcc-10/g++-10/lld do not need to be installed here, but they improve the build. - # gfortran-10 is just for compiler_wrapper.f95. - gcc-10 g++-10 gfortran-10 lld && \ + ninja-build && \ + apt-get clean + +# Add Kitware's apt repository to get a newer version of CMake +RUN apt-get -qq update && apt-get -qq install -y \ + software-properties-common lsb-release && \ + { wget -qO - https://apt.kitware.com/keys/kitware-archive-latest.asc \ + | gpg --dearmor -o /etc/apt/trusted.gpg.d/kitware.gpg; } && \ + apt-add-repository "deb https://apt.kitware.com/ubuntu/ $(lsb_release -cs) main" && \ + apt-get -qq update && apt-get -qq install -y cmake && apt-get clean + +# Update compiler (GCC) and linker (LLD) versions +# gfortran-11 is just for compiler_wrapper.f95 +RUN CODENAME="$(lsb_release -cs)" && \ + wget -qO - 'https://apt.llvm.org/llvm-snapshot.gpg.key' > /etc/apt/trusted.gpg.d/apt.llvm.org.asc && \ + apt-add-repository "deb https://apt.llvm.org/$CODENAME/ llvm-toolchain-$CODENAME-17 main" && \ + apt-add-repository -y ppa:ubuntu-toolchain-r/test && \ + apt-get -qq update && apt-get -qq install --no-install-recommends -y \ + gcc-11 g++-11 gfortran-11 lld-17 && \ apt-get clean && \ - update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-10 10 && \ - update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-10 10 && \ + update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-11 11 && \ + update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-11 11 && \ update-alternatives --install \ - /usr/bin/gfortran gfortran /usr/bin/gfortran-10 10 && \ - update-alternatives --install /usr/bin/ld ld /usr/bin/ld.lld 1 + /usr/bin/gfortran gfortran /usr/bin/gfortran-11 11 && \ + update-alternatives --install /usr/bin/ld ld /usr/bin/ld.lld-17 1 RUN mkdir /wheels /build WORKDIR /build From eeeb8a7ba955a55af289ad631ee252aea5e6c424 Mon Sep 17 00:00:00 2001 From: Eta Date: Mon, 28 Aug 2023 15:49:45 -0500 Subject: [PATCH 47/63] build(torch-extras): Update DeepSpeed to v0.10.1 --- torch-extras/Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/torch-extras/Dockerfile b/torch-extras/Dockerfile index cd07e54..6fe3b04 100644 --- a/torch-extras/Dockerfile +++ b/torch-extras/Dockerfile @@ -1,7 +1,7 @@ # syntax=docker/dockerfile:1.2 ARG BASE_IMAGE -ARG DEEPSPEED_VERSION="0.9.4" +ARG DEEPSPEED_VERSION="0.10.1" ARG FLASH_ATTN_VERSION="2.0.2" ARG APEX_COMMIT="38a12698bc3cc95987bca270bcd6d025bb0be346" From 829a5f0271ed9cc8e72627f69086d612de96f266 Mon Sep 17 00:00:00 2001 From: Eta Date: Mon, 28 Aug 2023 16:03:08 -0500 Subject: [PATCH 48/63] ci: Add names to `torch` workflow jobs --- .github/workflows/read-configuration.yml | 1 + .github/workflows/torch-base.yml | 2 ++ .github/workflows/torch-extras.yml | 3 +++ .github/workflows/torch-nccl.yml | 2 ++ .github/workflows/torch-nightly.yml | 4 ++++ .github/workflows/torch.yml | 2 ++ 6 files changed, 14 insertions(+) diff --git a/.github/workflows/read-configuration.yml b/.github/workflows/read-configuration.yml index 3301364..12a21b3 100644 --- a/.github/workflows/read-configuration.yml +++ b/.github/workflows/read-configuration.yml @@ -16,6 +16,7 @@ on: jobs: read-file: + name: Read Configuration File runs-on: ["self-hosted", "Linux"] permissions: {} outputs: diff --git a/.github/workflows/torch-base.yml b/.github/workflows/torch-base.yml index 1f9ecb1..2332664 100644 --- a/.github/workflows/torch-base.yml +++ b/.github/workflows/torch-base.yml @@ -22,10 +22,12 @@ on: jobs: get-config: + name: Get torch:base Config uses: ./.github/workflows/read-configuration.yml with: path: ./.github/configurations/torch-base.yml build: + name: Build torch:base needs: get-config strategy: matrix: ${{ fromJSON(needs.get-config.outputs.config) }} diff --git a/.github/workflows/torch-extras.yml b/.github/workflows/torch-extras.yml index 9dd70ec..4bab439 100644 --- a/.github/workflows/torch-extras.yml +++ b/.github/workflows/torch-extras.yml @@ -44,6 +44,7 @@ on: jobs: get-required-bases: + name: Get Latest Required Base Images if: inputs.skip-bases-check != true runs-on: ["self-hosted", "Linux"] permissions: @@ -129,6 +130,7 @@ jobs: LATEST_NCCL_IMAGES: ${{ steps.get-latest.outputs.LATEST_NCCL_IMAGES }} build-call: + name: Build torch-extras via Workflow Call if: inputs.skip-bases-check strategy: matrix: @@ -144,6 +146,7 @@ jobs: FLASH_ATTN_VERSION=${{ matrix.flash-attn }} build-self: + name: Build torch-extras via Event Trigger needs: get-required-bases if: needs.get-required-bases.outputs.bases-list && needs.get-required-bases.outputs.bases-list != '[]' strategy: diff --git a/.github/workflows/torch-nccl.yml b/.github/workflows/torch-nccl.yml index ed72627..7523db3 100644 --- a/.github/workflows/torch-nccl.yml +++ b/.github/workflows/torch-nccl.yml @@ -30,10 +30,12 @@ on: jobs: get-config: + name: Get torch:nccl Config uses: ./.github/workflows/read-configuration.yml with: path: ./.github/configurations/torch-nccl.yml build: + name: Build torch:nccl needs: get-config strategy: matrix: ${{ fromJSON(needs.get-config.outputs.config) }} diff --git a/.github/workflows/torch-nightly.yml b/.github/workflows/torch-nightly.yml index 485da13..9468b08 100644 --- a/.github/workflows/torch-nightly.yml +++ b/.github/workflows/torch-nightly.yml @@ -83,17 +83,20 @@ jobs: run: echo "date=$(date -u '+%Y.%m.%d.%H')" >> "$GITHUB_OUTPUT"; get-base-config: + name: Get torch:base Config uses: ./.github/workflows/read-configuration.yml with: path: ./.github/configurations/torch-base.yml filter: del(.include) get-nccl-config: + name: Get torch:nccl Config uses: ./.github/workflows/read-configuration.yml with: path: ./.github/configurations/torch-nccl.yml filter: del(.include) build-base: + name: Build Nightly torch:base needs: - get-nightly-info - get-base-config @@ -112,6 +115,7 @@ jobs: triton-version: ${{ needs.get-nightly-info.outputs.triton-commit }} build-extras: true build-nccl: + name: Build Nightly torch:nccl needs: - get-nightly-info - get-nccl-config diff --git a/.github/workflows/torch.yml b/.github/workflows/torch.yml index 3099968..c3b3f30 100644 --- a/.github/workflows/torch.yml +++ b/.github/workflows/torch.yml @@ -81,6 +81,7 @@ on: jobs: build: + name: Build torch uses: ./.github/workflows/build.yml secrets: inherit with: @@ -97,6 +98,7 @@ jobs: ${{ inputs.cuda-arch-support && format('BUILD_TORCH_CUDA_ARCH_LIST={0}', inputs.cuda-arch-support) || '' }} ${{ inputs.triton-version && format('BUILD_TRITON_VERSION={0}', inputs.triton-version) || '' }} build-extras: + name: Build torch-extras if: inputs.build-extras needs: build uses: ./.github/workflows/torch-extras.yml From 9e7d229623218a39078b88d2b6b803ddec42ad48 Mon Sep 17 00:00:00 2001 From: Eta Date: Mon, 28 Aug 2023 17:29:06 -0500 Subject: [PATCH 49/63] fix(torch-extras): Disable DeepSpeed's AIO extension for torch v2.1.x --- .github/workflows/torch-extras.yml | 2 ++ torch-extras/Dockerfile | 6 ++++++ 2 files changed, 8 insertions(+) diff --git a/.github/workflows/torch-extras.yml b/.github/workflows/torch-extras.yml index 4bab439..fe84495 100644 --- a/.github/workflows/torch-extras.yml +++ b/.github/workflows/torch-extras.yml @@ -1,3 +1,5 @@ +name: torch-extras + on: workflow_call: inputs: diff --git a/torch-extras/Dockerfile b/torch-extras/Dockerfile index 6fe3b04..4b1cd8b 100644 --- a/torch-extras/Dockerfile +++ b/torch-extras/Dockerfile @@ -103,6 +103,12 @@ ARG DEEPSPEED_VERSION SHELL ["/bin/bash", "-c"] RUN python3 -m pip install -U --no-cache-dir \ setuptools wheel pip && \ + if python3 -m pip show torch | grep -F 'Version: 2.1.' > /dev/null; then \ + # DeepSpeed's AIO extension is incompatible with PyTorch 2.1.x's + # requirement for C++17 (as of DeepSpeed 0.10.1). + # See: https://github.com/microsoft/DeepSpeed/pull/3976 + export DS_BUILD_AIO='0'; \ + fi && \ { \ # DeepSpeed doesn't handle blank environment variables # in the same way as unset ones, so clear any blank ones. From af80f6cc9374b4e9fc4d9b588c547258fd3e6419 Mon Sep 17 00:00:00 2001 From: Eta Date: Mon, 28 Aug 2023 17:29:40 -0500 Subject: [PATCH 50/63] ci(torch-nightly): Add a name for the `get-nightly-info` job --- .github/workflows/torch-nightly.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/workflows/torch-nightly.yml b/.github/workflows/torch-nightly.yml index 9468b08..4c2539f 100644 --- a/.github/workflows/torch-nightly.yml +++ b/.github/workflows/torch-nightly.yml @@ -17,6 +17,8 @@ on: jobs: get-nightly-info: + name: + Get Nightly Info runs-on: [ self-hosted, Linux ] outputs: pytorch-commit: ${{ steps.get-hash.outputs.pytorch-commit }} From d61bbffe4e5ba788f126eb8394207b69395a7e9e Mon Sep 17 00:00:00 2001 From: Eta Date: Mon, 28 Aug 2023 23:01:00 -0500 Subject: [PATCH 51/63] ci(torch-extras): Use customized image names during workflow calls --- .github/workflows/torch-extras.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/workflows/torch-extras.yml b/.github/workflows/torch-extras.yml index fe84495..eb85364 100644 --- a/.github/workflows/torch-extras.yml +++ b/.github/workflows/torch-extras.yml @@ -140,7 +140,7 @@ jobs: uses: ./.github/workflows/build.yml secrets: inherit with: - image-name: torch-extras + image-name: ${{ inputs.image-name || 'torch-extras' }} folder: torch-extras tag-suffix: ${{ inputs.tag }}-flash_attn${{ matrix.flash-attn }} build-args: | @@ -156,6 +156,7 @@ jobs: flash-attn: [ 2.0.2, 1.0.9 ] bases: ${{ fromJSON(needs.get-required-bases.outputs.bases-list) }} uses: ./.github/workflows/build.yml + secrets: inherit with: image-name: ${{ inputs.image-name || 'torch-extras' }} folder: torch-extras From 587381f9dc79ee2ebee4484186fdf4f1bb55a0d8 Mon Sep 17 00:00:00 2001 From: Eta Date: Mon, 28 Aug 2023 23:28:27 -0500 Subject: [PATCH 52/63] docs(torch-nightly): Add `torch-nightly` containers to the README index --- README.md | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/README.md b/README.md index 10678b9..03244f6 100644 --- a/README.md +++ b/README.md @@ -7,6 +7,12 @@ Repository for building ML images at CoreWeave See the [list of all published images](https://github.com/orgs/coreweave/packages?repo_name=ml-containers). +Special PyTorch Images: + +- [PyTorch Base Images](#pytorch-base-images) +- [PyTorch Extras](#pytorch-extras) +- [PyTorch Nightly](#pytorch-nightly) + ### PyTorch Base Images - [`ghcr.io/coreweave/ml-containers/torch`](https://github.com/coreweave/ml-containers/pkgs/container/ml-containers%2Ftorch) @@ -52,6 +58,30 @@ The `base` edition retains a small size, as a multi-stage build is used to avoid CUDA development libraries in it, despite those libraries being required to build the extensions themselves. +### PyTorch Nightly + +- [`ghcr.io/coreweave/ml-containers/nightly-torch`](https://github.com/coreweave/ml-containers/pkgs/container/ml-containers%2Fnightly-torch) +- [`ghcr.io/coreweave/ml-containers/nightly-torch-extras`](https://github.com/coreweave/ml-containers/pkgs/container/ml-containers%2Fnightly-torch-extras) + +[`ml-containers/nightly-torch`](https://github.com/coreweave/ml-containers/pkgs/container/ml-containers%2Fnightly-torch) +is an experimental, nightly release channel of the +[PyTorch Base Images](#pytorch-base-images) in the style of PyTorch's +own nightly preview builds, featuring the latest development versions of +`torch`, `torchvision`, and `torchaudio` pulled daily from GitHub +and compiled from source. + +[`ml-containers/nightly-torch-extras`](https://github.com/coreweave/ml-containers/pkgs/container/ml-containers%2Fnightly-torch-extras) +is a version of [PyTorch Extras](#pytorch-extras) built on top of the +[`ml-containers/nightly-torch`](https://github.com/coreweave/ml-containers/pkgs/container/ml-containers%2Fnightly-torch) +container images. +These are not nightly versions of the extensions themselves, but rather match +the extension versions in the regular [PyTorch Extras](#pytorch-extras) containers. + +> ⚠ The *PyTorch Nightly* containers are based on unstable, experimental preview +builds of PyTorch, and should be expected to contain bugs and other issues. +> For more stable containers use the [PyTorch Base Images](#pytorch-base-images) +> and [PyTorch Extras](#pytorch-extras) containers. + ## Organization This repository contains multiple container image Dockerfiles, each is expected From 5ddf530c006c1aa343f2827a06bb4f4bfc129287 Mon Sep 17 00:00:00 2001 From: Eta Date: Thu, 31 Aug 2023 12:28:51 -0500 Subject: [PATCH 53/63] fix(torch): Install missing runtime dependencies --- torch/Dockerfile | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/torch/Dockerfile b/torch/Dockerfile index b46da7c..438d91f 100755 --- a/torch/Dockerfile +++ b/torch/Dockerfile @@ -284,11 +284,19 @@ ENV DEBIAN_FRONTEND=noninteractive # Install core packages RUN apt-get -qq update && apt-get -qq install -y \ libncurses5 python3 python3-pip python3-distutils python3-numpy \ - curl git apt-utils ssh ca-certificates tmux nano vim sudo bash rsync \ - htop wget unzip tini && \ + libpng16-16 libjpeg-turbo8 \ + curl git apt-utils ssh ca-certificates tmux nano vim-tiny sudo bash \ + rsync htop wget unzip tini && \ /usr/bin/python3 -m pip install --no-cache-dir --upgrade pip && \ update-alternatives --install /usr/bin/python python /usr/bin/python3 1 && \ update-alternatives --install /usr/bin/pip pip /usr/bin/pip3 1 && \ + update-alternatives --install /usr/bin/vim vim /usr/bin/vim.tiny 1 && \ + apt-get clean + +RUN apt-get -qq update && apt-get -qq install -y --no-install-recommends \ + software-properties-common && \ + apt-add-repository -y ppa:ubuntu-toolchain-r/test && \ + apt-get -qq install -y --no-install-recommends libstdc++6 && \ apt-get clean ARG BUILD_TORCH_VERSION @@ -317,13 +325,15 @@ RUN export \ libcusparse-${CUDA_PACKAGE_VERSION} \ libcusolver-${CUDA_PACKAGE_VERSION} \ cuda-cupti-${CUDA_PACKAGE_VERSION} \ + libnvjpeg-${CUDA_PACKAGE_VERSION} \ libnvtoolsext1 && \ { if [ $CUDA_MAJOR_VERSION -ge 12 ]; then \ apt-get -qq install --no-upgrade -y libnvjitlink-${CUDA_PACKAGE_VERSION}; fi; } && \ { if [ ! -d /opt/nccl-tests ]; then \ export NCCL_PACKAGE_VERSION="2.*+cuda${CUDA_MAJOR_VERSION}.${CUDA_MINOR_VERSION}" && \ apt-get -qq install --no-upgrade -y "libnccl2=$NCCL_PACKAGE_VERSION"; fi; } && \ - apt-get clean + apt-get clean && \ + ldconfig WORKDIR /usr/src/app From 09e398d209423817fa7575e9e3336a69dd6e20e4 Mon Sep 17 00:00:00 2001 From: Anthony Mercurio Date: Thu, 14 Sep 2023 12:48:18 -0700 Subject: [PATCH 54/63] feat(sd-inference): remove tensorizer pinning --- .github/workflows/sd-serializer.yml | 22 ---------------------- sd-inference/Dockerfile | 23 ++++++++++------------- sd-serializer/Dockerfile | 23 ----------------------- 3 files changed, 10 insertions(+), 58 deletions(-) delete mode 100644 .github/workflows/sd-serializer.yml delete mode 100644 sd-serializer/Dockerfile diff --git a/.github/workflows/sd-serializer.yml b/.github/workflows/sd-serializer.yml deleted file mode 100644 index fc11bfa..0000000 --- a/.github/workflows/sd-serializer.yml +++ /dev/null @@ -1,22 +0,0 @@ -on: - workflow_dispatch: - inputs: - commit: - description: 'Commit to build' - required: true - default: 'master' - push: - paths: - - "sd-serializer/**" - - ".github/workflows/sd-serializer.yml" - - ".github/workflows/build.yml" - - -jobs: - build: - uses: ./.github/workflows/build.yml - secrets: inherit - with: - image-name: sd-serializer - folder: sd-serializer - build-args: "--build-arg COMMIT=${{ github.event.inputs.commit }}" diff --git a/sd-inference/Dockerfile b/sd-inference/Dockerfile index e191876..ef34b07 100644 --- a/sd-inference/Dockerfile +++ b/sd-inference/Dockerfile @@ -1,6 +1,9 @@ -FROM gooseai/torch-base:1.13.1-cuda-1.18-rc4 +FROM ghcr.io/coreweave/ml-containers/torch:afecfe9-base-cuda12.0.1-torch2.0.0-vision0.15.1 +ENV DEBIAN_FRONTEND=noninteractive -ENV tenzorizer_commit=35381e3812ba342991d30b71ce257503622ae828 +RUN apt update && apt upgrade -y && \ + apt update && apt install -y python3 python3-pip git curl && \ + apt clean RUN mkdir /app WORKDIR /app @@ -10,15 +13,9 @@ RUN git clone https://github.com/coreweave/kubernetes-cloud && \ cd kubernetes-cloud && \ git checkout ${COMMIT} && \ cd .. && \ - cp kubernetes-cloud/online-inference/stable-diffusion/service/* . + cp kubernetes-cloud/online-inference/stable-diffusion/service/* . && \ + cp kubernetes-cloud/online-inference/stable-diffusion/serializer/serialize.py . && \ + rm -rf kubernetes-cloud -RUN git clone https://github.com/coreweave/tensorizer && \ - cd tensorizer && \ - git checkout ${tenzorizer_commit} && \ - cd .. && \ - mv tensorizer/tensorizer.py . && \ - rm -rf tensorizer - -RUN pip3 install --no-cache-dir -r requirements.txt - -CMD [ "/usr/bin/python3", "service.py" ] +RUN pip3 install --no-cache-dir --upgrade pip && \ + pip3 install --no-cache-dir -r requirements.txt diff --git a/sd-serializer/Dockerfile b/sd-serializer/Dockerfile deleted file mode 100644 index 81e0595..0000000 --- a/sd-serializer/Dockerfile +++ /dev/null @@ -1,23 +0,0 @@ -FROM python:3.9 - -RUN mkdir /app -WORKDIR /app - -ENV tenzorizer_commit=35381e3812ba342991d30b71ce257503622ae828 - -ARG COMMIT=master -RUN git clone https://github.com/coreweave/kubernetes-cloud && \ - cd kubernetes-cloud && \ - git checkout ${COMMIT} && \ - cd .. && \ - cp kubernetes-cloud/online-inference/stable-diffusion/serializer/* . && \ - pip3 install --no-cache-dir -r requirements.txt - -RUN git clone https://github.com/coreweave/tensorizer && \ - cd tensorizer && \ - git checkout ${tenzorizer_commit} && \ - cd .. && \ - mv tensorizer/tensorizer.py . && \ - rm -rf tensorizer - -CMD ["python3", "/app/serialize.py"] \ No newline at end of file From 7d29c615bb45b310c3d144478396d5cef4e02f4e Mon Sep 17 00:00:00 2001 From: Anthony Mercurio Date: Thu, 14 Sep 2023 13:24:44 -0700 Subject: [PATCH 55/63] fix(sd-inference): fix build args --- .github/workflows/sd-inference.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/workflows/sd-inference.yml b/.github/workflows/sd-inference.yml index 22167bb..5c349cd 100644 --- a/.github/workflows/sd-inference.yml +++ b/.github/workflows/sd-inference.yml @@ -19,4 +19,5 @@ jobs: with: image-name: sd-inference folder: sd-inference - build-args: "--build-arg COMMIT=${{ github.event.inputs.commit }}" + build-args: | + COMMIT=${{ github.event.inputs.commit }} From c5431c379918a1d72337b598d562b433dd91e60d Mon Sep 17 00:00:00 2001 From: Eta Date: Sun, 24 Sep 2023 14:41:30 -0500 Subject: [PATCH 56/63] build(torch-extras): Avoid compiling DeepSpeed with AIO on torch v2.2+ --- torch-extras/Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/torch-extras/Dockerfile b/torch-extras/Dockerfile index 4b1cd8b..3e4351f 100644 --- a/torch-extras/Dockerfile +++ b/torch-extras/Dockerfile @@ -103,7 +103,7 @@ ARG DEEPSPEED_VERSION SHELL ["/bin/bash", "-c"] RUN python3 -m pip install -U --no-cache-dir \ setuptools wheel pip && \ - if python3 -m pip show torch | grep -F 'Version: 2.1.' > /dev/null; then \ + if python3 -m pip show torch | grep 'Version: 2\.[1-9]' > /dev/null; then \ # DeepSpeed's AIO extension is incompatible with PyTorch 2.1.x's # requirement for C++17 (as of DeepSpeed 0.10.1). # See: https://github.com/microsoft/DeepSpeed/pull/3976 From c8ebbd737e3562d1b5ad7f3318912a8a231cad98 Mon Sep 17 00:00:00 2001 From: Eta Date: Sun, 24 Sep 2023 14:41:51 -0500 Subject: [PATCH 57/63] build(torch-extras): Default to using DeepSpeed v0.10.3 --- torch-extras/Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/torch-extras/Dockerfile b/torch-extras/Dockerfile index 3e4351f..5356c3a 100644 --- a/torch-extras/Dockerfile +++ b/torch-extras/Dockerfile @@ -1,7 +1,7 @@ # syntax=docker/dockerfile:1.2 ARG BASE_IMAGE -ARG DEEPSPEED_VERSION="0.10.1" +ARG DEEPSPEED_VERSION="0.10.3" ARG FLASH_ATTN_VERSION="2.0.2" ARG APEX_COMMIT="38a12698bc3cc95987bca270bcd6d025bb0be346" From f40cbd62c9b8aaa5a67abfd01cbd46cd9cf43bea Mon Sep 17 00:00:00 2001 From: Eta Date: Thu, 19 Oct 2023 12:25:19 -0500 Subject: [PATCH 58/63] fix(torch-extras): Suppress pagination when listing torch image tags [skip ci] --- .github/workflows/torch-extras.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/torch-extras.yml b/.github/workflows/torch-extras.yml index eb85364..d5b6ebc 100644 --- a/.github/workflows/torch-extras.yml +++ b/.github/workflows/torch-extras.yml @@ -87,7 +87,7 @@ jobs: run: | RELEASES="$( \ /bin/curl -f -s --oauth2-bearer "$(echo "$BEARER_TOKEN" | base64 -w 0)" \ - https://ghcr.io/v2/coreweave/ml-containers%2Ftorch/tags/list \ + 'https://ghcr.io/v2/coreweave/ml-containers%2Ftorch/tags/list?n=100000' \ | jq -r '.["tags"][]' \ | grep -E '^[0-9a-f]{7}-(base|nccl)-' \ )" && \ From 7a3726f0aa444e5afb28b85599c091fe07b64dec Mon Sep 17 00:00:00 2001 From: Eta Date: Thu, 19 Oct 2023 14:25:05 -0500 Subject: [PATCH 59/63] build(torch): Build with CUDA 12.2.2 and NCCL v2.18.5 [skip ci] --- .github/configurations/torch-base.yml | 2 +- .github/configurations/torch-nccl.yml | 13 ++++++++----- README.md | 2 +- 3 files changed, 10 insertions(+), 7 deletions(-) diff --git a/.github/configurations/torch-base.yml b/.github/configurations/torch-base.yml index 328a01b..482a335 100644 --- a/.github/configurations/torch-base.yml +++ b/.github/configurations/torch-base.yml @@ -1,4 +1,4 @@ -cuda: [ 12.1.1, 12.0.1, 11.8.0 ] +cuda: [ 12.2.2, 12.1.1, 12.0.1, 11.8.0 ] include: - torch: 2.0.1 vision: 0.15.2 diff --git a/.github/configurations/torch-nccl.yml b/.github/configurations/torch-nccl.yml index 0ac1caf..e38bdf7 100644 --- a/.github/configurations/torch-nccl.yml +++ b/.github/configurations/torch-nccl.yml @@ -1,13 +1,16 @@ image: + - cuda: 12.2.2 + nccl: 2.18.6-1 + nccl-tests-hash: c50cbae - cuda: 12.1.1 - nccl: 2.18.3-1 - nccl-tests-hash: 253a5b1 + nccl: 2.18.6-1 + nccl-tests-hash: c50cbae - cuda: 12.0.1 - nccl: 2.18.3-1 - nccl-tests-hash: 253a5b1 + nccl: 2.18.6-1 + nccl-tests-hash: c50cbae - cuda: 11.8.0 nccl: 2.16.2-1 - nccl-tests-hash: 253a5b1 + nccl-tests-hash: c50cbae include: - torch: 2.0.1 vision: 0.15.2 diff --git a/README.md b/README.md index 03244f6..cabc166 100644 --- a/README.md +++ b/README.md @@ -23,7 +23,7 @@ CoreWeave provides custom builds of and [`torchaudio`](https://github.com/pytorch/audio) tuned for our platform in a single container image, [`ml-containers/torch`](https://github.com/coreweave/ml-containers/pkgs/container/ml-containers%2Ftorch). -Versions compiled against CUDA 11.8.0, 12.0.1, and 12.1.1 are available in this repository, with two variants: +Versions compiled against CUDA 11.8.0, 12.0.1, 12.1.1, and 12.2.2 are available in this repository, with two variants: 1. `base`: Tagged as `ml-containers/torch:a1b2c3d-base-...`. 1. Built from [`nvidia/cuda:...-base-ubuntu20.04`](https://hub.docker.com/r/nvidia/cuda/tags?name=base-ubuntu20.04) as a base. From 863b9be3fb4fd21d720c22168115add88f5bd3bb Mon Sep 17 00:00:00 2001 From: Eta Date: Thu, 19 Oct 2023 14:52:49 -0500 Subject: [PATCH 60/63] build(torch): Update `nccl-tests` base image tag [skip ci] --- .github/configurations/torch-nccl.yml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/configurations/torch-nccl.yml b/.github/configurations/torch-nccl.yml index e38bdf7..420390a 100644 --- a/.github/configurations/torch-nccl.yml +++ b/.github/configurations/torch-nccl.yml @@ -1,16 +1,16 @@ image: - cuda: 12.2.2 nccl: 2.18.6-1 - nccl-tests-hash: c50cbae + nccl-tests-hash: a6a61ab - cuda: 12.1.1 nccl: 2.18.6-1 - nccl-tests-hash: c50cbae + nccl-tests-hash: a6a61ab - cuda: 12.0.1 nccl: 2.18.6-1 - nccl-tests-hash: c50cbae + nccl-tests-hash: a6a61ab - cuda: 11.8.0 nccl: 2.16.2-1 - nccl-tests-hash: c50cbae + nccl-tests-hash: a6a61ab include: - torch: 2.0.1 vision: 0.15.2 From ca684b487255292245e8e06f3ef160eecc9d48b2 Mon Sep 17 00:00:00 2001 From: Eta Date: Thu, 19 Oct 2023 14:59:45 -0500 Subject: [PATCH 61/63] build(torch): Revert `nccl-tests` base image change for CUDA 12.1 [skip ci] --- .github/configurations/torch-nccl.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/configurations/torch-nccl.yml b/.github/configurations/torch-nccl.yml index 420390a..5f67275 100644 --- a/.github/configurations/torch-nccl.yml +++ b/.github/configurations/torch-nccl.yml @@ -3,8 +3,8 @@ image: nccl: 2.18.6-1 nccl-tests-hash: a6a61ab - cuda: 12.1.1 - nccl: 2.18.6-1 - nccl-tests-hash: a6a61ab + nccl: 2.18.3-1 + nccl-tests-hash: 253a5b1 - cuda: 12.0.1 nccl: 2.18.6-1 nccl-tests-hash: a6a61ab From b654c045c6c79bf46bcc31f68c7d56d83dcf765d Mon Sep 17 00:00:00 2001 From: Eta Date: Thu, 19 Oct 2023 15:01:47 -0500 Subject: [PATCH 62/63] build(torch): Fix typo in NCCL versions [skip ci] --- .github/configurations/torch-nccl.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/configurations/torch-nccl.yml b/.github/configurations/torch-nccl.yml index 5f67275..3e9aa4f 100644 --- a/.github/configurations/torch-nccl.yml +++ b/.github/configurations/torch-nccl.yml @@ -1,12 +1,12 @@ image: - cuda: 12.2.2 - nccl: 2.18.6-1 + nccl: 2.18.5-1 nccl-tests-hash: a6a61ab - cuda: 12.1.1 nccl: 2.18.3-1 nccl-tests-hash: 253a5b1 - cuda: 12.0.1 - nccl: 2.18.6-1 + nccl: 2.18.5-1 nccl-tests-hash: a6a61ab - cuda: 11.8.0 nccl: 2.16.2-1 From 039d1283b23a22e4aad1e3e40649057c09ceb53e Mon Sep 17 00:00:00 2001 From: Rahul Talari <104786892+rtalaricw@users.noreply.github.com> Date: Tue, 24 Oct 2023 17:04:48 -0400 Subject: [PATCH 63/63] feat(docs): Use tech-docs (backstage) --- catalog.yaml | 20 ++++++++++++++++++++ README.md => docs/README.md | 0 mkdocs.yml | 10 ++++++++++ 3 files changed, 30 insertions(+) create mode 100644 catalog.yaml rename README.md => docs/README.md (100%) create mode 100644 mkdocs.yml diff --git a/catalog.yaml b/catalog.yaml new file mode 100644 index 0000000..f6433d8 --- /dev/null +++ b/catalog.yaml @@ -0,0 +1,20 @@ +--- +apiVersion: backstage.io/v1alpha1 +kind: Component +metadata: + name: ml-containers + annotations: + backstage.io/techdocs-ref: dir:. + description: Optimized images for training/inference on CoreWeave infrastructure + tags: + - ml + # links: + # - title: Deployment Manifests + # url: https://github.com/coreweave/awesome-turtles/tree/main/deploy + # icon: github + customer_impact: true + stateless: false +spec: + type: service + lifecycle: production + owner: group:cw/team_ml \ No newline at end of file diff --git a/README.md b/docs/README.md similarity index 100% rename from README.md rename to docs/README.md diff --git a/mkdocs.yml b/mkdocs.yml new file mode 100644 index 0000000..e1564cf --- /dev/null +++ b/mkdocs.yml @@ -0,0 +1,10 @@ +site_name: ml-containers +plugins: + - techdocs-core +markdown_extensions: + pymdownx.extra: + pymdownx.superfences: + custom_fences: + - name: mermaid + class: mermaid + format: !!python/name:pymdownx.superfences.fence_code_format \ No newline at end of file