diff --git a/.github/configurations/torch-base.yml b/.github/configurations/torch-base.yml new file mode 100644 index 0000000..482a335 --- /dev/null +++ b/.github/configurations/torch-base.yml @@ -0,0 +1,5 @@ +cuda: [ 12.2.2, 12.1.1, 12.0.1, 11.8.0 ] +include: + - torch: 2.0.1 + vision: 0.15.2 + audio: 2.0.2 diff --git a/.github/configurations/torch-nccl.yml b/.github/configurations/torch-nccl.yml new file mode 100644 index 0000000..3e9aa4f --- /dev/null +++ b/.github/configurations/torch-nccl.yml @@ -0,0 +1,17 @@ +image: + - cuda: 12.2.2 + nccl: 2.18.5-1 + nccl-tests-hash: a6a61ab + - cuda: 12.1.1 + nccl: 2.18.3-1 + nccl-tests-hash: 253a5b1 + - cuda: 12.0.1 + nccl: 2.18.5-1 + nccl-tests-hash: a6a61ab + - cuda: 11.8.0 + nccl: 2.16.2-1 + nccl-tests-hash: a6a61ab +include: + - torch: 2.0.1 + vision: 0.15.2 + audio: 2.0.2 diff --git a/.github/workflows/bloom.yml b/.github/workflows/bloom.yml index 1aeaa41..8e169ff 100644 --- a/.github/workflows/bloom.yml +++ b/.github/workflows/bloom.yml @@ -10,7 +10,8 @@ on: jobs: build: uses: ./.github/workflows/build.yml - with: + secrets: inherit + with: image-name: bloom folder: bloom - build-args: "" \ No newline at end of file + build-args: "" diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 7eb4616..b2f8d5b 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -29,7 +29,7 @@ on: jobs: build: name: Build Images - runs-on: [self-hosted, Linux] + runs-on: [ self-hosted, Linux ] outputs: outcome: ${{ steps.docker-build.outcome }} tags: ${{ steps.meta.outputs.tags }} @@ -38,12 +38,17 @@ jobs: - uses: actions/checkout@v3 - name: Set up Docker Buildx uses: docker/setup-buildx-action@v2.2.1 - - name: Login to container registry - uses: docker/login-action@v2.1.0 + - name: Login to GitHub container registry + uses: docker/login-action@v2.2.0 with: registry: ghcr.io username: ${{ github.actor }} password: ${{ secrets.GITHUB_TOKEN }} + - name: Login to DockerHub container registry + uses: docker/login-action@v2.2.0 + with: + username: ${{ secrets.DOCKERHUB_USERNAME }} + password: ${{ secrets.DOCKERHUB_PASSWORD }} - name: Get base registry run: | echo "REGISTRY=ghcr.io/${GITHUB_REPOSITORY,,}" >> $GITHUB_ENV @@ -67,7 +72,8 @@ jobs: uses: docker/build-push-action@v3.2.0 with: context: ${{ inputs.folder }} - build-args: ${{ inputs.build-args }} + build-args: |- + ${{ inputs.build-args }} push: true tags: ${{ steps.meta.outputs.tags }} labels: ${{ steps.meta.outputs.labels }} @@ -80,10 +86,10 @@ jobs: - name: Comment if: steps.PR.outputs.number uses: peter-evans/create-or-update-comment@v2.1.0 - with: + with: issue-number: ${{ steps.PR.outputs.number }} body: > - @${{ github.triggering_actor }} Build complete, ${{ steps.docker-build.outcome }}: + @${{ github.triggering_actor }} Build complete, ${{ steps.docker-build.outcome }}: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }} - Image: `${{ fromJSON(steps.docker-build.outputs.metadata)['image.name'] }}` \ No newline at end of file + Image: `${{ fromJSON(steps.docker-build.outputs.metadata)['image.name'] }}` diff --git a/.github/workflows/cuda-ssh.yml b/.github/workflows/cuda-ssh.yml index 892e6c3..6ba26ea 100644 --- a/.github/workflows/cuda-ssh.yml +++ b/.github/workflows/cuda-ssh.yml @@ -16,6 +16,7 @@ jobs: - ceeb8c2-nccl-cuda11.8.0-nccl2.16.2-1-torch2.0.1-vision0.15.2-audio2.0.2 uses: ./.github/workflows/build.yml + secrets: inherit with: image-name: cuda-ssh folder: cuda-ssh diff --git a/.github/workflows/gpt-neox-determined.yml b/.github/workflows/gpt-neox-determined.yml index 2ae03f7..4e0a6f4 100644 --- a/.github/workflows/gpt-neox-determined.yml +++ b/.github/workflows/gpt-neox-determined.yml @@ -10,7 +10,8 @@ on: jobs: build: uses: ./.github/workflows/build.yml - with: + secrets: inherit + with: image-name: gpt-neox-determined folder: gpt-neox-determined - build-args: "" \ No newline at end of file + build-args: "" diff --git a/.github/workflows/gpt-neox-mpi.yml b/.github/workflows/gpt-neox-mpi.yml index f2c6c0c..aec2a5f 100644 --- a/.github/workflows/gpt-neox-mpi.yml +++ b/.github/workflows/gpt-neox-mpi.yml @@ -10,6 +10,7 @@ on: jobs: build: uses: ./.github/workflows/build.yml + secrets: inherit with: image-name: gpt-neox-mpi folder: gpt-neox-mpi diff --git a/.github/workflows/read-configuration.yml b/.github/workflows/read-configuration.yml new file mode 100644 index 0000000..12a21b3 --- /dev/null +++ b/.github/workflows/read-configuration.yml @@ -0,0 +1,45 @@ +name: read-configuration + +on: + workflow_call: + inputs: + path: + required: true + type: string + filter: + required: false + type: string + outputs: + config: + description: "The retrieved configuration, as JSON" + value: ${{ jobs.read-file.outputs.config }} + +jobs: + read-file: + name: Read Configuration File + runs-on: ["self-hosted", "Linux"] + permissions: {} + outputs: + config: ${{ steps.read.outputs.contents }} + steps: + - uses: actions/checkout@v3 + - name: Read configuration + id: read + env: + FILE_PATH: ${{ inputs.path }} + FILTER: ${{ inputs.filter }} + run: | + set -x; + if [ -n "$FILTER" ]; then + CONTENTS="$(yq e "$FILE_PATH" --expression "$FILTER" -oj -I0)"; + else + CONTENTS="$(yq e "$FILE_PATH" -oj -I0)"; + fi; + echo "contents=$CONTENTS" >> "$GITHUB_OUTPUT"; + + { + echo '## Configuration'; + echo '```json'; + echo "$CONTENTS" | jq .; + echo '```'; + } >> "$GITHUB_STEP_SUMMARY"; diff --git a/.github/workflows/sd-finetuner.yml b/.github/workflows/sd-finetuner.yml index 165a2a5..6e6203c 100644 --- a/.github/workflows/sd-finetuner.yml +++ b/.github/workflows/sd-finetuner.yml @@ -15,7 +15,8 @@ on: jobs: build: uses: ./.github/workflows/build.yml - with: + secrets: inherit + with: image-name: sd-finetuner folder: sd-finetuner - build-args: "--build-arg COMMIT=${{ github.event.inputs.commit }}" \ No newline at end of file + build-args: "--build-arg COMMIT=${{ github.event.inputs.commit }}" diff --git a/.github/workflows/sd-inference.yml b/.github/workflows/sd-inference.yml index 06d9052..5c349cd 100644 --- a/.github/workflows/sd-inference.yml +++ b/.github/workflows/sd-inference.yml @@ -15,7 +15,9 @@ on: jobs: build: uses: ./.github/workflows/build.yml - with: + secrets: inherit + with: image-name: sd-inference folder: sd-inference - build-args: "--build-arg COMMIT=${{ github.event.inputs.commit }}" \ No newline at end of file + build-args: | + COMMIT=${{ github.event.inputs.commit }} diff --git a/.github/workflows/sd-serializer.yml b/.github/workflows/sd-serializer.yml deleted file mode 100644 index e964807..0000000 --- a/.github/workflows/sd-serializer.yml +++ /dev/null @@ -1,21 +0,0 @@ -on: - workflow_dispatch: - inputs: - commit: - description: 'Commit to build' - required: true - default: 'master' - push: - paths: - - "sd-serializer/**" - - ".github/workflows/sd-serializer.yml" - - ".github/workflows/build.yml" - - -jobs: - build: - uses: ./.github/workflows/build.yml - with: - image-name: sd-serializer - folder: sd-serializer - build-args: "--build-arg COMMIT=${{ github.event.inputs.commit }}" \ No newline at end of file diff --git a/.github/workflows/slurm.yml b/.github/workflows/slurm.yml index fef7d0a..1c63a49 100644 --- a/.github/workflows/slurm.yml +++ b/.github/workflows/slurm.yml @@ -21,7 +21,8 @@ jobs: BASE_IMAGE=registry.gitlab.com/coreweave/sunk/slurmd-cw-cu117-extras:bc5a133d uses: ./.github/workflows/build.yml - with: + secrets: inherit + with: image-name: ${{ matrix.image.name }} folder: ${{ matrix.image.folder }} build-args: ${{ matrix.image.build-args }} diff --git a/.github/workflows/tensorizer.yml b/.github/workflows/tensorizer.yml index 5778b9b..a9a870f 100644 --- a/.github/workflows/tensorizer.yml +++ b/.github/workflows/tensorizer.yml @@ -15,7 +15,8 @@ on: jobs: build: uses: ./.github/workflows/build.yml - with: + secrets: inherit + with: image-name: tensorizer folder: tensorizer - build-args: "--build-arg COMMIT=${{ github.event.inputs.commit }}" \ No newline at end of file + build-args: "--build-arg COMMIT=${{ github.event.inputs.commit }}" diff --git a/.github/workflows/torch-base.yml b/.github/workflows/torch-base.yml index 64b3af9..2332664 100644 --- a/.github/workflows/torch-base.yml +++ b/.github/workflows/torch-base.yml @@ -1,26 +1,41 @@ +name: torch-base + on: workflow_dispatch: + inputs: + image-name: + required: false + description: "Custom name under which to publish the resulting container" + type: string + image-tag-suffix: + required: false + description: "Custom tag suffix listing library versions under which to publish the resulting container" + type: string push: paths: - "torch/**" + - ".github/configurations/torch-base.yml" - ".github/workflows/torch-base.yml" - ".github/workflows/torch.yml" - ".github/workflows/build.yml" jobs: + get-config: + name: Get torch:base Config + uses: ./.github/workflows/read-configuration.yml + with: + path: ./.github/configurations/torch-base.yml build: + name: Build torch:base + needs: get-config strategy: - matrix: - cuda: [12.1.1, 12.0.1, 11.8.0] - include: - - torch: 2.0.1 - vision: 0.15.2 - audio: 2.0.2 - + matrix: ${{ fromJSON(needs.get-config.outputs.config) }} uses: ./.github/workflows/torch.yml + secrets: inherit with: - tag: ${{ format('base-cuda{0}-torch{1}-vision{2}-audio{3}', matrix.cuda, matrix.torch, matrix.vision, matrix.audio) }} + image-name: ${{ inputs.image-name }} + tag: ${{ format('{0}-{1}', format('base-cuda{0}', matrix.cuda), inputs.image-tag-suffix || format('torch{0}-vision{1}-audio{2}', matrix.torch, matrix.vision, matrix.audio)) }} builder-base-image: nvidia/cuda:${{ matrix.cuda }}-devel-ubuntu20.04 base-image: nvidia/cuda:${{ matrix.cuda }}-base-ubuntu20.04 torch-version: ${{ matrix.torch }} diff --git a/.github/workflows/torch-extras.yml b/.github/workflows/torch-extras.yml index bd1decf..d5b6ebc 100644 --- a/.github/workflows/torch-extras.yml +++ b/.github/workflows/torch-extras.yml @@ -1,3 +1,5 @@ +name: torch-extras + on: workflow_call: inputs: @@ -7,29 +9,158 @@ on: base-image: required: true type: string + image-name: + required: false + type: string + skip-bases-check: + required: false + type: boolean + default: true workflow_dispatch: inputs: tag: - required: true + required: false description: "Tag suffix to identify the build" type: string base-image: - required: true + required: false description: "Base image for the build" type: string + image-name: + required: false + description: "Custom name under which to publish the resulting container" + type: string + skip-bases-check: + required: false + description: "Build from one specific image rather than the most recent releases from the main branch" + type: boolean + default: true + + push: + paths: + - "torch-extras/**" + - ".github/workflows/torch-extras.yml" + - ".github/workflows/build.yml" jobs: - build: + get-required-bases: + name: Get Latest Required Base Images + if: inputs.skip-bases-check != true + runs-on: ["self-hosted", "Linux"] + permissions: + packages: read + outputs: + bases-list: ${{ steps.choose-bases.outputs.list }} + steps: + - uses: actions/checkout@v3 + with: + fetch-depth: 0 + - name: Check if torch-extras needs to be rebuilt from previous bases + id: check-changed + run: | + if [ "$EVENT_NAME" = 'push' ]; then \ + CHANGED_FILES="$(git diff --name-only "$BEFORE_HASH" "$AFTER_HASH")" && \ + { \ + echo "$CHANGED_FILES" \ + | grep -P '^(\./)?(torch/|\.github/workflows/torch(-base)?\.yml|\.github/workflows/build\.yml)' > /dev/null \ + && echo "BASE_PROVIDED=true" >> "$GITHUB_OUTPUT" \ + || echo "BASE_PROVIDED=false" >> "$GITHUB_OUTPUT"; \ + } && { \ + echo "$CHANGED_FILES" \ + | grep -P '^(\./)?(torch/|\.github/workflows/torch(-nccl)?\.yml|\.github/workflows/build\.yml)' > /dev/null \ + && echo "NCCL_PROVIDED=true" >> "$GITHUB_OUTPUT" \ + || echo "NCCL_PROVIDED=false" >> "$GITHUB_OUTPUT"; \ + }; \ + else \ + echo "BASE_PROVIDED=false" >> "$GITHUB_OUTPUT" && \ + echo "NCCL_PROVIDED=false" >> "$GITHUB_OUTPUT"; + fi + env: + EVENT_NAME: ${{ github.event_name }} + BEFORE_HASH: ${{ github.event.before }} + AFTER_HASH: ${{ github.event.after }} + - name: Get latest torch container releases + if: steps.check-changed.outputs.BASE_PROVIDED != 'true' || steps.check-changed.outputs.NCCL_PROVIDED != 'true' + id: get-latest + run: | + RELEASES="$( \ + /bin/curl -f -s --oauth2-bearer "$(echo "$BEARER_TOKEN" | base64 -w 0)" \ + 'https://ghcr.io/v2/coreweave/ml-containers%2Ftorch/tags/list?n=100000' \ + | jq -r '.["tags"][]' \ + | grep -E '^[0-9a-f]{7}-(base|nccl)-' \ + )" && \ + BASE_RELEASES="$(echo "$RELEASES" | grep -E '^[0-9a-f]{7}-base-')" && \ + NCCL_RELEASES="$(echo "$RELEASES" | grep -E '^[0-9a-f]{7}-nccl-')" && \ + LATEST_BASE_COMMIT="$(echo "$BASE_RELEASES" | tail -1 | cut -c1-7)" && \ + LATEST_NCCL_COMMIT="$(echo "$NCCL_RELEASES" | tail -1 | cut -c1-7)" && \ + LATEST_BASE_IMAGES="$(echo "$BASE_RELEASES" | grep -F "${LATEST_BASE_COMMIT}-")" && \ + LATEST_NCCL_IMAGES="$(echo "$NCCL_RELEASES" | grep -F "${LATEST_NCCL_COMMIT}-")" && \ + echo "LATEST_BASE_IMAGES=$(echo $LATEST_BASE_IMAGES)" >> "$GITHUB_OUTPUT" && \ + echo "LATEST_NCCL_IMAGES=$(echo $LATEST_NCCL_IMAGES)" >> "$GITHUB_OUTPUT" + env: + BEARER_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Choose which torch containers to use as a build base + if: steps.check-changed.outputs.BASE_PROVIDED != 'true' || steps.check-changed.outputs.NCCL_PROVIDED != 'true' + id: choose-bases + run: | + TAG_TO_JSON() { + TAG_PATTERN='^[0-9a-f]{7}-(.*)'; + JSON_REPLACE='{"tag":"\1","image":"ghcr.io/coreweave/ml-containers/torch:\0"}'; + sed -E -e "s@${TAG_PATTERN}@${JSON_REPLACE}@g"; + } && \ + SPLIT_TO_LINES() { xargs -n 1; } && \ + JOIN_LINES() { tr '[:space:]' ',' | sed -e 's/,$//'; } && \ + echo '## Pre-existing `ghcr.io/coreweave/ml-containers/torch` images to build from' >> "$GITHUB_STEP_SUMMARY" && \ + echo "list=[$( \ + ( \ + if [ "$BASE_PROVIDED" = 'false' ]; then \ + echo "$LATEST_BASE_IMAGES" | xargs -n 1 echo '-' >> "$GITHUB_STEP_SUMMARY" && \ + echo "$LATEST_BASE_IMAGES"; \ + fi && \ + if [ "$NCCL_PROVIDED" = 'false' ]; then \ + echo "$LATEST_NCCL_IMAGES" | xargs -n 1 echo '-' >> "$GITHUB_STEP_SUMMARY" && \ + echo "$LATEST_NCCL_IMAGES"; \ + fi; \ + ) | SPLIT_TO_LINES | TAG_TO_JSON | JOIN_LINES \ + )]" >> "$GITHUB_OUTPUT"; + env: + BASE_PROVIDED: ${{ steps.check-changed.outputs.BASE_PROVIDED }} + NCCL_PROVIDED: ${{ steps.check-changed.outputs.NCCL_PROVIDED }} + LATEST_BASE_IMAGES: ${{ steps.get-latest.outputs.LATEST_BASE_IMAGES }} + LATEST_NCCL_IMAGES: ${{ steps.get-latest.outputs.LATEST_NCCL_IMAGES }} + + build-call: + name: Build torch-extras via Workflow Call + if: inputs.skip-bases-check strategy: matrix: flash-attn: [ 2.0.2, 1.0.9 ] uses: ./.github/workflows/build.yml + secrets: inherit with: - image-name: torch-extras + image-name: ${{ inputs.image-name || 'torch-extras' }} folder: torch-extras tag-suffix: ${{ inputs.tag }}-flash_attn${{ matrix.flash-attn }} build-args: | BASE_IMAGE=${{ inputs.base-image }} FLASH_ATTN_VERSION=${{ matrix.flash-attn }} + + build-self: + name: Build torch-extras via Event Trigger + needs: get-required-bases + if: needs.get-required-bases.outputs.bases-list && needs.get-required-bases.outputs.bases-list != '[]' + strategy: + matrix: + flash-attn: [ 2.0.2, 1.0.9 ] + bases: ${{ fromJSON(needs.get-required-bases.outputs.bases-list) }} + uses: ./.github/workflows/build.yml + secrets: inherit + with: + image-name: ${{ inputs.image-name || 'torch-extras' }} + folder: torch-extras + tag-suffix: ${{ matrix.bases.tag }}-flash_attn${{ matrix.flash-attn }} + build-args: | + BASE_IMAGE=${{ matrix.bases.image }} + FLASH_ATTN_VERSION=${{ matrix.flash-attn }} diff --git a/.github/workflows/torch-nccl.yml b/.github/workflows/torch-nccl.yml index 7c68903..7523db3 100644 --- a/.github/workflows/torch-nccl.yml +++ b/.github/workflows/torch-nccl.yml @@ -1,35 +1,49 @@ +name: torch-nccl + on: + workflow_call: + inputs: + image-name: + required: false + type: string + image-tag-suffix: + required: false + type: string workflow_dispatch: + inputs: + image-name: + required: false + description: "Custom name under which to publish the resulting container" + type: string + image-tag-suffix: + required: false + description: "Custom tag suffix listing library versions under which to publish the resulting container" + type: string push: paths: - "torch/**" + - ".github/configurations/torch-nccl.yml" - ".github/workflows/torch-nccl.yml" - ".github/workflows/torch.yml" - ".github/workflows/build.yml" jobs: + get-config: + name: Get torch:nccl Config + uses: ./.github/workflows/read-configuration.yml + with: + path: ./.github/configurations/torch-nccl.yml build: + name: Build torch:nccl + needs: get-config strategy: - matrix: - image: - - cuda: 12.1.1 - nccl: 2.18.3-1 - nccl-tests-hash: 471f0db - - cuda: 12.0.1 - nccl: 2.18.3-1 - nccl-tests-hash: 471f0db - - cuda: 11.8.0 - nccl: 2.16.2-1 - nccl-tests-hash: 471f0db - include: - - torch: 2.0.1 - vision: 0.15.2 - audio: 2.0.2 - + matrix: ${{ fromJSON(needs.get-config.outputs.config) }} uses: ./.github/workflows/torch.yml + secrets: inherit with: - tag: ${{ format('nccl-cuda{0}-nccl{1}-torch{2}-vision{3}-audio{4}', matrix.image.cuda, matrix.image.nccl, matrix.torch, matrix.vision, matrix.audio) }} + image-name: ${{ inputs.image-name }} + tag: ${{ format('{0}-{1}', format('nccl-cuda{0}-nccl{1}', matrix.image.cuda, matrix.image.nccl), inputs.image-tag-suffix || format('torch{0}-vision{1}-audio{2}', matrix.torch, matrix.vision, matrix.audio)) }} builder-base-image: ghcr.io/coreweave/nccl-tests:${{ matrix.image.cuda }}-cudnn8-devel-ubuntu20.04-nccl${{ matrix.image.nccl }}-${{ matrix.image.nccl-tests-hash }} base-image: ghcr.io/coreweave/nccl-tests:${{ matrix.image.cuda }}-cudnn8-devel-ubuntu20.04-nccl${{ matrix.image.nccl }}-${{ matrix.image.nccl-tests-hash }} torch-version: ${{ matrix.torch }} diff --git a/.github/workflows/torch-nightly.yml b/.github/workflows/torch-nightly.yml new file mode 100644 index 0000000..4c2539f --- /dev/null +++ b/.github/workflows/torch-nightly.yml @@ -0,0 +1,137 @@ +name: torch-nightly + +on: + workflow_dispatch: + schedule: + # At 05:00 UTC (midnight EST) + - cron: "0 5 * * *" + push: + paths: + - "torch/**" + - ".github/configurations/torch-base.yml" + - ".github/configurations/torch-nccl.yml" + - ".github/workflows/torch-nightly.yml" + - ".github/workflows/torch.yml" + - ".github/workflows/build.yml" + + +jobs: + get-nightly-info: + name: + Get Nightly Info + runs-on: [ self-hosted, Linux ] + outputs: + pytorch-commit: ${{ steps.get-hash.outputs.pytorch-commit }} + triton-commit: ${{ steps.get-hash.outputs.triton-commit }} + torchvision-commit: ${{ steps.get-hash.outputs.torchvision-commit }} + torchaudio-commit: ${{ steps.get-hash.outputs.torchaudio-commit }} + version-string: ${{ steps.get-hash.outputs.version-string }} + date: ${{ steps.get-date.outputs.date }} + steps: + - name: Get latest commit hashes + id: get-hash + run: | + set -e; + + FORMAT_COMMIT_LINK() { + printf '[`%.7s`](https://github.com/%s/tree/%s)\n' "$2" "$1" "$2"; + }; + + LOG() { + printf -- "$@" >> "$GITHUB_STEP_SUMMARY"; + }; + + CLONE() { + git clone --filter=blob:none --no-checkout --depth=1 \ + "https://github.com/$1" \ + "$2" > /dev/null 2> /dev/null && \ + local COMMIT="$(git -C "$2" rev-parse HEAD)" && \ + LOG 'Latest `%s` commit: %s\n' \ + "$1" "$(FORMAT_COMMIT_LINK "$1" "$COMMIT")" && \ + echo $COMMIT; + }; + + GET_VERSION() { + git -C "$1" show HEAD:version.txt 2> /dev/null; + }; + + PYTORCH_COMMIT="$(CLONE pytorch/pytorch pytorch-git)"; + PYTORCH_VERSION="$(GET_VERSION pytorch-git)"; + TRITON_COMMIT_FILE=".ci/docker/ci_commit_pins/triton.txt"; + TRITON_COMMIT="$(git -C pytorch-git show "HEAD:$TRITON_COMMIT_FILE" 2> /dev/null)"; + rm -rf pytorch-git; + + LOG 'Corresponding `openai/triton` commit: %s\n' \ + "$(FORMAT_COMMIT_LINK openai/triton "$TRITON_COMMIT")"; + + TORCHVISION_COMMIT="$(CLONE pytorch/vision torchvision-git)"; + TORCHVISION_VERSION="$(GET_VERSION torchvision-git)"; + rm -rf torchvision-git; + + TORCHAUDIO_COMMIT="$(CLONE pytorch/audio torchaudio-git)"; + TORCHAUDIO_VERSION="$(GET_VERSION torchaudio-git)"; + rm -rf torchaudio-git; + + echo "pytorch-commit=$PYTORCH_COMMIT" >> "$GITHUB_OUTPUT"; + echo "triton-commit=$TRITON_COMMIT" >> "$GITHUB_OUTPUT"; + echo "torchvision-commit=$TORCHVISION_COMMIT" >> "$GITHUB_OUTPUT"; + echo "torchaudio-commit=$TORCHAUDIO_COMMIT" >> "$GITHUB_OUTPUT"; + + printf -- 'version-string=torch%s-vision%s-audio%s\n' \ + "$PYTORCH_VERSION" "$TORCHVISION_VERSION" "$TORCHAUDIO_VERSION" \ + >> "$GITHUB_OUTPUT"; + - name: Get date + id: get-date + run: echo "date=$(date -u '+%Y.%m.%d.%H')" >> "$GITHUB_OUTPUT"; + + get-base-config: + name: Get torch:base Config + uses: ./.github/workflows/read-configuration.yml + with: + path: ./.github/configurations/torch-base.yml + filter: del(.include) + get-nccl-config: + name: Get torch:nccl Config + uses: ./.github/workflows/read-configuration.yml + with: + path: ./.github/configurations/torch-nccl.yml + filter: del(.include) + + build-base: + name: Build Nightly torch:base + needs: + - get-nightly-info + - get-base-config + strategy: + matrix: ${{ fromJSON(needs.get-base-config.outputs.config) }} + uses: ./.github/workflows/torch.yml + secrets: inherit + with: + image-name: nightly-torch + tag: ${{ format('base-{0}-cuda{1}-{2}', needs.get-nightly-info.outputs.date, matrix.cuda, needs.get-nightly-info.outputs.version-string ) }} + builder-base-image: nvidia/cuda:${{ matrix.cuda }}-devel-ubuntu20.04 + base-image: nvidia/cuda:${{ matrix.cuda }}-base-ubuntu20.04 + torch-version: ${{ needs.get-nightly-info.outputs.pytorch-commit }} + torchvision-version: ${{ needs.get-nightly-info.outputs.torchvision-commit }} + torchaudio-version: ${{ needs.get-nightly-info.outputs.torchaudio-commit }} + triton-version: ${{ needs.get-nightly-info.outputs.triton-commit }} + build-extras: true + build-nccl: + name: Build Nightly torch:nccl + needs: + - get-nightly-info + - get-nccl-config + strategy: + matrix: ${{ fromJSON(needs.get-nccl-config.outputs.config) }} + uses: ./.github/workflows/torch.yml + secrets: inherit + with: + image-name: nightly-torch + tag: ${{ format('nccl-{0}-cuda{1}-nccl{2}-{3}', needs.get-nightly-info.outputs.date, matrix.image.cuda, matrix.image.nccl, needs.get-nightly-info.outputs.version-string ) }} + builder-base-image: ghcr.io/coreweave/nccl-tests:${{ matrix.image.cuda }}-cudnn8-devel-ubuntu20.04-nccl${{ matrix.image.nccl }}-${{ matrix.image.nccl-tests-hash }} + base-image: ghcr.io/coreweave/nccl-tests:${{ matrix.image.cuda }}-cudnn8-devel-ubuntu20.04-nccl${{ matrix.image.nccl }}-${{ matrix.image.nccl-tests-hash }} + torch-version: ${{ needs.get-nightly-info.outputs.pytorch-commit }} + torchvision-version: ${{ needs.get-nightly-info.outputs.torchvision-commit }} + torchaudio-version: ${{ needs.get-nightly-info.outputs.torchaudio-commit }} + triton-version: ${{ needs.get-nightly-info.outputs.triton-commit }} + build-extras: true diff --git a/.github/workflows/torch.yml b/.github/workflows/torch.yml index d61fd03..c3b3f30 100644 --- a/.github/workflows/torch.yml +++ b/.github/workflows/torch.yml @@ -19,10 +19,16 @@ on: torchaudio-version: required: true type: string + triton-version: + required: false + type: string cuda-arch-support: required: false type: string default: "7.0 7.5 8.0 8.6 8.9 9.0+PTX" + image-name: + required: false + type: string build-extras: required: false type: boolean @@ -54,11 +60,19 @@ on: required: true description: "Tagged version number from pytorch/audio to build" type: string + triton-version: + required: false + description: "Tagged version number from openai/triton to build" + type: string cuda-arch-support: required: false description: "Space-separated list of CUDA architectures to support" type: string default: "7.0 7.5 8.0 8.6 8.9 9.0+PTX" + image-name: + required: false + description: "Custom name under which to publish the resulting container" + type: string build-extras: required: false description: "Whether to build and push a torch-extras container as well" @@ -67,23 +81,29 @@ on: jobs: build: + name: Build torch uses: ./.github/workflows/build.yml + secrets: inherit with: - image-name: torch + image-name: ${{ inputs.image-name || 'torch' }} folder: torch tag-suffix: ${{ inputs.tag }} build-args: | - BUILD_CCACHE_SIZE=1Gi + BUILD_CCACHE_SIZE=20Gi BUILDER_BASE_IMAGE=${{ inputs.builder-base-image }} FINAL_BASE_IMAGE=${{ inputs.base-image }} BUILD_TORCH_VERSION=${{ inputs.torch-version }} BUILD_TORCH_VISION_VERSION=${{ inputs.torchvision-version }} BUILD_TORCH_AUDIO_VERSION=${{ inputs.torchaudio-version }} ${{ inputs.cuda-arch-support && format('BUILD_TORCH_CUDA_ARCH_LIST={0}', inputs.cuda-arch-support) || '' }} + ${{ inputs.triton-version && format('BUILD_TRITON_VERSION={0}', inputs.triton-version) || '' }} build-extras: + name: Build torch-extras if: inputs.build-extras needs: build uses: ./.github/workflows/torch-extras.yml + secrets: inherit with: tag: ${{ inputs.tag }} base-image: ${{ needs.build.outputs.tags }} + image-name: ${{ inputs.image-name && format('{0}-extras', inputs.image-name) || '' }} diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..9d90afc --- /dev/null +++ b/.gitignore @@ -0,0 +1,164 @@ +# General +.DS_Store +.AppleDouble +.LSOverride + +# Icon must end with two \r +Icon + +# Thumbnails +._* + +# Files that might appear in the root of a volume +.DocumentRevisions-V100 +.fseventsd +.Spotlight-V100 +.TemporaryItems +.Trashes +.VolumeIcon.icns +.com.apple.timemachine.donotpresent + +# Directories potentially created on remote AFP share +.AppleDB +.AppleDesktop +Network Trash Folder +Temporary Items +.apdisk + +*~ + +# temporary files which can be created if a process still has a handle open of a deleted file +.fuse_hidden* + +# KDE directory preferences +.directory + +# Linux trash folder which might appear on any partition or disk +.Trash-* + +# .nfs files are created when an open file is removed but is still being accessed +.nfs* + +# Swap +[._]*.s[a-v][a-z] +!*.svg # comment out if you don't need vector files +[._]*.sw[a-p] +[._]s[a-rt-v][a-z] +[._]ss[a-gi-z] +[._]sw[a-p] + +# Session +Session.vim +Sessionx.vim + +# Temporary +.netrwhist +*~ +# Auto-generated tag files +tags +# Persistent undo +[._]*.un~ + +# -*- mode: gitignore; -*- +*~ +\#*\# +/.emacs.desktop +/.emacs.desktop.lock +*.elc +auto-save-list +tramp +.\#* + +# Org-mode +.org-id-locations +*_archive + +# flymake-mode +*_flymake.* + +# eshell files +/eshell/history +/eshell/lastdir + +# elpa packages +/elpa/ + +# reftex files +*.rel + +# AUCTeX auto folder +/auto/ + +# cask packages +.cask/ +dist/ + +# Flycheck +flycheck_*.el + +# server auth directory +/server/ + +# projectiles files +.projectile + +# directory configuration +.dir-locals.el + +# network security +/network-security.data + +# -*- mode: gitignore; -*- +*~ +\#*\# +/.emacs.desktop +/.emacs.desktop.lock +*.elc +auto-save-list +tramp +.\#* + +# Org-mode +.org-id-locations +*_archive + +# flymake-mode +*_flymake.* + +# eshell files +/eshell/history +/eshell/lastdir + +# elpa packages +/elpa/ + +# reftex files +*.rel + +# AUCTeX auto folder +/auto/ + +# cask packages +.cask/ +dist/ + +# Flycheck +flycheck_*.el + +# server auth directory +/server/ + +# projectiles files +.projectile + +# directory configuration +.dir-locals.el + +# network security +/network-security.data + +# local environment files +.env +.env* +.environment +.environment* diff --git a/README.md b/README.md deleted file mode 100644 index 8780cfb..0000000 --- a/README.md +++ /dev/null @@ -1,22 +0,0 @@ -# ml-containers - -Repository for building ML images at CoreWeave - -## Organization -This repository contains multiple container image Dockerfiles, each is expected -to be within its own folder along with any other needed files for the build. - -## CI Builds (Actions) -The current CI builds are setup to run when changes to files in the respective -folders are detected so that only the changed container images are built. The -actions are setup with an action per image utilizing a reusable base action -[build.yml](.github/workflows/build.yml). The reusable action accepts several inputs: - -- `folder` - the folder containing the dockerfile for the image -- `image-name` - the name to use for the image -- `build-args` - arguments to pass to the docker build - -Images built using the same source can utilize one action as the main reason for -the multiple actions is to handle only building the changed images. A build -matrix can be helpful for these cases -https://docs.github.com/en/actions/using-jobs/using-a-matrix-for-your-jobs. diff --git a/catalog.yaml b/catalog.yaml new file mode 100644 index 0000000..f6433d8 --- /dev/null +++ b/catalog.yaml @@ -0,0 +1,20 @@ +--- +apiVersion: backstage.io/v1alpha1 +kind: Component +metadata: + name: ml-containers + annotations: + backstage.io/techdocs-ref: dir:. + description: Optimized images for training/inference on CoreWeave infrastructure + tags: + - ml + # links: + # - title: Deployment Manifests + # url: https://github.com/coreweave/awesome-turtles/tree/main/deploy + # icon: github + customer_impact: true + stateless: false +spec: + type: service + lifecycle: production + owner: group:cw/team_ml \ No newline at end of file diff --git a/docs/README.md b/docs/README.md new file mode 100644 index 0000000..cabc166 --- /dev/null +++ b/docs/README.md @@ -0,0 +1,104 @@ +# ml-containers + +Repository for building ML images at CoreWeave + + +## Index + +See the [list of all published images](https://github.com/orgs/coreweave/packages?repo_name=ml-containers). + +Special PyTorch Images: + +- [PyTorch Base Images](#pytorch-base-images) +- [PyTorch Extras](#pytorch-extras) +- [PyTorch Nightly](#pytorch-nightly) + +### PyTorch Base Images + +- [`ghcr.io/coreweave/ml-containers/torch`](https://github.com/coreweave/ml-containers/pkgs/container/ml-containers%2Ftorch) + +CoreWeave provides custom builds of +[PyTorch](https://github.com/pytorch/pytorch), +[`torchvision`](https://github.com/pytorch/vision) +and [`torchaudio`](https://github.com/pytorch/audio) +tuned for our platform in a single container image, [`ml-containers/torch`](https://github.com/coreweave/ml-containers/pkgs/container/ml-containers%2Ftorch). + +Versions compiled against CUDA 11.8.0, 12.0.1, 12.1.1, and 12.2.2 are available in this repository, with two variants: + +1. `base`: Tagged as `ml-containers/torch:a1b2c3d-base-...`. + 1. Built from [`nvidia/cuda:...-base-ubuntu20.04`](https://hub.docker.com/r/nvidia/cuda/tags?name=base-ubuntu20.04) as a base. + 2. Only includes essentials (CUDA, `torch`, `torchvision`, `torchaudio`), + so it has a small image size, making it fast to launch. +2. `nccl`: Tagged as `ml-containers/torch:a1b2c3d-nccl-...`. + 1. Built from [`ghcr.io/coreweave/nccl-tests`](https://github.com/coreweave/nccl-tests/pkgs/container/nccl-tests) as a base. + 2. Ultimately inherits from [`nvidia/cuda:...-cudnn8-devel-ubuntu20.04`](https://hub.docker.com/r/nvidia/cuda/tags?name=cudnn8-devel-ubuntu20.04). + 3. Larger, but includes development libraries and build tools such as `nvcc` necessary for compiling other PyTorch extensions. + 4. These PyTorch builds are built on component libraries optimized for the CoreWeave cloud—see + [`coreweave/nccl-tests`](https://github.com/coreweave/nccl-tests/blob/master/README.md). + +### PyTorch Extras + +- [`ghcr.io/coreweave/ml-containers/torch-extras`](https://github.com/coreweave/ml-containers/pkgs/container/ml-containers%2Ftorch-extras) + +[`ml-containers/torch-extras`](https://github.com/coreweave/ml-containers/pkgs/container/ml-containers%2Ftorch-extras) +extends the [`ml-containers/torch`](https://github.com/coreweave/ml-containers/pkgs/container/ml-containers%2Ftorch) +images with a set of common PyTorch extensions: + +1. [DeepSpeed](https://github.com/microsoft/DeepSpeed) +2. [FlashAttention](https://github.com/Dao-AILab/flash-attention) +3. [NVIDIA Apex](https://github.com/NVIDIA/apex) + +Each one is compiled specially against the custom PyTorch builds in [`ml-containers/torch`](https://github.com/coreweave/ml-containers/pkgs/container/ml-containers%2Ftorch). + +Both `base` and `nccl` editions are available for +[`ml-containers/torch-extras`](https://github.com/coreweave/ml-containers/pkgs/container/ml-containers%2Ftorch-extras) +matching those for +[`ml-containers/torch`](https://github.com/coreweave/ml-containers/pkgs/container/ml-containers%2Ftorch). +The `base` edition retains a small size, as a multi-stage build is used to avoid including +CUDA development libraries in it, despite those libraries being required to build +the extensions themselves. + +### PyTorch Nightly + +- [`ghcr.io/coreweave/ml-containers/nightly-torch`](https://github.com/coreweave/ml-containers/pkgs/container/ml-containers%2Fnightly-torch) +- [`ghcr.io/coreweave/ml-containers/nightly-torch-extras`](https://github.com/coreweave/ml-containers/pkgs/container/ml-containers%2Fnightly-torch-extras) + +[`ml-containers/nightly-torch`](https://github.com/coreweave/ml-containers/pkgs/container/ml-containers%2Fnightly-torch) +is an experimental, nightly release channel of the +[PyTorch Base Images](#pytorch-base-images) in the style of PyTorch's +own nightly preview builds, featuring the latest development versions of +`torch`, `torchvision`, and `torchaudio` pulled daily from GitHub +and compiled from source. + +[`ml-containers/nightly-torch-extras`](https://github.com/coreweave/ml-containers/pkgs/container/ml-containers%2Fnightly-torch-extras) +is a version of [PyTorch Extras](#pytorch-extras) built on top of the +[`ml-containers/nightly-torch`](https://github.com/coreweave/ml-containers/pkgs/container/ml-containers%2Fnightly-torch) +container images. +These are not nightly versions of the extensions themselves, but rather match +the extension versions in the regular [PyTorch Extras](#pytorch-extras) containers. + +> ⚠ The *PyTorch Nightly* containers are based on unstable, experimental preview +builds of PyTorch, and should be expected to contain bugs and other issues. +> For more stable containers use the [PyTorch Base Images](#pytorch-base-images) +> and [PyTorch Extras](#pytorch-extras) containers. + + +## Organization +This repository contains multiple container image Dockerfiles, each is expected +to be within its own folder along with any other needed files for the build. + + +## CI Builds (Actions) +The current CI builds are set up to run when changes to files in the respective +folders are detected so that only the changed container images are built. The +actions are set up with an action per image utilizing a reusable base action +[build.yml](.github/workflows/build.yml). The reusable action accepts several inputs: + +- `folder` - the folder containing the dockerfile for the image +- `image-name` - the name to use for the image +- `build-args` - arguments to pass to the docker build + +Images built using the same source can utilize one action as the main reason for +the multiple actions is to handle only building the changed images. A build +matrix can be helpful for these cases +https://docs.github.com/en/actions/using-jobs/using-a-matrix-for-your-jobs. diff --git a/mkdocs.yml b/mkdocs.yml new file mode 100644 index 0000000..e1564cf --- /dev/null +++ b/mkdocs.yml @@ -0,0 +1,10 @@ +site_name: ml-containers +plugins: + - techdocs-core +markdown_extensions: + pymdownx.extra: + pymdownx.superfences: + custom_fences: + - name: mermaid + class: mermaid + format: !!python/name:pymdownx.superfences.fence_code_format \ No newline at end of file diff --git a/sd-inference/Dockerfile b/sd-inference/Dockerfile index e191876..ef34b07 100644 --- a/sd-inference/Dockerfile +++ b/sd-inference/Dockerfile @@ -1,6 +1,9 @@ -FROM gooseai/torch-base:1.13.1-cuda-1.18-rc4 +FROM ghcr.io/coreweave/ml-containers/torch:afecfe9-base-cuda12.0.1-torch2.0.0-vision0.15.1 +ENV DEBIAN_FRONTEND=noninteractive -ENV tenzorizer_commit=35381e3812ba342991d30b71ce257503622ae828 +RUN apt update && apt upgrade -y && \ + apt update && apt install -y python3 python3-pip git curl && \ + apt clean RUN mkdir /app WORKDIR /app @@ -10,15 +13,9 @@ RUN git clone https://github.com/coreweave/kubernetes-cloud && \ cd kubernetes-cloud && \ git checkout ${COMMIT} && \ cd .. && \ - cp kubernetes-cloud/online-inference/stable-diffusion/service/* . + cp kubernetes-cloud/online-inference/stable-diffusion/service/* . && \ + cp kubernetes-cloud/online-inference/stable-diffusion/serializer/serialize.py . && \ + rm -rf kubernetes-cloud -RUN git clone https://github.com/coreweave/tensorizer && \ - cd tensorizer && \ - git checkout ${tenzorizer_commit} && \ - cd .. && \ - mv tensorizer/tensorizer.py . && \ - rm -rf tensorizer - -RUN pip3 install --no-cache-dir -r requirements.txt - -CMD [ "/usr/bin/python3", "service.py" ] +RUN pip3 install --no-cache-dir --upgrade pip && \ + pip3 install --no-cache-dir -r requirements.txt diff --git a/sd-serializer/Dockerfile b/sd-serializer/Dockerfile deleted file mode 100644 index 81e0595..0000000 --- a/sd-serializer/Dockerfile +++ /dev/null @@ -1,23 +0,0 @@ -FROM python:3.9 - -RUN mkdir /app -WORKDIR /app - -ENV tenzorizer_commit=35381e3812ba342991d30b71ce257503622ae828 - -ARG COMMIT=master -RUN git clone https://github.com/coreweave/kubernetes-cloud && \ - cd kubernetes-cloud && \ - git checkout ${COMMIT} && \ - cd .. && \ - cp kubernetes-cloud/online-inference/stable-diffusion/serializer/* . && \ - pip3 install --no-cache-dir -r requirements.txt - -RUN git clone https://github.com/coreweave/tensorizer && \ - cd tensorizer && \ - git checkout ${tenzorizer_commit} && \ - cd .. && \ - mv tensorizer/tensorizer.py . && \ - rm -rf tensorizer - -CMD ["python3", "/app/serialize.py"] \ No newline at end of file diff --git a/torch-extras/Dockerfile b/torch-extras/Dockerfile index 06adf5b..f2ee465 100644 --- a/torch-extras/Dockerfile +++ b/torch-extras/Dockerfile @@ -1,10 +1,10 @@ # syntax=docker/dockerfile:1.2 ARG BASE_IMAGE -ARG DEEPSPEED_VERSION="0.9.4" +ARG DEEPSPEED_VERSION="0.10.3" ARG FLASH_ATTN_VERSION="2.0.2" -ARG APEX_COMMIT="7b2e71b0d4013f8e2f9f1c8dd21980ff1d76f1b6" -ARG XFORMERS_VERSION="0.0.20" +ARG APEX_COMMIT="38a12698bc3cc95987bca270bcd6d025bb0be346" +ARG XFORMERS_VERSION="0.0.22" FROM alpine/git:2.36.3 as flash-attn-downloader WORKDIR /git @@ -44,16 +44,31 @@ RUN export \ cuda-nvprof-${CUDA_PACKAGE_VERSION} \ cuda-profiler-api-${CUDA_PACKAGE_VERSION} \ libaio-dev \ - ninja-build \ - # gcc-10/g++-10/lld do not need to be installed here, but they improve the build. - # gfortran-10 is just for compiler_wrapper.f95. - gcc-10 g++-10 gfortran-10 lld && \ + ninja-build && \ + apt-get clean + +# Add Kitware's apt repository to get a newer version of CMake +RUN apt-get -qq update && apt-get -qq install -y \ + software-properties-common lsb-release && \ + { wget -qO - https://apt.kitware.com/keys/kitware-archive-latest.asc \ + | gpg --dearmor -o /etc/apt/trusted.gpg.d/kitware.gpg; } && \ + apt-add-repository "deb https://apt.kitware.com/ubuntu/ $(lsb_release -cs) main" && \ + apt-get -qq update && apt-get -qq install -y cmake && apt-get clean + +# Update compiler (GCC) and linker (LLD) versions +# gfortran-11 is just for compiler_wrapper.f95 +RUN CODENAME="$(lsb_release -cs)" && \ + wget -qO - 'https://apt.llvm.org/llvm-snapshot.gpg.key' > /etc/apt/trusted.gpg.d/apt.llvm.org.asc && \ + apt-add-repository "deb https://apt.llvm.org/$CODENAME/ llvm-toolchain-$CODENAME-17 main" && \ + apt-add-repository -y ppa:ubuntu-toolchain-r/test && \ + apt-get -qq update && apt-get -qq install --no-install-recommends -y \ + gcc-11 g++-11 gfortran-11 lld-17 && \ apt-get clean && \ - update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-10 10 && \ - update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-10 10 && \ + update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-11 11 && \ + update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-11 11 && \ update-alternatives --install \ - /usr/bin/gfortran gfortran /usr/bin/gfortran-10 10 && \ - update-alternatives --install /usr/bin/ld ld /usr/bin/ld.lld 1 + /usr/bin/gfortran gfortran /usr/bin/gfortran-11 11 && \ + update-alternatives --install /usr/bin/ld ld /usr/bin/ld.lld-17 1 RUN mkdir /wheels /build WORKDIR /build @@ -89,6 +104,12 @@ ARG DEEPSPEED_VERSION SHELL ["/bin/bash", "-c"] RUN python3 -m pip install -U --no-cache-dir \ setuptools wheel pip && \ + if python3 -m pip show torch | grep 'Version: 2\.[1-9]' > /dev/null; then \ + # DeepSpeed's AIO extension is incompatible with PyTorch 2.1.x's + # requirement for C++17 (as of DeepSpeed 0.10.1). + # See: https://github.com/microsoft/DeepSpeed/pull/3976 + export DS_BUILD_AIO='0'; \ + fi && \ { \ # DeepSpeed doesn't handle blank environment variables # in the same way as unset ones, so clear any blank ones. @@ -116,20 +137,6 @@ SHELL ["/bin/sh", "-c"] WORKDIR /wheels -FROM builder-base as xformers-builder - -ARG XFORMERS_VERSION - -RUN python3 -m pip install -U --no-cache-dir \ - setuptools wheel pip && \ - CC=$(realpath -e ./compiler) \ - MAX_JOBS=$(($(./effective_cpu_count.sh) / 2 + 1)) \ - python3 -m pip wheel -w /wheels -v \ - --no-cache-dir --no-build-isolation --no-deps \ - --no-binary=xformers \ - xformers==${XFORMERS_VERSION} - - FROM builder-base as flash-attn-builder RUN --mount=type=bind,from=flash-attn-downloader,source=/git/flash-attention,target=flash-attention/,rw \ @@ -167,18 +174,24 @@ RUN LIBNCCL2_VERSION=$(dpkg-query --showformat='${Version}' --show libnccl2) && libnccl-dev=$LIBNCCL2_VERSION && \ apt-get clean +# --distributed_adam, --distributed_lamb, and --group_norm aren't documented +# in the Apex README, but are defined in its setup.py config. RUN --mount=type=bind,from=apex-downloader,source=/git/apex,target=apex/,rw \ python3 -m pip install -U --no-cache-dir \ packaging setuptools wheel pip && \ export CC=$(realpath -e ./compiler) && \ export MAX_JOBS=$(($(./effective_cpu_count.sh) + 2)) && \ - EXTENSIONS=$(printf -- '--config-settings "--build-option=%s" ' $( \ + export NVCC_APPEND_FLAGS='-diag-suppress 186,177' && \ + printf -- '--config-settings="--build-option=%s" ' $( \ echo \ --cpp_ext \ --cuda_ext \ + --distributed_adam \ + --distributed_lamb \ --permutation_search \ --xentropy \ --focal_loss \ + --group_norm \ --index_mul_2d \ --deprecated_fused_adam \ --deprecated_fused_lamb \ @@ -195,15 +208,28 @@ RUN --mount=type=bind,from=apex-downloader,source=/git/apex,target=apex/,rw \ --cudnn_gbn \ --fused_conv_bias_relu; \ fi; \ - )) && \ + ) > ./apex-extensions.conf && \ + echo "Extensions: $(cat ./apex-extensions.conf)" && \ cd apex && \ - python3 -m pip wheel -w /wheels -v \ - --no-cache-dir --no-build-isolation --no-deps \ - $EXTENSIONS ./ + xargs -a ../apex-extensions.conf python3 -m pip wheel -w /wheels -v --no-cache-dir --no-build-isolation --no-deps ./ WORKDIR /wheels +FROM builder-base as xformers-builder + +ARG XFORMERS_VERSION + +RUN python3 -m pip install -U --no-cache-dir \ + setuptools wheel pip && \ + CC=$(realpath -e ./compiler) \ + MAX_JOBS=$(($(./effective_cpu_count.sh) / 2 + 1)) \ + python3 -m pip wheel -w /wheels -v \ + --no-cache-dir --no-build-isolation --no-deps \ + --no-binary=xformers \ + xformers==${XFORMERS_VERSION} + + FROM ${BASE_IMAGE} RUN apt-get -qq update && \ diff --git a/torch/Dockerfile b/torch/Dockerfile index 9f2f2d2..438d91f 100755 --- a/torch/Dockerfile +++ b/torch/Dockerfile @@ -1,57 +1,91 @@ -# syntax=docker/dockerfile:1.2 +# syntax=docker/dockerfile:1.4 ARG BUILDER_BASE_IMAGE="nvidia/cuda:12.0.1-devel-ubuntu20.04" ARG FINAL_BASE_IMAGE="nvidia/cuda:12.0.1-base-ubuntu20.04" ARG BUILD_TORCH_VERSION="2.0.1" ARG BUILD_TORCH_VISION_VERSION="0.15.2" ARG BUILD_TORCH_AUDIO_VERSION="2.0.2" +ARG BUILD_TRITON_VERSION="" ARG BUILD_TORCH_CUDA_ARCH_LIST="6.0 6.1 6.2 7.0 7.2 7.5 8.0 8.6 8.9 9.0+PTX" # 8.7 is supported in the PyTorch main branch, but not 2.0.0 # Clone PyTorch repositories independently from all other build steps # for cache-friendliness and parallelization -FROM alpine/git:2.36.3 as pytorch-downloader +FROM alpine/git:2.40.1 as downloader-base WORKDIR /git +RUN git config --global advice.detachedHead false + +COPY <<-"EOT" /git/clone.sh + #!/bin/sh + REPO="https://github.com/$1"; + DEST="$2"; + REF="$3"; + + CLONE() { git clone -j8 --depth=1 --filter=blob:none "$@"; }; + + # Try cloning REF as a tag prefixed with "v", otherwise fall back + # to git checkout for commit hashes + CLONE --recurse-submodules --shallow-submodules --also-filter-submodules \ + "$REPO" -b "v$REF" "$DEST" || { \ + CLONE --no-single-branch --no-checkout "$REPO" "$DEST" && \ + git -C "$DEST" checkout "$REF" && \ + git -C "$DEST" submodule update --init --filter=blob:none --depth=1 --recursive --jobs 8; \ + }; +EOT + +RUN chmod 755 /git/clone.sh + + +FROM downloader-base as pytorch-downloader ARG BUILD_TORCH_VERSION -RUN git clone --recurse-submodules --shallow-submodules -j8 --depth 1 \ - https://github.com/pytorch/pytorch -b v${BUILD_TORCH_VERSION} && \ +RUN ./clone.sh pytorch/pytorch pytorch "${BUILD_TORCH_VERSION}" && \ rm -rf pytorch/.git -FROM alpine/git:2.36.3 as torchvision-downloader -WORKDIR /git +FROM downloader-base as torchvision-downloader ARG BUILD_TORCH_VISION_VERSION -RUN git clone --recurse-submodules --shallow-submodules -j8 --depth 1 \ - https://github.com/pytorch/vision -b v${BUILD_TORCH_VISION_VERSION} && \ +RUN ./clone.sh pytorch/vision vision "${BUILD_TORCH_VISION_VERSION}" && \ rm -rf vision/.git -FROM alpine/git:2.36.3 as torchaudio-downloader -WORKDIR /git +FROM downloader-base as torchaudio-downloader ARG BUILD_TORCH_AUDIO_VERSION -RUN git clone --recurse-submodules --shallow-submodules -j8 --depth 1 \ - https://github.com/pytorch/audio -b v${BUILD_TORCH_AUDIO_VERSION} +RUN ./clone.sh pytorch/audio audio "${BUILD_TORCH_AUDIO_VERSION}" # The torchaudio build requires that this directory remain a full git repository, # so no rm -rf audio/.git is done for this one. +FROM downloader-base as triton-downloader +ARG BUILD_TRITON_VERSION +RUN if [ -n "${BUILD_TRITON_VERSION}" ]; then \ + ./clone.sh openai/triton triton "${BUILD_TRITON_VERSION}"; \ + else \ + mkdir triton; \ + fi; + + ## Build PyTorch on a builder image. FROM ${BUILDER_BASE_IMAGE} as builder ENV DEBIAN_FRONTEND=noninteractive ARG BUILD_CCACHE_SIZE="1Gi" -# ninja-build, ccache, gcc-10, g++-10, and lld are optional but improve the build +# ninja-build, ccache, and lld are optional but improve the build RUN apt-get -qq update && apt-get -qq install -y \ libncurses5 python3 python3-pip git apt-utils ssh ca-certificates \ libpng-dev libjpeg-dev pkg-config python3-distutils python3-numpy \ - build-essential ninja-build ccache gcc-10 g++-10 lld && \ + build-essential ninja-build && \ + apt-get clean && \ + /usr/bin/python3 -m pip install --no-cache-dir --upgrade pip && \ update-alternatives --install /usr/bin/python python /usr/bin/python3 1 && \ - update-alternatives --install /usr/bin/pip pip /usr/bin/pip3 1 && \ - update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-10 10 && \ - update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-10 10 && \ - update-alternatives --install /usr/bin/ld ld /usr/bin/ld.lld 1 && \ + update-alternatives --install /usr/bin/pip pip /usr/bin/pip3 1 + +RUN mkdir /tmp/ccache-install && \ + cd /tmp/ccache-install && \ + CCACHE_URL='https://github.com/ccache/ccache/releases/download/v4.8.2/ccache-4.8.2-linux-x86_64.tar.xz' && \ + wget -qO - $CCACHE_URL | tar --strip-components 1 -xJf - && \ + make install && \ + cd .. && \ + rm -rf /tmp/ccache-install && \ ccache -M "${BUILD_CCACHE_SIZE}" && \ - ccache -F 0 && \ - pip3 install --no-cache-dir --upgrade pip && \ - apt-get clean + ccache -F 0 # Build-time environment variables ENV CCACHE_DIR=/ccache \ @@ -62,23 +96,50 @@ ENV CCACHE_DIR=/ccache \ # Add Kitware's apt repository to get a newer version of CMake RUN apt-get -qq update && apt-get -qq install -y \ software-properties-common lsb-release && \ - { wget -O - https://apt.kitware.com/keys/kitware-archive-latest.asc 2>/dev/null \ + { wget -qO - https://apt.kitware.com/keys/kitware-archive-latest.asc \ | gpg --dearmor -o /etc/apt/trusted.gpg.d/kitware.gpg; } && \ apt-add-repository "deb https://apt.kitware.com/ubuntu/ $(lsb_release -cs) main" && \ apt-get -qq update && apt-get -qq install -y cmake && apt-get clean +# Update compiler (GCC) and linker (LLD) versions +RUN CODENAME="$(lsb_release -cs)" && \ + wget -qO - 'https://apt.llvm.org/llvm-snapshot.gpg.key' > /etc/apt/trusted.gpg.d/apt.llvm.org.asc && \ + apt-add-repository "deb https://apt.llvm.org/$CODENAME/ llvm-toolchain-$CODENAME-17 main" && \ + apt-add-repository -y ppa:ubuntu-toolchain-r/test && \ + apt-get -qq update && apt-get -qq install --no-install-recommends -y \ + gcc-11 g++-11 lld-17 && \ + apt-get clean && \ + update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-11 11 && \ + update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-11 11 && \ + update-alternatives --install /usr/bin/ld ld /usr/bin/ld.lld-17 1 + RUN mkdir /build /build/dist WORKDIR /build +COPY --chmod=755 effective_cpu_count.sh . + +COPY <<-"EOT" /build/version-string.sh + #!/bin/sh + set -x; + VERSION="$1"; + + IS_HASH() { + echo "$1" | grep -qxiEe '[0-9a-f]{40}'; + }; + + if IS_HASH "$VERSION"; then + REAL_VERSION="$(cat ./version.txt)"; + SHORT_HASH="$(echo "$VERSION" | cut -c1-7)"; + echo "$REAL_VERSION+$SHORT_HASH"; + else + echo "$VERSION"; + fi; +EOT +RUN chmod 755 /build/version-string.sh ## Build torch RUN --mount=type=bind,from=pytorch-downloader,source=/git/pytorch,target=pytorch/ \ cd pytorch && pip3 install --no-cache-dir -r requirements.txt -ARG BUILD_TORCH_VERSION -ARG BUILD_TORCH_CUDA_ARCH_LIST -ENV TORCH_VERSION=$BUILD_TORCH_VERSION -ENV TORCH_CUDA_ARCH_LIST=$BUILD_TORCH_CUDA_ARCH_LIST - # Build tool & library paths, shared for all libraries to be built ENV CMAKE_PREFIX_PATH=/usr/bin/ \ LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/usr/local/cuda/lib64:/usr/local/lib \ @@ -86,6 +147,21 @@ ENV CMAKE_PREFIX_PATH=/usr/bin/ \ CUDA_TOOLKIT_ROOT_DIR=/usr/local/cuda/ \ CUDNN_LIB_DIR=/usr/local/cuda/lib64 +ARG BUILD_TRITON_VERSION +RUN --mount=type=bind,from=triton-downloader,source=/git/triton,target=triton/,rw \ + --mount=type=cache,target=/ccache \ + if [ -n "$BUILD_TRITON_VERSION" ]; then \ + export MAX_JOBS="$(./effective_cpu_count.sh)" && \ + cd triton/python && \ + python -m pip wheel -w wheels/ --no-build-isolation --no-deps -vv . && \ + pip install wheels/*.whl; \ + fi + +ARG BUILD_TORCH_VERSION +ARG BUILD_TORCH_CUDA_ARCH_LIST +ENV TORCH_VERSION=$BUILD_TORCH_VERSION +ENV TORCH_CUDA_ARCH_LIST=$BUILD_TORCH_CUDA_ARCH_LIST + # If the directory /opt/nccl-tests exists, # the base image is assumed to be nccl-tests, # so it uses the system's special NCCL and UCC installations for the build. @@ -101,6 +177,7 @@ ENV CMAKE_PREFIX_PATH=/usr/bin/ \ # remain the same. RUN --mount=type=bind,from=pytorch-downloader,source=/git/pytorch,target=pytorch/,rw \ --mount=type=cache,target=/ccache \ + export MAX_JOBS="$(./effective_cpu_count.sh)" && \ cd pytorch && \ mkdir build && \ ln -s /usr/bin/cc build/cc && \ @@ -123,7 +200,7 @@ RUN --mount=type=bind,from=pytorch-downloader,source=/git/pytorch,target=pytorch CXX=c++ \ USE_EIGEN_FOR_BLAS=ON \ USE_MKL=OFF \ - PYTORCH_BUILD_VERSION="${TORCH_VERSION}" \ + PYTORCH_BUILD_VERSION="$(../version-string.sh "$TORCH_VERSION")" \ PYTORCH_BUILD_NUMBER=0 \ TORCH_NVCC_FLAGS="-Xfatbin -compress-all" \ python3 setup.py bdist_wheel --dist-dir ../dist @@ -136,6 +213,7 @@ RUN pip3 install --no-cache-dir --upgrade \ matplotlib numpy typing_extensions requests pillow RUN --mount=type=bind,from=torchvision-downloader,source=/git/vision,target=vision/,rw \ + export MAX_JOBS="$(./effective_cpu_count.sh)" && \ cd vision && \ mkdir build && \ ln -s /usr/bin/cc build/cc && \ @@ -159,7 +237,7 @@ RUN --mount=type=bind,from=torchvision-downloader,source=/git/vision,target=visi CXX=c++ \ USE_EIGEN_FOR_BLAS=ON \ USE_MKL=OFF \ - BUILD_VERSION="${TORCH_VISION_VERSION}" \ + BUILD_VERSION="$(../version-string.sh "$TORCH_VISION_VERSION")" \ TORCH_NVCC_FLAGS="-Xfatbin -compress-all" \ python3 setup.py bdist_wheel --dist-dir ../dist @@ -170,6 +248,7 @@ RUN pip3 install --no-cache-dir --upgrade \ matplotlib numpy typing_extensions requests pillow RUN --mount=type=bind,from=torchaudio-downloader,source=/git/audio,target=audio/,rw \ + export MAX_JOBS="$(./effective_cpu_count.sh)" && \ cd audio && \ mkdir build && \ ln -s /usr/bin/cc build/cc && \ @@ -193,7 +272,7 @@ RUN --mount=type=bind,from=torchaudio-downloader,source=/git/audio,target=audio/ CXX=c++ \ USE_EIGEN_FOR_BLAS=ON \ USE_MKL=OFF \ - BUILD_VERSION="${TORCH_AUDIO_VERSION}" \ + BUILD_VERSION="$(../version-string.sh "$TORCH_AUDIO_VERSION")" \ TORCH_NVCC_FLAGS="-Xfatbin -compress-all" \ python3 setup.py bdist_wheel --dist-dir ../dist @@ -205,11 +284,19 @@ ENV DEBIAN_FRONTEND=noninteractive # Install core packages RUN apt-get -qq update && apt-get -qq install -y \ libncurses5 python3 python3-pip python3-distutils python3-numpy \ - curl git apt-utils ssh ca-certificates tmux nano vim sudo bash rsync \ - htop wget unzip tini && \ + libpng16-16 libjpeg-turbo8 \ + curl git apt-utils ssh ca-certificates tmux nano vim-tiny sudo bash \ + rsync htop wget unzip tini && \ + /usr/bin/python3 -m pip install --no-cache-dir --upgrade pip && \ update-alternatives --install /usr/bin/python python /usr/bin/python3 1 && \ update-alternatives --install /usr/bin/pip pip /usr/bin/pip3 1 && \ - pip3 install --no-cache-dir --upgrade pip && \ + update-alternatives --install /usr/bin/vim vim /usr/bin/vim.tiny 1 && \ + apt-get clean + +RUN apt-get -qq update && apt-get -qq install -y --no-install-recommends \ + software-properties-common && \ + apt-add-repository -y ppa:ubuntu-toolchain-r/test && \ + apt-get -qq install -y --no-install-recommends libstdc++6 && \ apt-get clean ARG BUILD_TORCH_VERSION @@ -238,13 +325,15 @@ RUN export \ libcusparse-${CUDA_PACKAGE_VERSION} \ libcusolver-${CUDA_PACKAGE_VERSION} \ cuda-cupti-${CUDA_PACKAGE_VERSION} \ + libnvjpeg-${CUDA_PACKAGE_VERSION} \ libnvtoolsext1 && \ { if [ $CUDA_MAJOR_VERSION -ge 12 ]; then \ apt-get -qq install --no-upgrade -y libnvjitlink-${CUDA_PACKAGE_VERSION}; fi; } && \ { if [ ! -d /opt/nccl-tests ]; then \ export NCCL_PACKAGE_VERSION="2.*+cuda${CUDA_MAJOR_VERSION}.${CUDA_MINOR_VERSION}" && \ apt-get -qq install --no-upgrade -y "libnccl2=$NCCL_PACKAGE_VERSION"; fi; } && \ - apt-get clean + apt-get clean && \ + ldconfig WORKDIR /usr/src/app diff --git a/torch/effective_cpu_count.sh b/torch/effective_cpu_count.sh new file mode 100755 index 0000000..029ecbc --- /dev/null +++ b/torch/effective_cpu_count.sh @@ -0,0 +1,32 @@ +#!/bin/sh + +CPU_QUOTA() ( + CGROUP='/sys/fs/cgroup'; + CGROUP_V1="$CGROUP/cpu,cpuacct"; + CGROUP_V1_QUOTA="$CGROUP_V1/cpu.cfs_quota_us"; + CGROUP_V1_PERIOD="$CGROUP_V1/cpu.cfs_period_us"; + CGROUP_V2="$CGROUP/user.slice/cpu.max"; + if [ ! -d "$CGROUP" ]; then + return 1; + elif [ -f "$CGROUP_V1_QUOTA" ] && [ -f "$CGROUP_V1_PERIOD" ]; then + IFS='' read -r QUOTA 2> /dev/null < "$CGROUP_V1_QUOTA" || return 1; + IFS='' read -r PERIOD 2> /dev/null < "$CGROUP_V1_PERIOD" || return 1; + elif [ -f "$CGROUP_V2" ]; then + IFS=' ' read -r QUOTA PERIOD 2> /dev/null < "$CGROUP_V2" || return 1; + else + return 1; + fi; + + if [ "$QUOTA" -gt 0 ] 2> /dev/null && [ "$PERIOD" -gt 0 ] 2> /dev/null; then + echo $((QUOTA / PERIOD)); + return 0; + else + return 1; + fi; +) + +EFFECTIVE_CPU_COUNT() { + CPU_QUOTA || getconf _NPROCESSORS_ONLN; +} + +EFFECTIVE_CPU_COUNT;