diff --git a/.github/configurations/torch-base.yml b/.github/configurations/torch-base.yml
new file mode 100644
index 0000000..482a335
--- /dev/null
+++ b/.github/configurations/torch-base.yml
@@ -0,0 +1,5 @@
+cuda: [ 12.2.2, 12.1.1, 12.0.1, 11.8.0 ]
+include:
+  - torch: 2.0.1
+    vision: 0.15.2
+    audio: 2.0.2
diff --git a/.github/configurations/torch-nccl.yml b/.github/configurations/torch-nccl.yml
new file mode 100644
index 0000000..3e9aa4f
--- /dev/null
+++ b/.github/configurations/torch-nccl.yml
@@ -0,0 +1,17 @@
+image:
+  - cuda: 12.2.2
+    nccl: 2.18.5-1
+    nccl-tests-hash: a6a61ab
+  - cuda: 12.1.1
+    nccl: 2.18.3-1
+    nccl-tests-hash: 253a5b1
+  - cuda: 12.0.1
+    nccl: 2.18.5-1
+    nccl-tests-hash: a6a61ab
+  - cuda: 11.8.0
+    nccl: 2.16.2-1
+    nccl-tests-hash: a6a61ab
+include:
+  - torch: 2.0.1
+    vision: 0.15.2
+    audio: 2.0.2
diff --git a/.github/workflows/bloom.yml b/.github/workflows/bloom.yml
index 1aeaa41..8e169ff 100644
--- a/.github/workflows/bloom.yml
+++ b/.github/workflows/bloom.yml
@@ -10,7 +10,8 @@ on:
 jobs:
   build:
     uses: ./.github/workflows/build.yml
-    with: 
+    secrets: inherit
+    with:
       image-name: bloom
       folder: bloom
-      build-args: ""
\ No newline at end of file
+      build-args: ""
diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index 7eb4616..b2f8d5b 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -29,7 +29,7 @@ on:
 jobs:
   build:
     name: Build Images
-    runs-on: [self-hosted, Linux]
+    runs-on: [ self-hosted, Linux ]
     outputs:
       outcome: ${{ steps.docker-build.outcome }}
       tags: ${{ steps.meta.outputs.tags }}
@@ -38,12 +38,17 @@ jobs:
       - uses: actions/checkout@v3
       - name: Set up Docker Buildx
         uses: docker/setup-buildx-action@v2.2.1
-      - name: Login to container registry
-        uses: docker/login-action@v2.1.0
+      - name: Login to GitHub container registry
+        uses: docker/login-action@v2.2.0
         with:
           registry: ghcr.io
           username: ${{ github.actor }}
           password: ${{ secrets.GITHUB_TOKEN }}
+      - name: Login to DockerHub container registry
+        uses: docker/login-action@v2.2.0
+        with:
+          username: ${{ secrets.DOCKERHUB_USERNAME }}
+          password: ${{ secrets.DOCKERHUB_PASSWORD }}
       - name: Get base registry
         run: |
           echo "REGISTRY=ghcr.io/${GITHUB_REPOSITORY,,}" >> $GITHUB_ENV
@@ -67,7 +72,8 @@ jobs:
         uses: docker/build-push-action@v3.2.0
         with:
           context: ${{ inputs.folder }}
-          build-args: ${{ inputs.build-args }}
+          build-args: |-
+            ${{ inputs.build-args }}
           push: true
           tags: ${{ steps.meta.outputs.tags }}
           labels: ${{ steps.meta.outputs.labels }}
@@ -80,10 +86,10 @@ jobs:
       - name: Comment
         if: steps.PR.outputs.number
         uses: peter-evans/create-or-update-comment@v2.1.0
-        with: 
+        with:
           issue-number: ${{ steps.PR.outputs.number }}
           body: >
-            @${{ github.triggering_actor }} Build complete, ${{ steps.docker-build.outcome }}: 
+            @${{ github.triggering_actor }} Build complete, ${{ steps.docker-build.outcome }}:
             ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
 
-            Image: `${{ fromJSON(steps.docker-build.outputs.metadata)['image.name'] }}`
\ No newline at end of file
+            Image: `${{ fromJSON(steps.docker-build.outputs.metadata)['image.name'] }}`
diff --git a/.github/workflows/cuda-ssh.yml b/.github/workflows/cuda-ssh.yml
index 892e6c3..6ba26ea 100644
--- a/.github/workflows/cuda-ssh.yml
+++ b/.github/workflows/cuda-ssh.yml
@@ -16,6 +16,7 @@ jobs:
           - ceeb8c2-nccl-cuda11.8.0-nccl2.16.2-1-torch2.0.1-vision0.15.2-audio2.0.2
 
     uses: ./.github/workflows/build.yml
+    secrets: inherit
     with:
       image-name: cuda-ssh
       folder: cuda-ssh
diff --git a/.github/workflows/gpt-neox-determined.yml b/.github/workflows/gpt-neox-determined.yml
index 2ae03f7..4e0a6f4 100644
--- a/.github/workflows/gpt-neox-determined.yml
+++ b/.github/workflows/gpt-neox-determined.yml
@@ -10,7 +10,8 @@ on:
 jobs:
   build:
     uses: ./.github/workflows/build.yml
-    with: 
+    secrets: inherit
+    with:
       image-name: gpt-neox-determined
       folder: gpt-neox-determined
-      build-args: ""
\ No newline at end of file
+      build-args: ""
diff --git a/.github/workflows/gpt-neox-mpi.yml b/.github/workflows/gpt-neox-mpi.yml
index f2c6c0c..aec2a5f 100644
--- a/.github/workflows/gpt-neox-mpi.yml
+++ b/.github/workflows/gpt-neox-mpi.yml
@@ -10,6 +10,7 @@ on:
 jobs:
   build:
     uses: ./.github/workflows/build.yml
+    secrets: inherit
     with:
       image-name: gpt-neox-mpi
       folder: gpt-neox-mpi
diff --git a/.github/workflows/read-configuration.yml b/.github/workflows/read-configuration.yml
new file mode 100644
index 0000000..12a21b3
--- /dev/null
+++ b/.github/workflows/read-configuration.yml
@@ -0,0 +1,45 @@
+name: read-configuration
+
+on:
+  workflow_call:
+    inputs:
+      path:
+        required: true
+        type: string
+      filter:
+        required: false
+        type: string
+    outputs:
+      config:
+        description: "The retrieved configuration, as JSON"
+        value: ${{ jobs.read-file.outputs.config }}
+
+jobs:
+  read-file:
+    name: Read Configuration File
+    runs-on: ["self-hosted", "Linux"]
+    permissions: {}
+    outputs:
+      config: ${{ steps.read.outputs.contents }}
+    steps:
+      - uses: actions/checkout@v3
+      - name: Read configuration
+        id: read
+        env:
+          FILE_PATH: ${{ inputs.path }}
+          FILTER: ${{ inputs.filter }}
+        run: |
+          set -x;
+          if [ -n "$FILTER" ]; then
+            CONTENTS="$(yq e "$FILE_PATH" --expression "$FILTER" -oj -I0)";
+          else
+            CONTENTS="$(yq e "$FILE_PATH" -oj -I0)";
+          fi;
+          echo "contents=$CONTENTS" >> "$GITHUB_OUTPUT";
+          
+          {
+            echo '## Configuration';
+            echo '```json';
+            echo "$CONTENTS" | jq .;
+            echo '```';
+          } >> "$GITHUB_STEP_SUMMARY";
diff --git a/.github/workflows/sd-finetuner.yml b/.github/workflows/sd-finetuner.yml
index 165a2a5..6e6203c 100644
--- a/.github/workflows/sd-finetuner.yml
+++ b/.github/workflows/sd-finetuner.yml
@@ -15,7 +15,8 @@ on:
 jobs:
   build:
     uses: ./.github/workflows/build.yml
-    with: 
+    secrets: inherit
+    with:
       image-name: sd-finetuner
       folder: sd-finetuner
-      build-args: "--build-arg COMMIT=${{ github.event.inputs.commit }}"
\ No newline at end of file
+      build-args: "--build-arg COMMIT=${{ github.event.inputs.commit }}"
diff --git a/.github/workflows/sd-inference.yml b/.github/workflows/sd-inference.yml
index 06d9052..5c349cd 100644
--- a/.github/workflows/sd-inference.yml
+++ b/.github/workflows/sd-inference.yml
@@ -15,7 +15,9 @@ on:
 jobs:
   build:
     uses: ./.github/workflows/build.yml
-    with: 
+    secrets: inherit
+    with:
       image-name: sd-inference
       folder: sd-inference
-      build-args: "--build-arg COMMIT=${{ github.event.inputs.commit }}"
\ No newline at end of file
+      build-args: |
+        COMMIT=${{ github.event.inputs.commit }}
diff --git a/.github/workflows/sd-serializer.yml b/.github/workflows/sd-serializer.yml
deleted file mode 100644
index e964807..0000000
--- a/.github/workflows/sd-serializer.yml
+++ /dev/null
@@ -1,21 +0,0 @@
-on:
-  workflow_dispatch:
-    inputs:
-      commit:
-        description: 'Commit to build'
-        required: true
-        default: 'master'
-  push:
-    paths:
-      - "sd-serializer/**"
-      - ".github/workflows/sd-serializer.yml"
-      - ".github/workflows/build.yml"
-
-
-jobs:
-  build:
-    uses: ./.github/workflows/build.yml
-    with: 
-      image-name: sd-serializer
-      folder: sd-serializer
-      build-args: "--build-arg COMMIT=${{ github.event.inputs.commit }}"
\ No newline at end of file
diff --git a/.github/workflows/slurm.yml b/.github/workflows/slurm.yml
index fef7d0a..1c63a49 100644
--- a/.github/workflows/slurm.yml
+++ b/.github/workflows/slurm.yml
@@ -21,7 +21,8 @@ jobs:
             BASE_IMAGE=registry.gitlab.com/coreweave/sunk/slurmd-cw-cu117-extras:bc5a133d
 
     uses: ./.github/workflows/build.yml
-    with: 
+    secrets: inherit
+    with:
       image-name: ${{ matrix.image.name }}
       folder: ${{ matrix.image.folder }}
       build-args: ${{ matrix.image.build-args }}
diff --git a/.github/workflows/tensorizer.yml b/.github/workflows/tensorizer.yml
index 5778b9b..a9a870f 100644
--- a/.github/workflows/tensorizer.yml
+++ b/.github/workflows/tensorizer.yml
@@ -15,7 +15,8 @@ on:
 jobs:
   build:
     uses: ./.github/workflows/build.yml
-    with: 
+    secrets: inherit
+    with:
       image-name: tensorizer
       folder: tensorizer
-      build-args: "--build-arg COMMIT=${{ github.event.inputs.commit }}"
\ No newline at end of file
+      build-args: "--build-arg COMMIT=${{ github.event.inputs.commit }}"
diff --git a/.github/workflows/torch-base.yml b/.github/workflows/torch-base.yml
index 64b3af9..2332664 100644
--- a/.github/workflows/torch-base.yml
+++ b/.github/workflows/torch-base.yml
@@ -1,26 +1,41 @@
+name: torch-base
+
 on:
   workflow_dispatch:
+    inputs:
+      image-name:
+        required: false
+        description: "Custom name under which to publish the resulting container"
+        type: string
+      image-tag-suffix:
+        required: false
+        description: "Custom tag suffix listing library versions under which to publish the resulting container"
+        type: string
   push:
     paths:
       - "torch/**"
+      - ".github/configurations/torch-base.yml"
       - ".github/workflows/torch-base.yml"
       - ".github/workflows/torch.yml"
       - ".github/workflows/build.yml"
 
 
 jobs:
+  get-config:
+    name: Get torch:base Config
+    uses: ./.github/workflows/read-configuration.yml
+    with:
+      path: ./.github/configurations/torch-base.yml
   build:
+    name: Build torch:base
+    needs: get-config
     strategy:
-      matrix:
-        cuda: [12.1.1, 12.0.1, 11.8.0]
-        include:
-          - torch: 2.0.1
-            vision: 0.15.2
-            audio: 2.0.2
-
+      matrix: ${{ fromJSON(needs.get-config.outputs.config) }}
     uses: ./.github/workflows/torch.yml
+    secrets: inherit
     with:
-      tag: ${{ format('base-cuda{0}-torch{1}-vision{2}-audio{3}', matrix.cuda, matrix.torch, matrix.vision, matrix.audio) }}
+      image-name: ${{ inputs.image-name }}
+      tag: ${{ format('{0}-{1}', format('base-cuda{0}', matrix.cuda), inputs.image-tag-suffix || format('torch{0}-vision{1}-audio{2}', matrix.torch, matrix.vision, matrix.audio)) }}
       builder-base-image: nvidia/cuda:${{ matrix.cuda }}-devel-ubuntu20.04
       base-image: nvidia/cuda:${{ matrix.cuda }}-base-ubuntu20.04
       torch-version: ${{ matrix.torch }}
diff --git a/.github/workflows/torch-extras.yml b/.github/workflows/torch-extras.yml
index bd1decf..d5b6ebc 100644
--- a/.github/workflows/torch-extras.yml
+++ b/.github/workflows/torch-extras.yml
@@ -1,3 +1,5 @@
+name: torch-extras
+
 on:
   workflow_call:
     inputs:
@@ -7,29 +9,158 @@ on:
       base-image:
         required: true
         type: string
+      image-name:
+        required: false
+        type: string
+      skip-bases-check:
+        required: false
+        type: boolean
+        default: true
 
   workflow_dispatch:
     inputs:
       tag:
-        required: true
+        required: false
         description: "Tag suffix to identify the build"
         type: string
       base-image:
-        required: true
+        required: false
         description: "Base image for the build"
         type: string
+      image-name:
+        required: false
+        description: "Custom name under which to publish the resulting container"
+        type: string
+      skip-bases-check:
+        required: false
+        description: "Build from one specific image rather than the most recent releases from the main branch"
+        type: boolean
+        default: true
+
+  push:
+    paths:
+      - "torch-extras/**"
+      - ".github/workflows/torch-extras.yml"
+      - ".github/workflows/build.yml"
 
 
 jobs:
-  build:
+  get-required-bases:
+    name: Get Latest Required Base Images
+    if: inputs.skip-bases-check != true
+    runs-on: ["self-hosted", "Linux"]
+    permissions:
+      packages: read
+    outputs:
+      bases-list: ${{ steps.choose-bases.outputs.list }}
+    steps:
+      - uses: actions/checkout@v3
+        with:
+          fetch-depth: 0
+      - name: Check if torch-extras needs to be rebuilt from previous bases
+        id: check-changed
+        run: |
+          if [ "$EVENT_NAME" = 'push' ]; then \
+            CHANGED_FILES="$(git diff --name-only "$BEFORE_HASH" "$AFTER_HASH")" && \
+            { \
+              echo "$CHANGED_FILES" \
+              | grep -P '^(\./)?(torch/|\.github/workflows/torch(-base)?\.yml|\.github/workflows/build\.yml)' > /dev/null \
+                && echo "BASE_PROVIDED=true" >> "$GITHUB_OUTPUT" \
+                || echo "BASE_PROVIDED=false" >> "$GITHUB_OUTPUT"; \
+            } && { \
+              echo "$CHANGED_FILES" \
+              | grep -P '^(\./)?(torch/|\.github/workflows/torch(-nccl)?\.yml|\.github/workflows/build\.yml)' > /dev/null \
+                && echo "NCCL_PROVIDED=true" >> "$GITHUB_OUTPUT" \
+                || echo "NCCL_PROVIDED=false" >> "$GITHUB_OUTPUT"; \
+            }; \
+          else \
+            echo "BASE_PROVIDED=false" >> "$GITHUB_OUTPUT" && \
+            echo "NCCL_PROVIDED=false" >> "$GITHUB_OUTPUT";
+          fi
+        env:
+          EVENT_NAME: ${{ github.event_name }}
+          BEFORE_HASH: ${{ github.event.before }}
+          AFTER_HASH: ${{ github.event.after }}
+      - name: Get latest torch container releases
+        if: steps.check-changed.outputs.BASE_PROVIDED != 'true' || steps.check-changed.outputs.NCCL_PROVIDED != 'true'
+        id: get-latest
+        run: |
+          RELEASES="$( \
+            /bin/curl -f -s --oauth2-bearer "$(echo "$BEARER_TOKEN" | base64 -w 0)" \
+              'https://ghcr.io/v2/coreweave/ml-containers%2Ftorch/tags/list?n=100000' \
+            | jq -r '.["tags"][]' \
+            | grep -E '^[0-9a-f]{7}-(base|nccl)-' \
+          )" && \
+          BASE_RELEASES="$(echo "$RELEASES" | grep -E '^[0-9a-f]{7}-base-')" && \
+          NCCL_RELEASES="$(echo "$RELEASES" | grep -E '^[0-9a-f]{7}-nccl-')" && \
+          LATEST_BASE_COMMIT="$(echo "$BASE_RELEASES" | tail -1 | cut -c1-7)" && \
+          LATEST_NCCL_COMMIT="$(echo "$NCCL_RELEASES" | tail -1 | cut -c1-7)" && \
+          LATEST_BASE_IMAGES="$(echo "$BASE_RELEASES" | grep -F "${LATEST_BASE_COMMIT}-")" && \
+          LATEST_NCCL_IMAGES="$(echo "$NCCL_RELEASES" | grep -F "${LATEST_NCCL_COMMIT}-")" && \
+          echo "LATEST_BASE_IMAGES=$(echo $LATEST_BASE_IMAGES)" >> "$GITHUB_OUTPUT" && \
+          echo "LATEST_NCCL_IMAGES=$(echo $LATEST_NCCL_IMAGES)" >> "$GITHUB_OUTPUT"
+        env:
+          BEARER_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - name: Choose which torch containers to use as a build base
+        if: steps.check-changed.outputs.BASE_PROVIDED != 'true' || steps.check-changed.outputs.NCCL_PROVIDED != 'true'
+        id: choose-bases
+        run: |
+          TAG_TO_JSON() {
+            TAG_PATTERN='^[0-9a-f]{7}-(.*)';
+            JSON_REPLACE='{"tag":"\1","image":"ghcr.io/coreweave/ml-containers/torch:\0"}';
+            sed -E -e "s@${TAG_PATTERN}@${JSON_REPLACE}@g";
+          } && \
+          SPLIT_TO_LINES() { xargs -n 1; } && \
+          JOIN_LINES() { tr '[:space:]' ',' | sed -e 's/,$//'; } && \
+          echo '## Pre-existing `ghcr.io/coreweave/ml-containers/torch` images to build from' >> "$GITHUB_STEP_SUMMARY" && \
+          echo "list=[$( \
+            ( \
+              if [ "$BASE_PROVIDED" = 'false' ]; then \
+                echo "$LATEST_BASE_IMAGES" | xargs -n 1 echo '-' >> "$GITHUB_STEP_SUMMARY" && \
+                echo "$LATEST_BASE_IMAGES"; \
+              fi && \
+              if [ "$NCCL_PROVIDED" = 'false' ]; then \
+                echo "$LATEST_NCCL_IMAGES" | xargs -n 1 echo '-' >> "$GITHUB_STEP_SUMMARY" && \
+                echo "$LATEST_NCCL_IMAGES"; \
+              fi; \
+            ) | SPLIT_TO_LINES | TAG_TO_JSON | JOIN_LINES \
+          )]" >> "$GITHUB_OUTPUT";
+        env:
+          BASE_PROVIDED: ${{ steps.check-changed.outputs.BASE_PROVIDED }}
+          NCCL_PROVIDED: ${{ steps.check-changed.outputs.NCCL_PROVIDED }}
+          LATEST_BASE_IMAGES: ${{ steps.get-latest.outputs.LATEST_BASE_IMAGES }}
+          LATEST_NCCL_IMAGES: ${{ steps.get-latest.outputs.LATEST_NCCL_IMAGES }}
+
+  build-call:
+    name: Build torch-extras via Workflow Call
+    if: inputs.skip-bases-check
     strategy:
       matrix:
         flash-attn: [ 2.0.2, 1.0.9 ]
     uses: ./.github/workflows/build.yml
+    secrets: inherit
     with:
-      image-name: torch-extras
+      image-name: ${{ inputs.image-name || 'torch-extras' }}
       folder: torch-extras
       tag-suffix: ${{ inputs.tag }}-flash_attn${{ matrix.flash-attn }}
       build-args: |
         BASE_IMAGE=${{ inputs.base-image }}
         FLASH_ATTN_VERSION=${{ matrix.flash-attn }}
+
+  build-self:
+    name: Build torch-extras via Event Trigger
+    needs: get-required-bases
+    if: needs.get-required-bases.outputs.bases-list && needs.get-required-bases.outputs.bases-list != '[]'
+    strategy:
+      matrix:
+        flash-attn: [ 2.0.2, 1.0.9 ]
+        bases: ${{ fromJSON(needs.get-required-bases.outputs.bases-list) }}
+    uses: ./.github/workflows/build.yml
+    secrets: inherit
+    with:
+      image-name: ${{ inputs.image-name || 'torch-extras' }}
+      folder: torch-extras
+      tag-suffix: ${{ matrix.bases.tag }}-flash_attn${{ matrix.flash-attn }}
+      build-args: |
+        BASE_IMAGE=${{ matrix.bases.image }}
+        FLASH_ATTN_VERSION=${{ matrix.flash-attn }}
diff --git a/.github/workflows/torch-nccl.yml b/.github/workflows/torch-nccl.yml
index 7c68903..7523db3 100644
--- a/.github/workflows/torch-nccl.yml
+++ b/.github/workflows/torch-nccl.yml
@@ -1,35 +1,49 @@
+name: torch-nccl
+
 on:
+  workflow_call:
+    inputs:
+      image-name:
+        required: false
+        type: string
+      image-tag-suffix:
+        required: false
+        type: string
   workflow_dispatch:
+    inputs:
+      image-name:
+        required: false
+        description: "Custom name under which to publish the resulting container"
+        type: string
+      image-tag-suffix:
+        required: false
+        description: "Custom tag suffix listing library versions under which to publish the resulting container"
+        type: string
   push:
     paths:
       - "torch/**"
+      - ".github/configurations/torch-nccl.yml"
       - ".github/workflows/torch-nccl.yml"
       - ".github/workflows/torch.yml"
       - ".github/workflows/build.yml"
 
 
 jobs:
+  get-config:
+    name: Get torch:nccl Config
+    uses: ./.github/workflows/read-configuration.yml
+    with:
+      path: ./.github/configurations/torch-nccl.yml
   build:
+    name: Build torch:nccl
+    needs: get-config
     strategy:
-      matrix:
-        image:
-          - cuda: 12.1.1
-            nccl: 2.18.3-1
-            nccl-tests-hash: 471f0db
-          - cuda: 12.0.1
-            nccl: 2.18.3-1
-            nccl-tests-hash: 471f0db
-          - cuda: 11.8.0
-            nccl: 2.16.2-1
-            nccl-tests-hash: 471f0db
-        include:
-          - torch: 2.0.1
-            vision: 0.15.2
-            audio: 2.0.2
-
+      matrix: ${{ fromJSON(needs.get-config.outputs.config) }}
     uses: ./.github/workflows/torch.yml
+    secrets: inherit
     with:
-      tag: ${{ format('nccl-cuda{0}-nccl{1}-torch{2}-vision{3}-audio{4}', matrix.image.cuda, matrix.image.nccl, matrix.torch, matrix.vision, matrix.audio) }}
+      image-name: ${{ inputs.image-name }}
+      tag: ${{ format('{0}-{1}', format('nccl-cuda{0}-nccl{1}', matrix.image.cuda, matrix.image.nccl), inputs.image-tag-suffix || format('torch{0}-vision{1}-audio{2}', matrix.torch, matrix.vision, matrix.audio)) }}
       builder-base-image: ghcr.io/coreweave/nccl-tests:${{ matrix.image.cuda }}-cudnn8-devel-ubuntu20.04-nccl${{ matrix.image.nccl }}-${{ matrix.image.nccl-tests-hash }}
       base-image: ghcr.io/coreweave/nccl-tests:${{ matrix.image.cuda }}-cudnn8-devel-ubuntu20.04-nccl${{ matrix.image.nccl }}-${{ matrix.image.nccl-tests-hash }}
       torch-version: ${{ matrix.torch }}
diff --git a/.github/workflows/torch-nightly.yml b/.github/workflows/torch-nightly.yml
new file mode 100644
index 0000000..4c2539f
--- /dev/null
+++ b/.github/workflows/torch-nightly.yml
@@ -0,0 +1,137 @@
+name: torch-nightly
+
+on:
+  workflow_dispatch:
+  schedule:
+    # At 05:00 UTC (midnight EST)
+    - cron: "0 5 * * *"
+  push:
+    paths:
+      - "torch/**"
+      - ".github/configurations/torch-base.yml"
+      - ".github/configurations/torch-nccl.yml"
+      - ".github/workflows/torch-nightly.yml"
+      - ".github/workflows/torch.yml"
+      - ".github/workflows/build.yml"
+
+
+jobs:
+  get-nightly-info:
+    name:
+      Get Nightly Info
+    runs-on: [ self-hosted, Linux ]
+    outputs:
+      pytorch-commit: ${{ steps.get-hash.outputs.pytorch-commit }}
+      triton-commit: ${{ steps.get-hash.outputs.triton-commit }}
+      torchvision-commit: ${{ steps.get-hash.outputs.torchvision-commit }}
+      torchaudio-commit: ${{ steps.get-hash.outputs.torchaudio-commit }}
+      version-string: ${{ steps.get-hash.outputs.version-string }}
+      date: ${{ steps.get-date.outputs.date }}
+    steps:
+      - name: Get latest commit hashes
+        id: get-hash
+        run: |
+          set -e;
+          
+          FORMAT_COMMIT_LINK() {
+            printf '[`%.7s`](https://github.com/%s/tree/%s)\n' "$2" "$1" "$2";
+          };
+          
+          LOG() {
+            printf -- "$@" >> "$GITHUB_STEP_SUMMARY";
+          };
+          
+          CLONE() {
+            git clone --filter=blob:none --no-checkout --depth=1 \
+              "https://github.com/$1" \
+              "$2" > /dev/null 2> /dev/null && \
+            local COMMIT="$(git -C "$2" rev-parse HEAD)" && \
+            LOG 'Latest `%s` commit: %s\n' \
+              "$1" "$(FORMAT_COMMIT_LINK "$1" "$COMMIT")" && \
+            echo $COMMIT;
+          };
+          
+          GET_VERSION() {
+            git -C "$1" show HEAD:version.txt 2> /dev/null; 
+          };
+          
+          PYTORCH_COMMIT="$(CLONE pytorch/pytorch pytorch-git)";
+          PYTORCH_VERSION="$(GET_VERSION pytorch-git)";
+          TRITON_COMMIT_FILE=".ci/docker/ci_commit_pins/triton.txt";
+          TRITON_COMMIT="$(git -C pytorch-git show "HEAD:$TRITON_COMMIT_FILE" 2> /dev/null)";
+          rm -rf pytorch-git;
+          
+          LOG 'Corresponding `openai/triton` commit: %s\n' \
+            "$(FORMAT_COMMIT_LINK openai/triton "$TRITON_COMMIT")";
+          
+          TORCHVISION_COMMIT="$(CLONE pytorch/vision torchvision-git)";
+          TORCHVISION_VERSION="$(GET_VERSION torchvision-git)";
+          rm -rf torchvision-git;
+          
+          TORCHAUDIO_COMMIT="$(CLONE pytorch/audio torchaudio-git)";
+          TORCHAUDIO_VERSION="$(GET_VERSION torchaudio-git)";
+          rm -rf torchaudio-git;
+          
+          echo "pytorch-commit=$PYTORCH_COMMIT" >> "$GITHUB_OUTPUT";
+          echo "triton-commit=$TRITON_COMMIT" >> "$GITHUB_OUTPUT";
+          echo "torchvision-commit=$TORCHVISION_COMMIT" >> "$GITHUB_OUTPUT";
+          echo "torchaudio-commit=$TORCHAUDIO_COMMIT" >> "$GITHUB_OUTPUT";
+          
+          printf -- 'version-string=torch%s-vision%s-audio%s\n' \
+            "$PYTORCH_VERSION" "$TORCHVISION_VERSION" "$TORCHAUDIO_VERSION" \
+            >> "$GITHUB_OUTPUT";
+      - name: Get date
+        id: get-date
+        run: echo "date=$(date -u '+%Y.%m.%d.%H')" >> "$GITHUB_OUTPUT";
+
+  get-base-config:
+    name: Get torch:base Config
+    uses: ./.github/workflows/read-configuration.yml
+    with:
+      path: ./.github/configurations/torch-base.yml
+      filter: del(.include)
+  get-nccl-config:
+    name: Get torch:nccl Config
+    uses: ./.github/workflows/read-configuration.yml
+    with:
+      path: ./.github/configurations/torch-nccl.yml
+      filter: del(.include)
+
+  build-base:
+    name: Build Nightly torch:base
+    needs:
+      - get-nightly-info
+      - get-base-config
+    strategy:
+      matrix: ${{ fromJSON(needs.get-base-config.outputs.config) }}
+    uses: ./.github/workflows/torch.yml
+    secrets: inherit
+    with:
+      image-name: nightly-torch
+      tag: ${{ format('base-{0}-cuda{1}-{2}', needs.get-nightly-info.outputs.date, matrix.cuda, needs.get-nightly-info.outputs.version-string ) }}
+      builder-base-image: nvidia/cuda:${{ matrix.cuda }}-devel-ubuntu20.04
+      base-image: nvidia/cuda:${{ matrix.cuda }}-base-ubuntu20.04
+      torch-version: ${{ needs.get-nightly-info.outputs.pytorch-commit }}
+      torchvision-version: ${{ needs.get-nightly-info.outputs.torchvision-commit }}
+      torchaudio-version: ${{ needs.get-nightly-info.outputs.torchaudio-commit }}
+      triton-version: ${{ needs.get-nightly-info.outputs.triton-commit }}
+      build-extras: true
+  build-nccl:
+    name: Build Nightly torch:nccl
+    needs:
+      - get-nightly-info
+      - get-nccl-config
+    strategy:
+      matrix: ${{ fromJSON(needs.get-nccl-config.outputs.config) }}
+    uses: ./.github/workflows/torch.yml
+    secrets: inherit
+    with:
+      image-name: nightly-torch
+      tag: ${{ format('nccl-{0}-cuda{1}-nccl{2}-{3}', needs.get-nightly-info.outputs.date, matrix.image.cuda, matrix.image.nccl, needs.get-nightly-info.outputs.version-string ) }}
+      builder-base-image: ghcr.io/coreweave/nccl-tests:${{ matrix.image.cuda }}-cudnn8-devel-ubuntu20.04-nccl${{ matrix.image.nccl }}-${{ matrix.image.nccl-tests-hash }}
+      base-image: ghcr.io/coreweave/nccl-tests:${{ matrix.image.cuda }}-cudnn8-devel-ubuntu20.04-nccl${{ matrix.image.nccl }}-${{ matrix.image.nccl-tests-hash }}
+      torch-version: ${{ needs.get-nightly-info.outputs.pytorch-commit }}
+      torchvision-version: ${{ needs.get-nightly-info.outputs.torchvision-commit }}
+      torchaudio-version: ${{ needs.get-nightly-info.outputs.torchaudio-commit }}
+      triton-version: ${{ needs.get-nightly-info.outputs.triton-commit }}
+      build-extras: true
diff --git a/.github/workflows/torch.yml b/.github/workflows/torch.yml
index d61fd03..c3b3f30 100644
--- a/.github/workflows/torch.yml
+++ b/.github/workflows/torch.yml
@@ -19,10 +19,16 @@ on:
       torchaudio-version:
         required: true
         type: string
+      triton-version:
+        required: false
+        type: string
       cuda-arch-support:
         required: false
         type: string
         default: "7.0 7.5 8.0 8.6 8.9 9.0+PTX"
+      image-name:
+        required: false
+        type: string
       build-extras:
         required: false
         type: boolean
@@ -54,11 +60,19 @@ on:
         required: true
         description: "Tagged version number from pytorch/audio to build"
         type: string
+      triton-version:
+        required: false
+        description: "Tagged version number from openai/triton to build"
+        type: string
       cuda-arch-support:
         required: false
         description: "Space-separated list of CUDA architectures to support"
         type: string
         default: "7.0 7.5 8.0 8.6 8.9 9.0+PTX"
+      image-name:
+        required: false
+        description: "Custom name under which to publish the resulting container"
+        type: string
       build-extras:
         required: false
         description: "Whether to build and push a torch-extras container as well"
@@ -67,23 +81,29 @@ on:
 
 jobs:
   build:
+    name: Build torch
     uses: ./.github/workflows/build.yml
+    secrets: inherit
     with:
-      image-name: torch
+      image-name: ${{ inputs.image-name || 'torch' }}
       folder: torch
       tag-suffix: ${{ inputs.tag }}
       build-args: |
-        BUILD_CCACHE_SIZE=1Gi
+        BUILD_CCACHE_SIZE=20Gi
         BUILDER_BASE_IMAGE=${{ inputs.builder-base-image }}
         FINAL_BASE_IMAGE=${{ inputs.base-image }}
         BUILD_TORCH_VERSION=${{ inputs.torch-version }}
         BUILD_TORCH_VISION_VERSION=${{ inputs.torchvision-version }}
         BUILD_TORCH_AUDIO_VERSION=${{ inputs.torchaudio-version }}
         ${{ inputs.cuda-arch-support && format('BUILD_TORCH_CUDA_ARCH_LIST={0}', inputs.cuda-arch-support) || '' }}
+        ${{ inputs.triton-version && format('BUILD_TRITON_VERSION={0}', inputs.triton-version) || '' }}
   build-extras:
+    name: Build torch-extras
     if: inputs.build-extras
     needs: build
     uses: ./.github/workflows/torch-extras.yml
+    secrets: inherit
     with:
       tag: ${{ inputs.tag }}
       base-image: ${{ needs.build.outputs.tags }}
+      image-name: ${{ inputs.image-name && format('{0}-extras', inputs.image-name) || '' }}
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..9d90afc
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,164 @@
+# General
+.DS_Store
+.AppleDouble
+.LSOverride
+
+# Icon must end with two \r
+Icon
+
+# Thumbnails
+._*
+
+# Files that might appear in the root of a volume
+.DocumentRevisions-V100
+.fseventsd
+.Spotlight-V100
+.TemporaryItems
+.Trashes
+.VolumeIcon.icns
+.com.apple.timemachine.donotpresent
+
+# Directories potentially created on remote AFP share
+.AppleDB
+.AppleDesktop
+Network Trash Folder
+Temporary Items
+.apdisk
+
+*~
+
+# temporary files which can be created if a process still has a handle open of a deleted file
+.fuse_hidden*
+
+# KDE directory preferences
+.directory
+
+# Linux trash folder which might appear on any partition or disk
+.Trash-*
+
+# .nfs files are created when an open file is removed but is still being accessed
+.nfs*
+
+# Swap
+[._]*.s[a-v][a-z]
+!*.svg  # comment out if you don't need vector files
+[._]*.sw[a-p]
+[._]s[a-rt-v][a-z]
+[._]ss[a-gi-z]
+[._]sw[a-p]
+
+# Session
+Session.vim
+Sessionx.vim
+
+# Temporary
+.netrwhist
+*~
+# Auto-generated tag files
+tags
+# Persistent undo
+[._]*.un~
+
+# -*- mode: gitignore; -*-
+*~
+\#*\#
+/.emacs.desktop
+/.emacs.desktop.lock
+*.elc
+auto-save-list
+tramp
+.\#*
+
+# Org-mode
+.org-id-locations
+*_archive
+
+# flymake-mode
+*_flymake.*
+
+# eshell files
+/eshell/history
+/eshell/lastdir
+
+# elpa packages
+/elpa/
+
+# reftex files
+*.rel
+
+# AUCTeX auto folder
+/auto/
+
+# cask packages
+.cask/
+dist/
+
+# Flycheck
+flycheck_*.el
+
+# server auth directory
+/server/
+
+# projectiles files
+.projectile
+
+# directory configuration
+.dir-locals.el
+
+# network security
+/network-security.data
+
+# -*- mode: gitignore; -*-
+*~
+\#*\#
+/.emacs.desktop
+/.emacs.desktop.lock
+*.elc
+auto-save-list
+tramp
+.\#*
+
+# Org-mode
+.org-id-locations
+*_archive
+
+# flymake-mode
+*_flymake.*
+
+# eshell files
+/eshell/history
+/eshell/lastdir
+
+# elpa packages
+/elpa/
+
+# reftex files
+*.rel
+
+# AUCTeX auto folder
+/auto/
+
+# cask packages
+.cask/
+dist/
+
+# Flycheck
+flycheck_*.el
+
+# server auth directory
+/server/
+
+# projectiles files
+.projectile
+
+# directory configuration
+.dir-locals.el
+
+# network security
+/network-security.data
+
+# local environment files
+.env
+.env*
+.environment
+.environment*
diff --git a/README.md b/README.md
deleted file mode 100644
index 8780cfb..0000000
--- a/README.md
+++ /dev/null
@@ -1,22 +0,0 @@
-# ml-containers
-
-Repository for building ML images at CoreWeave
-
-## Organization
-This repository contains multiple container image Dockerfiles, each is expected
-to be within its own folder along with any other needed files for the build.
-
-## CI Builds (Actions)
-The current CI builds are setup to run when changes to files in the respective
-folders are detected so that only the changed container images are built.  The
-actions are setup with an action per image utilizing a reusable base action
-[build.yml](.github/workflows/build.yml).  The reusable action accepts several inputs:
-
-- `folder` - the folder containing the dockerfile for the image
-- `image-name` - the name to use for the image
-- `build-args` - arguments to pass to the docker build
-
-Images built using the same source can utilize one action as the main reason for
-the multiple actions is to handle only building the changed images.  A build
-matrix can be helpful for these cases
-https://docs.github.com/en/actions/using-jobs/using-a-matrix-for-your-jobs.
diff --git a/catalog.yaml b/catalog.yaml
new file mode 100644
index 0000000..f6433d8
--- /dev/null
+++ b/catalog.yaml
@@ -0,0 +1,20 @@
+---
+apiVersion: backstage.io/v1alpha1
+kind: Component
+metadata:
+  name: ml-containers
+  annotations:
+    backstage.io/techdocs-ref: dir:.
+  description: Optimized images for training/inference on CoreWeave infrastructure
+  tags:
+    - ml
+  # links:
+  #   - title: Deployment Manifests
+  #     url: https://github.com/coreweave/awesome-turtles/tree/main/deploy
+  #     icon: github
+  customer_impact: true
+  stateless: false
+spec:
+  type: service
+  lifecycle: production
+  owner: group:cw/team_ml
\ No newline at end of file
diff --git a/docs/README.md b/docs/README.md
new file mode 100644
index 0000000..cabc166
--- /dev/null
+++ b/docs/README.md
@@ -0,0 +1,104 @@
+# ml-containers
+
+Repository for building ML images at CoreWeave
+
+
+## Index
+
+See the [list of all published images](https://github.com/orgs/coreweave/packages?repo_name=ml-containers).
+
+Special PyTorch Images:
+
+- [PyTorch Base Images](#pytorch-base-images)
+- [PyTorch Extras](#pytorch-extras)
+- [PyTorch Nightly](#pytorch-nightly)
+
+### PyTorch Base Images
+
+- [`ghcr.io/coreweave/ml-containers/torch`](https://github.com/coreweave/ml-containers/pkgs/container/ml-containers%2Ftorch)
+
+CoreWeave provides custom builds of
+[PyTorch](https://github.com/pytorch/pytorch),
+[`torchvision`](https://github.com/pytorch/vision)
+and [`torchaudio`](https://github.com/pytorch/audio)
+tuned for our platform in a single container image, [`ml-containers/torch`](https://github.com/coreweave/ml-containers/pkgs/container/ml-containers%2Ftorch).
+
+Versions compiled against CUDA 11.8.0, 12.0.1, 12.1.1, and 12.2.2 are available in this repository, with two variants:
+
+1. `base`: Tagged as `ml-containers/torch:a1b2c3d-base-...`.
+   1. Built from [`nvidia/cuda:...-base-ubuntu20.04`](https://hub.docker.com/r/nvidia/cuda/tags?name=base-ubuntu20.04) as a base.
+   2. Only includes essentials (CUDA, `torch`, `torchvision`, `torchaudio`),
+      so it has a small image size, making it fast to launch.
+2. `nccl`: Tagged as `ml-containers/torch:a1b2c3d-nccl-...`.
+   1. Built from [`ghcr.io/coreweave/nccl-tests`](https://github.com/coreweave/nccl-tests/pkgs/container/nccl-tests) as a base.
+   2. Ultimately inherits from [`nvidia/cuda:...-cudnn8-devel-ubuntu20.04`](https://hub.docker.com/r/nvidia/cuda/tags?name=cudnn8-devel-ubuntu20.04).
+   3. Larger, but includes development libraries and build tools such as `nvcc` necessary for compiling other PyTorch extensions.
+   4. These PyTorch builds are built on component libraries optimized for the CoreWeave cloud&mdash;see
+      [`coreweave/nccl-tests`](https://github.com/coreweave/nccl-tests/blob/master/README.md).
+
+### PyTorch Extras
+
+- [`ghcr.io/coreweave/ml-containers/torch-extras`](https://github.com/coreweave/ml-containers/pkgs/container/ml-containers%2Ftorch-extras)
+
+[`ml-containers/torch-extras`](https://github.com/coreweave/ml-containers/pkgs/container/ml-containers%2Ftorch-extras)
+extends the [`ml-containers/torch`](https://github.com/coreweave/ml-containers/pkgs/container/ml-containers%2Ftorch)
+images with a set of common PyTorch extensions:
+
+1. [DeepSpeed](https://github.com/microsoft/DeepSpeed)
+2. [FlashAttention](https://github.com/Dao-AILab/flash-attention)
+3. [NVIDIA Apex](https://github.com/NVIDIA/apex)
+
+Each one is compiled specially against the custom PyTorch builds in [`ml-containers/torch`](https://github.com/coreweave/ml-containers/pkgs/container/ml-containers%2Ftorch).
+
+Both `base` and `nccl` editions are available for
+[`ml-containers/torch-extras`](https://github.com/coreweave/ml-containers/pkgs/container/ml-containers%2Ftorch-extras)
+matching those for
+[`ml-containers/torch`](https://github.com/coreweave/ml-containers/pkgs/container/ml-containers%2Ftorch).
+The `base` edition retains a small size, as a multi-stage build is used to avoid including
+CUDA development libraries in it, despite those libraries being required to build
+the extensions themselves.
+
+### PyTorch Nightly
+
+- [`ghcr.io/coreweave/ml-containers/nightly-torch`](https://github.com/coreweave/ml-containers/pkgs/container/ml-containers%2Fnightly-torch)
+- [`ghcr.io/coreweave/ml-containers/nightly-torch-extras`](https://github.com/coreweave/ml-containers/pkgs/container/ml-containers%2Fnightly-torch-extras)
+
+[`ml-containers/nightly-torch`](https://github.com/coreweave/ml-containers/pkgs/container/ml-containers%2Fnightly-torch)
+is an experimental, nightly release channel of the
+[PyTorch Base Images](#pytorch-base-images) in the style of PyTorch's
+own nightly preview builds, featuring the latest development versions of
+`torch`, `torchvision`, and `torchaudio` pulled daily from GitHub
+and compiled from source.
+
+[`ml-containers/nightly-torch-extras`](https://github.com/coreweave/ml-containers/pkgs/container/ml-containers%2Fnightly-torch-extras)
+is a version of [PyTorch Extras](#pytorch-extras) built on top of the
+[`ml-containers/nightly-torch`](https://github.com/coreweave/ml-containers/pkgs/container/ml-containers%2Fnightly-torch)
+container images.
+These are not nightly versions of the extensions themselves, but rather match
+the extension versions in the regular [PyTorch Extras](#pytorch-extras) containers.
+
+> ⚠ The *PyTorch Nightly* containers are based on unstable, experimental preview
+builds of PyTorch, and should be expected to contain bugs and other issues.
+> For more stable containers use the [PyTorch Base Images](#pytorch-base-images)
+> and [PyTorch Extras](#pytorch-extras) containers. 
+
+
+## Organization
+This repository contains multiple container image Dockerfiles, each is expected
+to be within its own folder along with any other needed files for the build.
+
+
+## CI Builds (Actions)
+The current CI builds are set up to run when changes to files in the respective
+folders are detected so that only the changed container images are built. The
+actions are set up with an action per image utilizing a reusable base action
+[build.yml](.github/workflows/build.yml). The reusable action accepts several inputs:
+
+- `folder` - the folder containing the dockerfile for the image
+- `image-name` - the name to use for the image
+- `build-args` - arguments to pass to the docker build
+
+Images built using the same source can utilize one action as the main reason for
+the multiple actions is to handle only building the changed images. A build
+matrix can be helpful for these cases
+https://docs.github.com/en/actions/using-jobs/using-a-matrix-for-your-jobs.
diff --git a/mkdocs.yml b/mkdocs.yml
new file mode 100644
index 0000000..e1564cf
--- /dev/null
+++ b/mkdocs.yml
@@ -0,0 +1,10 @@
+site_name: ml-containers
+plugins:
+  - techdocs-core
+markdown_extensions:
+  pymdownx.extra:
+    pymdownx.superfences:
+      custom_fences:
+        - name: mermaid
+          class: mermaid
+          format: !!python/name:pymdownx.superfences.fence_code_format
\ No newline at end of file
diff --git a/sd-inference/Dockerfile b/sd-inference/Dockerfile
index e191876..ef34b07 100644
--- a/sd-inference/Dockerfile
+++ b/sd-inference/Dockerfile
@@ -1,6 +1,9 @@
-FROM gooseai/torch-base:1.13.1-cuda-1.18-rc4
+FROM ghcr.io/coreweave/ml-containers/torch:afecfe9-base-cuda12.0.1-torch2.0.0-vision0.15.1
+ENV DEBIAN_FRONTEND=noninteractive
 
-ENV tenzorizer_commit=35381e3812ba342991d30b71ce257503622ae828
+RUN apt update && apt upgrade -y && \
+    apt update && apt install -y python3 python3-pip git curl && \
+    apt clean
 
 RUN mkdir /app
 WORKDIR /app
@@ -10,15 +13,9 @@ RUN git clone https://github.com/coreweave/kubernetes-cloud && \
     cd kubernetes-cloud && \
     git checkout ${COMMIT} && \
     cd .. && \
-    cp kubernetes-cloud/online-inference/stable-diffusion/service/* .
+    cp kubernetes-cloud/online-inference/stable-diffusion/service/* .  && \
+    cp kubernetes-cloud/online-inference/stable-diffusion/serializer/serialize.py . && \
+    rm -rf kubernetes-cloud
 
-RUN git clone https://github.com/coreweave/tensorizer && \
-    cd tensorizer && \
-    git checkout ${tenzorizer_commit} && \
-    cd .. && \
-    mv tensorizer/tensorizer.py . && \
-    rm -rf tensorizer
-
-RUN pip3 install --no-cache-dir -r requirements.txt
-
-CMD [ "/usr/bin/python3", "service.py" ]
+RUN pip3 install --no-cache-dir --upgrade pip && \
+    pip3 install --no-cache-dir -r requirements.txt
diff --git a/sd-serializer/Dockerfile b/sd-serializer/Dockerfile
deleted file mode 100644
index 81e0595..0000000
--- a/sd-serializer/Dockerfile
+++ /dev/null
@@ -1,23 +0,0 @@
-FROM python:3.9
-
-RUN mkdir /app
-WORKDIR /app
-
-ENV tenzorizer_commit=35381e3812ba342991d30b71ce257503622ae828
-
-ARG COMMIT=master
-RUN git clone https://github.com/coreweave/kubernetes-cloud && \
-    cd kubernetes-cloud && \
-    git checkout ${COMMIT} && \
-    cd .. && \
-    cp kubernetes-cloud/online-inference/stable-diffusion/serializer/* . && \
-    pip3 install --no-cache-dir -r requirements.txt
-
-RUN git clone https://github.com/coreweave/tensorizer && \
-    cd tensorizer && \
-    git checkout ${tenzorizer_commit} && \
-    cd .. && \
-    mv tensorizer/tensorizer.py . && \
-    rm -rf tensorizer
-
-CMD ["python3", "/app/serialize.py"]
\ No newline at end of file
diff --git a/torch-extras/Dockerfile b/torch-extras/Dockerfile
index 06adf5b..f2ee465 100644
--- a/torch-extras/Dockerfile
+++ b/torch-extras/Dockerfile
@@ -1,10 +1,10 @@
 # syntax=docker/dockerfile:1.2
 
 ARG BASE_IMAGE
-ARG DEEPSPEED_VERSION="0.9.4"
+ARG DEEPSPEED_VERSION="0.10.3"
 ARG FLASH_ATTN_VERSION="2.0.2"
-ARG APEX_COMMIT="7b2e71b0d4013f8e2f9f1c8dd21980ff1d76f1b6"
-ARG XFORMERS_VERSION="0.0.20"
+ARG APEX_COMMIT="38a12698bc3cc95987bca270bcd6d025bb0be346"
+ARG XFORMERS_VERSION="0.0.22"
 
 FROM alpine/git:2.36.3 as flash-attn-downloader
 WORKDIR /git
@@ -44,16 +44,31 @@ RUN export \
       cuda-nvprof-${CUDA_PACKAGE_VERSION} \
       cuda-profiler-api-${CUDA_PACKAGE_VERSION} \
       libaio-dev \
-      ninja-build \
-      # gcc-10/g++-10/lld do not need to be installed here, but they improve the build.
-      # gfortran-10 is just for compiler_wrapper.f95.
-      gcc-10 g++-10 gfortran-10 lld && \
+      ninja-build && \
+    apt-get clean
+
+# Add Kitware's apt repository to get a newer version of CMake
+RUN apt-get -qq update && apt-get -qq install -y \
+      software-properties-common lsb-release && \
+    { wget -qO - https://apt.kitware.com/keys/kitware-archive-latest.asc \
+    | gpg --dearmor -o /etc/apt/trusted.gpg.d/kitware.gpg; } && \
+    apt-add-repository "deb https://apt.kitware.com/ubuntu/ $(lsb_release -cs) main" && \
+    apt-get -qq update && apt-get -qq install -y cmake && apt-get clean
+
+# Update compiler (GCC) and linker (LLD) versions
+# gfortran-11 is just for compiler_wrapper.f95
+RUN CODENAME="$(lsb_release -cs)" && \
+    wget -qO - 'https://apt.llvm.org/llvm-snapshot.gpg.key' > /etc/apt/trusted.gpg.d/apt.llvm.org.asc && \
+    apt-add-repository "deb https://apt.llvm.org/$CODENAME/ llvm-toolchain-$CODENAME-17 main" && \
+    apt-add-repository -y ppa:ubuntu-toolchain-r/test && \
+    apt-get -qq update && apt-get -qq install --no-install-recommends -y \
+      gcc-11 g++-11 gfortran-11 lld-17 && \
     apt-get clean && \
-    update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-10 10 && \
-    update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-10 10 && \
+    update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-11 11 && \
+    update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-11 11 && \
     update-alternatives --install \
-      /usr/bin/gfortran gfortran /usr/bin/gfortran-10 10 && \
-    update-alternatives --install /usr/bin/ld ld /usr/bin/ld.lld 1
+      /usr/bin/gfortran gfortran /usr/bin/gfortran-11 11 && \
+    update-alternatives --install /usr/bin/ld ld /usr/bin/ld.lld-17 1
 
 RUN mkdir /wheels /build
 WORKDIR /build
@@ -89,6 +104,12 @@ ARG DEEPSPEED_VERSION
 SHELL ["/bin/bash", "-c"]
 RUN python3 -m pip install -U --no-cache-dir \
       setuptools wheel pip && \
+    if python3 -m pip show torch | grep 'Version: 2\.[1-9]' > /dev/null; then \
+      # DeepSpeed's AIO extension is incompatible with PyTorch 2.1.x's
+      # requirement for C++17 (as of DeepSpeed 0.10.1).
+      # See: https://github.com/microsoft/DeepSpeed/pull/3976
+      export DS_BUILD_AIO='0'; \
+    fi && \
     { \
       # DeepSpeed doesn't handle blank environment variables
       # in the same way as unset ones, so clear any blank ones.
@@ -116,20 +137,6 @@ SHELL ["/bin/sh", "-c"]
 WORKDIR /wheels
 
 
-FROM builder-base as xformers-builder
-
-ARG XFORMERS_VERSION
-
-RUN python3 -m pip install -U --no-cache-dir \
-      setuptools wheel pip && \
-    CC=$(realpath -e ./compiler) \
-      MAX_JOBS=$(($(./effective_cpu_count.sh) / 2 + 1)) \
-      python3 -m pip wheel -w /wheels -v \
-      --no-cache-dir --no-build-isolation --no-deps \
-      --no-binary=xformers \
-      xformers==${XFORMERS_VERSION}
-
-
 FROM builder-base as flash-attn-builder
 
 RUN --mount=type=bind,from=flash-attn-downloader,source=/git/flash-attention,target=flash-attention/,rw \
@@ -167,18 +174,24 @@ RUN LIBNCCL2_VERSION=$(dpkg-query --showformat='${Version}' --show libnccl2) &&
       libnccl-dev=$LIBNCCL2_VERSION && \
     apt-get clean
 
+# --distributed_adam, --distributed_lamb, and --group_norm aren't documented
+# in the Apex README, but are defined in its setup.py config.
 RUN --mount=type=bind,from=apex-downloader,source=/git/apex,target=apex/,rw \
     python3 -m pip install -U --no-cache-dir \
       packaging setuptools wheel pip && \
     export CC=$(realpath -e ./compiler) && \
     export MAX_JOBS=$(($(./effective_cpu_count.sh) + 2)) && \
-    EXTENSIONS=$(printf -- '--config-settings "--build-option=%s" ' $( \
+    export NVCC_APPEND_FLAGS='-diag-suppress 186,177' && \
+    printf -- '--config-settings="--build-option=%s" ' $( \
       echo \
         --cpp_ext \
         --cuda_ext \
+        --distributed_adam \
+        --distributed_lamb \
         --permutation_search \
         --xentropy \
         --focal_loss \
+        --group_norm \
         --index_mul_2d \
         --deprecated_fused_adam \
         --deprecated_fused_lamb \
@@ -195,15 +208,28 @@ RUN --mount=type=bind,from=apex-downloader,source=/git/apex,target=apex/,rw \
           --cudnn_gbn \
           --fused_conv_bias_relu; \
       fi; \
-    )) && \
+    ) > ./apex-extensions.conf && \
+    echo "Extensions: $(cat ./apex-extensions.conf)" && \
     cd apex && \
-    python3 -m pip wheel -w /wheels -v \
-      --no-cache-dir --no-build-isolation --no-deps \
-      $EXTENSIONS ./
+    xargs -a ../apex-extensions.conf python3 -m pip wheel -w /wheels -v --no-cache-dir --no-build-isolation --no-deps ./
 
 WORKDIR /wheels
 
 
+FROM builder-base as xformers-builder
+
+ARG XFORMERS_VERSION
+
+RUN python3 -m pip install -U --no-cache-dir \
+      setuptools wheel pip && \
+    CC=$(realpath -e ./compiler) \
+      MAX_JOBS=$(($(./effective_cpu_count.sh) / 2 + 1)) \
+      python3 -m pip wheel -w /wheels -v \
+      --no-cache-dir --no-build-isolation --no-deps \
+      --no-binary=xformers \
+      xformers==${XFORMERS_VERSION}
+
+
 FROM ${BASE_IMAGE}
 
 RUN apt-get -qq update && \
diff --git a/torch/Dockerfile b/torch/Dockerfile
index 9f2f2d2..438d91f 100755
--- a/torch/Dockerfile
+++ b/torch/Dockerfile
@@ -1,57 +1,91 @@
-# syntax=docker/dockerfile:1.2
+# syntax=docker/dockerfile:1.4
 ARG BUILDER_BASE_IMAGE="nvidia/cuda:12.0.1-devel-ubuntu20.04"
 ARG FINAL_BASE_IMAGE="nvidia/cuda:12.0.1-base-ubuntu20.04"
 
 ARG BUILD_TORCH_VERSION="2.0.1"
 ARG BUILD_TORCH_VISION_VERSION="0.15.2"
 ARG BUILD_TORCH_AUDIO_VERSION="2.0.2"
+ARG BUILD_TRITON_VERSION=""
 ARG BUILD_TORCH_CUDA_ARCH_LIST="6.0 6.1 6.2 7.0 7.2 7.5 8.0 8.6 8.9 9.0+PTX"
 # 8.7 is supported in the PyTorch main branch, but not 2.0.0
 
 # Clone PyTorch repositories independently from all other build steps
 # for cache-friendliness and parallelization
-FROM alpine/git:2.36.3 as pytorch-downloader
+FROM alpine/git:2.40.1 as downloader-base
 WORKDIR /git
+RUN git config --global advice.detachedHead false
+
+COPY <<-"EOT" /git/clone.sh
+    #!/bin/sh
+    REPO="https://github.com/$1";
+    DEST="$2";
+    REF="$3";
+
+    CLONE() { git clone -j8 --depth=1 --filter=blob:none "$@"; };
+
+    # Try cloning REF as a tag prefixed with "v", otherwise fall back
+    # to git checkout for commit hashes
+    CLONE --recurse-submodules --shallow-submodules --also-filter-submodules \
+      "$REPO" -b "v$REF" "$DEST" || { \
+        CLONE --no-single-branch --no-checkout "$REPO" "$DEST" && \
+        git -C "$DEST" checkout "$REF" && \
+        git -C "$DEST" submodule update --init --filter=blob:none --depth=1 --recursive --jobs 8; \
+    };
+EOT
+
+RUN chmod 755 /git/clone.sh
+
+
+FROM downloader-base as pytorch-downloader
 ARG BUILD_TORCH_VERSION
-RUN git clone --recurse-submodules --shallow-submodules -j8 --depth 1 \
-      https://github.com/pytorch/pytorch -b v${BUILD_TORCH_VERSION} && \
+RUN ./clone.sh pytorch/pytorch pytorch "${BUILD_TORCH_VERSION}" && \
     rm -rf pytorch/.git
 
-FROM alpine/git:2.36.3 as torchvision-downloader
-WORKDIR /git
+FROM downloader-base as torchvision-downloader
 ARG BUILD_TORCH_VISION_VERSION
-RUN git clone --recurse-submodules --shallow-submodules -j8 --depth 1 \
-      https://github.com/pytorch/vision -b v${BUILD_TORCH_VISION_VERSION} && \
+RUN ./clone.sh pytorch/vision vision "${BUILD_TORCH_VISION_VERSION}" && \
     rm -rf vision/.git
 
-FROM alpine/git:2.36.3 as torchaudio-downloader
-WORKDIR /git
+FROM downloader-base as torchaudio-downloader
 ARG BUILD_TORCH_AUDIO_VERSION
-RUN git clone --recurse-submodules --shallow-submodules -j8 --depth 1 \
-      https://github.com/pytorch/audio -b v${BUILD_TORCH_AUDIO_VERSION}
+RUN ./clone.sh pytorch/audio audio "${BUILD_TORCH_AUDIO_VERSION}"
 # The torchaudio build requires that this directory remain a full git repository,
 # so no rm -rf audio/.git is done for this one.
 
+FROM downloader-base as triton-downloader
+ARG BUILD_TRITON_VERSION
+RUN if [ -n "${BUILD_TRITON_VERSION}" ]; then \
+      ./clone.sh openai/triton triton "${BUILD_TRITON_VERSION}"; \
+    else \
+      mkdir triton; \
+    fi;
+
+
 ## Build PyTorch on a builder image.
 FROM ${BUILDER_BASE_IMAGE} as builder
 ENV DEBIAN_FRONTEND=noninteractive
 
 ARG BUILD_CCACHE_SIZE="1Gi"
 
-# ninja-build, ccache, gcc-10, g++-10, and lld are optional but improve the build
+# ninja-build, ccache, and lld are optional but improve the build
 RUN apt-get -qq update && apt-get -qq install -y \
       libncurses5 python3 python3-pip git apt-utils ssh ca-certificates \
       libpng-dev libjpeg-dev pkg-config python3-distutils python3-numpy \
-      build-essential ninja-build ccache gcc-10 g++-10 lld && \
+      build-essential ninja-build && \
+    apt-get clean && \
+    /usr/bin/python3 -m pip install --no-cache-dir --upgrade pip && \
     update-alternatives --install /usr/bin/python python /usr/bin/python3 1 && \
-    update-alternatives --install /usr/bin/pip pip /usr/bin/pip3 1 && \
-    update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-10 10 && \
-    update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-10 10 && \
-    update-alternatives --install /usr/bin/ld ld /usr/bin/ld.lld 1 && \
+    update-alternatives --install /usr/bin/pip pip /usr/bin/pip3 1
+
+RUN mkdir /tmp/ccache-install && \
+    cd /tmp/ccache-install && \
+    CCACHE_URL='https://github.com/ccache/ccache/releases/download/v4.8.2/ccache-4.8.2-linux-x86_64.tar.xz' && \
+    wget -qO - $CCACHE_URL | tar --strip-components 1 -xJf - && \
+    make install && \
+    cd .. && \
+    rm -rf /tmp/ccache-install && \
     ccache -M "${BUILD_CCACHE_SIZE}" && \
-    ccache -F 0 && \
-    pip3 install --no-cache-dir --upgrade pip && \
-    apt-get clean
+    ccache -F 0
 
 # Build-time environment variables
 ENV CCACHE_DIR=/ccache \
@@ -62,23 +96,50 @@ ENV CCACHE_DIR=/ccache \
 # Add Kitware's apt repository to get a newer version of CMake
 RUN apt-get -qq update && apt-get -qq install -y \
       software-properties-common lsb-release && \
-    { wget -O - https://apt.kitware.com/keys/kitware-archive-latest.asc 2>/dev/null \
+    { wget -qO - https://apt.kitware.com/keys/kitware-archive-latest.asc \
     | gpg --dearmor -o /etc/apt/trusted.gpg.d/kitware.gpg; } && \
     apt-add-repository "deb https://apt.kitware.com/ubuntu/ $(lsb_release -cs) main" && \
     apt-get -qq update && apt-get -qq install -y cmake && apt-get clean
 
+# Update compiler (GCC) and linker (LLD) versions
+RUN CODENAME="$(lsb_release -cs)" && \
+    wget -qO - 'https://apt.llvm.org/llvm-snapshot.gpg.key' > /etc/apt/trusted.gpg.d/apt.llvm.org.asc && \
+    apt-add-repository "deb https://apt.llvm.org/$CODENAME/ llvm-toolchain-$CODENAME-17 main" && \
+    apt-add-repository -y ppa:ubuntu-toolchain-r/test && \
+    apt-get -qq update && apt-get -qq install --no-install-recommends -y \
+      gcc-11 g++-11 lld-17 && \
+    apt-get clean && \
+    update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-11 11 && \
+    update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-11 11 && \
+    update-alternatives --install /usr/bin/ld ld /usr/bin/ld.lld-17 1
+
 RUN mkdir /build /build/dist
 WORKDIR /build
+COPY --chmod=755 effective_cpu_count.sh .
+
+COPY <<-"EOT" /build/version-string.sh
+    #!/bin/sh
+    set -x;
+    VERSION="$1";
+
+    IS_HASH() {
+      echo "$1" | grep -qxiEe '[0-9a-f]{40}';
+    };
+
+    if IS_HASH "$VERSION"; then
+      REAL_VERSION="$(cat ./version.txt)";
+      SHORT_HASH="$(echo "$VERSION" | cut -c1-7)";
+      echo "$REAL_VERSION+$SHORT_HASH";
+    else
+      echo "$VERSION";
+    fi;
+EOT
+RUN chmod 755 /build/version-string.sh
 
 ## Build torch
 RUN --mount=type=bind,from=pytorch-downloader,source=/git/pytorch,target=pytorch/ \
     cd pytorch && pip3 install --no-cache-dir -r requirements.txt
 
-ARG BUILD_TORCH_VERSION
-ARG BUILD_TORCH_CUDA_ARCH_LIST
-ENV TORCH_VERSION=$BUILD_TORCH_VERSION
-ENV TORCH_CUDA_ARCH_LIST=$BUILD_TORCH_CUDA_ARCH_LIST
-
 # Build tool & library paths, shared for all libraries to be built
 ENV CMAKE_PREFIX_PATH=/usr/bin/ \
     LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/usr/local/cuda/lib64:/usr/local/lib \
@@ -86,6 +147,21 @@ ENV CMAKE_PREFIX_PATH=/usr/bin/ \
     CUDA_TOOLKIT_ROOT_DIR=/usr/local/cuda/ \
     CUDNN_LIB_DIR=/usr/local/cuda/lib64
 
+ARG BUILD_TRITON_VERSION
+RUN --mount=type=bind,from=triton-downloader,source=/git/triton,target=triton/,rw \
+    --mount=type=cache,target=/ccache \
+    if [ -n "$BUILD_TRITON_VERSION" ]; then \
+      export MAX_JOBS="$(./effective_cpu_count.sh)" && \
+      cd triton/python && \
+      python -m pip wheel -w wheels/ --no-build-isolation --no-deps -vv . && \
+      pip install wheels/*.whl; \
+    fi
+
+ARG BUILD_TORCH_VERSION
+ARG BUILD_TORCH_CUDA_ARCH_LIST
+ENV TORCH_VERSION=$BUILD_TORCH_VERSION
+ENV TORCH_CUDA_ARCH_LIST=$BUILD_TORCH_CUDA_ARCH_LIST
+
 # If the directory /opt/nccl-tests exists,
 # the base image is assumed to be nccl-tests,
 # so it uses the system's special NCCL and UCC installations for the build.
@@ -101,6 +177,7 @@ ENV CMAKE_PREFIX_PATH=/usr/bin/ \
 # remain the same.
 RUN --mount=type=bind,from=pytorch-downloader,source=/git/pytorch,target=pytorch/,rw \
     --mount=type=cache,target=/ccache \
+    export MAX_JOBS="$(./effective_cpu_count.sh)" && \
     cd pytorch && \
     mkdir build && \
     ln -s /usr/bin/cc build/cc && \
@@ -123,7 +200,7 @@ RUN --mount=type=bind,from=pytorch-downloader,source=/git/pytorch,target=pytorch
     CXX=c++ \
     USE_EIGEN_FOR_BLAS=ON \
     USE_MKL=OFF \
-    PYTORCH_BUILD_VERSION="${TORCH_VERSION}" \
+    PYTORCH_BUILD_VERSION="$(../version-string.sh "$TORCH_VERSION")" \
     PYTORCH_BUILD_NUMBER=0 \
     TORCH_NVCC_FLAGS="-Xfatbin -compress-all" \
     python3 setup.py bdist_wheel --dist-dir ../dist
@@ -136,6 +213,7 @@ RUN pip3 install --no-cache-dir --upgrade \
     matplotlib numpy typing_extensions requests pillow
 
 RUN --mount=type=bind,from=torchvision-downloader,source=/git/vision,target=vision/,rw \
+    export MAX_JOBS="$(./effective_cpu_count.sh)" && \
     cd vision && \
     mkdir build && \
     ln -s /usr/bin/cc build/cc && \
@@ -159,7 +237,7 @@ RUN --mount=type=bind,from=torchvision-downloader,source=/git/vision,target=visi
     CXX=c++ \
     USE_EIGEN_FOR_BLAS=ON \
     USE_MKL=OFF \
-    BUILD_VERSION="${TORCH_VISION_VERSION}" \
+    BUILD_VERSION="$(../version-string.sh "$TORCH_VISION_VERSION")" \
     TORCH_NVCC_FLAGS="-Xfatbin -compress-all" \
     python3 setup.py bdist_wheel --dist-dir ../dist
 
@@ -170,6 +248,7 @@ RUN pip3 install --no-cache-dir --upgrade \
     matplotlib numpy typing_extensions requests pillow
 
 RUN --mount=type=bind,from=torchaudio-downloader,source=/git/audio,target=audio/,rw \
+    export MAX_JOBS="$(./effective_cpu_count.sh)" && \
     cd audio && \
     mkdir build && \
     ln -s /usr/bin/cc build/cc && \
@@ -193,7 +272,7 @@ RUN --mount=type=bind,from=torchaudio-downloader,source=/git/audio,target=audio/
     CXX=c++ \
     USE_EIGEN_FOR_BLAS=ON \
     USE_MKL=OFF \
-    BUILD_VERSION="${TORCH_AUDIO_VERSION}" \
+    BUILD_VERSION="$(../version-string.sh "$TORCH_AUDIO_VERSION")" \
     TORCH_NVCC_FLAGS="-Xfatbin -compress-all" \
     python3 setup.py bdist_wheel --dist-dir ../dist
 
@@ -205,11 +284,19 @@ ENV DEBIAN_FRONTEND=noninteractive
 # Install core packages
 RUN apt-get -qq update && apt-get -qq install -y \
       libncurses5 python3 python3-pip python3-distutils python3-numpy \
-      curl git apt-utils ssh ca-certificates tmux nano vim sudo bash rsync \
-      htop wget unzip tini && \
+      libpng16-16 libjpeg-turbo8 \
+      curl git apt-utils ssh ca-certificates tmux nano vim-tiny sudo bash \
+      rsync htop wget unzip tini && \
+    /usr/bin/python3 -m pip install --no-cache-dir --upgrade pip && \
     update-alternatives --install /usr/bin/python python /usr/bin/python3 1 && \
     update-alternatives --install /usr/bin/pip pip /usr/bin/pip3 1 && \
-    pip3 install --no-cache-dir --upgrade pip && \
+    update-alternatives --install /usr/bin/vim vim /usr/bin/vim.tiny 1 && \
+    apt-get clean
+
+RUN apt-get -qq update && apt-get -qq install -y --no-install-recommends \
+        software-properties-common && \
+    apt-add-repository -y ppa:ubuntu-toolchain-r/test && \
+    apt-get -qq install -y --no-install-recommends libstdc++6 && \
     apt-get clean
 
 ARG BUILD_TORCH_VERSION
@@ -238,13 +325,15 @@ RUN export \
       libcusparse-${CUDA_PACKAGE_VERSION} \
       libcusolver-${CUDA_PACKAGE_VERSION} \
       cuda-cupti-${CUDA_PACKAGE_VERSION} \
+      libnvjpeg-${CUDA_PACKAGE_VERSION} \
       libnvtoolsext1 && \
     { if [ $CUDA_MAJOR_VERSION -ge 12 ]; then \
       apt-get -qq install --no-upgrade -y libnvjitlink-${CUDA_PACKAGE_VERSION}; fi; } && \
     { if [ ! -d /opt/nccl-tests ]; then \
       export NCCL_PACKAGE_VERSION="2.*+cuda${CUDA_MAJOR_VERSION}.${CUDA_MINOR_VERSION}" && \
       apt-get -qq install --no-upgrade -y "libnccl2=$NCCL_PACKAGE_VERSION"; fi; } && \
-    apt-get clean
+    apt-get clean && \
+    ldconfig
 
 WORKDIR /usr/src/app
 
diff --git a/torch/effective_cpu_count.sh b/torch/effective_cpu_count.sh
new file mode 100755
index 0000000..029ecbc
--- /dev/null
+++ b/torch/effective_cpu_count.sh
@@ -0,0 +1,32 @@
+#!/bin/sh
+
+CPU_QUOTA() (
+    CGROUP='/sys/fs/cgroup';
+    CGROUP_V1="$CGROUP/cpu,cpuacct";
+    CGROUP_V1_QUOTA="$CGROUP_V1/cpu.cfs_quota_us";
+    CGROUP_V1_PERIOD="$CGROUP_V1/cpu.cfs_period_us";
+    CGROUP_V2="$CGROUP/user.slice/cpu.max";
+    if [ ! -d "$CGROUP" ]; then
+        return 1;
+    elif [ -f "$CGROUP_V1_QUOTA" ] && [ -f "$CGROUP_V1_PERIOD" ]; then
+        IFS='' read -r QUOTA 2> /dev/null < "$CGROUP_V1_QUOTA" || return 1;
+        IFS='' read -r PERIOD 2> /dev/null < "$CGROUP_V1_PERIOD" || return 1;
+    elif [ -f "$CGROUP_V2" ]; then
+        IFS=' ' read -r QUOTA PERIOD 2> /dev/null < "$CGROUP_V2" || return 1;
+    else
+        return 1;
+    fi;
+
+    if [ "$QUOTA" -gt 0 ] 2> /dev/null && [ "$PERIOD" -gt 0 ] 2> /dev/null; then
+        echo $((QUOTA / PERIOD));
+        return 0;
+    else
+        return 1;
+    fi;
+)
+
+EFFECTIVE_CPU_COUNT() {
+    CPU_QUOTA || getconf _NPROCESSORS_ONLN;
+}
+
+EFFECTIVE_CPU_COUNT;