opendatahub-io · joerunde · Apr 30, 2024 · May 21, 2024 · May 21, 2024 · May 28, 2024
diff --git a/.github/workflows/add_label_automerge.yml b/.github/workflows/add_label_automerge.yml
diff --git a/.github/workflows/reminder_comment.yml b/.github/workflows/reminder_comment.yml
diff --git a/.github/workflows/sync-with-upstream.yml b/.github/workflows/sync-with-upstream.yml
@@ -0,0 +1,84 @@
+name: "Sync with upstream"
+
+on:
+  schedule:
+    - cron: 20 4 * * *
+
+  workflow_dispatch:
+
+
+env:
+  # repo to fetch changes from
+  UPSTREAM_REPO: vllm-project/vllm
+ # branch to sync
+  BRANCH: main
+
+jobs:
+  upstream-sync:
+    name: Sync with upstream
+    runs-on: ubuntu-latest
+    permissions:
+      pull-requests: write
+      contents: write
+
+    steps:
+      - uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+
+      - name: Fetch upstream repo
+        run: |
+          git remote add upstream https://github.com/${UPSTREAM_REPO}
+          git fetch upstream
+
+      - name: Check diff
+        id: diff
+        shell: bash
+        run: |
+          echo 'diff<<EOF' >> $GITHUB_OUTPUT
+          git diff --stat upstream/${BRANCH} | tee -a >(cat >> $GITHUB_OUTPUT)
+          echo 'EOF' >> $GITHUB_OUTPUT
+
+      - name: Create PR
+        if: ${{ steps.diff.outputs.diff != '' }}
+        env:
+          GH_TOKEN: ${{ github.token }}
+        run: |
+          set -xeu
+
+          git_hash="$(git rev-parse upstream/${BRANCH})"
+          echo "git_hash=$git_hash" >> $GITHUB_OUTPUT
+          git_describe="$(git describe --tags upstream/${BRANCH})"
+          echo "git_describe=$git_describe" >> $GITHUB_OUTPUT
+
+          # echo 'commits<<EOF' >> $GITHUB_OUTPUT
+          # git log --oneline ..upstream/${BRANCH} >> $GITHUB_OUTPUT
+          # echo 'EOF' >> $GITHUB_OUTPUT
+
+          upstream_url="https://github.com/${UPSTREAM_REPO}"
+          upstream_branch="$upstream_url/tree/${BRANCH}"
+
+          title="Sync with upstream@${git_describe}"
+          body="Merge [${UPSTREAM_REPO}]($upstream_url):[${BRANCH}]($upstream_branch)@[${git_describe}](${upstream_url}/commit/$git_hash) into $BRANCH"
+
+          gh repo set-default $GITHUB_REPOSITORY
+          pr_number=$(gh pr list -S "Sync with upstream@" --json number --jq '.[0].number')
+
+          if [[ -z $pr_number ]]; then
+            echo "Creating PR"
+            gh pr create \
+              --head $(echo $UPSTREAM_REPO | sed 's|/|:|g'):${BRANCH} \
+              --base ${BRANCH} \
+              --label code-sync \
+              --title "$title" \
+              --body "$body" \
+              --draft \
+              --no-maintainer-edit
+            exit 0
+          fi
+
+          echo "Updating PR \#${pr_number}"
+          gh pr edit \
+            $pr_number \
+            --body "$body" \
+            --title "$title"
diff --git a/Dockerfile.rocm.ubi b/Dockerfile.rocm.ubi
@@ -0,0 +1,227 @@
+## Global Args #################################################################
+ARG BASE_UBI_IMAGE_TAG=9.4
+ARG PYTHON_VERSION=3.12
+# Default ROCm ARCHes to build vLLM for.
+ARG PYTORCH_ROCM_ARCH="gfx908;gfx90a;gfx942;gfx1100"
+ARG MAX_JOBS=12
+
+FROM registry.access.redhat.com/ubi9/ubi-minimal:${BASE_UBI_IMAGE_TAG} AS base
+
+ARG PYTHON_VERSION
+
+ENV VIRTUAL_ENV=/opt/vllm
+ENV PATH="$VIRTUAL_ENV/bin:$PATH"
+
+RUN --mount=type=cache,target=/root/.cache/pip \
+ microdnf -y update && \
+ microdnf install -y --setopt=install_weak_deps=0 --nodocs \
+    python${PYTHON_VERSION}-devel \
+    python${PYTHON_VERSION}-pip \
+    python${PYTHON_VERSION}-wheel && \
+    python${PYTHON_VERSION} -m venv $VIRTUAL_ENV && \
+    pip install -U pip wheel setuptools uv
+
+
+FROM base AS rocm_base
+ENV ROCM_VERSION=6.1.2
+
+RUN printf "[amdgpu]\n\
+name=amdgpu\n\
+baseurl=https://repo.radeon.com/amdgpu/${ROCM_VERSION}/rhel/9.4/main/x86_64/\n\
+enabled=1\n\
+priority=50\n\
+gpgcheck=1\n\
+gpgkey=https://repo.radeon.com/rocm/rocm.gpg.key\n\
+[ROCm-${ROCM_VERSION}]\n\
+name=ROCm${ROCM_VERSION}\n\
+baseurl=https://repo.radeon.com/rocm/rhel9/${ROCM_VERSION}/main\n\
+enabled=1\n\
+priority=50\n\
+gpgcheck=1\n\
+gpgkey=https://repo.radeon.com/rocm/rocm.gpg.key" > /etc/yum.repos.d/amdgpu.repo
+
+
+RUN microdnf -y install \
+        rocm-hip-libraries rocm-hip-runtime \
+        miopen-hip && \
+    microdnf clean all
+
+RUN --mount=type=cache,target=/root/.cache/pip \
+    --mount=type=cache,target=/root/.cache/uv \
+    uv pip install -v --index-url "https://download.pytorch.org/whl/nightly/rocm6.1" \
+        torch==2.5.0.dev20240912+rocm6.1 \
+        torchvision==0.20.0.dev20240912+rocm6.1
+
+FROM rocm_base as rocm_devel
+
+ENV CCACHE_DIR=/root/.cache/ccache
+
+RUN rpm -ivh https://dl.fedoraproject.org/pub/epel/epel-release-latest-9.noarch.rpm && \
+    rpm -ql epel-release && \
+    microdnf -y update && \
+    microdnf -y install \
+        ccache \
+        git \
+        rocm \
+        hipcc \
+        wget \
+        which && \
+    microdnf clean all
+
+WORKDIR /workspace
+
+ENV LLVM_SYMBOLIZER_PATH=/opt/rocm/llvm/bin/llvm-symbolizer
+ENV PATH=$PATH:/opt/rocm/bin:/libtorch/bin
+ENV CPLUS_INCLUDE_PATH=$CPLUS_INCLUDE_PATH:/libtorch/include:/libtorch/include/torch/csrc/api/include:/opt/rocm/include
+
+
+FROM rocm_devel AS build_amdsmi
+
+# Build AMD SMI wheel
+RUN cd /opt/rocm/share/amd_smi && \
+    python3 -m pip wheel . --wheel-dir=/install
+
+##################################################################################################
+
+FROM rocm_devel AS build_flashattention
+
+# Whether to install CK-based flash-attention
+ARG BUILD_FA="1"
+ARG TRY_FA_WHEEL="1"
+# Note: The ROCm fork provides a wheel built for ROCm but only for 2.5.9 and python 3.9, so this will be incompatible with the current build
+ARG FA_WHEEL_URL="https://github.com/ROCm/flash-attention/releases/download/v2.5.9post1-cktile-vllm/flash_attn-2.5.9.post1-cp39-cp39-linux_x86_64.whl"
+# only required when not using triton backend
+ARG FA_GFX_ARCHS="gfx90a;gfx942"
+ARG FLASH_ATTENTION_USE_TRITON_ROCM="TRUE"
+# FA_BRANCH is the main_perf branch as of Sep 4 2024 which includes triton backend support, see https://github.com/Dao-AILab/flash-attention/pull/1203
+ARG FA_BRANCH="75b5360"
+ARG MAX_JOBS
+ENV MAX_JOBS=${MAX_JOBS}
+ENV FLASH_ATTENTION_USE_TRITON_ROCM=${FLASH_ATTENTION_USE_TRITON_ROCM}
+
+# Build ROCm flash-attention wheel if `BUILD_FA` is set to 1
+RUN --mount=type=cache,target=/root/.cache/uv \
+    --mount=type=cache,target=/workspace/build \
+    if [ "$BUILD_FA" = "1" ]; then \
+        if [ "$TRY_FA_WHEEL" = "1" ] && python3 -m pip install "${FA_WHEEL_URL}"; then \
+	    # If a suitable wheel exists, download it instead of building FA
+            mkdir -p /install && wget -N "${FA_WHEEL_URL}" -P /install; \
+        else \
+            mkdir -p /libs && \
+            cd /libs && \
+            git clone https://github.com/ROCm/flash-attention.git && \
+            cd flash-attention && \
+            git checkout ${FA_BRANCH} && \
+            git submodule update --init && \
+            uv pip install cmake ninja packaging && \
+            env \
+                GPU_ARCHS="${FA_GFX_ARCHS}" \
+                BUILD_TARGET="rocm" \
+                python3 setup.py bdist_wheel --dist-dir=/install; \
+        fi; \
+    else \
+        # Create an empty directory otherwise AS later build stages expect one
+        mkdir -p /install; \
+    fi
+
+##################################################################################################
+
+FROM rocm_devel AS build_vllm
+ARG PYTORCH_ROCM_ARCH
+ARG MAX_JOBS
+ENV MAX_JOBS=${MAX_JOBS}
+ENV PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH}
+
+
+COPY . .
+
+
+ENV VLLM_TARGET_DEVICE="rocm"
+ENV MAX_JOBS=${MAX_JOBS}
+# Make sure punica kernels are built (for LoRA)
+ENV VLLM_INSTALL_PUNICA_KERNELS=1
+
+RUN --mount=type=cache,target=/root/.cache/ccache \
+    --mount=type=cache,target=/root/.cache/pip \
+    --mount=type=cache,target=/root/.cache/uv \
+    uv pip install -v -U \
+        ninja setuptools-scm>=8 "cmake>=3.26" packaging && \
+    python3 setup.py bdist_wheel --dist-dir=dist
+
+##################################################################################################
+
+FROM rocm_base AS vllm-openai
+ARG MAX_JOBS
+
+WORKDIR /workspace
+
+ENV VIRTUAL_ENV=/opt/vllm
+ENV PATH=$VIRTUAL_ENV/bin:$PATH
+
+# Required for triton
+RUN microdnf install -y --setopt=install_weak_deps=0 --nodocs gcc && \
+    microdnf clean all
+
+RUN --mount=type=bind,from=build_amdsmi,src=/install,target=/install/amdsmi/ \
+    --mount=type=bind,from=build_flashattention,src=/install,target=/install/flashattention \
+    --mount=type=bind,from=build_vllm,src=/workspace/dist,target=/install/vllm/ \
+    --mount=type=cache,target=/root/.cache/pip \
+    --mount=type=cache,target=/root/.cache/uv \
+    uv pip install -v  \
+        --index-strategy=unsafe-best-match \
+        --extra-index-url "https://download.pytorch.org/whl/nightly/rocm6.1" \
+        /install/amdsmi/*.whl\
+        /install/flashattention/*.whl\
+        /install/vllm/*.whl
+
+# Set up a non-root user for OpenShift
+RUN umask 002 && \
+    useradd --uid 2000 --gid 0 vllm && \
+    mkdir -p /licenses && \
+    chmod g+rwx $HOME /usr/src /workspace
+
+COPY LICENSE /licenses/vllm.md
+COPY examples/*.jinja /app/data/template/
+
+ENV HF_HUB_OFFLINE=1 \
+    PORT=8000 \
+    HOME=/home/vllm \
+    # Allow requested max length to exceed what is extracted from the
+    # config.json
+    # see: https://github.com/vllm-project/vllm/pull/7080
+    VLLM_ALLOW_LONG_MAX_MODEL_LEN=1 \
+    VLLM_USAGE_SOURCE=production-docker-image \
+    VLLM_WORKER_MULTIPROC_METHOD=fork \
+    VLLM_NO_USAGE_STATS=1 \
+    # Silences the HF Tokenizers warning
+    TOKENIZERS_PARALLELISM=false  \
+    RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES=1 \
+    FLASH_ATTENTION_USE_TRITON_ROCM="TRUE" \
+    OUTLINES_CACHE_DIR=/tmp/outlines \
+    NUMBA_CACHE_DIR=/tmp/numba \
+    TRITON_CACHE_DIR=/tmp/triton
+
+# Switch to the non-root user
+USER 2000
+
+# Set the entrypoint
+ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"]
+
+
+FROM vllm-openai as vllm-grpc-adapter
+
+USER root
+
+RUN --mount=type=cache,target=/root/.cache/pip \
+    pip install vllm-tgis-adapter==0.4.0
+
+ENV GRPC_PORT=8033 \
+    PORT=8000 \
+    # As an optimization, vLLM disables logprobs when using spec decoding by
+    # default, but this would be unexpected to users of a hosted model that
+    # happens to have spec decoding
+    # see: https://github.com/vllm-project/vllm/pull/6485
+    DISABLE_LOGPROBS_DURING_SPEC_DECODING=false
+
+USER 2000
+ENTRYPOINT ["python3", "-m", "vllm_tgis_adapter", "--uvicorn-log-level=warning"]