Merge branch 'main' into License_check

derekk-nm · web-flow · commit 97d1a0c48696 · 2024-06-04T15:52:35.000-04:00
diff --git a/.github/actions/nm-build-docker/action.yml b/.github/actions/nm-build-docker/action.yml
@@ -0,0 +1,39 @@
+name: Build docker image
+description: 'build docker image for nm-vllm'
+inputs:
+  docker_tag:
+    description: "tag to be used for the docker image"
+    type: string
+    required: true
+  build_type:
+    description: "type of nm-vllm to install for the docker image: NIGHTLY (default) or RELEASE"
+    type: string
+    default: 'NIGHTLY'
+  build_version:
+    description: "version of nm-vllm to install for the docker image: latest (default) or specific version e.g. 0.4.0, 0.4.0.20240531"
+    type: string
+    default: 'latest'
+runs:
+  using: composite
+  steps:
+  - run: |
+      # clean up
+      docker stop $(docker ps -a -q) || echo 'no container to stop'
+      docker rm $(docker ps -a -q) || echo 'no container to remove'
+      docker rmi -f $(docker images -aq) || echo 'no image to remove'
+      docker system prune --all --force
+      # build
+      status=0
+      docker build --tag ghcr.io/neuralmagic/nm-vllm-openai:${{ inputs.docker_tag }} \
+             --build-arg build_type=${{ inputs.build_type }} \
+             --build-arg build_version=${{ inputs.build_version }} \
+             --target vllm-openai . || status=$?
+      if [ ${status} -eq 0 ] && [[ "${build_type}" = "RELEASE" ]]; then
+          echo "Also tag image for RELEASE build as latest"
+          docker image tag ghcr.io/neuralmagic/nm-vllm-openai:${{ inputs.docker_tag }} ghcr.io/neuralmagic/nm-vllm-openai:latest || ((status+=$?))
+      fi
+      docker image ls -a
+      echo "status=${status}" >> $GITHUB_OUTPUT
+      echo "status=${status}"
+      exit ${status}
+    shell: bash
diff --git a/.github/actions/nm-setup-nvidia-container-toolkit/action.yml b/.github/actions/nm-setup-nvidia-container-toolkit/action.yml
@@ -0,0 +1,20 @@
+name: set up nvidia-container-toolkit for docker
+description: 'sets up nvidia-container-toolkit for docker'
+runs:
+  using: composite
+  steps:
+  - run: |
+        # install nvidia-container-toolkit
+        curl -fsSL https://nvidia.github.io/libnvidia-container/gpgkey | sudo gpg --dearmor -o /usr/share/keyrings/nvidia-container-toolkit-keyring.gpg \
+          && curl -s -L https://nvidia.github.io/libnvidia-container/stable/deb/nvidia-container-toolkit.list | \
+          sed 's#deb https://#deb [signed-by=/usr/share/keyrings/nvidia-container-toolkit-keyring.gpg] https://#g' | \
+          sudo tee /etc/apt/sources.list.d/nvidia-container-toolkit.list
+        sudo sed -i -e '/experimental/ s/^#//g' /etc/apt/sources.list.d/nvidia-container-toolkit.list
+        sudo killall apt apt-get || echo 'no apt or apt-get process to kill'
+        sudo apt-get update
+        sudo apt-get install -y nvidia-container-toolkit
+        # config and restart docker
+        sudo systemctl stop docker
+        sudo nvidia-ctk runtime configure --runtime=docker
+        sudo systemctl start docker
+    shell: bash
diff --git a/.github/workflows/publish-docker.yml b/.github/workflows/publish-docker.yml
@@ -1,26 +1,36 @@
 name: Docker Build + Publish
 
 on:
-  # For now, just manually trigger
-  # push:
-  #   branches:
-  #     - main
-  # pull_request:
-  #   branches:
-  #     - main
   workflow_dispatch:
+    inputs:
+      docker_tag:
+        description: "tag to be used for the docker image"
+        type: string
+        required: true
+      push_to_repository:
+        description: "whether to push out the docker image: no (default) or yes"
+        type: string
+        default: 'no'
+      gitref:
+        description: "git commit hash or branch name"
+        type: string
+        default: 'main'
+      build_type:
+        description: "type of nm-vllm to install for the docker image: NIGHTLY (default) or RELEASE"
+        type: string
+        default: 'NIGHTLY'
+      build_version:
+        description: "version of nm-vllm to install for the docker image: latest (default) or specific version e.g. 0.4.0, 0.4.0.20240531"
+        type: string
+        default: 'latest'
 
 jobs:
     build-docker-image:
 
-        runs-on: aws-avx2-192G-4-a10g-96G
-        timeout-minutes: 240
+        runs-on: aws-avx2-32G-a10g-24G
+        timeout-minutes: 60
 
         steps:
-          
-          - name: Set up Docker Buildx
-            id: buildx
-            uses: docker/setup-buildx-action@v3
 
           - name: Login to Github Packages
             uses: docker/login-action@v3
@@ -35,17 +45,32 @@ jobs:
               fetch-depth: 1
               submodules: recursive
 
-          - name: Get version tag
-            id: extract_tag
-            run: echo "tag=$(date +%Y%m%d)" >> $GITHUB_OUTPUT
+          - name: Set up nvidia-container-toolkit
+            id: setup
+            uses: ./.github/actions/nm-setup-nvidia-container-toolkit/
 
-          - name: Current Version Name
-            run: echo ${{ steps.extract_tag.outputs.tag }}
+          - name: Build image
+            id: build
+            uses: ./.github/actions/nm-build-docker/
+            with:
+              docker_tag: ${{ inputs.docker_tag }}
+              build_type: ${{ inputs.build_type }}
+              build_version: ${{ inputs.build_version }}
+
+          - name: Push image
+            uses: docker/build-push-action@v5
+            if: ${{ inputs.push_to_repository == 'yes' && steps.build.outputs.status == 0 }}
+            with:
+              context: .
+              target: vllm-openai
+              push: true
+              tags: ghcr.io/neuralmagic/nm-vllm-openai:${{ inputs.docker_tag }}
 
-          - name: nm-vllm latest
+          - name: Push image (latest for RELEASE)
             uses: docker/build-push-action@v5
+            if: ${{ inputs.push_to_repository == 'yes' && steps.build.outputs.status == 0 && inputs.build_type == 'RELEASE' }}
             with:
               context: .
               target: vllm-openai
               push: true
-              tags: ghcr.io/neuralmagic/nm-vllm-openai:${{ steps.extract_tag.outputs.tag }}
+              tags: ghcr.io/neuralmagic/nm-vllm-openai:latest
diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml
@@ -4,9 +4,7 @@
 name: Create Release
 
 on:
-  push:
-    tags:
-      - v*
+  workflow_dispatch:
 
 # Needed to create release and upload assets
 permissions:
diff --git a/Dockerfile b/Dockerfile
@@ -31,61 +31,29 @@ COPY requirements-dev.txt requirements-dev.txt
 RUN --mount=type=cache,target=/root/.cache/pip \
     pip install -r requirements-dev.txt
 
-# cuda arch list used by torch
-# can be useful for both `dev` and `test`
-# explicitly set the list to avoid issues with torch 2.2
-# see https://github.com/pytorch/pytorch/pull/123243
-ARG torch_cuda_arch_list='7.0 7.5 8.0 8.6 8.9 9.0+PTX'
-ENV TORCH_CUDA_ARCH_LIST=${torch_cuda_arch_list}
 #################### BASE BUILD IMAGE ####################
 
-
 #################### WHEEL BUILD IMAGE ####################
 FROM dev AS build
 
-# install build dependencies
-COPY requirements-build.txt requirements-build.txt
-RUN --mount=type=cache,target=/root/.cache/pip \
-    pip install -r requirements-build.txt
-
 # install compiler cache to speed up compilation leveraging local or remote caching
 RUN apt-get update -y && apt-get install -y ccache
 
-# files and directories related to build wheels
-COPY csrc csrc
-COPY setup.py setup.py
-COPY cmake cmake
-COPY CMakeLists.txt CMakeLists.txt
-COPY requirements-common.txt requirements-common.txt
-COPY requirements-cuda.txt requirements-cuda.txt
-COPY pyproject.toml pyproject.toml
-COPY vllm vllm
-
-# max jobs used by Ninja to build extensions
-ARG max_jobs=2
-ENV MAX_JOBS=${max_jobs}
-# number of threads used by nvcc
-ARG nvcc_threads=8
-ENV NVCC_THREADS=$nvcc_threads
-# make sure punica kernels are built (for LoRA)
-ENV VLLM_INSTALL_PUNICA_KERNELS=1
-
-ENV CCACHE_DIR=/root/.cache/ccache
-RUN --mount=type=cache,target=/root/.cache/ccache \
-    --mount=type=cache,target=/root/.cache/pip \
-    python3 setup.py bdist_wheel --dist-dir=dist
+#################### EXTENSION Build IMAGE ####################
 
-# check the size of the wheel, we cannot upload wheels larger than 100MB
-COPY .buildkite/check-wheel-size.py check-wheel-size.py
-RUN python3 check-wheel-size.py dist
+#################### FLASH_ATTENTION Build IMAGE ####################
+FROM dev as flash-attn-builder
+# flash attention version
+ARG flash_attn_version=v2.5.8
+ENV FLASH_ATTN_VERSION=${flash_attn_version}
 
-# the `vllm_nccl` package must be installed from source distribution
-# pip is too smart to store a wheel in the cache, and other CI jobs
-# will directly use the wheel from the cache, which is not what we want.
-# we need to remove it manually
-RUN --mount=type=cache,target=/root/.cache/pip \
-    pip cache remove vllm_nccl*
-#################### EXTENSION Build IMAGE ####################
+WORKDIR /usr/src/flash-attention-v2
+
+# Download the wheel or build it if a pre-compiled release doesn't exist
+RUN pip --verbose wheel flash-attn==${FLASH_ATTN_VERSION} \
+    --no-build-isolation --no-deps --no-cache-dir
+
+#################### FLASH_ATTENTION Build IMAGE ####################
 
 #################### vLLM installation IMAGE ####################
 # image with vLLM installed
@@ -101,28 +69,43 @@ RUN apt-get update -y \
 # or future versions of triton.
 RUN ldconfig /usr/local/cuda-12.4/compat/
 
-# UPSTREAM SYNC: Install sparsity extras
-RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist \
+# install nm-vllm wheel first, so that torch etc will be installed
+ARG build_type="nightly"
+ARG build_version="latest"
+ENV INSTALL_TYPE=${build_type}
+ENV INSTALL_VERSION=${build_version}
+# UPSTREAM SYNC: Install nm-vllm with sparsity extras
+# use nm pypi for now for testing
+RUN --mount=type=bind,from=build \
     --mount=type=cache,target=/root/.cache/pip \
-    pip install nm-magic-wand-nightly --extra-index-url https://pypi.neuralmagic.com/simple
-
-# install vllm wheel first, so that torch etc will be installed
-RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist \
+    if [ "${INSTALL_TYPE}" = "nightly" ]; then \
+        if [ "${INSTALL_VERSION}" = "latest" ]; then \
+            pip install nm-vllm-nightly[sparse] --extra-index-url https://pypi.neuralmagic.com/simple; \
+        else \
+            pip install nm-vllm-nightly[sparse]==${INSTALL_VERSION} --extra-index-url https://pypi.neuralmagic.com/simple; \
+        fi; \
+    else \
+        if [ "${INSTALL_VERSION}" = "latest" ]; then \
+            pip install nm-vllm[sparse] --extra-index-url https://pypi.neuralmagic.com/simple; \
+        else \
+            pip install nm-vllm[sparse]==${INSTALL_VERSION} --extra-index-url https://pypi.neuralmagic.com/simple; \
+        fi; \
+    fi
+
+RUN --mount=type=bind,from=flash-attn-builder,src=/usr/src/flash-attention-v2,target=/usr/src/flash-attention-v2 \
     --mount=type=cache,target=/root/.cache/pip \
-    pip install dist/*.whl --verbose
+    pip install /usr/src/flash-attention-v2/*.whl --no-cache-dir
 #################### vLLM installation IMAGE ####################
 
-
 #################### TEST IMAGE ####################
 # image to run unit testing suite
 # note that this uses vllm installed by `pip`
 FROM vllm-base AS test
 
 ADD . /vllm-workspace/
 
-# install development dependencies (for testing)
-RUN --mount=type=cache,target=/root/.cache/pip \
-    pip install -r requirements-dev.txt
+# check installed version
+RUN pip freeze | grep -e nm-vllm -e nm-magic-wand
 
 # doc requires source code
 # we hide them inside `test_docs/` , so that this source code
@@ -144,4 +127,4 @@ RUN --mount=type=cache,target=/root/.cache/pip \
 ENV VLLM_USAGE_SOURCE production-docker-image
 
 ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"]
-#################### OPENAI API SERVER ####################
+#################### OPENAI API SERVER ####################
diff --git a/README.md b/README.md
@@ -8,7 +8,7 @@
 [vLLM](https://github.com/vllm-project/vllm) is a fast and easy-to-use library for LLM inference that Neural Magic regularly contributes upstream improvements to. This fork, `nm-vllm` is our opinionated focus on incorporating the latest LLM optimizations like quantization and sparsity for enhanced performance.
 
 ## Installation
-The [nm-vllm PyPi package](https://pypi.org/project/nm-vllm/) includes pre-compiled binaries for CUDA (version 12.1) kernels, streamlining the setup process. For other PyTorch or CUDA versions, please compile the package from source.
+The [nm-vllm PyPi package](https://pypi.neuralmagic.com/simple/nm-vllm/index.html) includes pre-compiled binaries for CUDA (version 12.1) kernels, streamlining the setup process. For other PyTorch or CUDA versions, please compile the package from source.
 
 Install it using pip:
 ```bash
diff --git a/vllm/__init__.py b/vllm/__init__.py
@@ -12,7 +12,7 @@
 from vllm.sampling_params import SamplingParams
 
 # UPSTREAM SYNC: use the current downstream.
-__version__ = "0.4.0"
+__version__ = "0.5.0"
 
 __all__ = [
     "LLM",