Skip to content
This repository was archived by the owner on Oct 11, 2024. It is now read-only.

Commit 97d1a0c

Browse files
authored
Merge branch 'main' into License_check
2 parents ae527b4 + 0257d9d commit 97d1a0c

File tree

7 files changed

+147
-82
lines changed

7 files changed

+147
-82
lines changed
+39
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,39 @@
1+
name: Build docker image
2+
description: 'build docker image for nm-vllm'
3+
inputs:
4+
docker_tag:
5+
description: "tag to be used for the docker image"
6+
type: string
7+
required: true
8+
build_type:
9+
description: "type of nm-vllm to install for the docker image: NIGHTLY (default) or RELEASE"
10+
type: string
11+
default: 'NIGHTLY'
12+
build_version:
13+
description: "version of nm-vllm to install for the docker image: latest (default) or specific version e.g. 0.4.0, 0.4.0.20240531"
14+
type: string
15+
default: 'latest'
16+
runs:
17+
using: composite
18+
steps:
19+
- run: |
20+
# clean up
21+
docker stop $(docker ps -a -q) || echo 'no container to stop'
22+
docker rm $(docker ps -a -q) || echo 'no container to remove'
23+
docker rmi -f $(docker images -aq) || echo 'no image to remove'
24+
docker system prune --all --force
25+
# build
26+
status=0
27+
docker build --tag ghcr.io/neuralmagic/nm-vllm-openai:${{ inputs.docker_tag }} \
28+
--build-arg build_type=${{ inputs.build_type }} \
29+
--build-arg build_version=${{ inputs.build_version }} \
30+
--target vllm-openai . || status=$?
31+
if [ ${status} -eq 0 ] && [[ "${build_type}" = "RELEASE" ]]; then
32+
echo "Also tag image for RELEASE build as latest"
33+
docker image tag ghcr.io/neuralmagic/nm-vllm-openai:${{ inputs.docker_tag }} ghcr.io/neuralmagic/nm-vllm-openai:latest || ((status+=$?))
34+
fi
35+
docker image ls -a
36+
echo "status=${status}" >> $GITHUB_OUTPUT
37+
echo "status=${status}"
38+
exit ${status}
39+
shell: bash
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
name: set up nvidia-container-toolkit for docker
2+
description: 'sets up nvidia-container-toolkit for docker'
3+
runs:
4+
using: composite
5+
steps:
6+
- run: |
7+
# install nvidia-container-toolkit
8+
curl -fsSL https://nvidia.github.io/libnvidia-container/gpgkey | sudo gpg --dearmor -o /usr/share/keyrings/nvidia-container-toolkit-keyring.gpg \
9+
&& curl -s -L https://nvidia.github.io/libnvidia-container/stable/deb/nvidia-container-toolkit.list | \
10+
sed 's#deb https://#deb [signed-by=/usr/share/keyrings/nvidia-container-toolkit-keyring.gpg] https://#g' | \
11+
sudo tee /etc/apt/sources.list.d/nvidia-container-toolkit.list
12+
sudo sed -i -e '/experimental/ s/^#//g' /etc/apt/sources.list.d/nvidia-container-toolkit.list
13+
sudo killall apt apt-get || echo 'no apt or apt-get process to kill'
14+
sudo apt-get update
15+
sudo apt-get install -y nvidia-container-toolkit
16+
# config and restart docker
17+
sudo systemctl stop docker
18+
sudo nvidia-ctk runtime configure --runtime=docker
19+
sudo systemctl start docker
20+
shell: bash

.github/workflows/publish-docker.yml

+45-20
Original file line numberDiff line numberDiff line change
@@ -1,26 +1,36 @@
11
name: Docker Build + Publish
22

33
on:
4-
# For now, just manually trigger
5-
# push:
6-
# branches:
7-
# - main
8-
# pull_request:
9-
# branches:
10-
# - main
114
workflow_dispatch:
5+
inputs:
6+
docker_tag:
7+
description: "tag to be used for the docker image"
8+
type: string
9+
required: true
10+
push_to_repository:
11+
description: "whether to push out the docker image: no (default) or yes"
12+
type: string
13+
default: 'no'
14+
gitref:
15+
description: "git commit hash or branch name"
16+
type: string
17+
default: 'main'
18+
build_type:
19+
description: "type of nm-vllm to install for the docker image: NIGHTLY (default) or RELEASE"
20+
type: string
21+
default: 'NIGHTLY'
22+
build_version:
23+
description: "version of nm-vllm to install for the docker image: latest (default) or specific version e.g. 0.4.0, 0.4.0.20240531"
24+
type: string
25+
default: 'latest'
1226

1327
jobs:
1428
build-docker-image:
1529

16-
runs-on: aws-avx2-192G-4-a10g-96G
17-
timeout-minutes: 240
30+
runs-on: aws-avx2-32G-a10g-24G
31+
timeout-minutes: 60
1832

1933
steps:
20-
21-
- name: Set up Docker Buildx
22-
id: buildx
23-
uses: docker/setup-buildx-action@v3
2434

2535
- name: Login to Github Packages
2636
uses: docker/login-action@v3
@@ -35,17 +45,32 @@ jobs:
3545
fetch-depth: 1
3646
submodules: recursive
3747

38-
- name: Get version tag
39-
id: extract_tag
40-
run: echo "tag=$(date +%Y%m%d)" >> $GITHUB_OUTPUT
48+
- name: Set up nvidia-container-toolkit
49+
id: setup
50+
uses: ./.github/actions/nm-setup-nvidia-container-toolkit/
4151

42-
- name: Current Version Name
43-
run: echo ${{ steps.extract_tag.outputs.tag }}
52+
- name: Build image
53+
id: build
54+
uses: ./.github/actions/nm-build-docker/
55+
with:
56+
docker_tag: ${{ inputs.docker_tag }}
57+
build_type: ${{ inputs.build_type }}
58+
build_version: ${{ inputs.build_version }}
59+
60+
- name: Push image
61+
uses: docker/build-push-action@v5
62+
if: ${{ inputs.push_to_repository == 'yes' && steps.build.outputs.status == 0 }}
63+
with:
64+
context: .
65+
target: vllm-openai
66+
push: true
67+
tags: ghcr.io/neuralmagic/nm-vllm-openai:${{ inputs.docker_tag }}
4468

45-
- name: nm-vllm latest
69+
- name: Push image (latest for RELEASE)
4670
uses: docker/build-push-action@v5
71+
if: ${{ inputs.push_to_repository == 'yes' && steps.build.outputs.status == 0 && inputs.build_type == 'RELEASE' }}
4772
with:
4873
context: .
4974
target: vllm-openai
5075
push: true
51-
tags: ghcr.io/neuralmagic/nm-vllm-openai:${{ steps.extract_tag.outputs.tag }}
76+
tags: ghcr.io/neuralmagic/nm-vllm-openai:latest

.github/workflows/publish.yml

+1-3
Original file line numberDiff line numberDiff line change
@@ -4,9 +4,7 @@
44
name: Create Release
55

66
on:
7-
push:
8-
tags:
9-
- v*
7+
workflow_dispatch:
108

119
# Needed to create release and upload assets
1210
permissions:

Dockerfile

+40-57
Original file line numberDiff line numberDiff line change
@@ -31,61 +31,29 @@ COPY requirements-dev.txt requirements-dev.txt
3131
RUN --mount=type=cache,target=/root/.cache/pip \
3232
pip install -r requirements-dev.txt
3333

34-
# cuda arch list used by torch
35-
# can be useful for both `dev` and `test`
36-
# explicitly set the list to avoid issues with torch 2.2
37-
# see https://github.com/pytorch/pytorch/pull/123243
38-
ARG torch_cuda_arch_list='7.0 7.5 8.0 8.6 8.9 9.0+PTX'
39-
ENV TORCH_CUDA_ARCH_LIST=${torch_cuda_arch_list}
4034
#################### BASE BUILD IMAGE ####################
4135

42-
4336
#################### WHEEL BUILD IMAGE ####################
4437
FROM dev AS build
4538

46-
# install build dependencies
47-
COPY requirements-build.txt requirements-build.txt
48-
RUN --mount=type=cache,target=/root/.cache/pip \
49-
pip install -r requirements-build.txt
50-
5139
# install compiler cache to speed up compilation leveraging local or remote caching
5240
RUN apt-get update -y && apt-get install -y ccache
5341

54-
# files and directories related to build wheels
55-
COPY csrc csrc
56-
COPY setup.py setup.py
57-
COPY cmake cmake
58-
COPY CMakeLists.txt CMakeLists.txt
59-
COPY requirements-common.txt requirements-common.txt
60-
COPY requirements-cuda.txt requirements-cuda.txt
61-
COPY pyproject.toml pyproject.toml
62-
COPY vllm vllm
63-
64-
# max jobs used by Ninja to build extensions
65-
ARG max_jobs=2
66-
ENV MAX_JOBS=${max_jobs}
67-
# number of threads used by nvcc
68-
ARG nvcc_threads=8
69-
ENV NVCC_THREADS=$nvcc_threads
70-
# make sure punica kernels are built (for LoRA)
71-
ENV VLLM_INSTALL_PUNICA_KERNELS=1
72-
73-
ENV CCACHE_DIR=/root/.cache/ccache
74-
RUN --mount=type=cache,target=/root/.cache/ccache \
75-
--mount=type=cache,target=/root/.cache/pip \
76-
python3 setup.py bdist_wheel --dist-dir=dist
42+
#################### EXTENSION Build IMAGE ####################
7743

78-
# check the size of the wheel, we cannot upload wheels larger than 100MB
79-
COPY .buildkite/check-wheel-size.py check-wheel-size.py
80-
RUN python3 check-wheel-size.py dist
44+
#################### FLASH_ATTENTION Build IMAGE ####################
45+
FROM dev as flash-attn-builder
46+
# flash attention version
47+
ARG flash_attn_version=v2.5.8
48+
ENV FLASH_ATTN_VERSION=${flash_attn_version}
8149

82-
# the `vllm_nccl` package must be installed from source distribution
83-
# pip is too smart to store a wheel in the cache, and other CI jobs
84-
# will directly use the wheel from the cache, which is not what we want.
85-
# we need to remove it manually
86-
RUN --mount=type=cache,target=/root/.cache/pip \
87-
pip cache remove vllm_nccl*
88-
#################### EXTENSION Build IMAGE ####################
50+
WORKDIR /usr/src/flash-attention-v2
51+
52+
# Download the wheel or build it if a pre-compiled release doesn't exist
53+
RUN pip --verbose wheel flash-attn==${FLASH_ATTN_VERSION} \
54+
--no-build-isolation --no-deps --no-cache-dir
55+
56+
#################### FLASH_ATTENTION Build IMAGE ####################
8957

9058
#################### vLLM installation IMAGE ####################
9159
# image with vLLM installed
@@ -101,28 +69,43 @@ RUN apt-get update -y \
10169
# or future versions of triton.
10270
RUN ldconfig /usr/local/cuda-12.4/compat/
10371

104-
# UPSTREAM SYNC: Install sparsity extras
105-
RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist \
72+
# install nm-vllm wheel first, so that torch etc will be installed
73+
ARG build_type="nightly"
74+
ARG build_version="latest"
75+
ENV INSTALL_TYPE=${build_type}
76+
ENV INSTALL_VERSION=${build_version}
77+
# UPSTREAM SYNC: Install nm-vllm with sparsity extras
78+
# use nm pypi for now for testing
79+
RUN --mount=type=bind,from=build \
10680
--mount=type=cache,target=/root/.cache/pip \
107-
pip install nm-magic-wand-nightly --extra-index-url https://pypi.neuralmagic.com/simple
108-
109-
# install vllm wheel first, so that torch etc will be installed
110-
RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist \
81+
if [ "${INSTALL_TYPE}" = "nightly" ]; then \
82+
if [ "${INSTALL_VERSION}" = "latest" ]; then \
83+
pip install nm-vllm-nightly[sparse] --extra-index-url https://pypi.neuralmagic.com/simple; \
84+
else \
85+
pip install nm-vllm-nightly[sparse]==${INSTALL_VERSION} --extra-index-url https://pypi.neuralmagic.com/simple; \
86+
fi; \
87+
else \
88+
if [ "${INSTALL_VERSION}" = "latest" ]; then \
89+
pip install nm-vllm[sparse] --extra-index-url https://pypi.neuralmagic.com/simple; \
90+
else \
91+
pip install nm-vllm[sparse]==${INSTALL_VERSION} --extra-index-url https://pypi.neuralmagic.com/simple; \
92+
fi; \
93+
fi
94+
95+
RUN --mount=type=bind,from=flash-attn-builder,src=/usr/src/flash-attention-v2,target=/usr/src/flash-attention-v2 \
11196
--mount=type=cache,target=/root/.cache/pip \
112-
pip install dist/*.whl --verbose
97+
pip install /usr/src/flash-attention-v2/*.whl --no-cache-dir
11398
#################### vLLM installation IMAGE ####################
11499

115-
116100
#################### TEST IMAGE ####################
117101
# image to run unit testing suite
118102
# note that this uses vllm installed by `pip`
119103
FROM vllm-base AS test
120104

121105
ADD . /vllm-workspace/
122106

123-
# install development dependencies (for testing)
124-
RUN --mount=type=cache,target=/root/.cache/pip \
125-
pip install -r requirements-dev.txt
107+
# check installed version
108+
RUN pip freeze | grep -e nm-vllm -e nm-magic-wand
126109

127110
# doc requires source code
128111
# we hide them inside `test_docs/` , so that this source code
@@ -144,4 +127,4 @@ RUN --mount=type=cache,target=/root/.cache/pip \
144127
ENV VLLM_USAGE_SOURCE production-docker-image
145128

146129
ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"]
147-
#################### OPENAI API SERVER ####################
130+
#################### OPENAI API SERVER ####################

README.md

+1-1
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
[vLLM](https://github.com/vllm-project/vllm) is a fast and easy-to-use library for LLM inference that Neural Magic regularly contributes upstream improvements to. This fork, `nm-vllm` is our opinionated focus on incorporating the latest LLM optimizations like quantization and sparsity for enhanced performance.
99

1010
## Installation
11-
The [nm-vllm PyPi package](https://pypi.org/project/nm-vllm/) includes pre-compiled binaries for CUDA (version 12.1) kernels, streamlining the setup process. For other PyTorch or CUDA versions, please compile the package from source.
11+
The [nm-vllm PyPi package](https://pypi.neuralmagic.com/simple/nm-vllm/index.html) includes pre-compiled binaries for CUDA (version 12.1) kernels, streamlining the setup process. For other PyTorch or CUDA versions, please compile the package from source.
1212

1313
Install it using pip:
1414
```bash

vllm/__init__.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212
from vllm.sampling_params import SamplingParams
1313

1414
# UPSTREAM SYNC: use the current downstream.
15-
__version__ = "0.4.0"
15+
__version__ = "0.5.0"
1616

1717
__all__ = [
1818
"LLM",

0 commit comments

Comments
 (0)