@@ -31,61 +31,29 @@ COPY requirements-dev.txt requirements-dev.txt
31
31
RUN --mount=type=cache,target=/root/.cache/pip \
32
32
pip install -r requirements-dev.txt
33
33
34
- # cuda arch list used by torch
35
- # can be useful for both `dev` and `test`
36
- # explicitly set the list to avoid issues with torch 2.2
37
- # see https://github.com/pytorch/pytorch/pull/123243
38
- ARG torch_cuda_arch_list='7.0 7.5 8.0 8.6 8.9 9.0+PTX'
39
- ENV TORCH_CUDA_ARCH_LIST=${torch_cuda_arch_list}
40
34
# ################### BASE BUILD IMAGE ####################
41
35
42
-
43
36
# ################### WHEEL BUILD IMAGE ####################
44
37
FROM dev AS build
45
38
46
- # install build dependencies
47
- COPY requirements-build.txt requirements-build.txt
48
- RUN --mount=type=cache,target=/root/.cache/pip \
49
- pip install -r requirements-build.txt
50
-
51
39
# install compiler cache to speed up compilation leveraging local or remote caching
52
40
RUN apt-get update -y && apt-get install -y ccache
53
41
54
- # files and directories related to build wheels
55
- COPY csrc csrc
56
- COPY setup.py setup.py
57
- COPY cmake cmake
58
- COPY CMakeLists.txt CMakeLists.txt
59
- COPY requirements-common.txt requirements-common.txt
60
- COPY requirements-cuda.txt requirements-cuda.txt
61
- COPY pyproject.toml pyproject.toml
62
- COPY vllm vllm
63
-
64
- # max jobs used by Ninja to build extensions
65
- ARG max_jobs=2
66
- ENV MAX_JOBS=${max_jobs}
67
- # number of threads used by nvcc
68
- ARG nvcc_threads=8
69
- ENV NVCC_THREADS=$nvcc_threads
70
- # make sure punica kernels are built (for LoRA)
71
- ENV VLLM_INSTALL_PUNICA_KERNELS=1
72
-
73
- ENV CCACHE_DIR=/root/.cache/ccache
74
- RUN --mount=type=cache,target=/root/.cache/ccache \
75
- --mount=type=cache,target=/root/.cache/pip \
76
- python3 setup.py bdist_wheel --dist-dir=dist
42
+ # ################### EXTENSION Build IMAGE ####################
77
43
78
- # check the size of the wheel, we cannot upload wheels larger than 100MB
79
- COPY .buildkite/check-wheel-size.py check-wheel-size.py
80
- RUN python3 check-wheel-size.py dist
44
+ # ################### FLASH_ATTENTION Build IMAGE ####################
45
+ FROM dev as flash-attn-builder
46
+ # flash attention version
47
+ ARG flash_attn_version=v2.5.8
48
+ ENV FLASH_ATTN_VERSION=${flash_attn_version}
81
49
82
- # the `vllm_nccl` package must be installed from source distribution
83
- # pip is too smart to store a wheel in the cache, and other CI jobs
84
- # will directly use the wheel from the cache, which is not what we want.
85
- # we need to remove it manually
86
- RUN --mount=type=cache,target=/root/.cache/pip \
87
- pip cache remove vllm_nccl*
88
- # ################### EXTENSION Build IMAGE ####################
50
+ WORKDIR /usr/src/flash-attention-v2
51
+
52
+ # Download the wheel or build it if a pre-compiled release doesn't exist
53
+ RUN pip --verbose wheel flash-attn==${FLASH_ATTN_VERSION} \
54
+ --no-build-isolation --no-deps --no-cache-dir
55
+
56
+ # ################### FLASH_ATTENTION Build IMAGE ####################
89
57
90
58
# ################### vLLM installation IMAGE ####################
91
59
# image with vLLM installed
@@ -101,28 +69,43 @@ RUN apt-get update -y \
101
69
# or future versions of triton.
102
70
RUN ldconfig /usr/local/cuda-12.4/compat/
103
71
104
- # UPSTREAM SYNC: Install sparsity extras
105
- RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist \
72
+ # install nm-vllm wheel first, so that torch etc will be installed
73
+ ARG build_type="nightly"
74
+ ARG build_version="latest"
75
+ ENV INSTALL_TYPE=${build_type}
76
+ ENV INSTALL_VERSION=${build_version}
77
+ # UPSTREAM SYNC: Install nm-vllm with sparsity extras
78
+ # use nm pypi for now for testing
79
+ RUN --mount=type=bind,from=build \
106
80
--mount=type=cache,target=/root/.cache/pip \
107
- pip install nm-magic-wand-nightly --extra-index-url https://pypi.neuralmagic.com/simple
108
-
109
- # install vllm wheel first, so that torch etc will be installed
110
- RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist \
81
+ if [ "${INSTALL_TYPE}" = "nightly" ]; then \
82
+ if [ "${INSTALL_VERSION}" = "latest" ]; then \
83
+ pip install nm-vllm-nightly[sparse] --extra-index-url https://pypi.neuralmagic.com/simple; \
84
+ else \
85
+ pip install nm-vllm-nightly[sparse]==${INSTALL_VERSION} --extra-index-url https://pypi.neuralmagic.com/simple; \
86
+ fi; \
87
+ else \
88
+ if [ "${INSTALL_VERSION}" = "latest" ]; then \
89
+ pip install nm-vllm[sparse] --extra-index-url https://pypi.neuralmagic.com/simple; \
90
+ else \
91
+ pip install nm-vllm[sparse]==${INSTALL_VERSION} --extra-index-url https://pypi.neuralmagic.com/simple; \
92
+ fi; \
93
+ fi
94
+
95
+ RUN --mount=type=bind,from=flash-attn-builder,src=/usr/src/flash-attention-v2,target=/usr/src/flash-attention-v2 \
111
96
--mount=type=cache,target=/root/.cache/pip \
112
- pip install dist/ *.whl --verbose
97
+ pip install /usr/src/flash-attention-v2/ *.whl --no-cache-dir
113
98
# ################### vLLM installation IMAGE ####################
114
99
115
-
116
100
# ################### TEST IMAGE ####################
117
101
# image to run unit testing suite
118
102
# note that this uses vllm installed by `pip`
119
103
FROM vllm-base AS test
120
104
121
105
ADD . /vllm-workspace/
122
106
123
- # install development dependencies (for testing)
124
- RUN --mount=type=cache,target=/root/.cache/pip \
125
- pip install -r requirements-dev.txt
107
+ # check installed version
108
+ RUN pip freeze | grep -e nm-vllm -e nm-magic-wand
126
109
127
110
# doc requires source code
128
111
# we hide them inside `test_docs/` , so that this source code
@@ -144,4 +127,4 @@ RUN --mount=type=cache,target=/root/.cache/pip \
144
127
ENV VLLM_USAGE_SOURCE production-docker-image
145
128
146
129
ENTRYPOINT ["python3" , "-m" , "vllm.entrypoints.openai.api_server" ]
147
- # ################### OPENAI API SERVER ####################
130
+ # ################### OPENAI API SERVER ####################
0 commit comments