♻️ install vllm using wheels

Signed-off-by: Prashant Gupta <prashantgupta@us.ibm.com>
IBM · Apr 19, 2024 · 2c9e90a · 2c9e90a
1 parent 82d2261
commit 2c9e90a
Showing 1 changed file with 21 additions and 17 deletions.
diff --git a/Dockerfile.ubi b/Dockerfile.ubi
@@ -146,6 +146,7 @@ RUN --mount=type=cache,target=/root/.cache/pip \
 
 
 ## Builder #####################################################################
+# need something with python and cuda?
 FROM dev AS build
 
 # install build dependencies
@@ -180,6 +181,11 @@ ENV PATH=/usr/local/cuda/bin:$PATH
 ENV LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH
 
 ENV CCACHE_DIR=/root/.cache/ccache
+
+# TODO: Try running the build-ext command first to see
+# if it can pick up the *.so files 
+# to speed up builds
+
 RUN --mount=type=cache,target=/root/.cache/ccache \
     --mount=type=cache,target=/root/.cache/pip \
     python3 setup.py bdist_wheel --dist-dir=dist
@@ -195,19 +201,19 @@ RUN --mount=type=cache,target=/root/.cache/pip \
 ## Extension Cache #############################################################
 # Instead of compiling artifacts every build just copy from pre-built wheel
 # This might not work if the PyTorch and CUDA versions don't match!
-FROM base as prebuilt-wheel
+# FROM base as prebuilt-wheel
 
-RUN microdnf install -y \
-        unzip \
-    && microdnf clean all
+# RUN microdnf install -y \
+#         unzip \
+#     && microdnf clean all
 
-ARG PYTHON_VERSION
-# 0.4.0.post1 is built for CUDA 12.1 and PyTorch 2.1.2
-ARG VLLM_WHEEL_VERSION=0.4.0.post1
+# ARG PYTHON_VERSION
+# # 0.4.0.post1 is built for CUDA 12.1 and PyTorch 2.1.2
+# ARG VLLM_WHEEL_VERSION=0.4.0.post1
 
-RUN curl -Lo vllm.whl https://github.com/vllm-project/vllm/releases/download/v${VLLM_WHEEL_VERSION}/vllm-${VLLM_WHEEL_VERSION}-cp${PYTHON_VERSION//.}-cp${PYTHON_VERSION//.}-manylinux1_x86_64.whl \
-    && unzip vllm.whl \
-    && rm vllm.whl
+# RUN curl -Lo vllm.whl https://github.com/vllm-project/vllm/releases/download/v${VLLM_WHEEL_VERSION}/vllm-${VLLM_WHEEL_VERSION}-cp${PYTHON_VERSION//.}-cp${PYTHON_VERSION//.}-manylinux1_x86_64.whl \
+#     && unzip vllm.whl \
+#     && rm vllm.whl
 # compiled extensions located at /workspace/vllm/*.so
 
 #################### FLASH_ATTENTION Build IMAGE ####################
@@ -265,7 +271,8 @@ RUN --mount=type=cache,target=/root/.cache/pip \
 
 ## vLLM installation IMAGE ##########################################################
 # image with vLLM installed
-FROM base AS vllm
+# need something with python and cuda?
+FROM dev AS vllm
 
 WORKDIR /vllm-workspace
 
@@ -276,7 +283,7 @@ RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist
 
 # COPY files from various places into a staging directory
 # COPY --link vllm vllm
-# COPY --from=build --link /workspace/vllm/*.so vllm/
+# COPY --from=prebuilt-wheel --link /workspace/vllm/*.so vllm/
 COPY --from=gen-protos --link /workspace/vllm/entrypoints/grpc/pb vllm/entrypoints/grpc/pb
 
 # custom COPY command to use umask to control permissions and grant permissions
@@ -295,8 +302,8 @@ FROM cuda-runtime AS vllm-openai
 
 WORKDIR /workspace
 
-# Create release python environment
-COPY --from=python-torch-base --link /opt/vllm /opt/vllm
+# Create release python environment, this should have vllm too?
+COPY --from=vllm --link /opt/vllm /opt/vllm
 ENV PATH=/opt/vllm/bin/:$PATH
 
 RUN --mount=type=cache,target=/root/.cache/pip \
@@ -316,9 +323,6 @@ RUN --mount=type=cache,target=/root/.cache/pip \
 RUN --mount=type=bind,from=flash-attn-builder,src=/usr/src/flash-attention-v2,target=/usr/src/flash-attention-v2 \
     pip3 install /usr/src/flash-attention-v2/*.whl --no-cache-dir
 
-# vLLM will not be installed in site-packages
-COPY --from=vllm --link /workspace/ ./
-
 # Triton needs a CC compiler
 RUN microdnf install -y gcc \
     && microdnf clean all