Skip to content

Commit

Permalink
♻️ install vllm using wheels
Browse files Browse the repository at this point in the history
Signed-off-by: Prashant Gupta <prashantgupta@us.ibm.com>
  • Loading branch information
prashantgupta24 committed Apr 19, 2024
1 parent 82d2261 commit 2c9e90a
Showing 1 changed file with 21 additions and 17 deletions.
38 changes: 21 additions & 17 deletions Dockerfile.ubi
Original file line number Diff line number Diff line change
Expand Up @@ -146,6 +146,7 @@ RUN --mount=type=cache,target=/root/.cache/pip \


## Builder #####################################################################
# need something with python and cuda?
FROM dev AS build

# install build dependencies
Expand Down Expand Up @@ -180,6 +181,11 @@ ENV PATH=/usr/local/cuda/bin:$PATH
ENV LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH

ENV CCACHE_DIR=/root/.cache/ccache

# TODO: Try running the build-ext command first to see
# if it can pick up the *.so files
# to speed up builds

RUN --mount=type=cache,target=/root/.cache/ccache \
--mount=type=cache,target=/root/.cache/pip \
python3 setup.py bdist_wheel --dist-dir=dist
Expand All @@ -195,19 +201,19 @@ RUN --mount=type=cache,target=/root/.cache/pip \
## Extension Cache #############################################################
# Instead of compiling artifacts every build just copy from pre-built wheel
# This might not work if the PyTorch and CUDA versions don't match!
FROM base as prebuilt-wheel
# FROM base as prebuilt-wheel

RUN microdnf install -y \
unzip \
&& microdnf clean all
# RUN microdnf install -y \
# unzip \
# && microdnf clean all

ARG PYTHON_VERSION
# 0.4.0.post1 is built for CUDA 12.1 and PyTorch 2.1.2
ARG VLLM_WHEEL_VERSION=0.4.0.post1
# ARG PYTHON_VERSION
# # 0.4.0.post1 is built for CUDA 12.1 and PyTorch 2.1.2
# ARG VLLM_WHEEL_VERSION=0.4.0.post1

RUN curl -Lo vllm.whl https://github.com/vllm-project/vllm/releases/download/v${VLLM_WHEEL_VERSION}/vllm-${VLLM_WHEEL_VERSION}-cp${PYTHON_VERSION//.}-cp${PYTHON_VERSION//.}-manylinux1_x86_64.whl \
&& unzip vllm.whl \
&& rm vllm.whl
# RUN curl -Lo vllm.whl https://github.com/vllm-project/vllm/releases/download/v${VLLM_WHEEL_VERSION}/vllm-${VLLM_WHEEL_VERSION}-cp${PYTHON_VERSION//.}-cp${PYTHON_VERSION//.}-manylinux1_x86_64.whl \
# && unzip vllm.whl \
# && rm vllm.whl
# compiled extensions located at /workspace/vllm/*.so

#################### FLASH_ATTENTION Build IMAGE ####################
Expand Down Expand Up @@ -265,7 +271,8 @@ RUN --mount=type=cache,target=/root/.cache/pip \

## vLLM installation IMAGE ##########################################################
# image with vLLM installed
FROM base AS vllm
# need something with python and cuda?
FROM dev AS vllm

WORKDIR /vllm-workspace

Expand All @@ -276,7 +283,7 @@ RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist

# COPY files from various places into a staging directory
# COPY --link vllm vllm
# COPY --from=build --link /workspace/vllm/*.so vllm/
# COPY --from=prebuilt-wheel --link /workspace/vllm/*.so vllm/
COPY --from=gen-protos --link /workspace/vllm/entrypoints/grpc/pb vllm/entrypoints/grpc/pb

# custom COPY command to use umask to control permissions and grant permissions
Expand All @@ -295,8 +302,8 @@ FROM cuda-runtime AS vllm-openai

WORKDIR /workspace

# Create release python environment
COPY --from=python-torch-base --link /opt/vllm /opt/vllm
# Create release python environment, this should have vllm too?
COPY --from=vllm --link /opt/vllm /opt/vllm
ENV PATH=/opt/vllm/bin/:$PATH

RUN --mount=type=cache,target=/root/.cache/pip \
Expand All @@ -316,9 +323,6 @@ RUN --mount=type=cache,target=/root/.cache/pip \
RUN --mount=type=bind,from=flash-attn-builder,src=/usr/src/flash-attention-v2,target=/usr/src/flash-attention-v2 \
pip3 install /usr/src/flash-attention-v2/*.whl --no-cache-dir

# vLLM will not be installed in site-packages
COPY --from=vllm --link /workspace/ ./

# Triton needs a CC compiler
RUN microdnf install -y gcc \
&& microdnf clean all
Expand Down

0 comments on commit 2c9e90a

Please sign in to comment.