diff --git a/Dockerfile.ubi b/Dockerfile.ubi index 052ae1b6e..0b2bd225f 100644 --- a/Dockerfile.ubi +++ b/Dockerfile.ubi @@ -146,6 +146,7 @@ RUN --mount=type=cache,target=/root/.cache/pip \ ## Builder ##################################################################### +# need something with python and cuda? FROM dev AS build # install build dependencies @@ -180,6 +181,11 @@ ENV PATH=/usr/local/cuda/bin:$PATH ENV LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH ENV CCACHE_DIR=/root/.cache/ccache + +# TODO: Try running the build-ext command first to see +# if it can pick up the *.so files +# to speed up builds + RUN --mount=type=cache,target=/root/.cache/ccache \ --mount=type=cache,target=/root/.cache/pip \ python3 setup.py bdist_wheel --dist-dir=dist @@ -195,19 +201,19 @@ RUN --mount=type=cache,target=/root/.cache/pip \ ## Extension Cache ############################################################# # Instead of compiling artifacts every build just copy from pre-built wheel # This might not work if the PyTorch and CUDA versions don't match! -FROM base as prebuilt-wheel +# FROM base as prebuilt-wheel -RUN microdnf install -y \ - unzip \ - && microdnf clean all +# RUN microdnf install -y \ +# unzip \ +# && microdnf clean all -ARG PYTHON_VERSION -# 0.4.0.post1 is built for CUDA 12.1 and PyTorch 2.1.2 -ARG VLLM_WHEEL_VERSION=0.4.0.post1 +# ARG PYTHON_VERSION +# # 0.4.0.post1 is built for CUDA 12.1 and PyTorch 2.1.2 +# ARG VLLM_WHEEL_VERSION=0.4.0.post1 -RUN curl -Lo vllm.whl https://github.com/vllm-project/vllm/releases/download/v${VLLM_WHEEL_VERSION}/vllm-${VLLM_WHEEL_VERSION}-cp${PYTHON_VERSION//.}-cp${PYTHON_VERSION//.}-manylinux1_x86_64.whl \ - && unzip vllm.whl \ - && rm vllm.whl +# RUN curl -Lo vllm.whl https://github.com/vllm-project/vllm/releases/download/v${VLLM_WHEEL_VERSION}/vllm-${VLLM_WHEEL_VERSION}-cp${PYTHON_VERSION//.}-cp${PYTHON_VERSION//.}-manylinux1_x86_64.whl \ +# && unzip vllm.whl \ +# && rm vllm.whl # compiled extensions located at /workspace/vllm/*.so #################### FLASH_ATTENTION Build IMAGE #################### @@ -265,7 +271,8 @@ RUN --mount=type=cache,target=/root/.cache/pip \ ## vLLM installation IMAGE ########################################################## # image with vLLM installed -FROM base AS vllm +# need something with python and cuda? +FROM dev AS vllm WORKDIR /vllm-workspace @@ -276,7 +283,7 @@ RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist # COPY files from various places into a staging directory # COPY --link vllm vllm -# COPY --from=build --link /workspace/vllm/*.so vllm/ +# COPY --from=prebuilt-wheel --link /workspace/vllm/*.so vllm/ COPY --from=gen-protos --link /workspace/vllm/entrypoints/grpc/pb vllm/entrypoints/grpc/pb # custom COPY command to use umask to control permissions and grant permissions @@ -295,8 +302,8 @@ FROM cuda-runtime AS vllm-openai WORKDIR /workspace -# Create release python environment -COPY --from=python-torch-base --link /opt/vllm /opt/vllm +# Create release python environment, this should have vllm too? +COPY --from=vllm --link /opt/vllm /opt/vllm ENV PATH=/opt/vllm/bin/:$PATH RUN --mount=type=cache,target=/root/.cache/pip \ @@ -316,9 +323,6 @@ RUN --mount=type=cache,target=/root/.cache/pip \ RUN --mount=type=bind,from=flash-attn-builder,src=/usr/src/flash-attention-v2,target=/usr/src/flash-attention-v2 \ pip3 install /usr/src/flash-attention-v2/*.whl --no-cache-dir -# vLLM will not be installed in site-packages -COPY --from=vllm --link /workspace/ ./ - # Triton needs a CC compiler RUN microdnf install -y gcc \ && microdnf clean all