opendatahub-io · dtrifiro · Aug 12, 2024 · Aug 12, 2024 · Aug 12, 2024 · Aug 5, 2024
diff --git a/Dockerfile.ubi b/Dockerfile.ubi
@@ -32,7 +32,7 @@ ENV VIRTUAL_ENV=/opt/vllm
 ENV PATH="$VIRTUAL_ENV/bin:$PATH"
 RUN microdnf install -y \
     python${PYTHON_VERSION}-devel python${PYTHON_VERSION}-pip python${PYTHON_VERSION}-wheel && \
-    python${PYTHON_VERSION} -m venv $VIRTUAL_ENV && pip install --no-cache -U pip wheel && microdnf clean all
+    python${PYTHON_VERSION} -m venv $VIRTUAL_ENV && pip install --no-cache -U pip wheel uv && microdnf clean all
 
 
 ## CUDA Base ###################################################################
@@ -42,7 +42,7 @@ RUN curl -Lo /etc/yum.repos.d/cuda-rhel9.repo \
         https://developer.download.nvidia.com/compute/cuda/repos/rhel9/x86_64/cuda-rhel9.repo
 
 RUN microdnf install -y \
-        cuda-nvcc-12-4 cuda-nvtx-12-4 cuda-libraries-devel-12-4 && \
+        cuda-nvcc-12-1 cuda-nvtx-12-1 cuda-libraries-devel-12-1 && \
     microdnf clean all
 
 ENV CUDA_HOME="/usr/local/cuda" \
@@ -57,23 +57,26 @@ ENV PATH="$VIRTUAL_ENV/bin:$PATH"
 
 # install cuda and common dependencies
 RUN --mount=type=cache,target=/root/.cache/pip \
+    --mount=type=cache,target=/root/.cache/uv \
     --mount=type=bind,source=requirements-common.txt,target=requirements-common.txt \
     --mount=type=bind,source=requirements-cuda.txt,target=requirements-cuda.txt \
-    pip install \
+    uv pip install \
         -r requirements-cuda.txt
 
+
 ## Development #################################################################
 FROM python-cuda-base AS dev
 
 # install build and runtime dependencies
 RUN --mount=type=cache,target=/root/.cache/pip \
+    --mount=type=cache,target=/root/.cache/uv \
     --mount=type=bind,source=requirements-common.txt,target=requirements-common.txt \
     --mount=type=bind,source=requirements-cuda.txt,target=requirements-cuda.txt \
     --mount=type=bind,source=requirements-dev.txt,target=requirements-dev.txt \
     --mount=type=bind,source=requirements-lint.txt,target=requirements-lint.txt \
     --mount=type=bind,source=requirements-adag.txt,target=requirements-adag.txt \
     --mount=type=bind,source=requirements-test.txt,target=requirements-test.txt \
-    pip3 install \
+    uv pip install \
         -r requirements-cuda.txt \
         -r requirements-dev.txt
 
@@ -82,8 +85,9 @@ FROM dev AS build
 
 # install build dependencies
 RUN --mount=type=cache,target=/root/.cache/pip \
+    --mount=type=cache,target=/root/.cache/uv \
     --mount=type=bind,source=requirements-build.txt,target=requirements-build.txt \
-    pip install -r requirements-build.txt
+    uv pip install -r requirements-build.txt
 
 # install compiler cache to speed up compilation leveraging local or remote caching
 # git is required for the cutlass kernels
@@ -113,14 +117,14 @@ ENV VLLM_INSTALL_PUNICA_KERNELS=1
 
 # Make sure the cuda environment is in the PATH
 ENV PATH=/usr/local/cuda/bin:$PATH
-ENV LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH
 
 # Copy the entire directory before building wheel
 COPY vllm vllm
 
 ENV CCACHE_DIR=/root/.cache/ccache
 RUN --mount=type=cache,target=/root/.cache/ccache \
     --mount=type=cache,target=/root/.cache/pip \
+    --mount=type=cache,target=/root/.cache/uv \
     --mount=type=bind,src=.git,target=/workspace/.git \
     env CFLAGS="-march=haswell" \
         CXXFLAGS="$CFLAGS $CXXFLAGS" \
@@ -145,28 +149,36 @@ RUN ./configure --prefix="/usr/" && make && make check
 
 ## Release #####################################################################
 FROM python-install AS vllm-openai
+ARG PYTHON_VERSION
 
 WORKDIR /workspace
 
 ENV VIRTUAL_ENV=/opt/vllm
 ENV PATH=$VIRTUAL_ENV/bin/:$PATH
 
+# force using the python venv's cuda runtime libraries
+ENV LD_LIBRARY_PATH="${VIRTUAL_ENV}/lib/python${PYTHON_VERSION}/site-packages/nvidia/cuda_nvrtc/lib:${LD_LIBRARY_PATH}"
+ENV LD_LIBRARY_PATH="${VIRTUAL_ENV}/lib/python${PYTHON_VERSION}/site-packages/nvidia/cuda_runtime/lib:${LD_LIBRARY_PATH}"
+ENV LD_LIBRARY_PATH="${VIRTUAL_ENV}/lib/python${PYTHON_VERSION}/site-packages/nvidia/nvtx/lib:${LD_LIBRARY_PATH}"
+
 # Triton needs a CC compiler
 RUN microdnf install -y gcc \
     && microdnf clean all
 
 # install vllm wheel first, so that torch etc will be installed
 RUN --mount=type=bind,from=build,src=/workspace/dist,target=/workspace/dist \
     --mount=type=cache,target=/root/.cache/pip \
-    pip install $(echo dist/*.whl)'[tensorizer]' --verbose
+    --mount=type=cache,target=/root/.cache/uv \
+    uv pip install $(echo dist/*.whl)'[tensorizer]' --verbose
 
 # Install libsodium for Tensorizer encryption
 RUN --mount=type=bind,from=libsodium-builder,src=/usr/src/libsodium,target=/usr/src/libsodium \
     cd /usr/src/libsodium \
     && make install
 
 RUN --mount=type=cache,target=/root/.cache/pip \
-    pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.0.9/flashinfer-0.0.9+cu121torch2.3-cp311-cp311-linux_x86_64.whl
+    --mount=type=cache,target=/root/.cache/uv \
+    uv pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.1.2/flashinfer-0.1.2+cu121torch2.4-cp311-cp311-linux_x86_64.whl
 
 ENV HF_HUB_OFFLINE=1 \
     PORT=8000 \
@@ -190,7 +202,7 @@ FROM vllm-openai as vllm-grpc-adapter
 USER root
 
 RUN --mount=type=cache,target=/root/.cache/pip \
-    pip install vllm-tgis-adapter==0.2.3
+    pip install vllm-tgis-adapter==0.2.4
 
 ENV GRPC_PORT=8033
 USER 2000

diff --git a/extras/smoke-test.sh b/extras/smoke-test.sh
@@ -22,6 +22,7 @@ function wait_for(){
         max_retries=$((max_retries-1))
         if [[ max_retries -le 0 ]]; then
             echo "Timed out waiting for $name server" >&2
+            kill -9 ${server_pid}
             exit 1
         fi
     done