🐛 compile it ourselves

Signed-off-by: Joe Runde <Joseph.Runde@ibm.com>
IBM · Apr 11, 2024 · e6ec7db · e6ec7db
1 parent ba27b4d
commit e6ec7db
Show file tree

Hide file tree

Showing 2 changed files with 11 additions and 6 deletions.
diff --git a/Dockerfile.ubi b/Dockerfile.ubi
@@ -175,6 +175,10 @@ ENV NVCC_THREADS=$nvcc_threads
 # make sure punica kernels are built (for LoRA)
 ENV VLLM_INSTALL_PUNICA_KERNELS=1
 
+# Setup path stuff? Ref: https://github.com/vllm-project/vllm/blob/main/.github/workflows/scripts/build.sh#L6-L8
+ENV PATH=/usr/local/cuda/bin:$PATH
+ENV LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH
+
 RUN python3 setup.py build_ext --inplace
 
 
@@ -257,7 +261,7 @@ FROM base AS vllm
 WORKDIR /vllm-staging
 # COPY files from various places into a staging directory
 COPY --link vllm vllm
-COPY --from=prebuilt-wheel --link /workspace/vllm/*.so vllm/
+COPY --from=build --link /workspace/vllm/*.so vllm/
 COPY --from=gen-protos --link /workspace/vllm/entrypoints/grpc/pb vllm/entrypoints/grpc/pb
 
 # custom COPY command to use umask to control permissions and grant permissions
@@ -281,9 +285,10 @@ COPY --from=python-torch-base --link /opt/vllm /opt/vllm
 ENV PATH=/opt/vllm/bin/:$PATH
 
 RUN --mount=type=cache,target=/root/.cache/pip \
-    --mount=type=bind,source=requirements.txt,target=requirements.txt \
+    --mount=type=bind,source=requirements-common.txt,target=requirements-common.txt \
+    --mount=type=bind,source=requirements-cuda.txt,target=requirements-cuda.txt \
     pip3 install \
-        -r requirements.txt \
+        -r requirements-cuda.txt \
         # additional dependencies for the TGIS gRPC server
         grpcio-tools==1.62.1 \
         # additional dependencies for openai api_server

diff --git a/vllm/attention/ops/paged_attn.py b/vllm/attention/ops/paged_attn.py
@@ -76,7 +76,7 @@ def write_to_paged_cache(
             value_cache,
             slot_mapping.flatten(),
             kv_cache_dtype,
-            # kv_scale,
+            kv_scale,
         )
 
     @staticmethod
@@ -123,7 +123,7 @@ def forward_decode(
                 max_context_len,
                 alibi_slopes,
                 kv_cache_dtype,
-                # kv_scale,
+                kv_scale,
             )
         else:
             # Run PagedAttention V2.
@@ -155,7 +155,7 @@ def forward_decode(
                 max_context_len,
                 alibi_slopes,
                 kv_cache_dtype,
-                # kv_scale,
+                kv_scale,
             )
         return output