Skip to content

Commit

Permalink
🐛 compile it ourselves
Browse files Browse the repository at this point in the history
Signed-off-by: Joe Runde <Joseph.Runde@ibm.com>
  • Loading branch information
joerunde committed Apr 11, 2024
1 parent ba27b4d commit e6ec7db
Show file tree
Hide file tree
Showing 2 changed files with 11 additions and 6 deletions.
11 changes: 8 additions & 3 deletions Dockerfile.ubi
Original file line number Diff line number Diff line change
Expand Up @@ -175,6 +175,10 @@ ENV NVCC_THREADS=$nvcc_threads
# make sure punica kernels are built (for LoRA)
ENV VLLM_INSTALL_PUNICA_KERNELS=1

# Setup path stuff? Ref: https://github.com/vllm-project/vllm/blob/main/.github/workflows/scripts/build.sh#L6-L8
ENV PATH=/usr/local/cuda/bin:$PATH
ENV LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH

RUN python3 setup.py build_ext --inplace


Expand Down Expand Up @@ -257,7 +261,7 @@ FROM base AS vllm
WORKDIR /vllm-staging
# COPY files from various places into a staging directory
COPY --link vllm vllm
COPY --from=prebuilt-wheel --link /workspace/vllm/*.so vllm/
COPY --from=build --link /workspace/vllm/*.so vllm/
COPY --from=gen-protos --link /workspace/vllm/entrypoints/grpc/pb vllm/entrypoints/grpc/pb

# custom COPY command to use umask to control permissions and grant permissions
Expand All @@ -281,9 +285,10 @@ COPY --from=python-torch-base --link /opt/vllm /opt/vllm
ENV PATH=/opt/vllm/bin/:$PATH

RUN --mount=type=cache,target=/root/.cache/pip \
--mount=type=bind,source=requirements.txt,target=requirements.txt \
--mount=type=bind,source=requirements-common.txt,target=requirements-common.txt \
--mount=type=bind,source=requirements-cuda.txt,target=requirements-cuda.txt \
pip3 install \
-r requirements.txt \
-r requirements-cuda.txt \
# additional dependencies for the TGIS gRPC server
grpcio-tools==1.62.1 \
# additional dependencies for openai api_server
Expand Down
6 changes: 3 additions & 3 deletions vllm/attention/ops/paged_attn.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,7 @@ def write_to_paged_cache(
value_cache,
slot_mapping.flatten(),
kv_cache_dtype,
# kv_scale,
kv_scale,
)

@staticmethod
Expand Down Expand Up @@ -123,7 +123,7 @@ def forward_decode(
max_context_len,
alibi_slopes,
kv_cache_dtype,
# kv_scale,
kv_scale,
)
else:
# Run PagedAttention V2.
Expand Down Expand Up @@ -155,7 +155,7 @@ def forward_decode(
max_context_len,
alibi_slopes,
kv_cache_dtype,
# kv_scale,
kv_scale,
)
return output

Expand Down

0 comments on commit e6ec7db

Please sign in to comment.