diff --git a/Dockerfile.ubi b/Dockerfile.ubi index d46650db1..e8567e775 100644 --- a/Dockerfile.ubi +++ b/Dockerfile.ubi @@ -257,7 +257,7 @@ FROM base AS vllm WORKDIR /vllm-staging # COPY files from various places into a staging directory COPY --link vllm vllm -COPY --from=prebuilt-wheel --link /workspace/vllm/*.so vllm/ +COPY --from=build --link /workspace/vllm/*.so vllm/ COPY --from=gen-protos --link /workspace/vllm/entrypoints/grpc/pb vllm/entrypoints/grpc/pb # custom COPY command to use umask to control permissions and grant permissions diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py index c6efbf67c..a0837a208 100644 --- a/vllm/_custom_ops.py +++ b/vllm/_custom_ops.py @@ -49,7 +49,7 @@ def paged_attention_v1( vllm_ops.paged_attention_v1(out, query, key_cache, value_cache, num_kv_heads, scale, block_tables, context_lens, block_size, max_context_len, - alibi_slopes, kv_cache_dtype)#, kv_scale) + alibi_slopes, kv_cache_dtype, kv_scale) def paged_attention_v2( @@ -73,8 +73,8 @@ def paged_attention_v2( vllm_ops.paged_attention_v2(out, exp_sum, max_logits, tmp_out, query, key_cache, value_cache, num_kv_heads, scale, block_tables, context_lens, block_size, - max_context_len, alibi_slopes, kv_cache_dtype) - # kv_scale) + max_context_len, alibi_slopes, kv_cache_dtype, + kv_scale) # pos encoding ops @@ -173,7 +173,7 @@ def reshape_and_cache( kv_scale: float, ) -> None: vllm_cache_ops.reshape_and_cache(key, value, key_cache, value_cache, - slot_mapping, kv_cache_dtype)#, kv_scale) + slot_mapping, kv_cache_dtype, kv_scale) def copy_blocks(key_caches: torch.Tensor, value_caches: torch.Tensor,