opendatahub-io · dtrifiro · Aug 21, 2024 · Aug 20, 2024
diff --git a/Dockerfile.ubi b/Dockerfile.ubi
@@ -181,7 +181,6 @@ RUN --mount=type=cache,target=/root/.cache/pip \
     uv pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.1.2/flashinfer-0.1.2+cu121torch2.4-cp311-cp311-linux_x86_64.whl
 
 ENV HF_HUB_OFFLINE=1 \
-    PORT=8000 \
     HOME=/home/vllm \
     # Allow requested max length to exceed what is extracted from the
     # config.json
@@ -208,6 +207,13 @@ USER root
 RUN --mount=type=cache,target=/root/.cache/pip \
     pip install vllm-tgis-adapter==0.3.0
 
-ENV GRPC_PORT=8033
+ENV GRPC_PORT=8033 \
+    PORT=8000 \
+    # As an optimization, vLLM disables logprobs when using spec decoding by
+    # default, but this would be unexpected to users of a hosted model that
+    # happens to have spec decoding
+    # see: https://github.com/vllm-project/vllm/pull/6485
+    DISABLE_LOGPROBS_DURING_SPEC_DECODING=false
+
 USER 2000
 ENTRYPOINT ["python3", "-m", "vllm_tgis_adapter", "--uvicorn-log-level=warning"]