diff --git a/Dockerfile.ubi b/Dockerfile.ubi index e185ac549f51..5308d690015c 100644 --- a/Dockerfile.ubi +++ b/Dockerfile.ubi @@ -181,7 +181,6 @@ RUN --mount=type=cache,target=/root/.cache/pip \ uv pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.1.2/flashinfer-0.1.2+cu121torch2.4-cp311-cp311-linux_x86_64.whl ENV HF_HUB_OFFLINE=1 \ - PORT=8000 \ HOME=/home/vllm \ # Allow requested max length to exceed what is extracted from the # config.json @@ -208,6 +207,13 @@ USER root RUN --mount=type=cache,target=/root/.cache/pip \ pip install vllm-tgis-adapter==0.3.0 -ENV GRPC_PORT=8033 +ENV GRPC_PORT=8033 \ + PORT=8000 \ + # As an optimization, vLLM disables logprobs when using spec decoding by + # default, but this would be unexpected to users of a hosted model that + # happens to have spec decoding + # see: https://github.com/vllm-project/vllm/pull/6485 + DISABLE_LOGPROBS_DURING_SPEC_DECODING=false + USER 2000 ENTRYPOINT ["python3", "-m", "vllm_tgis_adapter", "--uvicorn-log-level=warning"]