Skip to content

Commit 01958b3

Browse files
authored
truss context builder start command (#1312)
* add missing whitespace command * update max concurrent requests * tart command * fmt * add magic contant and rename folder names to BEI * e release version 10 * bump version
1 parent bbba46b commit 01958b3

File tree

2 files changed

+31
-26
lines changed

2 files changed

+31
-26
lines changed

truss/base/constants.py

+5-3
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,9 @@
2929
MIN_SUPPORTED_PYTHON_VERSION_IN_CUSTOM_BASE_IMAGE = "3.8"
3030

3131
TRTLLM_PREDICT_CONCURRENCY = 512
32-
ENCODER_TRTLLM_CLIENT_BATCH_SIZE = 128
32+
BEI_TRTLLM_CLIENT_BATCH_SIZE = 128
33+
BEI_MAX_CONCURRENCY_TARGET_REQUESTS = 2048
34+
3335
TRTLLM_MIN_MEMORY_REQUEST_GI = 24
3436
HF_MODELS_API_URL = "https://huggingface.co/api/models"
3537
HF_ACCESS_TOKEN_KEY = "hf_access_token"
@@ -110,8 +112,8 @@
110112
TRTLLM_BASE_IMAGE = "baseten/briton-server:v0.13.0-4fd8a10-5e5c3d7"
111113
TRTLLM_PYTHON_EXECUTABLE = "/usr/bin/python3"
112114
BASE_TRTLLM_REQUIREMENTS = ["briton==0.3.13.dev4"]
113-
ENCODER_TRTLLM_BASE_IMAGE = "baseten/trt_tei_prod:v0.0.9"
114-
ENCODER_TRTLLM_PYTHON_EXECUTABLE = "/usr/bin/python3"
115+
BEI_TRTLLM_BASE_IMAGE = "baseten/trt_tei_prod:v0.0.11"
116+
BEI_TRTLLM_PYTHON_EXECUTABLE = "/usr/bin/python3"
115117

116118
OPENAI_COMPATIBLE_TAG = "openai-compatible"
117119

truss/contexts/image_builder/serving_image_builder.py

+26-23
Original file line numberDiff line numberDiff line change
@@ -16,12 +16,13 @@
1616
from truss.base.constants import (
1717
BASE_SERVER_REQUIREMENTS_TXT_FILENAME,
1818
BASE_TRTLLM_REQUIREMENTS,
19+
BEI_MAX_CONCURRENCY_TARGET_REQUESTS,
20+
BEI_TRTLLM_BASE_IMAGE,
21+
BEI_TRTLLM_CLIENT_BATCH_SIZE,
22+
BEI_TRTLLM_PYTHON_EXECUTABLE,
1923
CHAINS_CODE_DIR,
2024
CONTROL_SERVER_CODE_DIR,
2125
DOCKER_SERVER_TEMPLATES_DIR,
22-
ENCODER_TRTLLM_BASE_IMAGE,
23-
ENCODER_TRTLLM_CLIENT_BATCH_SIZE,
24-
ENCODER_TRTLLM_PYTHON_EXECUTABLE,
2526
FILENAME_CONSTANTS_MAP,
2627
MAX_SUPPORTED_PYTHON_VERSION_IN_CUSTOM_BASE_IMAGE,
2728
MIN_SUPPORTED_PYTHON_VERSION_IN_CUSTOM_BASE_IMAGE,
@@ -369,34 +370,36 @@ def _copy_into_build_dir(
369370
):
370371
copy_tree_or_file(from_path, build_dir / path_in_build_dir) # type: ignore[operator]
371372

372-
def prepare_trtllm_encoder_build_dir(self, build_dir: Path):
373-
"""prepares the build directory for a trtllm ENCODER model"""
373+
def prepare_trtllm_bei_encoder_build_dir(self, build_dir: Path):
374+
"""prepares the build directory for a trtllm ENCODER model to launch a Baseten Embeddings Inference (BEI) server"""
374375
config = self._spec.config
375376
assert (
376377
config.trt_llm
377378
and config.trt_llm.build
378379
and config.trt_llm.build.base_model == TrussTRTLLMModel.ENCODER
379380
), (
380-
"prepare_trtllm_encoder_build_dir should only be called for encoder tensorrt-llm model"
381+
"prepare_trtllm_bei_encoder_build_dir should only be called for ENCODER tensorrt-llm model"
381382
)
382383
# TRTLLM has performance degradation with batch size >> 32, so we limit the runtime settings
383384
# runtime batch size may not be higher than what the build settings of the model allow
384385
# to 32 even if the engine.rank0 allows for higher batch_size
385386
runtime_max_batch_size = min(config.trt_llm.build.max_batch_size, 32)
386387
port = 7997
387-
start_command = (
388-
f"python-truss-download && text-embeddings-router "
389-
f"--port {port} "
390-
f"--max-batch-requests {runtime_max_batch_size} "
391-
# how many sentences can be in a single json payload.
392-
# limited default to improve request based autoscaling.
393-
f"--max-client-batch-size {ENCODER_TRTLLM_CLIENT_BATCH_SIZE} "
394-
# how many concurrent requests can be handled by the server until 429 is returned.
395-
# limited by https://docs.baseten.co/performance/concurrency#concurrency-target
396-
# 16384 is a safe max value for the server
397-
f"--max-concurrent-requests 16384"
398-
# downloaded model path by `python-truss-download` cmd
399-
"--model-id /app/data/tokenization"
388+
start_command = " ".join(
389+
[
390+
"python-truss-download && text-embeddings-router",
391+
f"--port {port}",
392+
f"--max-batch-requests {runtime_max_batch_size}",
393+
# how many sentences can be in a single json payload.
394+
# limited default to improve request based autoscaling.
395+
f"--max-client-batch-size {BEI_TRTLLM_CLIENT_BATCH_SIZE}",
396+
# how many concurrent requests can be handled by the server until 429 is returned.
397+
# limited by https://docs.baseten.co/performance/concurrency#concurrency-target
398+
# 2048 is a safe max value for the server
399+
f"--max-concurrent-requests {BEI_MAX_CONCURRENCY_TARGET_REQUESTS}",
400+
# downloaded model path by `python-truss-download` cmd
401+
"--model-id /app/data/tokenization",
402+
]
400403
)
401404
self._spec.config.docker_server = DockerServer(
402405
start_command=f"/bin/sh -c '{start_command}'",
@@ -408,12 +411,12 @@ def prepare_trtllm_encoder_build_dir(self, build_dir: Path):
408411
copy_tree_path(DOCKER_SERVER_TEMPLATES_DIR, build_dir, ignore_patterns=[])
409412

410413
config.base_image = BaseImage(
411-
image=ENCODER_TRTLLM_BASE_IMAGE,
412-
python_executable_path=ENCODER_TRTLLM_PYTHON_EXECUTABLE,
414+
image=BEI_TRTLLM_BASE_IMAGE,
415+
python_executable_path=BEI_TRTLLM_PYTHON_EXECUTABLE,
413416
)
414417

415418
def prepare_trtllm_decoder_build_dir(self, build_dir: Path):
416-
"""prepares the build directory for a trtllm decoder-like modes"""
419+
"""prepares the build directory for a trtllm decoder-like models to launch BRITON server"""
417420
config = self._spec.config
418421
assert (
419422
config.trt_llm
@@ -481,7 +484,7 @@ def prepare_image_build_dir(
481484
):
482485
if config.trt_llm.build.base_model == TrussTRTLLMModel.ENCODER:
483486
# Run the specific encoder build
484-
self.prepare_trtllm_encoder_build_dir(build_dir=build_dir)
487+
self.prepare_trtllm_bei_encoder_build_dir(build_dir=build_dir)
485488
else:
486489
self.prepare_trtllm_decoder_build_dir(build_dir=build_dir)
487490

0 commit comments

Comments
 (0)