16
16
from truss .base .constants import (
17
17
BASE_SERVER_REQUIREMENTS_TXT_FILENAME ,
18
18
BASE_TRTLLM_REQUIREMENTS ,
19
+ BEI_MAX_CONCURRENCY_TARGET_REQUESTS ,
20
+ BEI_TRTLLM_BASE_IMAGE ,
21
+ BEI_TRTLLM_CLIENT_BATCH_SIZE ,
22
+ BEI_TRTLLM_PYTHON_EXECUTABLE ,
19
23
CHAINS_CODE_DIR ,
20
24
CONTROL_SERVER_CODE_DIR ,
21
25
DOCKER_SERVER_TEMPLATES_DIR ,
22
- ENCODER_TRTLLM_BASE_IMAGE ,
23
- ENCODER_TRTLLM_CLIENT_BATCH_SIZE ,
24
- ENCODER_TRTLLM_PYTHON_EXECUTABLE ,
25
26
FILENAME_CONSTANTS_MAP ,
26
27
MAX_SUPPORTED_PYTHON_VERSION_IN_CUSTOM_BASE_IMAGE ,
27
28
MIN_SUPPORTED_PYTHON_VERSION_IN_CUSTOM_BASE_IMAGE ,
@@ -369,34 +370,36 @@ def _copy_into_build_dir(
369
370
):
370
371
copy_tree_or_file (from_path , build_dir / path_in_build_dir ) # type: ignore[operator]
371
372
372
- def prepare_trtllm_encoder_build_dir (self , build_dir : Path ):
373
- """prepares the build directory for a trtllm ENCODER model"""
373
+ def prepare_trtllm_bei_encoder_build_dir (self , build_dir : Path ):
374
+ """prepares the build directory for a trtllm ENCODER model to launch a Baseten Embeddings Inference (BEI) server """
374
375
config = self ._spec .config
375
376
assert (
376
377
config .trt_llm
377
378
and config .trt_llm .build
378
379
and config .trt_llm .build .base_model == TrussTRTLLMModel .ENCODER
379
380
), (
380
- "prepare_trtllm_encoder_build_dir should only be called for encoder tensorrt-llm model"
381
+ "prepare_trtllm_bei_encoder_build_dir should only be called for ENCODER tensorrt-llm model"
381
382
)
382
383
# TRTLLM has performance degradation with batch size >> 32, so we limit the runtime settings
383
384
# runtime batch size may not be higher than what the build settings of the model allow
384
385
# to 32 even if the engine.rank0 allows for higher batch_size
385
386
runtime_max_batch_size = min (config .trt_llm .build .max_batch_size , 32 )
386
387
port = 7997
387
- start_command = (
388
- f"python-truss-download && text-embeddings-router "
389
- f"--port { port } "
390
- f"--max-batch-requests { runtime_max_batch_size } "
391
- # how many sentences can be in a single json payload.
392
- # limited default to improve request based autoscaling.
393
- f"--max-client-batch-size { ENCODER_TRTLLM_CLIENT_BATCH_SIZE } "
394
- # how many concurrent requests can be handled by the server until 429 is returned.
395
- # limited by https://docs.baseten.co/performance/concurrency#concurrency-target
396
- # 16384 is a safe max value for the server
397
- f"--max-concurrent-requests 16384"
398
- # downloaded model path by `python-truss-download` cmd
399
- "--model-id /app/data/tokenization"
388
+ start_command = " " .join (
389
+ [
390
+ "python-truss-download && text-embeddings-router" ,
391
+ f"--port { port } " ,
392
+ f"--max-batch-requests { runtime_max_batch_size } " ,
393
+ # how many sentences can be in a single json payload.
394
+ # limited default to improve request based autoscaling.
395
+ f"--max-client-batch-size { BEI_TRTLLM_CLIENT_BATCH_SIZE } " ,
396
+ # how many concurrent requests can be handled by the server until 429 is returned.
397
+ # limited by https://docs.baseten.co/performance/concurrency#concurrency-target
398
+ # 2048 is a safe max value for the server
399
+ f"--max-concurrent-requests { BEI_MAX_CONCURRENCY_TARGET_REQUESTS } " ,
400
+ # downloaded model path by `python-truss-download` cmd
401
+ "--model-id /app/data/tokenization" ,
402
+ ]
400
403
)
401
404
self ._spec .config .docker_server = DockerServer (
402
405
start_command = f"/bin/sh -c '{ start_command } '" ,
@@ -408,12 +411,12 @@ def prepare_trtllm_encoder_build_dir(self, build_dir: Path):
408
411
copy_tree_path (DOCKER_SERVER_TEMPLATES_DIR , build_dir , ignore_patterns = [])
409
412
410
413
config .base_image = BaseImage (
411
- image = ENCODER_TRTLLM_BASE_IMAGE ,
412
- python_executable_path = ENCODER_TRTLLM_PYTHON_EXECUTABLE ,
414
+ image = BEI_TRTLLM_BASE_IMAGE ,
415
+ python_executable_path = BEI_TRTLLM_PYTHON_EXECUTABLE ,
413
416
)
414
417
415
418
def prepare_trtllm_decoder_build_dir (self , build_dir : Path ):
416
- """prepares the build directory for a trtllm decoder-like modes """
419
+ """prepares the build directory for a trtllm decoder-like models to launch BRITON server """
417
420
config = self ._spec .config
418
421
assert (
419
422
config .trt_llm
@@ -481,7 +484,7 @@ def prepare_image_build_dir(
481
484
):
482
485
if config .trt_llm .build .base_model == TrussTRTLLMModel .ENCODER :
483
486
# Run the specific encoder build
484
- self .prepare_trtllm_encoder_build_dir (build_dir = build_dir )
487
+ self .prepare_trtllm_bei_encoder_build_dir (build_dir = build_dir )
485
488
else :
486
489
self .prepare_trtllm_decoder_build_dir (build_dir = build_dir )
487
490
0 commit comments