|
51 | 51 |
|
52 | 52 | logger = init_logger(__name__)
|
53 | 53 |
|
| 54 | +# This value is chosen to have a balance between ITL and TTFT. Note it is |
| 55 | +# not optimized for throughput. |
| 56 | +_DEFAULT_MAX_NUM_BATCHED_TOKENS = 2048 |
54 | 57 | _POOLING_MODEL_MAX_NUM_BATCHED_TOKENS = 32768
|
55 | 58 | _MULTIMODAL_MODEL_MAX_NUM_BATCHED_TOKENS = 5120
|
56 | 59 |
|
@@ -1526,15 +1529,17 @@ def __post_init__(self) -> None:
|
1526 | 1529 | # for now. Have max_num_batched_tokens set to max_model_len
|
1527 | 1530 | # so we don't reject sequences on account of a short
|
1528 | 1531 | # max_num_batched_tokens.
|
1529 |
| - self.max_num_batched_tokens = max(self.max_model_len, 2048) |
| 1532 | + self.max_num_batched_tokens = max( |
| 1533 | + self.max_model_len, _DEFAULT_MAX_NUM_BATCHED_TOKENS) |
1530 | 1534 | else:
|
1531 |
| - # This value is chosen to have a balance between ITL |
1532 |
| - # and TTFT. Note it is not optimized for throughput. |
1533 |
| - self.max_num_batched_tokens = 2048 |
| 1535 | + self.max_num_batched_tokens = ( |
| 1536 | + _DEFAULT_MAX_NUM_BATCHED_TOKENS) |
1534 | 1537 | else:
|
1535 |
| - # If max_model_len is too short, use 2048 as the default value |
| 1538 | + # If max_model_len is too short, use |
| 1539 | + # _DEFAULT_MAX_NUM_BATCHED_TOKENS as the default value |
1536 | 1540 | # for higher throughput.
|
1537 |
| - self.max_num_batched_tokens = max(self.max_model_len, 2048) |
| 1541 | + self.max_num_batched_tokens = max( |
| 1542 | + self.max_model_len, _DEFAULT_MAX_NUM_BATCHED_TOKENS) |
1538 | 1543 |
|
1539 | 1544 | if self.runner_type == "pooling":
|
1540 | 1545 | # Choose specific value for higher throughput
|
@@ -3333,6 +3338,9 @@ def __post_init__(self):
|
3333 | 3338 | "caching to be disabled.")
|
3334 | 3339 | self.scheduler_config.enable_chunked_prefill = False
|
3335 | 3340 | self.scheduler_config.chunked_prefill_enabled = False
|
| 3341 | + self.scheduler_config.max_num_batched_tokens = max( |
| 3342 | + self.scheduler_config.max_model_len, |
| 3343 | + _DEFAULT_MAX_NUM_BATCHED_TOKENS) |
3336 | 3344 |
|
3337 | 3345 | if self.cache_config is not None:
|
3338 | 3346 | self.cache_config.enable_prefix_caching = False
|
|
0 commit comments