Skip to content

Commit

Permalink
enable automatic batch size and max tokens tuning
Browse files Browse the repository at this point in the history
  • Loading branch information
mmoskal committed Jan 3, 2025
1 parent 86ceb85 commit 4de7df1
Show file tree
Hide file tree
Showing 4 changed files with 22 additions and 5 deletions.
10 changes: 10 additions & 0 deletions llgtrt/src/config.rs
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,14 @@ pub struct TrtLlmRuntimeConfig {

/// Host memory to use for KV cache
pub kv_cache_host_memory_megabytes: usize,

/// Control automatic tuning of batch size
/// Defaults to true (unlike trtllm)
pub enable_batch_size_tuning: bool,

/// Control automatic tuning of max num tokens
/// Defaults to true (unlike trtllm)
pub enable_max_num_tokens_tuning: bool,
}

impl Default for TrtLlmRuntimeConfig {
Expand All @@ -52,6 +60,8 @@ impl Default for TrtLlmRuntimeConfig {
enable_kv_cache_reuse: true,
kv_cache_free_gpu_mem_fraction: 0.9,
kv_cache_host_memory_megabytes: 0,
enable_batch_size_tuning: true,
enable_max_num_tokens_tuning: true,
}
}
}
Expand Down
2 changes: 2 additions & 0 deletions llgtrt/src/startup.rs
Original file line number Diff line number Diff line change
Expand Up @@ -135,6 +135,8 @@ pub async fn run_server(mut cli_config: CliConfig) -> anyhow::Result<()> {

set_field!(enable_chunked_context);
set_field!(enable_kv_cache_reuse);
set_field!(enable_batch_size_tuning);
set_field!(enable_max_num_tokens_tuning);
set_field!(max_batch_size);
set_field!(max_num_tokens);
set_field!(max_queue_size);
Expand Down
11 changes: 6 additions & 5 deletions trtllm-c/main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -92,11 +92,12 @@ TlcStatus tlc_init(TlcInitParams const* params, TlcExecutor** res)
: std::nullopt,
ep->kv_cache_host_memory_bytes, ep->kv_cache_onboard_blocks);

auto schedulerConfig
= tle::SchedulerConfig(ep->guaranteed_no_evict ? tle::CapacitySchedulerPolicy::kGUARANTEED_NO_EVICT
: tle::CapacitySchedulerPolicy::kMAX_UTILIZATION
// tle::ContextChunkingPolicy::kFIRST_COME_FIRST_SERVED // default?
);
tle::DynamicBatchConfig dynamicBatchConfig(ep->enable_batch_size_tuning, ep->enable_max_num_tokens_tuning);

auto policy = ep->guaranteed_no_evict ? tle::CapacitySchedulerPolicy::kGUARANTEED_NO_EVICT
: tle::CapacitySchedulerPolicy::kMAX_UTILIZATION;
auto chunking = tle::ContextChunkingPolicy::kFIRST_COME_FIRST_SERVED; // default?
auto schedulerConfig = tle::SchedulerConfig(policy, chunking, dynamicBatchConfig);

executorConfig.setKvCacheConfig(kvConfig);
executorConfig.setSchedulerConfig(schedulerConfig);
Expand Down
4 changes: 4 additions & 0 deletions trtllm-c/tlc.h
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,10 @@ extern "C"
int32_t sink_token_length;
// defaults to false (prefix caching)
bool enable_kv_cache_reuse;

// both default to false
bool enable_batch_size_tuning;
bool enable_max_num_tokens_tuning;
} TlcEngineParams;

typedef struct
Expand Down

0 comments on commit 4de7df1

Please sign in to comment.