enable automatic batch size and max tokens tuning

guidance-ai · Jan 3, 2025 · 4de7df1 · 4de7df1
1 parent 86ceb85
commit 4de7df1
Show file tree

Hide file tree

Showing 4 changed files with 22 additions and 5 deletions.
diff --git a/llgtrt/src/config.rs b/llgtrt/src/config.rs
@@ -39,6 +39,14 @@ pub struct TrtLlmRuntimeConfig {
 
     /// Host memory to use for KV cache
     pub kv_cache_host_memory_megabytes: usize,
+
+    /// Control automatic tuning of batch size
+    /// Defaults to true (unlike trtllm)
+    pub enable_batch_size_tuning: bool,
+
+    /// Control automatic tuning of max num tokens
+    /// Defaults to true (unlike trtllm)
+    pub enable_max_num_tokens_tuning: bool,
 }
 
 impl Default for TrtLlmRuntimeConfig {
@@ -52,6 +60,8 @@ impl Default for TrtLlmRuntimeConfig {
             enable_kv_cache_reuse: true,
             kv_cache_free_gpu_mem_fraction: 0.9,
             kv_cache_host_memory_megabytes: 0,
+            enable_batch_size_tuning: true,
+            enable_max_num_tokens_tuning: true,
         }
     }
 }

diff --git a/llgtrt/src/startup.rs b/llgtrt/src/startup.rs
@@ -135,6 +135,8 @@ pub async fn run_server(mut cli_config: CliConfig) -> anyhow::Result<()> {
 
     set_field!(enable_chunked_context);
     set_field!(enable_kv_cache_reuse);
+    set_field!(enable_batch_size_tuning);
+    set_field!(enable_max_num_tokens_tuning);
     set_field!(max_batch_size);
     set_field!(max_num_tokens);
     set_field!(max_queue_size);

diff --git a/trtllm-c/main.cpp b/trtllm-c/main.cpp
@@ -92,11 +92,12 @@ TlcStatus tlc_init(TlcInitParams const* params, TlcExecutor** res)
                 : std::nullopt,
             ep->kv_cache_host_memory_bytes, ep->kv_cache_onboard_blocks);
 
-        auto schedulerConfig
-            = tle::SchedulerConfig(ep->guaranteed_no_evict ? tle::CapacitySchedulerPolicy::kGUARANTEED_NO_EVICT
-                                                           : tle::CapacitySchedulerPolicy::kMAX_UTILIZATION
-                // tle::ContextChunkingPolicy::kFIRST_COME_FIRST_SERVED // default?
-            );
+        tle::DynamicBatchConfig dynamicBatchConfig(ep->enable_batch_size_tuning, ep->enable_max_num_tokens_tuning);
+
+        auto policy = ep->guaranteed_no_evict ? tle::CapacitySchedulerPolicy::kGUARANTEED_NO_EVICT
+                                              : tle::CapacitySchedulerPolicy::kMAX_UTILIZATION;
+        auto chunking = tle::ContextChunkingPolicy::kFIRST_COME_FIRST_SERVED; // default?
+        auto schedulerConfig = tle::SchedulerConfig(policy, chunking, dynamicBatchConfig);
 
         executorConfig.setKvCacheConfig(kvConfig);
         executorConfig.setSchedulerConfig(schedulerConfig);

diff --git a/trtllm-c/tlc.h b/trtllm-c/tlc.h
@@ -75,6 +75,10 @@ extern "C"
         int32_t sink_token_length;
         // defaults to false (prefix caching)
         bool enable_kv_cache_reuse;
+
+        // both default to false
+        bool enable_batch_size_tuning;
+        bool enable_max_num_tokens_tuning;
     } TlcEngineParams;
 
     typedef struct