Skip to content

Commit

Permalink
Update TensorRT-LLM (#1891)
Browse files Browse the repository at this point in the history
* Update TensorRT-LLM

---------

Co-authored-by: Marks101 <markus.schnoes@gmx.de>
Co-authored-by: lkm2835 <lkm2835@gmail.com>
  • Loading branch information
3 people authored Jul 4, 2024
1 parent 9691e12 commit 9dbc5b3
Show file tree
Hide file tree
Showing 216 changed files with 6,178 additions and 3,563 deletions.
7 changes: 5 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ TensorRT-LLM
[![Documentation](https://img.shields.io/badge/docs-latest-brightgreen.svg?style=flat)](https://nvidia.github.io/TensorRT-LLM/)
[![python](https://img.shields.io/badge/python-3.10.12-green)](https://www.python.org/downloads/release/python-31012/)
[![cuda](https://img.shields.io/badge/cuda-12.4.1-green)](https://developer.nvidia.com/cuda-downloads)
[![trt](https://img.shields.io/badge/TRT-10.0.1-green)](https://developer.nvidia.com/tensorrt)
[![trt](https://img.shields.io/badge/TRT-10.1.0-green)](https://developer.nvidia.com/tensorrt)
[![version](https://img.shields.io/badge/release-0.11.0.dev-green)](./tensorrt_llm/version.py)
[![license](https://img.shields.io/badge/license-Apache%202-blue)](./LICENSE)

Expand All @@ -17,7 +17,10 @@ TensorRT-LLM
<div align="left">

## Latest News
* [*Weekly*] Check out **[@NVIDIAAIDev](https://twitter.com/nvidiaaidev?lang=en)** & **[NVIDIA AI](https://www.linkedin.com/showcase/nvidia-ai/)** LinkedIn for the latest updates!
* [2024/07/02] Let the @MistralAI MoE tokens fly 📈 🚀 #Mixtral 8x7B with NVIDIA #TensorRT #LLM on #H100.
[➡️ Tech blog](https://developer.nvidia.com/blog/achieving-high-mixtral-8x7b-performance-with-nvidia-h100-tensor-core-gpus-and-tensorrt-llm?ncid=so-twit-928467)
![Example Image](docs/source/media/picture-07-02-2024.png)

* [2024/02/06] [🚀 Speed up inference with SOTA quantization techniques in TRT-LLM](./docs/source/blogs/quantization-in-TRT-LLM.md)
* [2024/01/30] [ New XQA-kernel provides 2.4x more Llama-70B throughput within the same latency budget](./docs/source/blogs/XQA-kernel.md)
* [2023/12/04] [Falcon-180B on a single H200 GPU with INT4 AWQ, and 6.7x faster Llama-70B over A100](./docs/source/blogs/Falcon180B-H200.md)
Expand Down
122 changes: 76 additions & 46 deletions benchmarks/cpp/gptManagerBenchmark.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -150,7 +150,9 @@ struct BenchmarkParams
bool streaming{false};
bool enableExpDelays{false};
std::optional<float> requestRate{std::nullopt};
std::optional<int> concurrency{std::nullopt};
std::optional<SizeType32> maxBatchSize{std::nullopt};
std::optional<SizeType32> maxNumTokens{std::nullopt};
int randomSeed = 430;
std::optional<int> maxAttentionWindow{std::nullopt};

Expand Down Expand Up @@ -773,7 +775,9 @@ class ExecutorServer
: mRecorder(std::move(recorder))
, mWaitSleep(waitSleep)
, mStaticEmulatedBatchSize(staticEmulatedBatchSize)
, mConcurrency(benchmarkParams.concurrency)
, mActiveCount(0)
, mNumFinished(0)
, mShutdown(false)
{

Expand All @@ -793,6 +797,10 @@ class ExecutorServer
{
executorConfig.setMaxBatchSize(benchmarkParams.maxBatchSize.value());
}
if (benchmarkParams.maxNumTokens)
{
executorConfig.setMaxNumTokens(benchmarkParams.maxNumTokens.value());
}

executorConfig.setDecodingConfig(texec::DecodingConfig(
benchmarkParams.medusaChoices.has_value() ? texec::DecodingMode::Medusa() : texec::DecodingMode::Auto(),
Expand Down Expand Up @@ -843,10 +851,19 @@ class ExecutorServer
}
}

void resetNumFinished()
{
mNumFinished = 0;
}

bool canEnqueue(int numSentRequests) const
{
return !mConcurrency || (numSentRequests - mNumFinished < mConcurrency);
}

void waitForResponses(SizeType32 numRequests, bool warmup = false)
{
SizeType32 numFinished = 0;
while (mActiveCount || (numFinished < numRequests))
while (mActiveCount || (mNumFinished < numRequests))
{
auto responses = mExecutor->awaitResponses(mWaitSleep);
for (auto const& response : responses)
Expand All @@ -856,7 +873,7 @@ class ExecutorServer
if (response.getResult().isFinal)
{
mActiveCount--;
numFinished++;
mNumFinished++;
if (!warmup)
{
mRecorder->recordEnd(reqId, response);
Expand All @@ -873,7 +890,7 @@ class ExecutorServer
}
}

void collectStats()
void collectStats() const
{
while (!mShutdown)
{
Expand All @@ -893,7 +910,9 @@ class ExecutorServer
std::shared_ptr<Recorder> mRecorder;
std::chrono::milliseconds mWaitSleep;
std::optional<int> mStaticEmulatedBatchSize;
std::optional<int> mConcurrency;
std::atomic<uint64_t> mActiveCount;
std::atomic<uint64_t> mNumFinished;
std::atomic<bool> mShutdown;
}; // class ExecutorServer

Expand All @@ -914,9 +933,7 @@ class GptServer
, mInferReqSyncSndHdl(nullptr)
{
auto const jsonConfig = GptJsonConfig::parse(trtEnginePath / "config.json");
SizeType32 deviceCount{0};
TLLM_CUDA_CHECK(cudaGetDeviceCount(&deviceCount));
mWorldConfig = WorldConfig::mpi(deviceCount, jsonConfig.getTensorParallelism(),
mWorldConfig = WorldConfig::mpi(jsonConfig.getGpusPerNode(), jsonConfig.getTensorParallelism(),
jsonConfig.getPipelineParallelism(), optionalParams.deviceIds);
auto& comm = COMM_SESSION;
mCommTensorParallel = std::make_shared<tensorrt_llm::mpi::MpiComm>(
Expand Down Expand Up @@ -1352,16 +1369,15 @@ void benchmarkGptManager(std::filesystem::path const& engineDir, TrtGptModelType
optionalParams.gpuWeightsPercent = benchmarkParams.gpuWeightsPercent;
optionalParams.maxBeamWidth = beamWidth;
optionalParams.maxBatchSize = benchmarkParams.maxBatchSize;
optionalParams.maxNumTokens = benchmarkParams.maxNumTokens;
optionalParams.schedulerConfig = texec::SchedulerConfig{capacitySchedulerPolicy};
optionalParams.decodingConfig = texec::DecodingConfig(
benchmarkParams.medusaChoices.has_value() ? texec::DecodingMode::Medusa() : texec::DecodingMode::Auto(),
std::nullopt, benchmarkParams.medusaChoices);

auto const jsonConfig = GptJsonConfig::parse(engineDir / "config.json");
SizeType32 deviceCount{0};
TLLM_CUDA_CHECK(cudaGetDeviceCount(&deviceCount));
auto const worldConfig = WorldConfig::mpi(
deviceCount, jsonConfig.getTensorParallelism(), jsonConfig.getPipelineParallelism(), optionalParams.deviceIds);
auto const worldConfig = WorldConfig::mpi(jsonConfig.getGpusPerNode(), jsonConfig.getTensorParallelism(),
jsonConfig.getPipelineParallelism(), optionalParams.deviceIds);

BufferManager bufferManager{std::make_shared<CudaStream>()}; // the stream is not used

Expand Down Expand Up @@ -1551,53 +1567,49 @@ void benchmarkExecutor(std::filesystem::path const& engineDir, TrtGptModelType m
benchmarkParams.streaming, returnContextLogits, returnGenerationLogits, loraConfig));
}

bool hasDelay
bool const hasDelay
= std::any_of(timeDelays.begin(), timeDelays.end(), [](auto const& delay) { return delay > 0.0; });
if (hasDelay && staticEmulatedBatchSize)
executorServer->resetNumFinished();
if (!staticEmulatedBatchSize)
{
TLLM_THROW("Executor benchmark doesn't support delays with emulated static batch sizes");
}
// Launch a thread that will wait for responses
std::thread waitThread(
[numSamples, executorServer]() { executorServer->waitForResponses(numSamples); });

if (!hasDelay)
{
if (!staticEmulatedBatchSize)
{
executorServer->enqueue(std::move(requests));
executorServer->waitForResponses(numSamples);
}
else
// Enqueue requests one by one
int numSentRequests = 0;
while (numSentRequests < numSamples)
{
SizeType32 numRequests = requests.size();
SizeType32 maxBatchSize = staticEmulatedBatchSize.value();
for (SizeType32 req = 0; req < numRequests; req += maxBatchSize)
if (executorServer->canEnqueue(numSentRequests))
{
auto batchSize = std::min(maxBatchSize, numRequests - req);

std::vector<texec::Request> requestsBatch(std::make_move_iterator(requests.begin() + req),
std::make_move_iterator(requests.begin() + req + batchSize));
// Enqueue in batches
executorServer->enqueue(std::move(requestsBatch));
// Wait for current batch to be done
executorServer->waitForResponses(batchSize);
executorServer->enqueue({requests.at(numSentRequests)});
if (hasDelay && numSentRequests < numSamples - 1)
{
std::this_thread::sleep_for(
std::chrono::milliseconds(static_cast<int>(timeDelays.at(numSentRequests) * 1000)));
}
numSentRequests += 1;
}
}
waitThread.join();
}
else
{
// Launch a thread that will wait for responses
std::thread waitThread(
[numSamples, executorServer]() { executorServer->waitForResponses(numSamples); });
// Enqueue requests one by one
for (std::size_t i = 0; i < numSamples; ++i)
TLLM_CHECK_WITH_INFO(
!hasDelay, "Executor benchmark doesn't support delays with emulated static batch sizes");
SizeType32 numRequests = requests.size();
SizeType32 maxBatchSize = staticEmulatedBatchSize.value();
for (SizeType32 req = 0; req < numRequests; req += maxBatchSize)
{
executorServer->enqueue({std::move(requests.at(i))});
if (i < numSamples - 1)
{
std::this_thread::sleep_for(
std::chrono::milliseconds(static_cast<int>(timeDelays.at(i) * 1000)));
}
auto batchSize = std::min(maxBatchSize, numRequests - req);

std::vector<texec::Request> requestsBatch(std::make_move_iterator(requests.begin() + req),
std::make_move_iterator(requests.begin() + req + batchSize));
// Enqueue in batches
executorServer->enqueue(std::move(requestsBatch));
// Wait for current batch to be done
executorServer->waitForResponses(batchSize);
}
waitThread.join();
}
}
recorder->finalize();
Expand Down Expand Up @@ -1670,7 +1682,10 @@ int main(int argc, char* argv[])
options.add_options()("request_rate",
"request rate in reqs/sec. Skipping this arg or negative value will trigger offline/0-delay.",
cxxopts::value<float>());
options.add_options()("concurrency", "Concurrent number of connections with the server.", cxxopts::value<int>());
options.add_options()("max_batch_size", "The max runtime batch size when benchmarking", cxxopts::value<int>());
options.add_options()(
"max_num_tokens", "The max runtime number of tokens per batch when benchmarking", cxxopts::value<int>());
options.add_options()("enable_trt_overlap", "Overlap TRT context preparation and execution",
cxxopts::value<bool>()->default_value("false"));
options.add_options()("enable_exp_delays", "Enables exponential delay distr to mimic real world request arrival",
Expand Down Expand Up @@ -1816,18 +1831,33 @@ int main(int argc, char* argv[])
// Argument: streaming
benchmarkParams.streaming = result["streaming"].as<bool>();

TLLM_CHECK_WITH_INFO(!(result.count("request_rate") && result.count("concurrency")),
"request_rate and concurrency cannot be specified at the same time.");

// Argument: request rate
if (result.count("request_rate"))
{
benchmarkParams.requestRate = result["request_rate"].as<float>();
}

// Argument: concurrency
if (result.count("concurrency"))
{
benchmarkParams.concurrency = result["concurrency"].as<int>();
}

// Argument: request rate
if (result.count("max_batch_size"))
{
benchmarkParams.maxBatchSize = result["max_batch_size"].as<int>();
}

// Argument: request rate
if (result.count("max_num_tokens"))
{
benchmarkParams.maxNumTokens = result["max_num_tokens"].as<int>();
}

benchmarkParams.enableExpDelays = result["enable_exp_delays"].as<bool>();

// Argument: Enable batch stats output
Expand Down
5 changes: 2 additions & 3 deletions benchmarks/cpp/gptSessionBenchmark.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -75,9 +75,8 @@ void benchmarkGptSession(std::filesystem::path const& dataPath, std::vector<int>
auto const json = GptJsonConfig::parse(jsonFileName);
auto const modelConfig = json.getModelConfig();
auto const inputPacked = modelConfig.usePackedInput();
SizeType32 deviceCount{0};
TLLM_CUDA_CHECK(cudaGetDeviceCount(&deviceCount));
auto const worldConfig = WorldConfig::mpi(deviceCount, json.getTensorParallelism(), json.getPipelineParallelism());
auto const worldConfig
= WorldConfig::mpi(json.getGpusPerNode(), json.getTensorParallelism(), json.getPipelineParallelism());
auto& comm = COMM_SESSION;
auto const enginePath = dataPath / json.engineFilename(worldConfig);
auto const dtype = modelConfig.getDataType();
Expand Down
4 changes: 0 additions & 4 deletions benchmarks/python/allowed_configs.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,6 @@ class BuildConfig:
type_vocab_size: Optional[int] = None
pre_norm: Optional[bool] = None
do_layer_norm_before: Optional[bool] = None
enable_qk_half_accum: bool = False
enable_context_fmha: bool = True
enable_multi_block_mode: bool = False
# The enum name of PositionEmbeddingType
Expand Down Expand Up @@ -651,7 +650,6 @@ class ModelConfig:
max_batch_size=256,
max_input_len=512,
builder_opt=None,
enable_qk_half_accum=False,
enable_context_fmha=False,
)),
"bert_large":
Expand All @@ -669,7 +667,6 @@ class ModelConfig:
max_batch_size=64,
max_input_len=512,
builder_opt=None,
enable_qk_half_accum=False,
enable_context_fmha=False,
)),
"roberta_base":
Expand All @@ -687,7 +684,6 @@ class ModelConfig:
max_batch_size=64,
max_input_len=512,
builder_opt=None,
enable_qk_half_accum=False,
enable_context_fmha=False,
)),
"falcon_rw_1b":
Expand Down
31 changes: 0 additions & 31 deletions benchmarks/python/build.py
Original file line number Diff line number Diff line change
Expand Up @@ -264,21 +264,6 @@ def build_gpt(args):
max_input_len = build_config['max_input_len'] \
if args.max_input_len is None else args.max_input_len

if args.max_output_len:
logger.warning(
'--max_output_len has been deprecated in favor of --max_seq_len')
if args.max_input_len:
if args.max_seq_len:
logger.warning(
'--max_seq_len has been overwritten due to --max_output_len being specified'
)
args.max_seq_len = args.max_input_len + args.max_output_len
else:
raise Exception(
f"max_output_len is specified but not max_input_len")

del args.max_output_len

max_seq_len = build_config['max_seq_len'] \
if args.max_seq_len is None else args.max_seq_len
max_beam_width = build_config['max_beam_width'] \
Expand Down Expand Up @@ -1113,7 +1098,6 @@ def build_bert(args):
if args.mode == 'plugin':
network.plugin_config.bert_attention_plugin = args.dtype
network.plugin_config.gemm_plugin = args.dtype
network.plugin_config.attention_qk_half_accumulation = True
network.plugin_config.set_context_fmha(ContextFMHAType.enabled)
elif args.mode == 'ootb-except-mha':
network.plugin_config.bert_attention_plugin = args.dtype
Expand Down Expand Up @@ -1573,21 +1557,6 @@ def build_enc_dec(args):
if args.max_input_len is None else args.max_input_len
build_config['max_decoder_input_len'] = 1

if args.max_output_len:
logger.warning(
'--max_output_len has been deprecated in favor of --max_seq_len')
if args.max_input_len:
if args.max_seq_len:
logger.warning(
'--max_seq_len has been overwritten due to --max_output_len being specified'
)
args.max_seq_len = args.max_input_len + args.max_output_len
else:
raise Exception(
f"max_output_len is specified but not max_input_len")

del args.max_output_len

build_config['max_seq_len'] = build_config['max_seq_len'] \
if args.max_seq_len is None else args.max_seq_len
build_config[
Expand Down
9 changes: 4 additions & 5 deletions benchmarks/suite/tensorrt_llm_bench/benchmarkers/static.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import platform
from pathlib import Path
from subprocess import CompletedProcess
from typing import Dict, List
Expand Down Expand Up @@ -143,11 +144,9 @@ def benchmark(self):
"""Benchmarks a TRT-LLM for a configured instance."""

# Compile the command for running
cmd = [
"mpirun",
"-allow-run-as-root",
"-n",
self.config.world_size,
cmd = ["mpiexec", "-n", self.config.world_size]
cmd += ["-allow-run-as-root"] if platform.system() != "Windows" else ""
cmd += [
self.gpt_session_path,
"--engine_dir",
self.config.engine_path,
Expand Down
2 changes: 1 addition & 1 deletion benchmarks/suite/tensorrt_llm_bench/ifb.py
Original file line number Diff line number Diff line change
Expand Up @@ -285,7 +285,7 @@ def executor_benchmark(
# the
logger.info("Launching benchmark...")
bench_cmd = \
["mpirun", "-n", f"{benchmark_cfg.world_size}", "python"] + \
["mpiexec", "-n", f"{benchmark_cfg.world_size}", "python"] + \
sys.argv + ["--run"]
process = subprocess.Popen(
bench_cmd,
Expand Down
Loading

0 comments on commit 9dbc5b3

Please sign in to comment.