From 06083b3fbda7dfe06103b6e33ad672c320f4a8ba Mon Sep 17 00:00:00 2001 From: Varun Puri Date: Thu, 7 Mar 2024 15:20:45 -0800 Subject: [PATCH] Add time to first token for llama runner (#2141) Summary: Add time to first generated token & other features - Since we're measuring the first token time, the token rate is measured both at the * Model Load Time - just a timer around ET_CHECK_OK_OR_RETURN_ERROR(load()); * Total inference time - Immediately after model load until the end of the inference loop * >>First token time - From immediately after the model load until the first generated (not prompt) token is printed. * >>>>Prompt eval - (comparable to llama.cpp prompt_eval_time) prompt array allocation and tokenization. Ends right before the inference loop starts * >>Remaining tokens - immediately after the first token is outputted until the end of the inference loop * >>Net eval time - (comparable to llama.cpp eval_time) Total time spent generating tokens. To implement: * Sample time - amount of time spent sampling per token (present in llama.cpp) Differential Revision: D54223564 --- examples/models/llama2/runner/runner.cpp | 117 +++++++++++++++++++---- examples/models/llama2/runner/runner.h | 30 ++++++ 2 files changed, 130 insertions(+), 17 deletions(-) diff --git a/examples/models/llama2/runner/runner.cpp b/examples/models/llama2/runner/runner.cpp index 38e9d77a0da..3b2d67d3e9b 100644 --- a/examples/models/llama2/runner/runner.cpp +++ b/examples/models/llama2/runner/runner.cpp @@ -14,6 +14,7 @@ #include #include +#include #ifdef USE_ATEN_LIB #include @@ -161,8 +162,16 @@ Error Runner::generate( // Prepare the inputs. // Use ones-initialized inputs. ET_CHECK_MSG(!prompt.empty(), "Prompt cannot be null"); - ET_CHECK_OK_OR_RETURN_ERROR(load()); + if (!is_loaded()) { + timers_.model_load_start_ms = util::time_in_ms(); + ET_CHECK_OK_OR_RETURN_ERROR(load()); + timers_.model_load_end_ms = util::time_in_ms(); + } + + // First token time only measures the time it takes to encode the prompt and + // return a response token. + timers_.inference_start_ms = util::time_in_ms(); shouldStop_ = false; // encode the (string) prompt into tokens sequence @@ -179,6 +188,7 @@ Error Runner::generate( append_eos_ ? n_eos_ : 0, prompt_tokens, &num_prompt_tokens); + for (int i = 0; i < num_prompt_tokens; i++) { ET_LOG(Info, "prompt_tokens[%d]: %d", i, prompt_tokens[i]); } @@ -192,8 +202,6 @@ Error Runner::generate( "Sequence length exceeded - please increase the seq_len value passed to generate()"); // start the main loop - long start = - 0; // used to time our code, only initialized after first iteration int next; // will store the next token in the sequence int64_t pos = num_prompt_tokens - 1; // position in the sequence int token = prompt_tokens[pos]; // prefill starts from 0 to num_prompt_tokens @@ -254,6 +262,7 @@ Error Runner::generate( tokenizer_->decode(prompt_tokens[i - 1], prompt_tokens[i]))); } } + // create a 1xN int tensor with next as value while (pos < seq_len) { // ET_LOG(Info, "Generating step %d...", pos); @@ -289,10 +298,14 @@ Error Runner::generate( outputs.size() > 0, "Expecting output to have at least one evalue. Got %zu", outputs.size()); - + if (pos == num_prompt_tokens) { + timers_.first_token_ms = util::time_in_ms(); + } else if (pos == num_prompt_tokens - 1) { + timers_.prompt_eval_end_ms = util::time_in_ms(); + } int32_t next_tok; exec_aten::Tensor logits_tensor = outputs.at(logits_index).toTensor(); - + long sample_start_time_ms = util::time_in_ms(); switch (logits_tensor.scalar_type()) { case ScalarType::Float: { next_tok = logitsToToken(logits_tensor, pos, 0); @@ -308,6 +321,8 @@ Error Runner::generate( "Unsupported dtype output %hhd", static_cast(logits_tensor.scalar_type())); } + timers_.aggregate_sampling_time_ms += + util::time_in_ms() - sample_start_time_ms; // advance the state machine if (pos < num_prompt_tokens - 1) { @@ -339,16 +354,13 @@ Error Runner::generate( // data-dependent terminating condition: we have n_eos_ number of EOS if (pos >= num_prompt_tokens && next == eos_id_) { - ET_LOG(Info, "Reached to the end of generation"); + printf("\n"); + ET_LOG(Info, "\nReached to the end of generation"); break; } token = next; - // init the timer here because the first iteration can be slower - if (start == 0) { - start = util::time_in_ms(); - } if (use_kv_cache_) { // outputs: [k_cache, v_cache, logits, k_cache, v_cache] memcpy( @@ -361,23 +373,94 @@ Error Runner::generate( v_data.size()); } } + timers_.inference_end_ms = util::time_in_ms(); printf("\n"); if (pos == seq_len) { ET_LOG(Info, "Sequence length (%i tokens) reached!", seq_len); } - // report achieved tok/s (pos-1 because the timer starts after first - // iteration) - if (pos >= 1) { - long end = util::time_in_ms(); - ET_LOG( - Info, "Achieved tok/s: %f\n", (pos - 1) / (double)(end - start) * 1000); - } + + timers_.printReport(num_prompt_tokens, pos - num_prompt_tokens); delete[] prompt_tokens; return Error::Ok; } +void Runner::TimeStamps::printReport( + const int64_t& num_prompt_tokens, + const int64_t& num_generated_tokens) { + ET_LOG( + Info, + "\tPrompt Tokens: %ld Generated Tokens: %ld", + num_prompt_tokens, + num_generated_tokens); + + ET_LOG( + Info, + "\tModel Load Time:\t\t%f (seconds)", + ((double)(model_load_end_ms - model_load_start_ms) / + SCALING_FACTOR_UNITS_PER_SECOND)); + double inference_time_ms = (double)(inference_end_ms - inference_start_ms); + ET_LOG( + Info, + "\tTotal inference time:\t\t%f (seconds)\t\t Rate: \t%f (tokens/second)", + inference_time_ms / SCALING_FACTOR_UNITS_PER_SECOND, + + (num_generated_tokens) / (double)(inference_end_ms - inference_start_ms) * + SCALING_FACTOR_UNITS_PER_SECOND); + double prompt_eval_time = (double)(prompt_eval_end_ms - inference_start_ms); + ET_LOG( + Info, + "\t\tPrompt evaluation:\t%f (seconds)\t\t Rate: \t%f (tokens/second)", + prompt_eval_time / SCALING_FACTOR_UNITS_PER_SECOND, + (num_prompt_tokens) / prompt_eval_time * SCALING_FACTOR_UNITS_PER_SECOND); + + double eval_time = (double)(inference_end_ms - prompt_eval_end_ms); + ET_LOG( + Info, + "\t\tGenerated %ld tokens:\t%f (seconds)\t\t Rate: \t%f (tokens/second)", + num_generated_tokens, + eval_time / SCALING_FACTOR_UNITS_PER_SECOND, + num_generated_tokens / eval_time * SCALING_FACTOR_UNITS_PER_SECOND); + + // Time to first token is measured from the start of inference, excluding + // model load time. + ET_LOG( + Info, + "\tTime to first generated token:\t%f (seconds)", + ((double)(first_token_ms - inference_start_ms) / + SCALING_FACTOR_UNITS_PER_SECOND)); + + ET_LOG( + Info, + "\tSampling time over %ld tokens:\t%f (seconds)\t\t Rate: \t%f (tokens/second)", + num_prompt_tokens + num_generated_tokens, + (double)aggregate_sampling_time_ms / SCALING_FACTOR_UNITS_PER_SECOND, + (num_prompt_tokens + num_generated_tokens) / (double)aggregate_sampling_time_ms * SCALING_FACTOR_UNITS_PER_SECOND); + + printf( + "[llama_runner_perf_data] %s\n", + toJsonString(num_prompt_tokens, num_generated_tokens).c_str()); +} + +const std::string Runner::TimeStamps::toJsonString( + const int64_t& num_prompt_tokens, + const int64_t& num_generated_tokens) { + std::stringstream ss; + ss << "{\"prompt_tokens\":" << num_prompt_tokens << "," + << "\"generated_tokens\":" << num_generated_tokens << "," + << "\"model_load_start_ms\":" << model_load_start_ms << "," + << "\"model_load_end_ms\":" << model_load_end_ms << "," + << "\"inference_start_ms\":" << inference_start_ms << "," + << "\"inference_end_ms\":" << inference_end_ms << "," + << "\"prompt_eval_end_ms\":" << prompt_eval_end_ms << "," + << "\"first_token_ms\":" << first_token_ms << "," + << "\"aggregate_sampling_time_ms\":" << aggregate_sampling_time_ms << "," + << "\"SCALING_FACTOR_UNITS_PER_SECOND\":" + << SCALING_FACTOR_UNITS_PER_SECOND << "}"; + return ss.str(); +} + void Runner::stop() { shouldStop_ = true; } diff --git a/examples/models/llama2/runner/runner.h b/examples/models/llama2/runner/runner.h index cc254cb5348..69aac699d4a 100644 --- a/examples/models/llama2/runner/runner.h +++ b/examples/models/llama2/runner/runner.h @@ -63,6 +63,36 @@ class Runner { std::unique_ptr tokenizer_; std::unique_ptr sampler_; bool shouldStop_{false}; + + struct TimeStamps { + // Scaling factor for timestamps - in this case, we use ms. + const long SCALING_FACTOR_UNITS_PER_SECOND = 1000; + // Time stamps for the different stages of the execution + // model_load_start_ms: Start of model loading. + long model_load_start_ms; + // model_load_end_ms: End of model loading. + long model_load_end_ms; + // inference_start_ms: Immediately after the model is loaded (or we check + // for model load), measure the inference time. + long inference_start_ms; + // prompt_eval_end_ms: Prompt array allocation and tokenization. Ends right + // before the inference loop starts + long prompt_eval_end_ms; + // first_token: Timestamp when the first generated token is emitted + long first_token_ms; + // inference_end_ms: End of inference/generation. + long inference_end_ms; + // Keep a running total of the time spent in sampling. + long aggregate_sampling_time_ms; + + void printReport( + const int64_t& num_prompt_tokens, + const int64_t& num_generated_tokens); + const std::string toJsonString( + const int64_t& num_prompt_tokens, + const int64_t& num_generated_tokens); + }; + TimeStamps timers_; }; } // namespace torch::executor