Skip to content

Commit

Permalink
Add time to first token for llama runner (#2141)
Browse files Browse the repository at this point in the history
Summary:

Add time to first generated token & other features



- Since we're measuring the first token time, the token rate is measured both at the

* Model Load Time - just a timer around   ET_CHECK_OK_OR_RETURN_ERROR(load());
* Total inference time - Immediately after model load until the end of the inference loop
* >>First token time - From immediately after the model load until the first generated (not prompt) token is printed.
* >>>>Prompt eval - (comparable to llama.cpp prompt_eval_time) prompt array allocation and tokenization. Ends right before the inference loop starts
* >>Remaining tokens - immediately after the first token is outputted until the end of the inference loop
* >>Net eval time - (comparable to llama.cpp eval_time) Total time spent generating tokens.
* Sample time - amount of time spent sampling per token (present in llama.cpp)

bypass-github-executorch-ci-checks
bypass-github-pytorch-ci-checks

Reviewed By: digantdesai, Jack-Khuu

Differential Revision: D54223564
  • Loading branch information
Varun Puri authored and facebook-github-bot committed Mar 13, 2024
1 parent d0512b6 commit 8c6cb45
Show file tree
Hide file tree
Showing 2 changed files with 133 additions and 17 deletions.
120 changes: 103 additions & 17 deletions examples/models/llama2/runner/runner.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@

#include <ctime>
#include <memory>
#include <sstream>

#ifdef USE_ATEN_LIB
#include <torch/torch.h>
Expand Down Expand Up @@ -161,8 +162,16 @@ Error Runner::generate(
// Prepare the inputs.
// Use ones-initialized inputs.
ET_CHECK_MSG(!prompt.empty(), "Prompt cannot be null");
ET_CHECK_OK_OR_RETURN_ERROR(load());
if (!is_loaded()) {
timers_.model_load_start_ms = util::time_in_ms();
ET_CHECK_OK_OR_RETURN_ERROR(load());
timers_.model_load_end_ms = util::time_in_ms();
}

// First token time only measures the time it takes to encode the prompt and
// return a response token.

timers_.inference_start_ms = util::time_in_ms();
shouldStop_ = false;

// encode the (string) prompt into tokens sequence
Expand All @@ -179,6 +188,7 @@ Error Runner::generate(
append_eos_ ? n_eos_ : 0,
prompt_tokens,
&num_prompt_tokens);

for (int i = 0; i < num_prompt_tokens; i++) {
ET_LOG(Info, "prompt_tokens[%d]: %d", i, prompt_tokens[i]);
}
Expand All @@ -192,8 +202,6 @@ Error Runner::generate(
"Sequence length exceeded - please increase the seq_len value passed to generate()");

// start the main loop
long start =
0; // used to time our code, only initialized after first iteration
int next; // will store the next token in the sequence
int64_t pos = num_prompt_tokens - 1; // position in the sequence
int token = prompt_tokens[pos]; // prefill starts from 0 to num_prompt_tokens
Expand Down Expand Up @@ -254,6 +262,7 @@ Error Runner::generate(
tokenizer_->decode(prompt_tokens[i - 1], prompt_tokens[i])));
}
}

// create a 1xN int tensor with next as value
while (pos < seq_len) {
// ET_LOG(Info, "Generating step %d...", pos);
Expand Down Expand Up @@ -289,10 +298,14 @@ Error Runner::generate(
outputs.size() > 0,
"Expecting output to have at least one evalue. Got %zu",
outputs.size());

if (pos == num_prompt_tokens) {
timers_.first_token_ms = util::time_in_ms();
} else if (pos == num_prompt_tokens - 1) {
timers_.prompt_eval_end_ms = util::time_in_ms();
}
int32_t next_tok;
exec_aten::Tensor logits_tensor = outputs.at(logits_index).toTensor();

long sample_start_time_ms = util::time_in_ms();
switch (logits_tensor.scalar_type()) {
case ScalarType::Float: {
next_tok = logitsToToken<float>(logits_tensor, pos, 0);
Expand All @@ -308,6 +321,8 @@ Error Runner::generate(
"Unsupported dtype output %hhd",
static_cast<int8_t>(logits_tensor.scalar_type()));
}
timers_.aggregate_sampling_time_ms +=
util::time_in_ms() - sample_start_time_ms;

// advance the state machine
if (pos < num_prompt_tokens - 1) {
Expand Down Expand Up @@ -339,16 +354,13 @@ Error Runner::generate(

// data-dependent terminating condition: we have n_eos_ number of EOS
if (pos >= num_prompt_tokens && next == eos_id_) {
ET_LOG(Info, "Reached to the end of generation");
printf("\n");
ET_LOG(Info, "\nReached to the end of generation");
break;
}

token = next;

// init the timer here because the first iteration can be slower
if (start == 0) {
start = util::time_in_ms();
}
if (use_kv_cache_) {
// outputs: [k_cache, v_cache, logits, k_cache, v_cache]
memcpy(
Expand All @@ -361,23 +373,97 @@ Error Runner::generate(
v_data.size());
}
}
timers_.inference_end_ms = util::time_in_ms();
printf("\n");

if (pos == seq_len) {
ET_LOG(Info, "Sequence length (%i tokens) reached!", seq_len);
}
// report achieved tok/s (pos-1 because the timer starts after first
// iteration)
if (pos >= 1) {
long end = util::time_in_ms();
ET_LOG(
Info, "Achieved tok/s: %f\n", (pos - 1) / (double)(end - start) * 1000);
}

timers_.printReport(num_prompt_tokens, pos - num_prompt_tokens);

delete[] prompt_tokens;
return Error::Ok;
}

void Runner::TimeStamps::printReport(
const int64_t& num_prompt_tokens,
const int64_t& num_generated_tokens) {
printf(
"PyTorchObserver %s\n",
toJsonString(num_prompt_tokens, num_generated_tokens).c_str());

ET_LOG(
Info,
"\tPrompt Tokens: %" PRIu64 " Generated Tokens: %" PRIu64,
num_prompt_tokens,
num_generated_tokens);

ET_LOG(
Info,
"\tModel Load Time:\t\t%f (seconds)",
((double)(model_load_end_ms - model_load_start_ms) /
SCALING_FACTOR_UNITS_PER_SECOND));
double inference_time_ms = (double)(inference_end_ms - inference_start_ms);
ET_LOG(
Info,
"\tTotal inference time:\t\t%f (seconds)\t\t Rate: \t%f (tokens/second)",
inference_time_ms / SCALING_FACTOR_UNITS_PER_SECOND,

(num_generated_tokens) / (double)(inference_end_ms - inference_start_ms) *
SCALING_FACTOR_UNITS_PER_SECOND);
double prompt_eval_time = (double)(prompt_eval_end_ms - inference_start_ms);
ET_LOG(
Info,
"\t\tPrompt evaluation:\t%f (seconds)\t\t Rate: \t%f (tokens/second)",
prompt_eval_time / SCALING_FACTOR_UNITS_PER_SECOND,
(num_prompt_tokens) / prompt_eval_time * SCALING_FACTOR_UNITS_PER_SECOND);

double eval_time = (double)(inference_end_ms - prompt_eval_end_ms);
ET_LOG(
Info,
"\t\tGenerated %" PRIu64
" tokens:\t%f (seconds)\t\t Rate: \t%f (tokens/second)",
num_generated_tokens,
eval_time / SCALING_FACTOR_UNITS_PER_SECOND,
num_generated_tokens / eval_time * SCALING_FACTOR_UNITS_PER_SECOND);

// Time to first token is measured from the start of inference, excluding
// model load time.
ET_LOG(
Info,
"\tTime to first generated token:\t%f (seconds)",
((double)(first_token_ms - inference_start_ms) /
SCALING_FACTOR_UNITS_PER_SECOND));

ET_LOG(
Info,
"\tSampling time over %" PRIu64
" tokens:\t%f (seconds)\t\t Rate: \t%f (tokens/second)",
num_prompt_tokens + num_generated_tokens,
(double)aggregate_sampling_time_ms / SCALING_FACTOR_UNITS_PER_SECOND,
(num_prompt_tokens + num_generated_tokens) /
(double)aggregate_sampling_time_ms * SCALING_FACTOR_UNITS_PER_SECOND);
}

const std::string Runner::TimeStamps::toJsonString(
const int64_t& num_prompt_tokens,
const int64_t& num_generated_tokens) {
std::stringstream ss;
ss << "{\"prompt_tokens\":" << num_prompt_tokens << ","
<< "\"generated_tokens\":" << num_generated_tokens << ","
<< "\"model_load_start_ms\":" << model_load_start_ms << ","
<< "\"model_load_end_ms\":" << model_load_end_ms << ","
<< "\"inference_start_ms\":" << inference_start_ms << ","
<< "\"inference_end_ms\":" << inference_end_ms << ","
<< "\"prompt_eval_end_ms\":" << prompt_eval_end_ms << ","
<< "\"first_token_ms\":" << first_token_ms << ","
<< "\"aggregate_sampling_time_ms\":" << aggregate_sampling_time_ms << ","
<< "\"SCALING_FACTOR_UNITS_PER_SECOND\":"
<< SCALING_FACTOR_UNITS_PER_SECOND << "}";
return ss.str();
}

void Runner::stop() {
shouldStop_ = true;
}
Expand Down
30 changes: 30 additions & 0 deletions examples/models/llama2/runner/runner.h
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,36 @@ class Runner {
std::unique_ptr<Tokenizer> tokenizer_;
std::unique_ptr<Sampler> sampler_;
bool shouldStop_{false};

struct TimeStamps {
// Scaling factor for timestamps - in this case, we use ms.
const long SCALING_FACTOR_UNITS_PER_SECOND = 1000;
// Time stamps for the different stages of the execution
// model_load_start_ms: Start of model loading.
long model_load_start_ms;
// model_load_end_ms: End of model loading.
long model_load_end_ms;
// inference_start_ms: Immediately after the model is loaded (or we check
// for model load), measure the inference time.
long inference_start_ms;
// prompt_eval_end_ms: Prompt array allocation and tokenization. Ends right
// before the inference loop starts
long prompt_eval_end_ms;
// first_token: Timestamp when the first generated token is emitted
long first_token_ms;
// inference_end_ms: End of inference/generation.
long inference_end_ms;
// Keep a running total of the time spent in sampling.
long aggregate_sampling_time_ms;

void printReport(
const int64_t& num_prompt_tokens,
const int64_t& num_generated_tokens);
const std::string toJsonString(
const int64_t& num_prompt_tokens,
const int64_t& num_generated_tokens);
};
TimeStamps timers_;
};

} // namespace torch::executor

0 comments on commit 8c6cb45

Please sign in to comment.