From 00c63907931bb08a0ed2b7e38cf44dd290143cb9 Mon Sep 17 00:00:00 2001 From: Justine Tunney Date: Sat, 25 May 2024 05:04:03 -0400 Subject: [PATCH] main : don't print special tokens with --grammar (#6923) * main : don't print special tokens with --grammar The CLI interface was recently changed to print special control tokens like the stop message one. This token shouldn't be printed if the grammar flag was passed, unless the grammar specifies it, because that breaks shell-scriptability. * main: use seperate stream for control characters * main: use dprintf and add --ctrl-token-no-out and --ctrl-token-fd-out * main: dprintf isn't part of the IEEE POSIX standard. Just use write(). * main: remove --ctrl-token-fd-out in favor for fcntl() based detection * common.cpp: accidentally removed --interactive-first * main: only merge stdout and control token if not in conversation or grammar mode * main: rejig control token descriptor handling * main: must check pipe status on very top of program * main: renamed --no-special from --ctrl-token-no-out and other refactoring * main: refactor ctrl_token_no_out --> no_special * llama: rename llama_token_is_control_token() to llama_token_is_control() * main: remove special token file descriptor feature (#5) --------- Co-authored-by: Brian --- common/common.cpp | 5 +++++ common/common.h | 1 + examples/main/main.cpp | 20 +++++++++++++++++--- llama.cpp | 4 ++++ llama.h | 3 +++ 5 files changed, 30 insertions(+), 3 deletions(-) diff --git a/common/common.cpp b/common/common.cpp index c6459038560f1..781f2166bb66a 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -904,6 +904,10 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa params.interactive_specials = true; return true; } + if (arg == "--no-special") { + params.no_special = true; + return true; + } if (arg == "--embedding") { params.embedding = true; return true; @@ -1364,6 +1368,7 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param printf(" -i, --interactive run in interactive mode\n"); printf(" --interactive-specials allow special tokens in user text, in interactive mode\n"); printf(" --interactive-first run in interactive mode and wait for input right away\n"); + printf(" --no-special control tokens output disabled\n"); printf(" -cnv, --conversation run in conversation mode (does not print special tokens and suffix/prefix)\n"); printf(" -ins, --instruct run in instruction mode (use with Alpaca models)\n"); printf(" -cml, --chatml run in chatml mode (use with ChatML-compatible models)\n"); diff --git a/common/common.h b/common/common.h index f68f3c2979b94..5388f6b68973c 100644 --- a/common/common.h +++ b/common/common.h @@ -146,6 +146,7 @@ struct gpt_params { bool use_color = false; // use color to distinguish generations and inputs bool interactive = false; // interactive mode bool interactive_specials = false; // whether to allow special tokens from user, during interactive mode + bool no_special = false; // disable control token output bool conversation = false; // conversation mode (does not print special tokens and suffix/prefix) bool chatml = false; // chatml mode (used for models trained on chatml syntax) bool prompt_cache_all = false; // save user input and generations to prompt cache diff --git a/examples/main/main.cpp b/examples/main/main.cpp index 09fa85fce0ee3..ac35772f1e133 100644 --- a/examples/main/main.cpp +++ b/examples/main/main.cpp @@ -740,18 +740,32 @@ int main(int argc, char ** argv) { // display text if (input_echo && display) { for (auto id : embd) { - const std::string token_str = llama_token_to_piece(ctx, id, !params.conversation); - printf("%s", token_str.c_str()); + const std::string token_str = llama_token_to_piece(ctx, id); + + // Console/Stream Output + if (!llama_token_is_control(llama_get_model(ctx), id)) { + // Stream Output Token To Standard Output + fprintf(stdout, "%s", token_str.c_str()); + } else if (!params.no_special && !params.conversation) { + // Stream Control Token To Standard Output Stream + fprintf(stdout, "%s", token_str.c_str()); + } + // Record Displayed Tokens To Log + // Note: Generated tokens are created one by one hence this check if (embd.size() > 1) { + // Incoming Requested Tokens input_tokens.push_back(id); } else { + // Outgoing Generated Tokens output_tokens.push_back(id); output_ss << token_str; } + + fflush(stdout); } - fflush(stdout); } + // reset color to default if there is no pending user input if (input_echo && (int) embd_inp.size() == n_consumed) { console::set_display(console::reset); diff --git a/llama.cpp b/llama.cpp index 85cb3140d945b..989d27b9dfb3a 100644 --- a/llama.cpp +++ b/llama.cpp @@ -17861,6 +17861,10 @@ bool llama_token_is_eog(const struct llama_model * model, llama_token token) { ); } +bool llama_token_is_control(const struct llama_model * model, llama_token token) { + return llama_is_control_token(model->vocab, token); +} + llama_token llama_token_bos(const struct llama_model * model) { return model->vocab.special_bos_id; } diff --git a/llama.h b/llama.h index 16cece5db0e78..16676269dd38a 100644 --- a/llama.h +++ b/llama.h @@ -823,6 +823,9 @@ extern "C" { // Check if the token is supposed to end generation (end-of-generation, eg. EOS, EOT, etc.) LLAMA_API bool llama_token_is_eog(const struct llama_model * model, llama_token token); + // Identify if Token Id is a control token or a render-able token + LLAMA_API bool llama_token_is_control(const struct llama_model * model, llama_token token); + // Special tokens LLAMA_API llama_token llama_token_bos(const struct llama_model * model); // beginning-of-sentence LLAMA_API llama_token llama_token_eos(const struct llama_model * model); // end-of-sentence