From c2d0041704cd883b5171dc8a123988333c2135a0 Mon Sep 17 00:00:00 2001 From: Branden Butler Date: Fri, 10 Nov 2023 09:59:22 -0600 Subject: [PATCH 1/3] Support special tokens and not adding BOS to prompt in speculative --- examples/speculative/speculative.cpp | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/examples/speculative/speculative.cpp b/examples/speculative/speculative.cpp index 3a8e278110c20..0aae69d16f7b3 100644 --- a/examples/speculative/speculative.cpp +++ b/examples/speculative/speculative.cpp @@ -94,9 +94,13 @@ int main(int argc, char ** argv) { } } - // tokenize the prompt + + // Tokenize the prompt + const bool add_bos = llama_vocab_type(llama_get_model(ctx_tgt)) == LLAMA_VOCAB_TYPE_SPM; + LOG("add_bos: %d\n", add_bos); + std::vector inp; - inp = ::llama_tokenize(ctx_tgt, params.prompt, true); + inp = ::llama_tokenize(ctx_tgt, params.prompt, add_bos, true); const int max_context_size = llama_n_ctx(ctx_tgt); const int max_tokens_list_size = max_context_size - 4; From e778ce4a4cb28f2b68629610ece50832d9704139 Mon Sep 17 00:00:00 2001 From: Branden Butler Date: Sat, 18 Nov 2023 12:23:33 -0600 Subject: [PATCH 2/3] Adapt to new should_add_bos function --- examples/speculative/speculative.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/speculative/speculative.cpp b/examples/speculative/speculative.cpp index 0aae69d16f7b3..3bd572a7de81a 100644 --- a/examples/speculative/speculative.cpp +++ b/examples/speculative/speculative.cpp @@ -96,7 +96,7 @@ int main(int argc, char ** argv) { // Tokenize the prompt - const bool add_bos = llama_vocab_type(llama_get_model(ctx_tgt)) == LLAMA_VOCAB_TYPE_SPM; + const bool add_bos = llama_should_add_bos_token(model_tgt); LOG("add_bos: %d\n", add_bos); std::vector inp; From 9cfc5e216007f4b86c504c5596c9880e37dd869d Mon Sep 17 00:00:00 2001 From: Branden Butler Date: Sat, 18 Nov 2023 12:49:35 -0600 Subject: [PATCH 3/3] Ensure tgt and dft have same add_bos setting --- examples/speculative/speculative.cpp | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/examples/speculative/speculative.cpp b/examples/speculative/speculative.cpp index 3bd572a7de81a..ace755c51d8a3 100644 --- a/examples/speculative/speculative.cpp +++ b/examples/speculative/speculative.cpp @@ -96,11 +96,20 @@ int main(int argc, char ** argv) { // Tokenize the prompt - const bool add_bos = llama_should_add_bos_token(model_tgt); - LOG("add_bos: %d\n", add_bos); + const bool add_bos_tgt = llama_should_add_bos_token(model_tgt); + LOG("add_bos tgt: %d\n", add_bos_tgt); + + const bool add_bos_dft = llama_should_add_bos_token(model_dft); + LOG("add_bos dft: %d\n", add_bos_dft); + + if (add_bos_tgt != add_bos_dft) { + fprintf(stderr, "%s: error: draft model add_bos must match target model to use speculation but ", __func__); + fprintf(stderr, "add_bos_dft = %d while add_bos_tgt = %d\n", add_bos_dft, add_bos_tgt); + return 1; + } std::vector inp; - inp = ::llama_tokenize(ctx_tgt, params.prompt, add_bos, true); + inp = ::llama_tokenize(ctx_tgt, params.prompt, add_bos_tgt, true); const int max_context_size = llama_n_ctx(ctx_tgt); const int max_tokens_list_size = max_context_size - 4;