--- llama.cpp 2023-09-15 10:34:56 +++ llama.cpp.new 2023-09-15 10:34:54 @@ -1765,9 +1765,15 @@ vocab.type = LLAMA_VOCAB_TYPE_SPM; // default special tokens +#ifdef XGEN + vocab.special_bos_id = 50256; + vocab.special_eos_id = 50256; + vocab.special_unk_id = 2954; +#else vocab.special_bos_id = 1; vocab.special_eos_id = 2; vocab.special_unk_id = 0; +#endif vocab.special_sep_id = -1; vocab.special_pad_id = -1; } else if (tokenizer_name == "gpt2") { @@ -3525,10 +3531,14 @@ } static llama_token llama_byte_to_token(const llama_vocab & vocab, uint8_t ch) { +#ifdef XGEN + return vocab.token_to_id.at(std::string(1, ch)); +#else char buf[7]; int result = snprintf(buf, sizeof(buf), "<0x%02X>", ch); GGML_ASSERT(0 <= result && result < 7); return vocab.token_to_id.at(buf); +#endif } static void llama_escape_whitespace(std::string & text) { @@ -3903,7 +3913,9 @@ raw_text = " " + raw_text; llm_tokenizer_spm tokenizer(vocab); +#ifndef XGEN llama_escape_whitespace(raw_text); +#endif tokenizer.tokenize(raw_text, output); } break; case LLAMA_VOCAB_TYPE_BPE: