diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py index f76f2457920ff..cdccbb7dbcc3c 100755 --- a/convert-hf-to-gguf.py +++ b/convert-hf-to-gguf.py @@ -2394,6 +2394,12 @@ def set_gguf_parameters(self): self.gguf_writer.add_key_length(hparams["head_dim"]) self.gguf_writer.add_value_length(hparams["head_dim"]) self.gguf_writer.add_file_type(self.ftype) + self.gguf_writer.add_attn_logit_softcapping( + self.hparams["attn_logit_softcapping"] + ) + self.gguf_writer.add_final_logit_softcapping( + self.hparams["final_logit_softcapping"] + ) def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: del bid # unusem diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index f3e4295291185..ad1aa978d9ad5 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -50,6 +50,8 @@ class LLM: POOLING_TYPE = "{arch}.pooling_type" LOGIT_SCALE = "{arch}.logit_scale" DECODER_START_TOKEN_ID = "{arch}.decoder_start_token_id" + ATTN_LOGIT_SOFTCAPPING = "{arch}.attn_logit_softcapping" + FINAL_LOGIT_SOFTCAPPING = "{arch}.final_logit_softcapping" class Attention: HEAD_COUNT = "{arch}.attention.head_count" diff --git a/gguf-py/gguf/gguf_writer.py b/gguf-py/gguf/gguf_writer.py index 9869f6fe3445a..1aeb0d9b08685 100644 --- a/gguf-py/gguf/gguf_writer.py +++ b/gguf-py/gguf/gguf_writer.py @@ -516,6 +516,12 @@ def add_clamp_kqv(self, value: float) -> None: def add_logit_scale(self, value: float) -> None: self.add_float32(Keys.LLM.LOGIT_SCALE.format(arch=self.arch), value) + def add_attn_logit_softcapping(self, value: float) -> None: + self.add_float32(Keys.LLM.ATTN_LOGIT_SOFTCAPPING.format(arch=self.arch), value) + + def add_final_logit_softcapping(self, value: float) -> None: + self.add_float32(Keys.LLM.FINAL_LOGIT_SOFTCAPPING.format(arch=self.arch), value) + def add_expert_count(self, count: int) -> None: self.add_uint32(Keys.LLM.EXPERT_COUNT.format(arch=self.arch), count) diff --git a/llama.cpp b/llama.cpp index 1e8d41b52cf3e..96200312d5bad 100644 --- a/llama.cpp +++ b/llama.cpp @@ -326,6 +326,8 @@ enum llm_kv { LLM_KV_POOLING_TYPE, LLM_KV_LOGIT_SCALE, LLM_KV_DECODER_START_TOKEN_ID, + LLM_KV_ATTN_LOGIT_SOFTCAPPING, + LLM_KV_FINAL_LOGIT_SOFTCAPPING, LLM_KV_ATTENTION_HEAD_COUNT, LLM_KV_ATTENTION_HEAD_COUNT_KV, @@ -416,6 +418,8 @@ static const std::map LLM_KV_NAMES = { { LLM_KV_POOLING_TYPE , "%s.pooling_type" }, { LLM_KV_LOGIT_SCALE, "%s.logit_scale" }, { LLM_KV_DECODER_START_TOKEN_ID, "%s.decoder_start_token_id" }, + { LLM_KV_ATTN_LOGIT_SOFTCAPPING, "%s.attn_logit_softcapping" }, + { LLM_KV_FINAL_LOGIT_SOFTCAPPING, "%s.final_logit_softcapping" }, { LLM_KV_ATTENTION_HEAD_COUNT, "%s.attention.head_count" }, { LLM_KV_ATTENTION_HEAD_COUNT_KV, "%s.attention.head_count_kv" }, @@ -2127,6 +2131,9 @@ struct llama_hparams { float f_norm_eps; float f_norm_rms_eps; + float f_attn_logit_softcapping = 50.0f; + float f_final_logit_softcapping = 30.0f; + float rope_attn_factor = 1.0f; float rope_freq_base_train; float rope_freq_scale_train; @@ -2143,8 +2150,9 @@ struct llama_hparams { float f_max_alibi_bias = 0.0f; float f_logit_scale = 0.0f; - bool causal_attn = true; - bool use_alibi = false; + bool causal_attn = true; + bool use_alibi = false; + bool attn_soft_cap = false; enum llama_pooling_type pooling_type = LLAMA_POOLING_TYPE_NONE; enum llama_rope_type rope_type = LLAMA_ROPE_TYPE_NONE; @@ -4822,6 +4830,9 @@ static void llm_load_hparams( case LLM_ARCH_GEMMA2: { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); + ml.get_key(LLM_KV_ATTN_LOGIT_SOFTCAPPING, hparams.f_attn_logit_softcapping, false); + ml.get_key(LLM_KV_FINAL_LOGIT_SOFTCAPPING, hparams.f_final_logit_softcapping, false); + hparams.attn_soft_cap = true; switch (hparams.n_layer) { case 42: model.type = e_model::MODEL_9B; break; @@ -7737,6 +7748,12 @@ static struct ggml_tensor * llm_build_kqv( kq = ggml_scale(ctx, kq, 30); } + if (hparams.attn_soft_cap) { + kq = ggml_scale(ctx, kq, 1.0f / hparams.f_attn_logit_softcapping); + kq = ggml_tanh(ctx, kq); + kq = ggml_scale(ctx, kq, hparams.f_attn_logit_softcapping); + } + kq = ggml_soft_max_ext(ctx, kq, kq_mask, kq_scale, hparams.f_max_alibi_bias); cb(kq, "kq_soft_max_ext", il); @@ -11197,7 +11214,7 @@ struct llm_build_context { ext_factor, attn_factor, beta_fast, beta_slow); cb(Qcur, "Qcur", il); - Qcur = ggml_scale(ctx0, Qcur, 1.0f / sqrtf(float(n_embd_head_k))); + Qcur = ggml_scale(ctx0, Qcur, 1.0f / sqrtf(float(n_embd / n_head))); cb(Qcur, "Qcur_scaled", il); Kcur = ggml_rope_ext( @@ -11264,6 +11281,12 @@ struct llm_build_context { // lm_head cur = ggml_mul_mat(ctx0, model.output, cur); + + // final logit soft-capping + cur = ggml_scale(ctx0, cur, 1.0f / hparams.f_final_logit_softcapping); + cur = ggml_tanh(ctx0, cur); + cur = ggml_scale(ctx0, cur, hparams.f_final_logit_softcapping); + cb(cur, "result_output", -1); ggml_build_forward_expand(gf, cur); @@ -20022,6 +20045,12 @@ struct llama_context * llama_new_context_with_model( params.flash_attn = false; } + if (params.flash_attn && model->hparams.attn_soft_cap) { + LLAMA_LOG_WARN("%s: flash_attn is not compatible with attn_soft_cap - forcing off\n", __func__); + params.flash_attn = false; + } + + if (params.flash_attn && model->hparams.n_embd_head_k != model->hparams.n_embd_head_v) { LLAMA_LOG_WARN("%s: flash_attn requires n_embd_head_k == n_embd_head_v - forcing off\n", __func__); params.flash_attn = false;