Skip to content

Commit

Permalink
Fix ffn_down quantization mix for MoE models
Browse files Browse the repository at this point in the history
In #4872 I did not consider the part where every third
tensor is quantized with more bits. Fir MoE this leads to tensors
of the same layer being quantized with different number of bits,
which is not considered as a possibility in the inference implementation
(it is assumed all experts use the same quantization).
  • Loading branch information
Kawrakow committed Jan 14, 2024
1 parent 76484fb commit 998b635
Showing 1 changed file with 11 additions and 8 deletions.
19 changes: 11 additions & 8 deletions llama.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -8462,28 +8462,31 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
new_type = GGML_TYPE_Q8_0;
}
} else if (name.find("ffn_down") != std::string::npos) {
const int n_expert = std::max(1, (int)qs.model.hparams.n_expert);
const int i_layer = qs.i_feed_forward_w2 / n_expert;
const int n_layer = qs.i_feed_forward_w2 / n_expert;
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S) {
if (qs.i_feed_forward_w2 < qs.n_feed_forward_w2/8) new_type = GGML_TYPE_Q4_K;
if (i_layer < n_layer/8) new_type = GGML_TYPE_Q4_K;
}
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) {
new_type = qs.i_feed_forward_w2 < qs.n_feed_forward_w2/16 ? GGML_TYPE_Q5_K
: arch != LLM_ARCH_FALCON || use_more_bits(qs.i_feed_forward_w2, qs.n_feed_forward_w2) ? GGML_TYPE_Q4_K
new_type = i_layer < n_layer/16 ? GGML_TYPE_Q5_K
: arch != LLM_ARCH_FALCON || use_more_bits(i_layer, n_layer) ? GGML_TYPE_Q4_K
: GGML_TYPE_Q3_K;
}
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) {
new_type = arch == LLM_ARCH_FALCON ? GGML_TYPE_Q4_K : GGML_TYPE_Q5_K;
}
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) {
if (arch == LLM_ARCH_FALCON) {
new_type = qs.i_feed_forward_w2 < qs.n_feed_forward_w2/16 ? GGML_TYPE_Q6_K :
use_more_bits(qs.i_feed_forward_w2, qs.n_feed_forward_w2) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
new_type = i_layer < n_layer/16 ? GGML_TYPE_Q6_K :
use_more_bits(i_layer, n_layer) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
} else {
if (use_more_bits(qs.i_feed_forward_w2, qs.n_feed_forward_w2)) new_type = GGML_TYPE_Q6_K;
if (use_more_bits(i_layer, n_layer)) new_type = GGML_TYPE_Q6_K;
}
}
else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M && use_more_bits(qs.i_feed_forward_w2, qs.n_feed_forward_w2)) new_type = GGML_TYPE_Q6_K;
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && arch != LLM_ARCH_FALCON && qs.i_feed_forward_w2 < qs.n_feed_forward_w2/8) {
else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M && use_more_bits(i_layer, n_layer)) new_type = GGML_TYPE_Q6_K;
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && arch != LLM_ARCH_FALCON && i_layer < n_layer/8) {
new_type = GGML_TYPE_Q5_K;
}
++qs.i_feed_forward_w2;
Expand Down

0 comments on commit 998b635

Please sign in to comment.