Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ggml : new Q4 and Q5 quantization formats + backward ops #154

Merged
merged 5 commits into from
May 14, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 0 additions & 6 deletions examples/common-ggml.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@
static const std::map<std::string, enum ggml_ftype> GGML_FTYPE_MAP = {
{"q4_0", GGML_FTYPE_MOSTLY_Q4_0},
{"q4_1", GGML_FTYPE_MOSTLY_Q4_1},
{"q4_2", GGML_FTYPE_MOSTLY_Q4_2},
{"q5_0", GGML_FTYPE_MOSTLY_Q5_0},
{"q5_1", GGML_FTYPE_MOSTLY_Q5_1},
{"q8_0", GGML_FTYPE_MOSTLY_Q8_0},
Expand Down Expand Up @@ -46,7 +45,6 @@ bool ggml_common_quantize_0(
switch (ftype) {
case GGML_FTYPE_MOSTLY_Q4_0: qtype = GGML_TYPE_Q4_0; break;
case GGML_FTYPE_MOSTLY_Q4_1: qtype = GGML_TYPE_Q4_1; break;
case GGML_FTYPE_MOSTLY_Q4_2: qtype = GGML_TYPE_Q4_2; break;
case GGML_FTYPE_MOSTLY_Q5_0: qtype = GGML_TYPE_Q5_0; break;
case GGML_FTYPE_MOSTLY_Q5_1: qtype = GGML_TYPE_Q5_1; break;
case GGML_FTYPE_MOSTLY_Q8_0: qtype = GGML_TYPE_Q8_0; break;
Expand Down Expand Up @@ -171,10 +169,6 @@ bool ggml_common_quantize_0(
{
cur_size = ggml_quantize_q4_1(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data());
} break;
case GGML_TYPE_Q4_2:
{
cur_size = ggml_quantize_q4_2(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data());
} break;
case GGML_TYPE_Q5_0:
{
cur_size = ggml_quantize_q5_0(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data());
Expand Down
1 change: 0 additions & 1 deletion examples/common.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -212,7 +212,6 @@ void gpt_vocab::add_special_token(const std::string & token) {
special_tokens.push_back(token);
}


std::vector<gpt_vocab::id> gpt_tokenize(const gpt_vocab & vocab, const std::string & text) {
std::vector<std::string> words;

Expand Down
17 changes: 9 additions & 8 deletions examples/dolly-v2/main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
#include <cmath>
#include <cstdio>
#include <cstring>
#include <cinttypes>
#include <fstream>
#include <map>
#include <string>
Expand Down Expand Up @@ -199,7 +200,7 @@ bool dollyv2_model_load(const std::string & fname, dollyv2_model & model, gpt_vo
ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(GGML_TYPE_F32); // memory_k
ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(GGML_TYPE_F32); // memory_v

ctx_size += (6 + 16*n_layer)*256; // object overhead
ctx_size += (6 + 16*n_layer)*512; // object overhead

printf("%s: ggml ctx size = %6.2f MB\n", __func__, ctx_size/(1024.0*1024.0));
}
Expand Down Expand Up @@ -307,7 +308,7 @@ bool dollyv2_model_load(const std::string & fname, dollyv2_model & model, gpt_vo

const size_t memory_size = ggml_nbytes(model.memory_k) + ggml_nbytes(model.memory_v);

printf("%s: memory_size = %8.2f MB, n_mem = %ld\n", __func__, memory_size/1024.0/1024.0, n_mem);
printf("%s: memory_size = %8.2f MB, n_mem = %" PRId64 "\n", __func__, memory_size/1024.0/1024.0, n_mem);
}

// load weights
Expand Down Expand Up @@ -478,8 +479,8 @@ bool dollyv2_eval(
struct ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_3d(ctx0, cur, n_embd/n_head, n_head, N, cur->nb[1]/n_head, cur->nb[1], 2*sizeof(float)*n_embd/n_head));

// using mode = 2 for GPT-NeoX mode
Qcur = ggml_rope(ctx0, Qcur, n_past, n_rot, 2);
Kcur = ggml_rope(ctx0, Kcur, n_past, n_rot, 2);
Qcur = ggml_rope_inplace(ctx0, Qcur, n_past, n_rot, 2);
Kcur = ggml_rope_inplace(ctx0, Kcur, n_past, n_rot, 2);

// store key and value to memory
{
Expand Down Expand Up @@ -513,16 +514,16 @@ bool dollyv2_eval(

// KQ_scaled = KQ / sqrt(n_embd/n_head)
struct ggml_tensor * KQ_scaled =
ggml_scale(ctx0,
ggml_scale_inplace(ctx0,
KQ,
ggml_new_f32(ctx0, 1.0f/sqrt(float(n_embd)/n_head))
);

// KQ_masked = mask_past(KQ_scaled)
struct ggml_tensor * KQ_masked = ggml_diag_mask_inf(ctx0, KQ_scaled, n_past);
struct ggml_tensor * KQ_masked = ggml_diag_mask_inf_inplace(ctx0, KQ_scaled, n_past);

// KQ = soft_max(KQ_masked)
struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked);
struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_masked);

// V_trans = Vmem.view(n_embd/n_head, n_head, n_past + N).permute(1, 2, 0, 3).contiguous()
struct ggml_tensor * V =
Expand Down Expand Up @@ -621,7 +622,7 @@ bool dollyv2_eval(
}

// logits -> probs
//inpL = ggml_soft_max(ctx0, inpL);
//inpL = ggml_soft_max_inplace(ctx0, inpL);

// run the computation
ggml_build_forward_expand(&gf, inpL);
Expand Down
11 changes: 6 additions & 5 deletions examples/gpt-2/main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -186,8 +186,9 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab &
ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(GGML_TYPE_F32); // memory_k
ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(GGML_TYPE_F32); // memory_v

ctx_size += (6 + 12*n_layer)*256; // object overhead
ctx_size += (6 + 12*n_layer)*512; // object overhead

printf("%s: ggml tensor size = %d bytes\n", __func__, (int) sizeof(ggml_tensor));
printf("%s: ggml ctx size = %6.2f MB\n", __func__, ctx_size/(1024.0*1024.0));
}

Expand Down Expand Up @@ -526,18 +527,18 @@ bool gpt2_eval(
// KQ_scaled = KQ / sqrt(n_embd/n_head)
// [n_past + N, N, 12]
struct ggml_tensor * KQ_scaled =
ggml_scale(ctx0,
ggml_scale_inplace(ctx0,
KQ,
ggml_new_f32(ctx0, 1.0f/sqrt(float(n_embd)/n_head))
);

// KQ_masked = mask_past(KQ_scaled)
// [n_past + N, N, 12]
struct ggml_tensor * KQ_masked = ggml_diag_mask_inf(ctx0, KQ_scaled, n_past);
struct ggml_tensor * KQ_masked = ggml_diag_mask_inf_inplace(ctx0, KQ_scaled, n_past);

// KQ = soft_max(KQ_masked)
// [n_past + N, N, 12]
struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked);
struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_masked);

// V_trans = Vmem.view(n_embd/n_head, n_head, n_past + N).permute(1, 2, 0, 3).contiguous()
// [n_past + N, 64, 12]
Expand Down Expand Up @@ -664,7 +665,7 @@ bool gpt2_eval(
inpL = ggml_mul_mat(ctx0, model.lm_head, inpL);

// logits -> probs
//inpL = ggml_soft_max(ctx0, inpL);
//inpL = ggml_soft_max_inplace(ctx0, inpL);

// run the computation
ggml_build_forward_expand(&gf, inpL);
Expand Down
16 changes: 8 additions & 8 deletions examples/gpt-j/main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -185,7 +185,7 @@ bool gptj_model_load(const std::string & fname, gptj_model & model, gpt_vocab &
ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(GGML_TYPE_F16); // memory_k
ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(GGML_TYPE_F16); // memory_v

ctx_size += (5 + 10*n_layer)*256; // object overhead
ctx_size += (5 + 10*n_layer)*512; // object overhead

printf("%s: ggml ctx size = %6.2f MB\n", __func__, ctx_size/(1024.0*1024.0));
}
Expand Down Expand Up @@ -446,8 +446,8 @@ bool gptj_eval(

// self-attention
{
struct ggml_tensor * Qcur = ggml_rope(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model.layers[il].c_attn_q_proj_w, cur), n_embd/n_head, n_head, N), n_past, n_rot, 0);
struct ggml_tensor * Kcur = ggml_rope(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model.layers[il].c_attn_k_proj_w, cur), n_embd/n_head, n_head, N), n_past, n_rot, 0);
struct ggml_tensor * Qcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model.layers[il].c_attn_q_proj_w, cur), n_embd/n_head, n_head, N), n_past, n_rot, 0);
struct ggml_tensor * Kcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model.layers[il].c_attn_k_proj_w, cur), n_embd/n_head, n_head, N), n_past, n_rot, 0);

// store key and value to memory
{
Expand Down Expand Up @@ -481,16 +481,16 @@ bool gptj_eval(

// KQ_scaled = KQ / sqrt(n_embd/n_head)
struct ggml_tensor * KQ_scaled =
ggml_scale(ctx0,
ggml_scale_inplace(ctx0,
KQ,
ggml_new_f32(ctx0, 1.0f/sqrt(float(n_embd)/n_head))
);

// KQ_masked = mask_past(KQ_scaled)
struct ggml_tensor * KQ_masked = ggml_diag_mask_inf(ctx0, KQ_scaled, n_past);
struct ggml_tensor * KQ_masked = ggml_diag_mask_inf_inplace(ctx0, KQ_scaled, n_past);

// KQ = soft_max(KQ_masked)
struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked);
struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_masked);

// V_trans = Vmem.view(n_embd/n_head, n_head, n_past + N).permute(1, 2, 0, 3).contiguous()
struct ggml_tensor * V =
Expand Down Expand Up @@ -574,15 +574,15 @@ bool gptj_eval(
}

// logits -> probs
//inpL = ggml_soft_max(ctx0, inpL);
//inpL = ggml_soft_max_inplace(ctx0, inpL);

// run the computation
ggml_build_forward_expand(&gf, inpL);
ggml_graph_compute (ctx0, &gf);

//if (n_past%100 == 0) {
// ggml_graph_print (&gf);
// ggml_graph_dump_dot(&gf, NULL, "gpt-2.dot");
// ggml_graph_dump_dot(&gf, NULL, "gpt-j.dot");
//}

//embd_w.resize(n_vocab*N);
Expand Down
17 changes: 9 additions & 8 deletions examples/gpt-neox/main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
#include <cmath>
#include <cstdio>
#include <cstring>
#include <cinttypes>
#include <fstream>
#include <map>
#include <string>
Expand Down Expand Up @@ -188,7 +189,7 @@ bool gpt_neox_model_load(const std::string & fname, gpt_neox_model & model, gpt_
ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(GGML_TYPE_F32); // memory_k
ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(GGML_TYPE_F32); // memory_v

ctx_size += (6 + 16*n_layer)*256; // object overhead
ctx_size += (6 + 16*n_layer)*512; // object overhead

printf("%s: ggml ctx size = %6.2f MB\n", __func__, ctx_size/(1024.0*1024.0));
}
Expand Down Expand Up @@ -293,7 +294,7 @@ bool gpt_neox_model_load(const std::string & fname, gpt_neox_model & model, gpt_

const size_t memory_size = ggml_nbytes(model.memory_k) + ggml_nbytes(model.memory_v);

printf("%s: memory_size = %8.2f MB, n_mem = %ld\n", __func__, memory_size/1024.0/1024.0, n_mem);
printf("%s: memory_size = %8.2f MB, n_mem = %" PRId64 "\n", __func__, memory_size/1024.0/1024.0, n_mem);
}

// load weights
Expand Down Expand Up @@ -501,8 +502,8 @@ bool gpt_neox_eval(
struct ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_3d(ctx0, cur, n_embd/n_head, n_head, N, cur->nb[1]/n_head, cur->nb[1], 2*sizeof(float)*n_embd/n_head));

// using mode = 2 for GPT-NeoX mode
Qcur = ggml_rope(ctx0, Qcur, n_past, n_rot, 2);
Kcur = ggml_rope(ctx0, Kcur, n_past, n_rot, 2);
Qcur = ggml_rope_inplace(ctx0, Qcur, n_past, n_rot, 2);
Kcur = ggml_rope_inplace(ctx0, Kcur, n_past, n_rot, 2);

// store key and value to memory
{
Expand Down Expand Up @@ -536,16 +537,16 @@ bool gpt_neox_eval(

// KQ_scaled = KQ / sqrt(n_embd/n_head)
struct ggml_tensor * KQ_scaled =
ggml_scale(ctx0,
ggml_scale_inplace(ctx0,
KQ,
ggml_new_f32(ctx0, 1.0f/sqrt(float(n_embd)/n_head))
);

// KQ_masked = mask_past(KQ_scaled)
struct ggml_tensor * KQ_masked = ggml_diag_mask_inf(ctx0, KQ_scaled, n_past);
struct ggml_tensor * KQ_masked = ggml_diag_mask_inf_inplace(ctx0, KQ_scaled, n_past);

// KQ = soft_max(KQ_masked)
struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked);
struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_masked);

// V_trans = Vmem.view(n_embd/n_head, n_head, n_past + N).permute(1, 2, 0, 3).contiguous()
struct ggml_tensor * V =
Expand Down Expand Up @@ -620,7 +621,7 @@ bool gpt_neox_eval(
}

// logits -> probs
//inpL = ggml_soft_max(ctx0, inpL);
//inpL = ggml_soft_max_inplace(ctx0, inpL);

// run the computation
ggml_build_forward_expand(&gf, inpL);
Expand Down
1 change: 0 additions & 1 deletion examples/mnist/main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -205,7 +205,6 @@ int main(int argc, char ** argv) {
exit(0);
}

mnist_hparams params;
int64_t t_load_us = 0;

mnist_model model;
Expand Down
10 changes: 5 additions & 5 deletions examples/starcoder/main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -193,7 +193,7 @@ bool starcoder_model_load(const std::string & fname, starcoder_model & model, gp
ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(GGML_TYPE_F32); // memory_k
ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(GGML_TYPE_F32); // memory_v

ctx_size += (6 + 12*n_layer)*256; // object overhead
ctx_size += (6 + 12*n_layer)*512; // object overhead

printf("%s: ggml ctx size = %6.2f MB\n", __func__, ctx_size/(1024.0*1024.0));
}
Expand Down Expand Up @@ -537,18 +537,18 @@ bool starcoder_eval(
// KQ_scaled = KQ / sqrt(n_embd/n_head)
// [n_past + N, N, 12]
struct ggml_tensor * KQ_scaled =
ggml_scale(ctx0,
ggml_scale_inplace(ctx0,
KQ,
ggml_new_f32(ctx0, 1.0f/sqrt(float(n_embd)/n_head))
);

// KQ_masked = mask_past(KQ_scaled)
// [n_past + N, N, 12]
struct ggml_tensor * KQ_masked = ggml_diag_mask_inf(ctx0, KQ_scaled, n_past);
struct ggml_tensor * KQ_masked = ggml_diag_mask_inf_inplace(ctx0, KQ_scaled, n_past);

// KQ = soft_max(KQ_masked)
// [n_past + N, N, 12]
struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked);
struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_masked);

// V_trans = Vmem.view(n_embd/n_head, n_head, n_past + N).permute(1, 2, 0, 3).contiguous()
// [n_past + N, 64, 12]
Expand Down Expand Up @@ -675,7 +675,7 @@ bool starcoder_eval(
inpL = ggml_mul_mat(ctx0, model.lm_head, inpL);

// logits -> probs
//inpL = ggml_soft_max(ctx0, inpL);
//inpL = ggml_soft_max_inplace(ctx0, inpL);

// run the computation
ggml_build_forward_expand(&gf, inpL);
Expand Down
2 changes: 1 addition & 1 deletion examples/whisper/quantize.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -109,7 +109,7 @@ bool whisper_model_quantize(const std::string & fname_inp, const std::string & f
fout.write((char *) &hparams.n_text_head, sizeof(hparams.n_text_head));
fout.write((char *) &hparams.n_text_layer, sizeof(hparams.n_text_layer));
fout.write((char *) &hparams.n_mels, sizeof(hparams.n_mels));
fout.write((char *) &ftype, sizeof(hparams.ftype));
fout.write((char *) &ftype_dst, sizeof(hparams.ftype));
}

// load mel filters
Expand Down
Loading