Skip to content

Commit

Permalink
llama : add llm_build_k_shift
Browse files Browse the repository at this point in the history
  • Loading branch information
ggerganov committed Oct 29, 2023
1 parent dbf836b commit c6ae530
Showing 1 changed file with 52 additions and 64 deletions.
116 changes: 52 additions & 64 deletions llama.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3230,6 +3230,51 @@ static struct ggml_tensor * llm_build_ffn(
return cur;
}

// Persimmon: n_rot = n_embd_head/2
// Other: n_rot = n_embd_head
static void llm_build_k_shift(
const llama_context & lctx,
struct ggml_context * ctx,
struct ggml_cgraph * graph,
int64_t n_rot,
const llm_build_cb & cb) {
const auto & model = lctx.model;
const auto & kv_self = lctx.kv_self;
const auto & cparams = lctx.cparams;

const auto & hparams = model.hparams;

const int64_t n_head = hparams.n_head;
const int64_t n_layer = hparams.n_layer;
const int64_t n_embd_gqa = hparams.n_embd_gqa();
const int64_t n_embd_head = hparams.n_embd_head();

const int64_t n_ctx = lctx.cparams.n_ctx;

const float freq_base = cparams.rope_freq_base;
const float freq_scale = cparams.rope_freq_scale;

GGML_ASSERT(n_embd_head % n_rot == 0);

struct ggml_tensor * K_shift = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, n_ctx);
cb(K_shift, "K_shift", -1);

for (int il = 0; il < n_layer; ++il) {
struct ggml_tensor * tmp =
// we rotate only the first n_rot dimensions
ggml_rope_custom_inplace(ctx,
ggml_view_3d(ctx, kv_self.k,
n_rot, n_head, n_ctx,
ggml_element_size(kv_self.k)*n_embd_gqa,
ggml_element_size(kv_self.k)*n_embd_head,
ggml_element_size(kv_self.k)*(n_embd_head*n_ctx*il)
),
K_shift, n_rot, 2, 0, freq_base, freq_scale);
cb(tmp, "K_shifted", il);
ggml_build_forward_expand(graph, tmp);
}
}

static struct ggml_cgraph * llm_build_llama(
llama_context & lctx,
const llama_batch & batch,
Expand Down Expand Up @@ -3308,21 +3353,7 @@ static struct ggml_cgraph * llm_build_llama(

// shift the entire K-cache if needed
if (do_rope_shift) {
struct ggml_tensor * K_shift = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_ctx);
cb(K_shift, "K_shift", -1);

for (int il = 0; il < n_layer; ++il) {
struct ggml_tensor * tmp =
ggml_rope_custom_inplace(ctx0,
ggml_view_3d(ctx0, kv_self.k,
n_embd_head, n_head_kv, n_ctx,
ggml_element_size(kv_self.k)*n_embd_head,
ggml_element_size(kv_self.k)*n_embd_gqa,
ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il),
K_shift, n_embd_head, 0, 0, freq_base, freq_scale);
cb(tmp, "K_shifted", il);
ggml_build_forward_expand(gf, tmp);
}
llm_build_k_shift(lctx, ctx0, gf, n_embd_head, cb);
}

for (int il = 0; il < n_layer; ++il) {
Expand Down Expand Up @@ -3557,21 +3588,7 @@ static struct ggml_cgraph * llm_build_baichaun(

// shift the entire K-cache if needed
if (do_rope_shift) {
struct ggml_tensor * K_shift = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_ctx);
cb(K_shift, "K_shift", -1);

for (int il = 0; il < n_layer; ++il) {
struct ggml_tensor * tmp =
ggml_rope_custom_inplace(ctx0,
ggml_view_3d(ctx0, kv_self.k,
n_embd_head, n_head_kv, n_ctx,
ggml_element_size(kv_self.k)*n_embd_head,
ggml_element_size(kv_self.k)*n_embd_gqa,
ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il),
K_shift, n_embd_head, 0, 0, freq_base, freq_scale);
cb(tmp, "K_shifted", il);
ggml_build_forward_expand(gf, tmp);
}
llm_build_k_shift(lctx, ctx0, gf, n_embd_head, cb);
}

for (int il = 0; il < n_layer; ++il) {
Expand Down Expand Up @@ -3830,21 +3847,7 @@ static struct ggml_cgraph * llm_build_falcon(

// shift the entire K-cache if needed
if (do_rope_shift) {
struct ggml_tensor * K_shift = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_ctx);
cb(K_shift, "K_shift", -1);

for (int il = 0; il < n_layer; ++il) {
struct ggml_tensor * tmp =
ggml_rope_custom_inplace(ctx0,
ggml_view_3d(ctx0, kv_self.k,
n_embd_head, n_head_kv, n_ctx,
ggml_element_size(kv_self.k)*n_embd_head,
ggml_element_size(kv_self.k)*n_embd_gqa,
ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il),
K_shift, n_embd_head, 2, 0, freq_base, freq_scale);
cb(tmp, "K_shifted", il);
ggml_build_forward_expand(gf, tmp);
}
llm_build_k_shift(lctx, ctx0, gf, n_embd_head, cb);
}

for (int il = 0; il < n_layer; ++il) {
Expand Down Expand Up @@ -4243,14 +4246,15 @@ static struct ggml_cgraph * llm_build_persimmon(
GGML_ASSERT(!!kv_self.ctx);

const auto & cparams = lctx.cparams;

const int64_t n_embd = hparams.n_embd;
const int64_t n_layer = hparams.n_layer;
const int64_t n_ctx = cparams.n_ctx;
const int64_t n_head_kv = hparams.n_head_kv;
const int64_t n_head = hparams.n_head;
const int64_t n_embd_head = hparams.n_embd_head();
const int64_t n_embd_gqa = hparams.n_embd_gqa();
const size_t n_rot = n_embd_head / 2;
const int64_t n_rot = n_embd_head / 2;

const float freq_base = cparams.rope_freq_base;
const float freq_scale = cparams.rope_freq_scale;
Expand Down Expand Up @@ -4297,23 +4301,7 @@ static struct ggml_cgraph * llm_build_persimmon(
cb(KQ_mask, "KQ_mask", -1);

if (do_rope_shift) {
struct ggml_tensor * K_shift = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_ctx);
cb(K_shift, "K_shift", -1);

for (int il = 0; il < n_layer; ++il) {
struct ggml_tensor * tmp =
// we rotate only the first n_rot dimensions.
ggml_rope_custom_inplace(ctx0,
ggml_view_3d(ctx0, kv_self.k,
n_rot, n_head, n_ctx,
ggml_element_size(kv_self.k)*n_embd_gqa,
ggml_element_size(kv_self.k)*n_embd_head,
ggml_element_size(kv_self.k)*(n_embd_head*n_ctx*il)
),
K_shift, n_rot, 2, 0, freq_base, freq_scale);
cb(tmp, "K_shifted", il);
ggml_build_forward_expand(gf, tmp);
}
llm_build_k_shift(lctx, ctx0, gf, n_rot, cb);
}

for (int il = 0; il < n_layer; ++il) {
Expand Down Expand Up @@ -5534,7 +5522,7 @@ static struct ggml_cgraph * llama_build_graph(
#ifdef GGML_USE_CUBLAS
const bool do_offload = true;
#else
const bool do_offload = false;
const bool do_offload = true; // TODO: set to false after finishing refactoring
#endif

if (!do_offload) {
Expand Down

0 comments on commit c6ae530

Please sign in to comment.