From d84aa7f76c5c842b9f2e89b3cf5b659bed6cba8f Mon Sep 17 00:00:00 2001
From: Dan Forbes <dan@danforbes.dev>
Date: Sat, 15 Apr 2023 12:51:01 -0700
Subject: [PATCH 01/35] Create a Model trait

---
 Cargo.lock                                    |  18 +-
 Cargo.toml                                    |   3 +
 llama-cli/Cargo.toml                          |   3 +-
 llama-cli/src/cli_args.rs                     |   6 +-
 llama-cli/src/main.rs                         |   4 +-
 llama-cli/src/snapshot.rs                     |   4 +-
 llama-rs/Cargo.toml                           |  16 +-
 llama-rs/src/inference_session.rs             |  32 +-
 llama-rs/src/lib.rs                           |  13 +-
 llama-rs/src/model.rs                         | 468 +----------------
 llama-rs/src/util.rs                          |  96 +---
 llama-rs/src/vocabulary.rs                    |   8 +-
 llama/Cargo.toml                              |  23 +
 {llama-rs => llama}/src/convert.rs            |   4 +-
 .../src/loader.rs => llama/src/ggml_loader.rs | 400 +++++++++------
 llama/src/lib.rs                              | 475 ++++++++++++++++++
 16 files changed, 826 insertions(+), 747 deletions(-)
 create mode 100644 llama/Cargo.toml
 rename {llama-rs => llama}/src/convert.rs (97%)
 rename llama-rs/src/loader.rs => llama/src/ggml_loader.rs (82%)
 create mode 100644 llama/src/lib.rs

diff --git a/Cargo.lock b/Cargo.lock
index f46addca..475a847d 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -456,6 +456,20 @@ version = "0.1.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "f051f77a7c8e6957c0696eac88f26b0117e54f52d3fc682ab19397a8812846a4"
 
+[[package]]
+name = "llama"
+version = "0.1.0"
+dependencies = [
+ "bytemuck",
+ "ggml",
+ "llama-rs",
+ "protobuf",
+ "rust_tokenizers",
+ "serde",
+ "serde_json",
+ "thiserror",
+]
+
 [[package]]
 name = "llama-cli"
 version = "0.1.0"
@@ -463,6 +477,7 @@ dependencies = [
  "bincode",
  "clap",
  "env_logger",
+ "llama",
  "llama-rs",
  "log",
  "num_cpus",
@@ -480,12 +495,9 @@ dependencies = [
  "bytemuck",
  "ggml",
  "partial_sort",
- "protobuf",
  "rand",
- "rust_tokenizers",
  "serde",
  "serde_bytes",
- "serde_json",
  "thiserror",
 ]
 
diff --git a/Cargo.toml b/Cargo.toml
index 8ea220d8..2621b08f 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -2,6 +2,7 @@
 members = [
     "ggml-sys",
     "ggml",
+    "llama",
     "llama-rs",
     "llama-cli",
     "generate-ggml-bindings"
@@ -12,4 +13,6 @@ resolver = "2"
 version = "0.1.0"
 
 [workspace.dependencies]
+bytemuck = "1.13.1"
 rand = "0.8.5"
+serde = { version = "1.0", features = ["derive"] }
diff --git a/llama-cli/Cargo.toml b/llama-cli/Cargo.toml
index 2eff43b7..c5d93022 100644
--- a/llama-cli/Cargo.toml
+++ b/llama-cli/Cargo.toml
@@ -6,7 +6,8 @@ edition = "2021"
 # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
 
 [dependencies]
-llama-rs = { path = "../llama-rs", features = ["convert"] }
+llama = { path = "../llama", features = ["convert"] }
+llama-rs = { path = "../llama-rs" }
 
 rand = { workspace = true }
 
diff --git a/llama-cli/src/cli_args.rs b/llama-cli/src/cli_args.rs
index ff9556c5..9c21c07e 100644
--- a/llama-cli/src/cli_args.rs
+++ b/llama-cli/src/cli_args.rs
@@ -260,9 +260,9 @@ pub struct ModelLoad {
     pub num_ctx_tokens: usize,
 }
 impl ModelLoad {
-    pub fn load(&self) -> llama_rs::Model {
-        let model = llama_rs::Model::load(&self.model_path, self.num_ctx_tokens, |progress| {
-            use llama_rs::LoadProgress;
+    pub fn load(&self) -> llama::Llama {
+        let model = llama::Llama::load(&self.model_path, self.num_ctx_tokens, |progress| {
+            use llama::LoadProgress;
             match progress {
                 LoadProgress::HyperparametersLoaded(hparams) => {
                     log::debug!("Loaded hyperparameters {hparams:#?}")
diff --git a/llama-cli/src/main.rs b/llama-cli/src/main.rs
index f0f072b8..3bddc674 100644
--- a/llama-cli/src/main.rs
+++ b/llama-cli/src/main.rs
@@ -2,9 +2,11 @@ use std::{convert::Infallible, io::Write};
 
 use clap::Parser;
 use cli_args::Args;
-use llama_rs::{convert::convert_pth_to_ggml, InferenceError};
 use rustyline::error::ReadlineError;
 
+use llama::convert::convert_pth_to_ggml;
+use llama_rs::{InferenceError, Model};
+
 mod cli_args;
 mod snapshot;
 
diff --git a/llama-cli/src/snapshot.rs b/llama-cli/src/snapshot.rs
index 3601de76..0dbe7de2 100644
--- a/llama-cli/src/snapshot.rs
+++ b/llama-cli/src/snapshot.rs
@@ -13,12 +13,12 @@ use zstd::{
 const SNAPSHOT_COMPRESSION_LEVEL: CompressionLevel = 1;
 
 pub fn read_or_create_session(
-    model: &Model,
+    model: &llama::Llama,
     persist_session: Option<&Path>,
     load_session: Option<&Path>,
     inference_session_params: InferenceSessionParameters,
 ) -> (InferenceSession, bool) {
-    fn load(model: &Model, path: &Path) -> InferenceSession {
+    fn load(model: &llama::Llama, path: &Path) -> InferenceSession {
         let file = unwrap_or_exit(File::open(path), || format!("Could not open file {path:?}"));
         let decoder = unwrap_or_exit(Decoder::new(BufReader::new(file)), || {
             format!("Could not create decoder for {path:?}")
diff --git a/llama-rs/Cargo.toml b/llama-rs/Cargo.toml
index 076dd7bc..26f863b1 100644
--- a/llama-rs/Cargo.toml
+++ b/llama-rs/Cargo.toml
@@ -9,18 +9,10 @@ rust-version = "1.65"
 [dependencies]
 ggml = { path = "../ggml" }
 
+bytemuck = { workspace = true }
 rand = { workspace = true }
+serde = { workspace = true }
 
-bytemuck = "1.13.1"
-partial_sort = "0.2.0"
 thiserror = "1.0"
-serde = { version = "1.0", features = ["derive"] }
-serde_bytes = "0.11"
-
-# Used for the `convert` feature
-serde_json = { version = "1.0", optional = true }
-protobuf = { version = "= 2.14.0", optional = true }
-rust_tokenizers = { version = "3.1.2", optional = true }
-
-[features]
-convert = ["dep:serde_json", "dep:protobuf", "dep:rust_tokenizers"]
\ No newline at end of file
+partial_sort = "0.2.0"
+serde_bytes = "0.11"
\ No newline at end of file
diff --git a/llama-rs/src/inference_session.rs b/llama-rs/src/inference_session.rs
index 3af27812..65d96899 100644
--- a/llama-rs/src/inference_session.rs
+++ b/llama-rs/src/inference_session.rs
@@ -5,7 +5,7 @@ use rand::{distributions::WeightedIndex, prelude::Distribution};
 use thiserror::Error;
 
 use crate::{
-    util::mulf, EvaluateOutputRequest, InferenceError, InferenceParameters, Model, TokenId,
+    mulf, EvaluateOutputRequest, InferenceError, InferenceParameters, Model, TokenId,
     TokenUtf8Buffer, EOT_TOKEN_ID,
 };
 
@@ -28,33 +28,36 @@ pub struct InferenceSession {
     // Parameters for the session.
     pub(crate) params: InferenceSessionParameters,
 
-    pub(crate) memory_k: ggml::Tensor,
-    pub(crate) memory_v: ggml::Tensor,
+    /// Memory K
+    pub memory_k: ggml::Tensor,
+
+    /// Memory M
+    pub memory_v: ggml::Tensor,
 
     /// How many tokens have been fed into the model's working memory so far.
-    pub(crate) n_past: usize,
+    pub n_past: usize,
 
     /// How much memory is required per token for the temporary context used
     /// during inference.
-    pub(crate) mem_per_token: usize,
+    pub mem_per_token: usize,
 
     /// All tokens generated by this inference session
     pub(crate) tokens: Vec<TokenId>,
 
     /// The logits that were last predicted by the network. Zeroed out otherwise.
-    pub(crate) last_logits: Vec<f32>,
+    pub last_logits: Vec<f32>,
 
     /// Scratch buffers used during inference.
     ///
     /// The number of scratch buffers was copied from `llama.cpp`.
     /// There is no specific reason for this number, but one is insufficient.
-    pub(crate) scratch: [ggml::Buffer; 2],
+    pub scratch: [ggml::Buffer; 2],
 }
 impl InferenceSession {
     /// Feed a prompt to the model for this session.
     pub fn feed_prompt<E: std::error::Error + 'static>(
         &mut self,
-        model: &Model,
+        model: &impl Model,
         params: &InferenceParameters,
         prompt: &str,
         mut callback: impl FnMut(&[u8]) -> Result<(), E>,
@@ -68,7 +71,7 @@ impl InferenceSession {
             .map(|(_, tok)| *tok)
             .collect();
 
-        if self.n_past + prompt_tokens.len() >= model.hparams.n_ctx {
+        if self.n_past + prompt_tokens.len() >= model.n_ctx() {
             return Err(InferenceError::ContextFull);
         }
 
@@ -92,11 +95,11 @@ impl InferenceSession {
     /// Infer the next token for this session.
     pub fn infer_next_token<'v>(
         &mut self,
-        model: &'v Model,
+        model: &'v impl Model,
         params: &InferenceParameters,
         rng: &mut impl rand::Rng,
     ) -> Result<&'v [u8], InferenceError> {
-        if self.n_past + 1 >= model.hparams.n_ctx {
+        if self.n_past + 1 >= model.n_ctx() {
             return Err(InferenceError::ContextFull);
         }
 
@@ -129,7 +132,7 @@ impl InferenceSession {
     /// If `params.play_back_previous_tokens` is specified, this will "play back" all existing tokens in the session.
     pub fn inference_with_prompt<E: std::error::Error + 'static>(
         &mut self,
-        model: &Model,
+        model: &impl Model,
         params: &InferenceParameters,
         prompt: &str,
         maximum_token_count: Option<usize>,
@@ -311,7 +314,7 @@ impl InferenceSession {
     /// Creates an [InferenceSession] from a snapshot.
     pub fn from_snapshot(
         snapshot: InferenceSnapshot,
-        model: &Model,
+        model: &impl Model,
     ) -> Result<Self, SnapshotError> {
         let mut session = model.start_session(snapshot.session_params);
 
@@ -340,7 +343,8 @@ impl InferenceSession {
     }
 }
 impl InferenceSession {
-    pub(crate) fn new(
+    /// Create a new InferenceSession
+    pub fn new(
         params: InferenceSessionParameters,
         n_ctx: usize,
         n_layer: usize,
diff --git a/llama-rs/src/lib.rs b/llama-rs/src/lib.rs
index 51218cb8..b7888184 100644
--- a/llama-rs/src/lib.rs
+++ b/llama-rs/src/lib.rs
@@ -3,13 +3,11 @@
 
 use thiserror::Error;
 
-#[cfg(feature = "convert")]
-pub mod convert;
-
 mod inference_session;
-mod loader;
-mod model;
-mod util;
+/// Large language model
+pub mod model;
+/// Utilities
+pub mod util;
 mod vocabulary;
 
 pub use ggml::Type as ElementType;
@@ -17,8 +15,7 @@ pub use inference_session::{
     InferenceSession, InferenceSessionParameters, InferenceSnapshot, ModelKVMemoryType,
     SnapshotError,
 };
-pub use loader::{LoadError, LoadProgress};
-pub use model::{Hyperparameters, Model};
+pub use model::Model;
 pub use util::TokenUtf8Buffer;
 pub use vocabulary::{TokenBias, TokenId, Vocabulary};
 
diff --git a/llama-rs/src/model.rs b/llama-rs/src/model.rs
index 370e62df..91a502a1 100644
--- a/llama-rs/src/model.rs
+++ b/llama-rs/src/model.rs
@@ -1,469 +1,37 @@
-use std::{collections::HashMap, path::Path};
-
-use serde::Deserialize;
-
 use crate::{
-    loader, vocabulary::TokenId, EvaluateOutputRequest, InferenceParameters, InferenceSession,
-    InferenceSessionParameters, LoadError, LoadProgress, Vocabulary,
+    vocabulary::TokenId, EvaluateOutputRequest, InferenceParameters, InferenceSession,
+    InferenceSessionParameters, Vocabulary,
 };
 
-/// The weights for the LLaMA model. All the mutable state is split into a
-/// separate struct `InferenceSession`.
-pub struct Model {
-    pub(crate) hparams: Hyperparameters,
-
-    vocabulary: Vocabulary,
-
-    tok_embeddings: ggml::Tensor,
-
-    norm: ggml::Tensor,
-    output: ggml::Tensor,
-
-    layers: Vec<Layer>,
-
-    tensors: HashMap<String, ggml::Tensor>,
-
-    // Must be kept alive for the model
-    _context: ggml::Context,
-}
-impl Model {
-    pub(crate) fn new(
-        context: ggml::Context,
-        hparams: Hyperparameters,
-        vocabulary: Vocabulary,
-        n_ff: usize,
-        wtype: ggml::Type,
-    ) -> Model {
-        let n_embd = hparams.n_embd;
-        let n_layer = hparams.n_layer;
-        let n_vocab = hparams.n_vocab;
-
-        let mut tensors = HashMap::new();
-
-        let tok_embeddings = context.new_tensor_2d(wtype, n_embd, n_vocab);
-        let norm = context.new_tensor_1d(ggml::Type::F32, n_embd);
-        let output = context.new_tensor_2d(wtype, n_embd, n_vocab);
-
-        tensors.insert("tok_embeddings.weight".to_owned(), tok_embeddings.share());
-        tensors.insert("norm.weight".to_owned(), norm.share());
-        tensors.insert("output.weight".to_owned(), output.share());
-
-        let mut layers = Vec::new();
-        for i in 0..n_layer {
-            let layer = Layer {
-                attention_norm: context.new_tensor_1d(ggml::Type::F32, n_embd),
-                wq: context.new_tensor_2d(wtype, n_embd, n_embd),
-                wk: context.new_tensor_2d(wtype, n_embd, n_embd),
-                wv: context.new_tensor_2d(wtype, n_embd, n_embd),
-                wo: context.new_tensor_2d(wtype, n_embd, n_embd),
-                ffn_norm: context.new_tensor_1d(ggml::Type::F32, n_embd),
-                w1: context.new_tensor_2d(wtype, n_embd, n_ff),
-                w2: context.new_tensor_2d(wtype, n_ff, n_embd),
-                w3: context.new_tensor_2d(wtype, n_embd, n_ff),
-            };
-
-            tensors.insert(
-                format!("layers.{i}.attention_norm.weight"),
-                layer.attention_norm.share(),
-            );
-
-            tensors.insert(format!("layers.{i}.attention.wq.weight"), layer.wq.share());
-            tensors.insert(format!("layers.{i}.attention.wk.weight"), layer.wk.share());
-            tensors.insert(format!("layers.{i}.attention.wv.weight"), layer.wv.share());
-            tensors.insert(format!("layers.{i}.attention.wo.weight"), layer.wo.share());
-
-            tensors.insert(
-                format!("layers.{i}.ffn_norm.weight"),
-                layer.ffn_norm.share(),
-            );
-
-            tensors.insert(
-                format!("layers.{i}.feed_forward.w1.weight"),
-                layer.w1.share(),
-            );
-            tensors.insert(
-                format!("layers.{i}.feed_forward.w2.weight"),
-                layer.w2.share(),
-            );
-            tensors.insert(
-                format!("layers.{i}.feed_forward.w3.weight"),
-                layer.w3.share(),
-            );
-
-            layers.push(layer);
-        }
-
-        Model {
-            hparams,
-            vocabulary,
-            tok_embeddings,
-            norm,
-            output,
-            layers,
-            tensors,
-            _context: context,
-        }
-    }
-
-    /// Load the model from `path` with `n_context_tokens` context tokens.
-    ///
-    /// The status of the loading process will be reported through `load_progress_callback`.
-    pub fn load(
-        path: impl AsRef<Path>,
-        n_context_tokens: usize,
-        load_progress_callback: impl FnMut(LoadProgress),
-    ) -> Result<Model, LoadError> {
-        loader::load(path, n_context_tokens, load_progress_callback)
-    }
+/// A large language model.
+pub trait Model {
+    /// The model type.
+    type Model;
+    /// Hyperparameters for the model
+    type Hyperparameters;
+    /// Layer for the model
+    type Layer;
 
     /// Starts a new `InferenceSession` for this model.
-    pub fn start_session(&self, params: InferenceSessionParameters) -> InferenceSession {
-        InferenceSession::new(
-            params,
-            self.hparams.n_ctx,
-            self.hparams.n_layer,
-            self.hparams.n_embd,
-            self.hparams.n_vocab,
-        )
-    }
-
+    fn start_session(&self, params: InferenceSessionParameters) -> InferenceSession;
+    
     /// Evaluates the transformer.
     ///
     /// The provided `output_request` struct lets you specify which additional
     /// data you are interested in fetching from the transformer. Setting a
     /// field to a `Some` value will clear and fill the provided vector with
     /// data. The provided vector will be resized to the exact output size.
-    pub fn evaluate(
+    fn evaluate(
         &self,
         session: &mut InferenceSession,
         params: &InferenceParameters,
         input_tokens: &[TokenId],
         output_request: &mut EvaluateOutputRequest,
-    ) {
-        let n = input_tokens.len();
-        let n_past = session.n_past;
-        let n_threads = params.n_threads;
-
-        let memk_elsize = session.memory_k.element_size();
-        let memv_elsize = session.memory_v.element_size();
-
-        let Hyperparameters {
-            n_vocab,
-            n_ctx,
-            n_embd,
-            n_mult: _,
-            n_head,
-            n_layer,
-            n_rot,
-            f16_: _,
-        } = self.hparams;
-
-        // For the first run, we need to guess a maximum buffer size so we can measure
-        // the actual memory consumption of the temporary ggml context.
-        //
-        // These numbers are from `llama.cpp`, and could potentially be more efficient.
-        let mut buf_size = {
-            let buf_size_mb = if n_layer >= 80 {
-                1536
-            } else if n_layer >= 60 {
-                1280
-            } else {
-                1024
-            };
-            buf_size_mb * 1024 * 1024
-        };
-        if session.mem_per_token > 0 && session.mem_per_token * n > buf_size {
-            // add 10% to account for ggml object overhead
-            buf_size = (1.1f64 * session.mem_per_token as f64 * n as f64) as usize;
-        };
-        let ctx0 = ggml::Context::init(buf_size);
-
-        let mut gf = ggml::ComputationGraph::new(n_threads);
-
-        let embd = ctx0.new_tensor_1d(ggml::Type::I32, n);
-        unsafe { embd.write_data(bytemuck::cast_slice(input_tokens)) };
-
-        let mut input_layer = ctx0.op_get_rows(&self.tok_embeddings, &embd);
-
-        for il in 0..n_layer {
-            let input_self_attention = input_layer.share();
-            let mut current: ggml::Tensor;
-
-            ctx0.use_scratch(Some(&mut session.scratch[0]));
-
-            // norm
-            {
-                current = ctx0.op_rms_norm(&input_layer);
-
-                // cur = attention_norm * cur
-                current = ctx0.op_mul(
-                    &ctx0.op_repeat(&self.layers[il].attention_norm, &current),
-                    &current,
-                );
-            }
-
-            // self-attention
-            {
-                // compute Q and K and RoPE them
-                let q_current = ctx0.op_rope(
-                    &ctx0.op_reshape_3d(
-                        &ctx0.op_mul_mat(&self.layers[il].wq, &current),
-                        n_embd / n_head,
-                        n_head,
-                        n,
-                    ),
-                    n_past,
-                    n_rot,
-                    0,
-                );
-                let k_current = ctx0.op_rope(
-                    &ctx0.op_reshape_3d(
-                        &ctx0.op_mul_mat(&self.layers[il].wk, &current),
-                        n_embd / n_head,
-                        n_head,
-                        n,
-                    ),
-                    n_past,
-                    n_rot,
-                    0,
-                );
-
-                // store key and value to memory
-                {
-                    // compute the transposed [N, n_embd] V matrix
-                    let v_current = ctx0.op_transpose(&ctx0.op_reshape_2d(
-                        &ctx0.op_mul_mat(&self.layers[il].wv, &current),
-                        n_embd,
-                        n,
-                    ));
-
-                    let k = ctx0.op_view_1d(
-                        &session.memory_k,
-                        n * n_embd,
-                        (memk_elsize * n_embd) * (il * n_ctx + n_past),
-                    );
-
-                    let v = ctx0.op_view_2d(
-                        &session.memory_v,
-                        n,
-                        n_embd,
-                        n_ctx * memv_elsize,
-                        (il * n_ctx) * memv_elsize * n_embd + n_past * memv_elsize,
-                    );
-
-                    // important: storing RoPE-ed version of K in the KV cache!
-                    gf.build_forward_expand(&ctx0.op_cpy(&k_current, &k));
-                    gf.build_forward_expand(&ctx0.op_cpy(&v_current, &v));
-                }
-
-                let q = ctx0.op_permute(&q_current, 0, 2, 1, 3);
-
-                let k = ctx0.op_permute(
-                    &ctx0.op_reshape_3d(
-                        &ctx0.op_view_1d(
-                            &session.memory_k,
-                            (n_past + n) * n_embd,
-                            il * n_ctx * memk_elsize * n_embd,
-                        ),
-                        n_embd / n_head,
-                        n_head,
-                        n_past + n,
-                    ),
-                    0,
-                    2,
-                    1,
-                    3,
-                );
-
-                // K * Q
-                let k_q = ctx0.op_mul_mat(&k, &q);
-
-                // KQ_scaled = KQ / sqrt(n_embd/n_head)
-                let k_q_scaled = ctx0.op_scale(
-                    &k_q,
-                    &ctx0.new_f32(1.0 / f32::sqrt(n_embd as f32 / n_head as f32)),
-                );
-
-                // KQ_masked = mask_past(KQ_scaled)
-                let k_q_masked = ctx0.op_diag_mask_inf(&k_q_scaled, n_past);
-
-                // KQ = soft_max(KQ_masked)
-                let k_q_soft_max = ctx0.op_soft_max(&k_q_masked);
-
-                // split cached V into n_head heads
-                let v = ctx0.op_view_3d(
-                    &session.memory_v,
-                    n_past + n,
-                    n_embd / n_head,
-                    n_head,
-                    n_ctx * memv_elsize,
-                    n_ctx * memv_elsize * n_embd / n_head,
-                    il * n_ctx * memv_elsize * n_embd,
-                );
-
-                let k_q_v = ctx0.op_mul_mat(&v, &k_q_soft_max);
-
-                // KQV_merged = KQV.permute(0, 2, 1, 3)
-                let k_q_v_merged = ctx0.op_permute(&k_q_v, 0, 2, 1, 3);
-
-                // cur = KQV_merged.contiguous().view(n_embd, N)
-                current = ctx0.op_cpy(
-                    &k_q_v_merged,
-                    &ctx0.new_tensor_2d(ggml::Type::F32, n_embd, n),
-                );
-
-                // projection (no bias)
-                current = ctx0.op_mul_mat(&self.layers[il].wo, &current);
-            }
-
-            ctx0.use_scratch(Some(&mut session.scratch[1]));
-
-            let input_feed_forward = ctx0.op_add(&current, &input_self_attention);
-
-            // feed-forward network
-            {
-                // norm
-                {
-                    current = ctx0.op_rms_norm(&input_feed_forward);
-
-                    // cur = ffn_norm*cur
-                    current = ctx0.op_mul(
-                        &ctx0.op_repeat(&self.layers[il].ffn_norm, &current),
-                        &current,
-                    );
-                }
-
-                let tmp = ctx0.op_mul_mat(&self.layers[il].w3, &current);
-
-                current = ctx0.op_mul_mat(&self.layers[il].w1, &current);
-
-                // SILU activation
-                current = ctx0.op_silu(&current);
-
-                current = ctx0.op_mul(&current, &tmp);
-
-                current = ctx0.op_mul_mat(&self.layers[il].w2, &current);
-            }
-
-            current = ctx0.op_add(&current, &input_feed_forward);
-
-            // input for next layer
-            input_layer = current;
-        }
-
-        ctx0.use_scratch(Some(&mut session.scratch[0]));
-
-        // Used at the end to optionally extract the embeddings.
-        let embeddings_tensor;
-
-        // norm
-        {
-            input_layer = ctx0.op_rms_norm(&input_layer);
-
-            // inpL = norm*inpL
-            input_layer = ctx0.op_mul(&ctx0.op_repeat(&self.norm, &input_layer), &input_layer);
-            embeddings_tensor = input_layer.share();
-        }
-
-        // lm_head
-        {
-            input_layer = ctx0.op_mul_mat(&self.output, &input_layer);
-        }
-
-        ctx0.use_scratch(None);
-
-        // logits -> probs
-        // inpL = ctx0.op_soft_max(&inpL);
-
-        // run the computation
-        gf.build_forward_expand(&input_layer);
-        ctx0.graph_compute(&mut gf);
-
-        // return result for just the last token
-        // SAFETY: yolo
-        assert_eq!(session.last_logits.len(), n_vocab);
-        unsafe {
-            input_layer.read_data(
-                n_vocab * (n - 1) * std::mem::size_of::<f32>(),
-                bytemuck::cast_slice_mut(&mut session.last_logits),
-            )
-        };
-
-        // Extract logits
-        if let Some(all_logits) = &mut output_request.all_logits {
-            all_logits.resize(n_vocab * n, 0.0);
-            // SAFETY: Tensor data can be read (properly aligned, initialized,
-            // data will not be mutated or otherwise aliased during the copy),
-            // and we're not reading past the end of the tensor data.
-            assert_eq!(input_layer.nelements(), n_vocab * n);
-            unsafe {
-                input_layer.read_data(0, bytemuck::cast_slice_mut(all_logits));
-            }
-        }
-
-        // Extract embeddings
-        if let Some(embeddings) = &mut output_request.embeddings {
-            embeddings.resize(n_embd * n, 0.0);
-            // SAFETY: Same rationale as for the "Extract logits" section applies.
-            assert_eq!(embeddings_tensor.nelements(), n_embd * n);
-            unsafe {
-                embeddings_tensor.read_data(0, bytemuck::cast_slice_mut(embeddings));
-            }
-        }
-
-        // Adjust the required memory per token if we didn't know that already
-        if session.mem_per_token == 0 {
-            session.mem_per_token = ctx0.used_mem() / n;
-        }
-
-        // Adjust n_past to new length.
-        session.n_past += input_tokens.len();
-    }
-
-    /// Returns the vocabulary used by this model.
-    pub fn vocabulary(&self) -> &Vocabulary {
-        &self.vocabulary
-    }
-
-    pub(crate) fn tensors(&self) -> &HashMap<String, ggml::Tensor> {
-        &self.tensors
-    }
-}
-
-/// The hyperparameters of the model.
-#[derive(Debug, Default, PartialEq, Eq, PartialOrd, Ord, Deserialize)]
-pub struct Hyperparameters {
-    /// n_vocab
-    pub n_vocab: usize,
-    /// n_ctx
-    pub n_ctx: usize,
-    /// n_embd
-    pub n_embd: usize,
-    /// n_mult
-    pub n_mult: usize,
-    /// n_head
-    pub n_head: usize,
-    /// n_layer
-    pub n_layer: usize,
-    /// n_rot
-    pub n_rot: usize,
-    /// f16_
-    pub f16_: u32,
-}
-
-struct Layer {
-    attention_norm: ggml::Tensor,
-
-    wq: ggml::Tensor,
-    wk: ggml::Tensor,
-    wv: ggml::Tensor,
-    wo: ggml::Tensor,
+    );
 
-    // normalization
-    ffn_norm: ggml::Tensor,
+    /// Model vocabulary
+    fn vocabulary(&self) -> &Vocabulary;
 
-    // ff
-    w1: ggml::Tensor,
-    w2: ggml::Tensor,
-    w3: ggml::Tensor,
+    /// Model context size
+    fn n_ctx(&self) -> usize;
 }
diff --git a/llama-rs/src/util.rs b/llama-rs/src/util.rs
index 3eb8f06d..42fd5cdd 100644
--- a/llama-rs/src/util.rs
+++ b/llama-rs/src/util.rs
@@ -1,7 +1,3 @@
-use std::path::{Path, PathBuf};
-
-use crate::LoadError;
-
 /// NOTE: The original code relies in promotion rules and automatic cast between
 /// int to float. What we do instead is use this macro to convert every term of
 /// the multiplication to f64, which should have enough precision bits to hold
@@ -9,14 +5,13 @@ use crate::LoadError;
 /// the ctx_size found using this code, and the one in llama.cpp. The number for
 /// rust ends up being slightly lower, but no "out of memory" errors are
 /// reported by ggml.
+#[macro_export]
 macro_rules! mulf {
     ($term:expr, $($terms:expr),*) => {
         usize::try_from((($term as f64) $(* ($terms as f64))*) as u64).unwrap()
     };
 }
 
-pub(crate) use mulf;
-
 /// Used to buffer incoming tokens until they produce a valid string of UTF-8 text.
 ///
 /// Tokens are *not* valid UTF-8 by themselves. However, the LLM will produce valid UTF-8
@@ -68,92 +63,3 @@ impl TokenUtf8Buffer {
         }
     }
 }
-
-pub(crate) fn find_all_model_files(main_path: &Path) -> Result<Vec<PathBuf>, LoadError> {
-    Ok(collect_related_paths(
-        main_path,
-        std::fs::read_dir(main_path.parent().ok_or_else(|| LoadError::NoParentPath {
-            path: main_path.to_owned(),
-        })?)?
-        .filter_map(Result::ok)
-        .map(|de| de.path()),
-    ))
-}
-
-fn collect_related_paths(
-    main_path: &Path,
-    directory_paths: impl Iterator<Item = PathBuf>,
-) -> Vec<PathBuf> {
-    let main_filename = main_path.file_name().and_then(|p| p.to_str());
-
-    let mut paths: Vec<PathBuf> = directory_paths
-        .filter(|p| {
-            p.file_name()
-                .and_then(|p| p.to_str())
-                .zip(main_filename)
-                .map(|(part_filename, main_filename)| {
-                    match part_filename.strip_prefix(main_filename) {
-                        Some(suffix) => {
-                            suffix.is_empty()
-                                || (suffix
-                                    .strip_prefix('.')
-                                    .map(|s| s.parse::<usize>().is_ok())
-                                    .unwrap_or(false))
-                        }
-                        None => false,
-                    }
-                })
-                .unwrap_or(false)
-        })
-        .collect();
-    paths.sort();
-    paths
-}
-
-#[cfg(test)]
-mod tests {
-    use super::*;
-
-    #[test]
-    fn test_collect_related_paths() {
-        let main_path = PathBuf::from("/models/llama.bin");
-        let directory_paths = [
-            "/models/llama.bin",
-            "/models/llama.bin.1",
-            "/models/llama.bin.2",
-            "/models/llama.bin.tmp",
-        ]
-        .map(PathBuf::from);
-        let expected_paths = [
-            "/models/llama.bin",
-            "/models/llama.bin.1",
-            "/models/llama.bin.2",
-        ]
-        .map(PathBuf::from);
-
-        let output_paths = collect_related_paths(&main_path, directory_paths.into_iter());
-        assert_eq!(expected_paths.as_slice(), output_paths);
-    }
-
-    #[test]
-    fn test_valid_utf8() {
-        let mut buffer = TokenUtf8Buffer::new();
-        assert_eq!(buffer.push(b"hello").as_deref(), Some("hello"));
-        assert_eq!(buffer.push(&[0xE2, 0x82, 0xAC]).as_deref(), Some("€"));
-    }
-
-    #[test]
-    fn test_partial_utf8() {
-        let mut buffer = TokenUtf8Buffer::new();
-        assert_eq!(buffer.push(&[0xE2, 0x82]).as_deref(), None);
-        assert_eq!(buffer.push(&[0xAC]).as_deref(), Some("€"));
-    }
-
-    #[test]
-    fn test_invalid_prelude_for_valid_utf8() {
-        let mut buffer = TokenUtf8Buffer::new();
-        assert_eq!(buffer.push(&[0xD8]).as_deref(), None);
-        assert_eq!(buffer.push(&[0xE2, 0x82]).as_deref(), None);
-        assert_eq!(buffer.push(&[0xAC]).as_deref(), Some("€"));
-    }
-}
diff --git a/llama-rs/src/vocabulary.rs b/llama-rs/src/vocabulary.rs
index 80e619c7..cf4a6df8 100644
--- a/llama-rs/src/vocabulary.rs
+++ b/llama-rs/src/vocabulary.rs
@@ -11,16 +11,16 @@ pub(crate) type TokenScore = f32;
 #[derive(Debug, Clone)]
 pub struct Vocabulary {
     /// Maps every integer (index) token id to its corresponding token
-    pub(crate) id_to_token: Vec<Token>,
+    pub id_to_token: Vec<Token>,
 
     /// Maps every integer (index) token id to corresponding score
-    pub(crate) id_to_token_score: Vec<TokenScore>,
+    pub id_to_token_score: Vec<TokenScore>,
 
     /// Maps a token to a token id
-    pub(crate) token_to_id: HashMap<Token, TokenId>,
+    pub token_to_id: HashMap<Token, TokenId>,
 
     /// The longest token in this vocabulary
-    pub(crate) max_token_length: usize,
+    pub max_token_length: usize,
 }
 impl Vocabulary {
     pub(crate) fn token(&self, idx: usize) -> &[u8] {
diff --git a/llama/Cargo.toml b/llama/Cargo.toml
new file mode 100644
index 00000000..6fbf1740
--- /dev/null
+++ b/llama/Cargo.toml
@@ -0,0 +1,23 @@
+[package]
+name = "llama"
+version = { workspace = true }
+edition = "2021"
+
+# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
+
+[dependencies]
+bytemuck = { workspace = true }
+ggml = { path = "../ggml" }
+llama-rs = { path = "../llama-rs" }
+
+serde = { workspace = true }
+
+thiserror = "1.0"
+
+# Used for the `convert` feature
+serde_json = { version = "1.0", optional = true }
+protobuf = { version = "= 2.14.0", optional = true }
+rust_tokenizers = { version = "3.1.2", optional = true }
+
+[features]
+convert = ["dep:serde_json", "dep:protobuf", "dep:rust_tokenizers"]
diff --git a/llama-rs/src/convert.rs b/llama/src/convert.rs
similarity index 97%
rename from llama-rs/src/convert.rs
rename to llama/src/convert.rs
index fc562d48..679fcb42 100644
--- a/llama-rs/src/convert.rs
+++ b/llama/src/convert.rs
@@ -16,7 +16,7 @@ use std::{
     vec,
 };
 
-use crate::{util, Hyperparameters, Vocabulary};
+use crate::{ggml_loader::find_all_model_files, Hyperparameters, Vocabulary};
 
 /// Converts a `pth` file to a `ggml` file.
 pub fn convert_pth_to_ggml(model_directory: &Path, element_type: ggml::Type) {
@@ -25,7 +25,7 @@ pub fn convert_pth_to_ggml(model_directory: &Path, element_type: ggml::Type) {
 
     let hparams = load_hyperparameters(model_directory, element_type, &vocab);
 
-    let model_files = util::find_all_model_files(model_directory).unwrap();
+    let model_files = find_all_model_files(model_directory).unwrap();
 
     for (i, _file) in model_files.iter().enumerate() {
         let fname_out = model_directory.join(format!("rust-model-{}.bin", element_type));
diff --git a/llama-rs/src/loader.rs b/llama/src/ggml_loader.rs
similarity index 82%
rename from llama-rs/src/loader.rs
rename to llama/src/ggml_loader.rs
index f99344dd..903e2f56 100644
--- a/llama-rs/src/loader.rs
+++ b/llama/src/ggml_loader.rs
@@ -1,143 +1,20 @@
 use std::{
-    collections::HashMap,
-    io::{BufRead, Read, Seek, SeekFrom},
-    path::{Path, PathBuf},
+  collections::HashMap,
+  io::{BufRead, Read, Seek, SeekFrom},
+  path::{Path, PathBuf},
 };
 
+use llama_rs::{mulf, TokenId, Vocabulary};
 use thiserror::Error;
 
-use crate::{
-    util::{self, mulf},
-    vocabulary::TokenId,
-    Hyperparameters, Model, Vocabulary,
-};
-
-/// Each variant represents a step within the process of loading the model.
-/// These can be used to report progress to the user.
-#[derive(Clone, PartialEq, Eq, PartialOrd, Ord, Debug)]
-pub enum LoadProgress<'a> {
-    /// The hyperparameters have been loaded from the model.
-    HyperparametersLoaded(&'a Hyperparameters),
-    /// The context has been created.
-    ContextSize {
-        /// The size of the context.
-        bytes: usize,
-    },
-    /// A part of the model is being loaded.
-    PartLoading {
-        /// The path to the model part.
-        file: &'a Path,
-        /// The current part (0-indexed).
-        current_part: usize,
-        /// The number of total parts.
-        total_parts: usize,
-    },
-    /// A tensor from the current part has been loaded.
-    PartTensorLoaded {
-        /// The path to the model part.
-        file: &'a Path,
-        /// The current tensor (0-indexed).
-        current_tensor: usize,
-        /// The number of total tensors.
-        tensor_count: usize,
-    },
-    /// A model part has finished fully loading.
-    PartLoaded {
-        /// The path to the model part.
-        file: &'a Path,
-        /// The number of bytes in the part.
-        byte_size: usize,
-        /// The number of tensors in the part.
-        tensor_count: usize,
-    },
-}
-
-#[derive(Error, Debug)]
-/// Errors encountered during the loading process.
-pub enum LoadError {
-    #[error("could not open file {path:?}")]
-    /// A file failed to open.
-    OpenFileFailed {
-        /// The original error.
-        source: std::io::Error,
-        /// The path that failed.
-        path: PathBuf,
-    },
-    #[error("no parent path for {path:?}")]
-    /// There is no parent path for a given path.
-    NoParentPath {
-        /// The path without a parent.
-        path: PathBuf,
-    },
-    #[error("unable to read exactly {bytes} bytes")]
-    /// Reading exactly `bytes` from a file failed.
-    ReadExactFailed {
-        /// The original error.
-        source: std::io::Error,
-        /// The number of bytes that were attempted to be read.
-        bytes: usize,
-    },
-    #[error("non-specific I/O error")]
-    /// A non-specific IO error.
-    IO(#[from] std::io::Error),
-    #[error("could not convert bytes to a UTF-8 string")]
-    /// One of the strings encountered was not valid UTF-8.
-    InvalidUtf8(#[from] std::string::FromUtf8Error),
-    #[error("invalid integer conversion")]
-    /// One of the integers encountered could not be converted to a more appropriate type.
-    InvalidIntegerConversion(#[from] std::num::TryFromIntError),
-    #[error("invalid magic number for {path:?}")]
-    /// An invalid magic number was encountered during the loading process.
-    InvalidMagic {
-        /// The path that failed.
-        path: PathBuf,
-    },
-    #[error("invalid file format version {value}")]
-    /// The version of the format is not supported by this version of `llama-rs`.
-    InvalidFormatVersion {
-        /// The version that was encountered.
-        value: u32,
-    },
-    #[error("invalid value {ftype} for `f16` in hyperparameters")]
-    /// The `f16` hyperparameter had an invalid value.
-    HyperparametersF16Invalid {
-        /// The format type that was encountered.
-        ftype: u32,
-    },
-    #[error("unknown tensor `{tensor_name}` in {path:?}")]
-    /// The tensor `tensor_name` was encountered during the loading of `path`, but was not seen during
-    /// the model prelude.
-    UnknownTensor {
-        /// The name of the tensor.
-        tensor_name: String,
-        /// The path that failed.
-        path: PathBuf,
-    },
-    #[error("the tensor `{tensor_name}` has the wrong size in {path:?}")]
-    /// The tensor `tensor_name` did not match its expected size.
-    TensorWrongSize {
-        /// The name of the tensor.
-        tensor_name: String,
-        /// The path that failed.
-        path: PathBuf,
-    },
-    /// The tensor `tensor_name` did not have the expected format type.
-    #[error("invalid ftype {ftype} for tensor `{tensor_name}` in {path:?}")]
-    InvalidFtype {
-        /// The name of the tensor.
-        tensor_name: String,
-        /// The format type that was encountered.
-        ftype: u32,
-        /// The path that failed.
-        path: PathBuf,
-    },
-}
+use crate::{Hyperparameters, Llama};
 
+/// Load a model from disk
 pub fn load(
     path: impl AsRef<Path>,
     n_context_tokens: usize,
     mut load_progress_callback: impl FnMut(LoadProgress),
-) -> Result<Model, LoadError> {
+) -> Result<Llama, LoadError> {
     use std::fs::File;
     use std::io::BufReader;
 
@@ -276,14 +153,14 @@ pub fn load(
     // Initialize the context
     let context = ggml::Context::init(ctx_size);
 
-    let model = Model::new(context, hparams, vocabulary, n_ff, wtype);
+    let model = Llama::new(context, hparams, vocabulary, n_ff, wtype);
 
     // Close the file, but keep its offset. That way we know how to skip the
     // metadata when loading the parts.
     let file_offset = reader.stream_position()?;
     drop(reader);
 
-    let paths = util::find_all_model_files(main_path)?;
+    let paths = find_all_model_files(main_path)?;
     let n_parts = paths.len();
 
     for (i, part_path) in paths.into_iter().enumerate() {
@@ -514,41 +391,260 @@ pub fn load(
     Ok(model)
 }
 
+/// Each variant represents a step within the process of loading the model.
+/// These can be used to report progress to the user.
+#[derive(Clone, PartialEq, Eq, PartialOrd, Ord, Debug)]
+pub enum LoadProgress<'a> {
+    /// The hyperparameters have been loaded from the model.
+    HyperparametersLoaded(&'a Hyperparameters),
+    /// The context has been created.
+    ContextSize {
+        /// The size of the context.
+        bytes: usize,
+    },
+    /// A part of the model is being loaded.
+    PartLoading {
+        /// The path to the model part.
+        file: &'a Path,
+        /// The current part (0-indexed).
+        current_part: usize,
+        /// The number of total parts.
+        total_parts: usize,
+    },
+    /// A tensor from the current part has been loaded.
+    PartTensorLoaded {
+        /// The path to the model part.
+        file: &'a Path,
+        /// The current tensor (0-indexed).
+        current_tensor: usize,
+        /// The number of total tensors.
+        tensor_count: usize,
+    },
+    /// A model part has finished fully loading.
+    PartLoaded {
+        /// The path to the model part.
+        file: &'a Path,
+        /// The number of bytes in the part.
+        byte_size: usize,
+        /// The number of tensors in the part.
+        tensor_count: usize,
+    },
+}
+
+#[derive(Error, Debug)]
+/// Errors encountered during the loading process.
+pub enum LoadError {
+    #[error("could not open file {path:?}")]
+    /// A file failed to open.
+    OpenFileFailed {
+        /// The original error.
+        source: std::io::Error,
+        /// The path that failed.
+        path: PathBuf,
+    },
+    #[error("no parent path for {path:?}")]
+    /// There is no parent path for a given path.
+    NoParentPath {
+        /// The path without a parent.
+        path: PathBuf,
+    },
+    #[error("unable to read exactly {bytes} bytes")]
+    /// Reading exactly `bytes` from a file failed.
+    ReadExactFailed {
+        /// The original error.
+        source: std::io::Error,
+        /// The number of bytes that were attempted to be read.
+        bytes: usize,
+    },
+    #[error("non-specific I/O error")]
+    /// A non-specific IO error.
+    IO(#[from] std::io::Error),
+    #[error("could not convert bytes to a UTF-8 string")]
+    /// One of the strings encountered was not valid UTF-8.
+    InvalidUtf8(#[from] std::string::FromUtf8Error),
+    #[error("invalid integer conversion")]
+    /// One of the integers encountered could not be converted to a more appropriate type.
+    InvalidIntegerConversion(#[from] std::num::TryFromIntError),
+    #[error("invalid magic number for {path:?}")]
+    /// An invalid magic number was encountered during the loading process.
+    InvalidMagic {
+        /// The path that failed.
+        path: PathBuf,
+    },
+    #[error("invalid file format version {value}")]
+    /// The version of the format is not supported by this version of `llama-rs`.
+    InvalidFormatVersion {
+        /// The version that was encountered.
+        value: u32,
+    },
+    #[error("invalid value {ftype} for `f16` in hyperparameters")]
+    /// The `f16` hyperparameter had an invalid value.
+    HyperparametersF16Invalid {
+        /// The format type that was encountered.
+        ftype: u32,
+    },
+    #[error("unknown tensor `{tensor_name}` in {path:?}")]
+    /// The tensor `tensor_name` was encountered during the loading of `path`, but was not seen during
+    /// the model prelude.
+    UnknownTensor {
+        /// The name of the tensor.
+        tensor_name: String,
+        /// The path that failed.
+        path: PathBuf,
+    },
+    #[error("the tensor `{tensor_name}` has the wrong size in {path:?}")]
+    /// The tensor `tensor_name` did not match its expected size.
+    TensorWrongSize {
+        /// The name of the tensor.
+        tensor_name: String,
+        /// The path that failed.
+        path: PathBuf,
+    },
+    /// The tensor `tensor_name` did not have the expected format type.
+    #[error("invalid ftype {ftype} for tensor `{tensor_name}` in {path:?}")]
+    InvalidFtype {
+        /// The name of the tensor.
+        tensor_name: String,
+        /// The format type that was encountered.
+        ftype: u32,
+        /// The path that failed.
+        path: PathBuf,
+    },
+}
+
+/// Read bytes
 pub fn read_bytes<const N: usize>(reader: &mut impl BufRead) -> Result<[u8; N], LoadError> {
-    let mut bytes = [0u8; N];
-    reader
-        .read_exact(&mut bytes)
-        .map_err(|e| LoadError::ReadExactFailed {
-            source: e,
-            bytes: N,
-        })?;
-    Ok(bytes)
+  let mut bytes = [0u8; N];
+  reader
+      .read_exact(&mut bytes)
+      .map_err(|e| LoadError::ReadExactFailed {
+          source: e,
+          bytes: N,
+      })?;
+  Ok(bytes)
 }
 
+/// Ready bytes with length
 pub fn read_bytes_with_len(reader: &mut impl BufRead, len: usize) -> Result<Vec<u8>, LoadError> {
-    let mut bytes = vec![0u8; len];
-    reader
-        .read_exact(&mut bytes)
-        .map_err(|e| LoadError::ReadExactFailed {
-            source: e,
-            bytes: len,
-        })?;
-    Ok(bytes)
+  let mut bytes = vec![0u8; len];
+  reader
+      .read_exact(&mut bytes)
+      .map_err(|e| LoadError::ReadExactFailed {
+          source: e,
+          bytes: len,
+      })?;
+  Ok(bytes)
 }
 
+/// Read an i32
 pub fn read_i32(reader: &mut impl BufRead) -> Result<i32, LoadError> {
-    Ok(i32::from_le_bytes(read_bytes::<4>(reader)?))
+  Ok(i32::from_le_bytes(read_bytes::<4>(reader)?))
 }
 
+/// Read a u32
 pub fn read_u32(reader: &mut impl BufRead) -> Result<u32, LoadError> {
-    Ok(u32::from_le_bytes(read_bytes::<4>(reader)?))
+  Ok(u32::from_le_bytes(read_bytes::<4>(reader)?))
 }
 
+/// Read an f32
 pub fn read_f32(reader: &mut impl BufRead) -> Result<f32, LoadError> {
-    Ok(f32::from_le_bytes(read_bytes::<4>(reader)?))
+  Ok(f32::from_le_bytes(read_bytes::<4>(reader)?))
 }
 
 /// Helper function. Reads a string from the buffer and returns it.
 pub fn read_string(reader: &mut impl BufRead, len: usize) -> Result<String, LoadError> {
-    Ok(String::from_utf8(read_bytes_with_len(reader, len)?)?)
+  Ok(String::from_utf8(read_bytes_with_len(reader, len)?)?)
+}
+
+
+
+pub fn find_all_model_files(main_path: &Path) -> Result<Vec<PathBuf>, LoadError> {
+    Ok(collect_related_paths(
+        main_path,
+        std::fs::read_dir(main_path.parent().ok_or_else(|| LoadError::NoParentPath {
+            path: main_path.to_owned(),
+        })?)?
+        .filter_map(Result::ok)
+        .map(|de| de.path()),
+    ))
+}
+
+fn collect_related_paths(
+    main_path: &Path,
+    directory_paths: impl Iterator<Item = PathBuf>,
+) -> Vec<PathBuf> {
+    let main_filename = main_path.file_name().and_then(|p| p.to_str());
+
+    let mut paths: Vec<PathBuf> = directory_paths
+        .filter(|p| {
+            p.file_name()
+                .and_then(|p| p.to_str())
+                .zip(main_filename)
+                .map(|(part_filename, main_filename)| {
+                    match part_filename.strip_prefix(main_filename) {
+                        Some(suffix) => {
+                            suffix.is_empty()
+                                || (suffix
+                                    .strip_prefix('.')
+                                    .map(|s| s.parse::<usize>().is_ok())
+                                    .unwrap_or(false))
+                        }
+                        None => false,
+                    }
+                })
+                .unwrap_or(false)
+        })
+        .collect();
+    paths.sort();
+    paths
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    use llama_rs::util::TokenUtf8Buffer;
+
+    #[test]
+    fn test_collect_related_paths() {
+        let main_path = PathBuf::from("/models/llama.bin");
+        let directory_paths = [
+            "/models/llama.bin",
+            "/models/llama.bin.1",
+            "/models/llama.bin.2",
+            "/models/llama.bin.tmp",
+        ]
+        .map(PathBuf::from);
+        let expected_paths = [
+            "/models/llama.bin",
+            "/models/llama.bin.1",
+            "/models/llama.bin.2",
+        ]
+        .map(PathBuf::from);
+
+        let output_paths = collect_related_paths(&main_path, directory_paths.into_iter());
+        assert_eq!(expected_paths.as_slice(), output_paths);
+    }
+
+    #[test]
+    fn test_valid_utf8() {
+        let mut buffer = TokenUtf8Buffer::new();
+        assert_eq!(buffer.push(b"hello").as_deref(), Some("hello"));
+        assert_eq!(buffer.push(&[0xE2, 0x82, 0xAC]).as_deref(), Some("€"));
+    }
+
+    #[test]
+    fn test_partial_utf8() {
+        let mut buffer = TokenUtf8Buffer::new();
+        assert_eq!(buffer.push(&[0xE2, 0x82]).as_deref(), None);
+        assert_eq!(buffer.push(&[0xAC]).as_deref(), Some("€"));
+    }
+
+    #[test]
+    fn test_invalid_prelude_for_valid_utf8() {
+        let mut buffer = TokenUtf8Buffer::new();
+        assert_eq!(buffer.push(&[0xD8]).as_deref(), None);
+        assert_eq!(buffer.push(&[0xE2, 0x82]).as_deref(), None);
+        assert_eq!(buffer.push(&[0xAC]).as_deref(), Some("€"));
+    }
 }
diff --git a/llama/src/lib.rs b/llama/src/lib.rs
new file mode 100644
index 00000000..bdb79ce6
--- /dev/null
+++ b/llama/src/lib.rs
@@ -0,0 +1,475 @@
+use std::collections::HashMap;
+
+use serde::Deserialize;
+
+use llama_rs::{InferenceSession, InferenceSessionParameters, model::Model, Vocabulary};
+
+#[cfg(feature = "convert")]
+pub mod convert;
+
+mod ggml_loader;
+
+pub use ggml_loader::{load, LoadError, LoadProgress};
+
+pub struct Llama {
+    pub(crate) hparams: Hyperparameters,
+
+    vocabulary: Vocabulary,
+
+    tok_embeddings: ggml::Tensor,
+
+    norm: ggml::Tensor,
+    output: ggml::Tensor,
+
+    layers: Vec<Layer>,
+
+    tensors: HashMap<String, ggml::Tensor>,
+
+    // Must be kept alive for the model
+    _context: ggml::Context,
+}
+
+impl Model for Llama {
+    type Model = Llama;
+    type Hyperparameters = Hyperparameters;
+    type Layer = Layer;
+
+    fn start_session(&self, params: InferenceSessionParameters) -> InferenceSession {
+        InferenceSession::new(
+            params,
+            self.hparams.n_ctx,
+            self.hparams.n_layer,
+            self.hparams.n_embd,
+            self.hparams.n_vocab,
+        )
+    }
+
+    fn evaluate(
+        &self,
+        session: &mut InferenceSession,
+        params: &llama_rs::InferenceParameters,
+        input_tokens: &[llama_rs::TokenId],
+        output_request: &mut llama_rs::EvaluateOutputRequest,
+    ) {
+        let n = input_tokens.len();
+        let n_past = session.n_past;
+        let n_threads = params.n_threads;
+
+        let memk_elsize = session.memory_k.element_size();
+        let memv_elsize = session.memory_v.element_size();
+
+        let Hyperparameters {
+            n_vocab,
+            n_ctx,
+            n_embd,
+            n_mult: _,
+            n_head,
+            n_layer,
+            n_rot,
+            f16_: _,
+        } = self.hparams;
+
+        // For the first run, we need to guess a maximum buffer size so we can measure
+        // the actual memory consumption of the temporary ggml context.
+        //
+        // These numbers are from `llama.cpp`, and could potentially be more efficient.
+        let mut buf_size = {
+            let buf_size_mb = if n_layer >= 80 {
+                1536
+            } else if n_layer >= 60 {
+                1280
+            } else {
+                1024
+            };
+            buf_size_mb * 1024 * 1024
+        };
+        if session.mem_per_token > 0 && session.mem_per_token * n > buf_size {
+            // add 10% to account for ggml object overhead
+            buf_size = (1.1f64 * session.mem_per_token as f64 * n as f64) as usize;
+        };
+        let ctx0 = ggml::Context::init(buf_size);
+
+        let mut gf = ggml::ComputationGraph::new(n_threads);
+
+        let embd = ctx0.new_tensor_1d(ggml::Type::I32, n);
+        unsafe { embd.write_data(bytemuck::cast_slice(input_tokens)) };
+
+        let mut input_layer = ctx0.op_get_rows(&self.tok_embeddings, &embd);
+
+        for il in 0..n_layer {
+            let input_self_attention = input_layer.share();
+            let mut current: ggml::Tensor;
+
+            ctx0.use_scratch(Some(&mut session.scratch[0]));
+
+            // norm
+            {
+                current = ctx0.op_rms_norm(&input_layer);
+
+                // cur = attention_norm * cur
+                current = ctx0.op_mul(
+                    &ctx0.op_repeat(&self.layers[il].attention_norm, &current),
+                    &current,
+                );
+            }
+
+            // self-attention
+            {
+                // compute Q and K and RoPE them
+                let q_current = ctx0.op_rope(
+                    &ctx0.op_reshape_3d(
+                        &ctx0.op_mul_mat(&self.layers[il].wq, &current),
+                        n_embd / n_head,
+                        n_head,
+                        n,
+                    ),
+                    n_past,
+                    n_rot,
+                    0,
+                );
+                let k_current = ctx0.op_rope(
+                    &ctx0.op_reshape_3d(
+                        &ctx0.op_mul_mat(&self.layers[il].wk, &current),
+                        n_embd / n_head,
+                        n_head,
+                        n,
+                    ),
+                    n_past,
+                    n_rot,
+                    0,
+                );
+
+                // store key and value to memory
+                {
+                    // compute the transposed [N, n_embd] V matrix
+                    let v_current = ctx0.op_transpose(&ctx0.op_reshape_2d(
+                        &ctx0.op_mul_mat(&self.layers[il].wv, &current),
+                        n_embd,
+                        n,
+                    ));
+
+                    let k = ctx0.op_view_1d(
+                        &session.memory_k,
+                        n * n_embd,
+                        (memk_elsize * n_embd) * (il * n_ctx + n_past),
+                    );
+
+                    let v = ctx0.op_view_2d(
+                        &session.memory_v,
+                        n,
+                        n_embd,
+                        n_ctx * memv_elsize,
+                        (il * n_ctx) * memv_elsize * n_embd + n_past * memv_elsize,
+                    );
+
+                    // important: storing RoPE-ed version of K in the KV cache!
+                    gf.build_forward_expand(&ctx0.op_cpy(&k_current, &k));
+                    gf.build_forward_expand(&ctx0.op_cpy(&v_current, &v));
+                }
+
+                let q = ctx0.op_permute(&q_current, 0, 2, 1, 3);
+
+                let k = ctx0.op_permute(
+                    &ctx0.op_reshape_3d(
+                        &ctx0.op_view_1d(
+                            &session.memory_k,
+                            (n_past + n) * n_embd,
+                            il * n_ctx * memk_elsize * n_embd,
+                        ),
+                        n_embd / n_head,
+                        n_head,
+                        n_past + n,
+                    ),
+                    0,
+                    2,
+                    1,
+                    3,
+                );
+
+                // K * Q
+                let k_q = ctx0.op_mul_mat(&k, &q);
+
+                // KQ_scaled = KQ / sqrt(n_embd/n_head)
+                let k_q_scaled = ctx0.op_scale(
+                    &k_q,
+                    &ctx0.new_f32(1.0 / f32::sqrt(n_embd as f32 / n_head as f32)),
+                );
+
+                // KQ_masked = mask_past(KQ_scaled)
+                let k_q_masked = ctx0.op_diag_mask_inf(&k_q_scaled, n_past);
+
+                // KQ = soft_max(KQ_masked)
+                let k_q_soft_max = ctx0.op_soft_max(&k_q_masked);
+
+                // split cached V into n_head heads
+                let v = ctx0.op_view_3d(
+                    &session.memory_v,
+                    n_past + n,
+                    n_embd / n_head,
+                    n_head,
+                    n_ctx * memv_elsize,
+                    n_ctx * memv_elsize * n_embd / n_head,
+                    il * n_ctx * memv_elsize * n_embd,
+                );
+
+                let k_q_v = ctx0.op_mul_mat(&v, &k_q_soft_max);
+
+                // KQV_merged = KQV.permute(0, 2, 1, 3)
+                let k_q_v_merged = ctx0.op_permute(&k_q_v, 0, 2, 1, 3);
+
+                // cur = KQV_merged.contiguous().view(n_embd, N)
+                current = ctx0.op_cpy(
+                    &k_q_v_merged,
+                    &ctx0.new_tensor_2d(ggml::Type::F32, n_embd, n),
+                );
+
+                // projection (no bias)
+                current = ctx0.op_mul_mat(&self.layers[il].wo, &current);
+            }
+
+            ctx0.use_scratch(Some(&mut session.scratch[1]));
+
+            let input_feed_forward = ctx0.op_add(&current, &input_self_attention);
+
+            // feed-forward network
+            {
+                // norm
+                {
+                    current = ctx0.op_rms_norm(&input_feed_forward);
+
+                    // cur = ffn_norm*cur
+                    current = ctx0.op_mul(
+                        &ctx0.op_repeat(&self.layers[il].ffn_norm, &current),
+                        &current,
+                    );
+                }
+
+                let tmp = ctx0.op_mul_mat(&self.layers[il].w3, &current);
+
+                current = ctx0.op_mul_mat(&self.layers[il].w1, &current);
+
+                // SILU activation
+                current = ctx0.op_silu(&current);
+
+                current = ctx0.op_mul(&current, &tmp);
+
+                current = ctx0.op_mul_mat(&self.layers[il].w2, &current);
+            }
+
+            current = ctx0.op_add(&current, &input_feed_forward);
+
+            // input for next layer
+            input_layer = current;
+        }
+
+        ctx0.use_scratch(Some(&mut session.scratch[0]));
+
+        // Used at the end to optionally extract the embeddings.
+        let embeddings_tensor;
+
+        // norm
+        {
+            input_layer = ctx0.op_rms_norm(&input_layer);
+
+            // inpL = norm*inpL
+            input_layer = ctx0.op_mul(&ctx0.op_repeat(&self.norm, &input_layer), &input_layer);
+            embeddings_tensor = input_layer.share();
+        }
+
+        // lm_head
+        {
+            input_layer = ctx0.op_mul_mat(&self.output, &input_layer);
+        }
+
+        ctx0.use_scratch(None);
+
+        // logits -> probs
+        // inpL = ctx0.op_soft_max(&inpL);
+
+        // run the computation
+        gf.build_forward_expand(&input_layer);
+        ctx0.graph_compute(&mut gf);
+
+        // return result for just the last token
+        // SAFETY: yolo
+        assert_eq!(session.last_logits.len(), n_vocab);
+        unsafe {
+            input_layer.read_data(
+                n_vocab * (n - 1) * std::mem::size_of::<f32>(),
+                bytemuck::cast_slice_mut(&mut session.last_logits),
+            )
+        };
+
+        // Extract logits
+        if let Some(all_logits) = &mut output_request.all_logits {
+            all_logits.resize(n_vocab * n, 0.0);
+            // SAFETY: Tensor data can be read (properly aligned, initialized,
+            // data will not be mutated or otherwise aliased during the copy),
+            // and we're not reading past the end of the tensor data.
+            assert_eq!(input_layer.nelements(), n_vocab * n);
+            unsafe {
+                input_layer.read_data(0, bytemuck::cast_slice_mut(all_logits));
+            }
+        }
+
+        // Extract embeddings
+        if let Some(embeddings) = &mut output_request.embeddings {
+            embeddings.resize(n_embd * n, 0.0);
+            // SAFETY: Same rationale as for the "Extract logits" section applies.
+            assert_eq!(embeddings_tensor.nelements(), n_embd * n);
+            unsafe {
+                embeddings_tensor.read_data(0, bytemuck::cast_slice_mut(embeddings));
+            }
+        }
+
+        // Adjust the required memory per token if we didn't know that already
+        if session.mem_per_token == 0 {
+            session.mem_per_token = ctx0.used_mem() / n;
+        }
+
+        // Adjust n_past to new length.
+        session.n_past += input_tokens.len();
+    }
+
+    /// Returns the vocabulary used by this model.
+    fn vocabulary(&self) -> &Vocabulary {
+        &self.vocabulary
+    }
+
+    fn n_ctx(&self) -> usize {
+        self.hparams.n_ctx
+    }
+}
+
+impl Llama {
+    /// Load the model from `path` with `n_context_tokens` context tokens.
+    ///
+    /// The status of the loading process will be reported through `load_progress_callback`.
+    pub fn load(
+        path: impl AsRef<std::path::Path>,
+        n_context_tokens: usize,
+        load_progress_callback: impl FnMut(LoadProgress),
+    ) -> Result<Self, LoadError> {
+        load(path, n_context_tokens, load_progress_callback)
+    }
+
+    pub(crate) fn new(
+        context: ggml::Context,
+        hparams: Hyperparameters,
+        vocabulary: Vocabulary,
+        n_ff: usize,
+        wtype: ggml::Type,
+    ) -> Llama {
+        let n_embd = hparams.n_embd;
+        let n_layer = hparams.n_layer;
+        let n_vocab = hparams.n_vocab;
+
+        let mut tensors = HashMap::new();
+
+        let tok_embeddings = context.new_tensor_2d(wtype, n_embd, n_vocab);
+        let norm = context.new_tensor_1d(ggml::Type::F32, n_embd);
+        let output = context.new_tensor_2d(wtype, n_embd, n_vocab);
+
+        tensors.insert("tok_embeddings.weight".to_owned(), tok_embeddings.share());
+        tensors.insert("norm.weight".to_owned(), norm.share());
+        tensors.insert("output.weight".to_owned(), output.share());
+
+        let mut layers = Vec::new();
+        for i in 0..n_layer {
+            let layer = Layer {
+                attention_norm: context.new_tensor_1d(ggml::Type::F32, n_embd),
+                wq: context.new_tensor_2d(wtype, n_embd, n_embd),
+                wk: context.new_tensor_2d(wtype, n_embd, n_embd),
+                wv: context.new_tensor_2d(wtype, n_embd, n_embd),
+                wo: context.new_tensor_2d(wtype, n_embd, n_embd),
+                ffn_norm: context.new_tensor_1d(ggml::Type::F32, n_embd),
+                w1: context.new_tensor_2d(wtype, n_embd, n_ff),
+                w2: context.new_tensor_2d(wtype, n_ff, n_embd),
+                w3: context.new_tensor_2d(wtype, n_embd, n_ff),
+            };
+
+            tensors.insert(
+                format!("layers.{i}.attention_norm.weight"),
+                layer.attention_norm.share(),
+            );
+
+            tensors.insert(format!("layers.{i}.attention.wq.weight"), layer.wq.share());
+            tensors.insert(format!("layers.{i}.attention.wk.weight"), layer.wk.share());
+            tensors.insert(format!("layers.{i}.attention.wv.weight"), layer.wv.share());
+            tensors.insert(format!("layers.{i}.attention.wo.weight"), layer.wo.share());
+
+            tensors.insert(
+                format!("layers.{i}.ffn_norm.weight"),
+                layer.ffn_norm.share(),
+            );
+
+            tensors.insert(
+                format!("layers.{i}.feed_forward.w1.weight"),
+                layer.w1.share(),
+            );
+            tensors.insert(
+                format!("layers.{i}.feed_forward.w2.weight"),
+                layer.w2.share(),
+            );
+            tensors.insert(
+                format!("layers.{i}.feed_forward.w3.weight"),
+                layer.w3.share(),
+            );
+
+            layers.push(layer);
+        }
+
+        Llama {
+            hparams,
+            vocabulary,
+            tok_embeddings,
+            norm,
+            output,
+            layers,
+            tensors,
+            _context: context,
+        }
+    }
+
+    pub(crate) fn tensors(&self) -> &HashMap<String, ggml::Tensor> {
+        &self.tensors
+    }
+}
+
+/// The hyperparameters of the model.
+#[derive(Debug, Default, PartialEq, Eq, PartialOrd, Ord, Deserialize)]
+pub struct Hyperparameters {
+    /// n_vocab
+    pub n_vocab: usize,
+    /// n_ctx
+    pub n_ctx: usize,
+    /// n_embd
+    pub n_embd: usize,
+    /// n_mult
+    pub n_mult: usize,
+    /// n_head
+    pub n_head: usize,
+    /// n_layer
+    pub n_layer: usize,
+    /// n_rot
+    pub n_rot: usize,
+    /// f16_
+    pub f16_: u32,
+}
+
+pub struct Layer {
+    attention_norm: ggml::Tensor,
+
+    wq: ggml::Tensor,
+    wk: ggml::Tensor,
+    wv: ggml::Tensor,
+    wo: ggml::Tensor,
+
+    // normalization
+    ffn_norm: ggml::Tensor,
+
+    // ff
+    w1: ggml::Tensor,
+    w2: ggml::Tensor,
+    w3: ggml::Tensor,
+}

From e0713a109376a9545aa8d44e2ce0569ddec74e80 Mon Sep 17 00:00:00 2001
From: Dan Forbes <dan@danforbes.dev>
Date: Sat, 15 Apr 2023 14:21:07 -0700
Subject: [PATCH 02/35] Bloom model

Modified code by @hhamud
---
 Cargo.lock                              |  35 +-
 Cargo.toml                              |   3 +
 bloom-cli/Cargo.toml                    |  21 +
 bloom-cli/src/cli_args.rs               | 333 +++++++++++++++
 bloom-cli/src/main.rs                   | 196 +++++++++
 bloom/Cargo.toml                        |  12 +
 bloom/src/ggml_loader.rs                | 406 +++++++++++++++++++
 bloom/src/lib.rs                        | 517 ++++++++++++++++++++++++
 ggml-sys/ggml/ggml.c                    | 203 +++++++++-
 ggml-sys/ggml/ggml.h                    |   9 +
 ggml-sys/src/lib.rs                     |   8 +
 ggml/Cargo.toml                         |   6 +-
 ggml/src/lib.rs                         |  23 ++
 ggml/src/loader.rs                      | 261 ++++++++++++
 llama-cli/Cargo.toml                    |   5 +-
 llama-cli/src/cli_args.rs               |  55 +--
 llama-cli/src/main.rs                   |   3 +-
 llama-rs/Cargo.toml                     |   7 +-
 llama-rs/src/lib.rs                     |   5 +-
 {llama-cli => llama-rs}/src/snapshot.rs |  12 +-
 llama/Cargo.toml                        |   4 +-
 llama/src/convert.rs                    |   4 +-
 llama/src/ggml_loader.rs                | 271 +------------
 llama/src/lib.rs                        |   7 +-
 24 files changed, 2069 insertions(+), 337 deletions(-)
 create mode 100644 bloom-cli/Cargo.toml
 create mode 100644 bloom-cli/src/cli_args.rs
 create mode 100644 bloom-cli/src/main.rs
 create mode 100644 bloom/Cargo.toml
 create mode 100644 bloom/src/ggml_loader.rs
 create mode 100644 bloom/src/lib.rs
 create mode 100644 ggml/src/loader.rs
 rename {llama-cli => llama-rs}/src/snapshot.rs (89%)

diff --git a/Cargo.lock b/Cargo.lock
index 475a847d..06175af9 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -60,6 +60,32 @@ version = "1.3.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a"
 
+[[package]]
+name = "bloom"
+version = "0.1.0"
+dependencies = [
+ "bytemuck",
+ "ggml",
+ "llama-rs",
+]
+
+[[package]]
+name = "bloom-cli"
+version = "0.1.0"
+dependencies = [
+ "bloom",
+ "clap",
+ "env_logger",
+ "ggml",
+ "llama-rs",
+ "log",
+ "num_cpus",
+ "once_cell",
+ "rand",
+ "rustyline",
+ "spinners",
+]
+
 [[package]]
 name = "bytemuck"
 version = "1.13.1"
@@ -324,6 +350,8 @@ name = "ggml"
 version = "0.1.0"
 dependencies = [
  "ggml-sys",
+ "log",
+ "thiserror",
 ]
 
 [[package]]
@@ -467,16 +495,15 @@ dependencies = [
  "rust_tokenizers",
  "serde",
  "serde_json",
- "thiserror",
 ]
 
 [[package]]
 name = "llama-cli"
 version = "0.1.0"
 dependencies = [
- "bincode",
  "clap",
  "env_logger",
+ "ggml",
  "llama",
  "llama-rs",
  "log",
@@ -485,20 +512,22 @@ dependencies = [
  "rand",
  "rustyline",
  "spinners",
- "zstd",
 ]
 
 [[package]]
 name = "llama-rs"
 version = "0.1.0"
 dependencies = [
+ "bincode",
  "bytemuck",
  "ggml",
+ "log",
  "partial_sort",
  "rand",
  "serde",
  "serde_bytes",
  "thiserror",
+ "zstd",
 ]
 
 [[package]]
diff --git a/Cargo.toml b/Cargo.toml
index 2621b08f..bef02276 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -1,5 +1,7 @@
 [workspace]
 members = [
+    "bloom",
+    "bloom-cli",
     "ggml-sys",
     "ggml",
     "llama",
@@ -14,5 +16,6 @@ version = "0.1.0"
 
 [workspace.dependencies]
 bytemuck = "1.13.1"
+log = "0.4"
 rand = "0.8.5"
 serde = { version = "1.0", features = ["derive"] }
diff --git a/bloom-cli/Cargo.toml b/bloom-cli/Cargo.toml
new file mode 100644
index 00000000..1d3de63d
--- /dev/null
+++ b/bloom-cli/Cargo.toml
@@ -0,0 +1,21 @@
+[package]
+name = "bloom-cli"
+version = { workspace = true }
+edition = "2021"
+
+# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
+
+[dependencies]
+bloom = { path = "../bloom" }
+ggml = { path = "../ggml" }
+llama-rs = { path = "../llama-rs" }
+
+log = { workspace = true }
+rand = { workspace = true }
+
+clap = { version = "4.1.8", features = ["derive"] }
+env_logger = "0.10.0"
+num_cpus = "1.15.0"
+once_cell = "1.17.1"
+rustyline = "11.0.0"
+spinners = "4.1.0"
diff --git a/bloom-cli/src/cli_args.rs b/bloom-cli/src/cli_args.rs
new file mode 100644
index 00000000..91f03fe2
--- /dev/null
+++ b/bloom-cli/src/cli_args.rs
@@ -0,0 +1,333 @@
+use std::path::PathBuf;
+
+use clap::{Parser, ValueEnum};
+use rand::SeedableRng;
+
+use ggml::loader::load_progress;
+
+use llama_rs::{
+    InferenceParameters, InferenceSessionParameters, ModelKVMemoryType, TokenBias, EOT_TOKEN_ID,
+};
+
+#[derive(Parser, Debug)]
+#[command(author, version, about, long_about = None)]
+pub enum Args {
+    #[command()]
+    /// Use a model to infer the next tokens in a sequence, and exit
+    Infer(Box<Infer>),
+
+    #[command()]
+    /// Dumps the prompt to console and exits, first as a comma-separated list of token IDs
+    /// and then as a list of comma-separated string keys and token ID values.
+    DumpTokens(Box<DumpTokens>),
+
+    #[command()]
+    /// Use a model to interactively prompt it multiple times, while
+    /// resetting the context between invocations
+    Repl(Box<Repl>),
+
+    #[command()]
+    /// Use a model to interactively generate tokens, and chat with it
+    ///
+    /// Note that most, if not all, existing models are not trained for this
+    /// and do not support a long enough context window to be able to
+    /// have an extended conversation.
+    ChatExperimental(Box<Repl>),
+}
+
+#[derive(Parser, Debug)]
+pub struct Infer {
+    #[command(flatten)]
+    pub model_load: ModelLoad,
+
+    #[command(flatten)]
+    pub prompt_file: PromptFile,
+
+    #[command(flatten)]
+    pub generate: Generate,
+
+    /// The prompt to feed the generator.
+    ///
+    /// If used with `--prompt-file`/`-f`, the prompt from the file will be used
+    /// and `{{PROMPT}}` will be replaced with the value of `--prompt`/`-p`.
+    #[arg(long, short = 'p', default_value = None)]
+    pub prompt: Option<String>,
+
+    /// Saves an inference session at the given path. The same session can then be
+    /// loaded from disk using `--load-session`.
+    ///
+    /// Use this with `-n 0` to save just the prompt
+    #[arg(long, default_value = None)]
+    pub save_session: Option<PathBuf>,
+
+    /// Loads an inference session from the given path if present, and then saves
+    /// the result to the same path after inference is completed.
+    ///
+    /// Equivalent to `--load-session` and `--save-session` with the same path,
+    /// but will not error if the path does not exist
+    #[arg(long, default_value = None)]
+    pub persist_session: Option<PathBuf>,
+}
+
+#[derive(Parser, Debug)]
+pub struct DumpTokens {
+    #[command(flatten)]
+    pub model_load: ModelLoad,
+
+    #[command(flatten)]
+    pub prompt_file: PromptFile,
+
+    /// The prompt to feed the generator.
+    ///
+    /// If used with `--prompt-file`/`-f`, the prompt from the file will be used
+    /// and `{{PROMPT}}` will be replaced with the value of `--prompt`/`-p`.
+    #[arg(long, short = 'p', default_value = None)]
+    pub prompt: Option<String>,
+}
+
+#[derive(Parser, Debug)]
+pub struct Repl {
+    #[command(flatten)]
+    pub model_load: ModelLoad,
+
+    #[command(flatten)]
+    pub prompt_file: PromptFile,
+
+    #[command(flatten)]
+    pub generate: Generate,
+}
+
+#[derive(Parser, Debug)]
+pub struct Generate {
+    /// Sets the number of threads to use
+    #[arg(long, short = 't')]
+    pub num_threads: Option<usize>,
+
+    /// Sets how many tokens to predict
+    #[arg(long, short = 'n')]
+    pub num_predict: Option<usize>,
+
+    /// How many tokens from the prompt at a time to feed the network. Does not
+    /// affect generation.
+    #[arg(long, default_value_t = 8)]
+    pub batch_size: usize,
+
+    /// Size of the 'last N' buffer that is used for the `repeat_penalty`
+    /// option. In tokens.
+    #[arg(long, default_value_t = 64)]
+    pub repeat_last_n: usize,
+
+    /// The penalty for repeating tokens. Higher values make the generation less
+    /// likely to get into a loop, but may harm results when repetitive outputs
+    /// are desired.
+    #[arg(long, default_value_t = 1.30)]
+    pub repeat_penalty: f32,
+
+    /// Temperature
+    #[arg(long, default_value_t = 0.80)]
+    pub temperature: f32,
+
+    /// Top-K: The top K words by score are kept during sampling.
+    #[arg(long, default_value_t = 40)]
+    pub top_k: usize,
+
+    /// Top-p: The cumulative probability after which no more words are kept
+    /// for sampling.
+    #[arg(long, default_value_t = 0.95)]
+    pub top_p: f32,
+
+    /// Loads a saved inference session from the given path, previously saved using
+    /// `--save-session`
+    #[arg(long, default_value = None)]
+    pub load_session: Option<PathBuf>,
+
+    /// Specifies the seed to use during sampling. Note that, depending on
+    /// hardware, the same seed may lead to different results on two separate
+    /// machines.
+    #[arg(long, default_value = None)]
+    pub seed: Option<u64>,
+
+    /// Use 16-bit floats for model memory key and value. Ignored when restoring
+    /// from the cache.
+    #[arg(long, default_value_t = false)]
+    pub float16: bool,
+
+    /// A comma separated list of token biases. The list should be in the format
+    /// "TID=BIAS,TID=BIAS" where TID is an integer token ID and BIAS is a
+    /// floating point number.
+    /// For example, "1=-1.0,2=-1.0" sets the bias for token IDs 1
+    /// (start of document) and 2 (end of document) to -1.0 which effectively
+    /// disables the model from generating responses containing those token IDs.
+    #[arg(long, default_value = None, value_parser = parse_bias)]
+    pub token_bias: Option<TokenBias>,
+
+    /// Prevent the end of stream (EOS/EOD) token from being generated. This will allow the
+    /// model to generate text until it runs out of context space. Note: The --token-bias
+    /// option will override this if specified.
+    #[arg(long, default_value_t = false)]
+    pub ignore_eos: bool,
+}
+impl Generate {
+    #[cfg(all(target_os = "macos", target_arch = "aarch64"))]
+    pub fn autodetect_num_threads(&self) -> usize {
+        std::process::Command::new("sysctl")
+            .arg("-n")
+            .arg("hw.perflevel0.physicalcpu")
+            .output()
+            .ok()
+            .and_then(|output| String::from_utf8(output.stdout).ok()?.trim().parse().ok())
+            .unwrap_or(num_cpus::get_physical())
+    }
+
+    #[cfg(not(all(target_os = "macos", target_arch = "aarch64")))]
+    pub fn autodetect_num_threads(&self) -> usize {
+        num_cpus::get_physical()
+    }
+
+    pub fn num_threads(&self) -> usize {
+        self.num_threads
+            .unwrap_or_else(|| self.autodetect_num_threads())
+    }
+
+    pub fn inference_session_parameters(&self) -> InferenceSessionParameters {
+        let mem_typ = if self.float16 {
+            ModelKVMemoryType::Float16
+        } else {
+            ModelKVMemoryType::Float32
+        };
+        InferenceSessionParameters {
+            memory_k_type: mem_typ,
+            memory_v_type: mem_typ,
+            repetition_penalty_last_n: self.repeat_last_n,
+        }
+    }
+
+    pub fn rng(&self) -> rand::rngs::StdRng {
+        if let Some(seed) = self.seed {
+            rand::rngs::StdRng::seed_from_u64(seed)
+        } else {
+            rand::rngs::StdRng::from_entropy()
+        }
+    }
+
+    pub fn inference_parameters(&self, session_loaded: bool) -> InferenceParameters {
+        InferenceParameters {
+            n_threads: self.num_threads(),
+            n_batch: self.batch_size,
+            top_k: self.top_k,
+            top_p: self.top_p,
+            repeat_penalty: self.repeat_penalty,
+            temperature: self.temperature,
+            bias_tokens: self.token_bias.clone().unwrap_or_else(|| {
+                if self.ignore_eos {
+                    TokenBias::new(vec![(EOT_TOKEN_ID, -1.0)])
+                } else {
+                    TokenBias::default()
+                }
+            }),
+            play_back_previous_tokens: session_loaded,
+        }
+    }
+}
+fn parse_bias(s: &str) -> Result<TokenBias, String> {
+    s.parse()
+}
+
+#[derive(Parser, Debug)]
+pub struct ModelLoad {
+    /// Where to load the model path from
+    #[arg(long, short = 'm')]
+    pub model_path: String,
+
+    /// Sets the size of the context (in tokens). Allows feeding longer prompts.
+    /// Note that this affects memory.
+    ///
+    /// LLaMA models are trained with a context size of 2048 tokens. If you
+    /// want to use a larger context size, you will need to retrain the model,
+    /// or use a model that was trained with a larger context size.
+    ///
+    /// Alternate methods to extend the context, including
+    /// [context clearing](https://github.com/rustformers/llama-rs/issues/77) are
+    /// being investigated, but are not yet implemented. Additionally, these
+    /// will likely not perform as well as a model with a larger context size.
+    #[arg(long, default_value_t = 2048)]
+    pub num_ctx_tokens: usize,
+}
+impl ModelLoad {
+    pub fn load(&self) -> bloom::Bloom {
+        let model = bloom::Bloom::load(&self.model_path, self.num_ctx_tokens, load_progress)
+            .expect("Could not load model");
+
+        log::info!("Model fully loaded!");
+
+        model
+    }
+}
+
+#[derive(Parser, Debug)]
+pub struct PromptFile {
+    /// A file to read the prompt from.
+    #[arg(long, short = 'f', default_value = None)]
+    pub prompt_file: Option<String>,
+}
+impl PromptFile {
+    pub fn contents(&self) -> Option<String> {
+        match &self.prompt_file {
+            Some(path) => {
+                match std::fs::read_to_string(path) {
+                    Ok(mut prompt) => {
+                        // Strip off the last character if it's exactly newline. Also strip off a single
+                        // carriage return if it's there. Since String must be valid UTF-8 it should be
+                        // guaranteed that looking at the string as bytes here is safe: UTF-8 non-ASCII
+                        // bytes will always the high bit set.
+                        if matches!(prompt.as_bytes().last(), Some(b'\n')) {
+                            prompt.pop();
+                        }
+                        if matches!(prompt.as_bytes().last(), Some(b'\r')) {
+                            prompt.pop();
+                        }
+                        Some(prompt)
+                    }
+                    Err(err) => {
+                        log::error!("Could not read prompt file at {path}. Error {err}");
+                        std::process::exit(1);
+                    }
+                }
+            }
+            _ => None,
+        }
+    }
+}
+
+#[derive(Parser, Debug)]
+pub struct Convert {
+    /// Path to model directory
+    #[arg(long, short = 'd')]
+    pub directory: PathBuf,
+
+    /// File type to convert to
+    #[arg(long, short = 't', value_enum, default_value_t = ElementType::Q4_0)]
+    pub element_type: ElementType,
+}
+
+#[derive(Parser, Debug, ValueEnum, Clone, Copy)]
+pub enum ElementType {
+    /// Quantized 4-bit (type 0).
+    Q4_0,
+    /// Quantized 4-bit (type 1); used by GPTQ.
+    Q4_1,
+    /// Float 16-bit.
+    F16,
+    /// Float 32-bit.
+    F32,
+}
+impl From<ElementType> for llama_rs::ElementType {
+    fn from(model_type: ElementType) -> Self {
+        match model_type {
+            ElementType::Q4_0 => llama_rs::ElementType::Q4_0,
+            ElementType::Q4_1 => llama_rs::ElementType::Q4_1,
+            ElementType::F16 => llama_rs::ElementType::F16,
+            ElementType::F32 => llama_rs::ElementType::F32,
+        }
+    }
+}
diff --git a/bloom-cli/src/main.rs b/bloom-cli/src/main.rs
new file mode 100644
index 00000000..026984ff
--- /dev/null
+++ b/bloom-cli/src/main.rs
@@ -0,0 +1,196 @@
+use std::{convert::Infallible, io::Write};
+
+use clap::Parser;
+use cli_args::Args;
+use rustyline::error::ReadlineError;
+
+use llama_rs::{snapshot, InferenceError, Model};
+
+mod cli_args;
+
+fn main() {
+    env_logger::builder()
+        .filter_level(log::LevelFilter::Info)
+        .parse_default_env()
+        .init();
+
+    let cli_args = Args::parse();
+    match cli_args {
+        Args::Infer(args) => infer(&args),
+        Args::DumpTokens(args) => dump_tokens(&args),
+        Args::Repl(args) => interactive(&args, false),
+        Args::ChatExperimental(args) => interactive(&args, true),
+    }
+}
+
+fn infer(args: &cli_args::Infer) {
+    let prompt = load_prompt_file_with_prompt(&args.prompt_file, args.prompt.as_deref());
+    let inference_session_params = args.generate.inference_session_parameters();
+    let model = args.model_load.load();
+    let (mut session, session_loaded) = snapshot::read_or_create_session(
+        &model,
+        args.persist_session.as_deref(),
+        args.generate.load_session.as_deref(),
+        inference_session_params,
+    );
+    let inference_params = args.generate.inference_parameters(session_loaded);
+
+    let mut rng = args.generate.rng();
+    let res = session.inference_with_prompt::<Infallible>(
+        &model,
+        &inference_params,
+        &prompt,
+        args.generate.num_predict,
+        &mut rng,
+        |t| {
+            print!("{t}");
+            std::io::stdout().flush().unwrap();
+
+            Ok(())
+        },
+    );
+    println!();
+
+    match res {
+        Ok(_) => (),
+        Err(llama_rs::InferenceError::ContextFull) => {
+            log::warn!("Context window full, stopping inference.")
+        }
+        Err(llama_rs::InferenceError::TokenizationFailed) => {
+            log::error!("Failed to tokenize initial prompt.");
+        }
+        Err(llama_rs::InferenceError::UserCallback(_))
+        | Err(llama_rs::InferenceError::EndOfText) => unreachable!("cannot fail"),
+    }
+
+    if let Some(session_path) = args.save_session.as_ref().or(args.persist_session.as_ref()) {
+        // Write the memory to the cache file
+        snapshot::write_session(session, session_path);
+    }
+}
+
+fn dump_tokens(args: &cli_args::DumpTokens) {
+    let prompt = load_prompt_file_with_prompt(&args.prompt_file, args.prompt.as_deref());
+    let model = args.model_load.load();
+    let toks = match model.vocabulary().tokenize(&prompt, false) {
+        Ok(toks) => toks,
+        Err(e) => {
+            log::error!("Could not tokenize prompt: {e}");
+            std::process::exit(1);
+        }
+    };
+    log::info!("=== Dumping prompt tokens:");
+    log::info!(
+        "{}",
+        toks.iter()
+            .map(|(_, tid)| tid.to_string())
+            .collect::<Vec<_>>()
+            .join(", ")
+    );
+    log::info!(
+        "{}",
+        toks.iter()
+            .map(|(s, tid)| format!("{s:?}:{tid}"))
+            .collect::<Vec<_>>()
+            .join(", ")
+    );
+}
+
+fn interactive(
+    args: &cli_args::Repl,
+    // If set to false, the session will be cloned after each inference
+    // to ensure that previous state is not carried over.
+    chat_mode: bool,
+) {
+    let prompt_file = args.prompt_file.contents();
+    let inference_session_params = args.generate.inference_session_parameters();
+    let model = args.model_load.load();
+    let (mut session, session_loaded) = snapshot::read_or_create_session(
+        &model,
+        None,
+        args.generate.load_session.as_deref(),
+        inference_session_params,
+    );
+    let inference_params = args.generate.inference_parameters(session_loaded);
+
+    let mut rng = args.generate.rng();
+    let mut rl = rustyline::DefaultEditor::new().unwrap();
+    loop {
+        let readline = rl.readline(">> ");
+        match readline {
+            Ok(line) => {
+                let session_backup = if chat_mode {
+                    None
+                } else {
+                    Some(session.clone())
+                };
+
+                let prompt = prompt_file
+                    .as_deref()
+                    .map(|pf| process_prompt(pf, &line))
+                    .unwrap_or(line);
+
+                let mut sp = spinners::Spinner::new(spinners::Spinners::Dots2, "".to_string());
+                if let Err(InferenceError::ContextFull) = session.feed_prompt::<Infallible>(
+                    &model,
+                    &inference_params,
+                    &prompt,
+                    |_| Ok(()),
+                ) {
+                    log::error!("Prompt exceeds context window length.")
+                };
+                sp.stop();
+
+                let res = session.inference_with_prompt::<Infallible>(
+                    &model,
+                    &inference_params,
+                    "",
+                    args.generate.num_predict,
+                    &mut rng,
+                    |tk| {
+                        print!("{tk}");
+                        std::io::stdout().flush().unwrap();
+                        Ok(())
+                    },
+                );
+                println!();
+
+                if let Err(InferenceError::ContextFull) = res {
+                    log::error!("Reply exceeds context window length");
+                }
+
+                if let Some(session_backup) = session_backup {
+                    session = session_backup;
+                }
+            }
+            Err(ReadlineError::Eof) | Err(ReadlineError::Interrupted) => {
+                break;
+            }
+            Err(err) => {
+                log::error!("{err}");
+            }
+        }
+    }
+}
+
+fn load_prompt_file_with_prompt(
+    prompt_file: &cli_args::PromptFile,
+    prompt: Option<&str>,
+) -> String {
+    if let Some(prompt_file) = prompt_file.contents() {
+        if let Some(prompt) = prompt {
+            process_prompt(&prompt_file, prompt)
+        } else {
+            prompt_file
+        }
+    } else if let Some(prompt) = prompt {
+        prompt.to_owned()
+    } else {
+        log::error!("No prompt or prompt file was provided. See --help");
+        std::process::exit(1);
+    }
+}
+
+fn process_prompt(raw_prompt: &str, prompt: &str) -> String {
+    raw_prompt.replace("{{PROMPT}}", prompt)
+}
diff --git a/bloom/Cargo.toml b/bloom/Cargo.toml
new file mode 100644
index 00000000..5f2a108e
--- /dev/null
+++ b/bloom/Cargo.toml
@@ -0,0 +1,12 @@
+[package]
+name = "bloom"
+version = { workspace = true }
+edition = "2021"
+
+# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
+
+[dependencies]
+ggml = { path = "../ggml" }
+llama-rs = { path = "../llama-rs" }
+
+bytemuck = { workspace = true }
diff --git a/bloom/src/ggml_loader.rs b/bloom/src/ggml_loader.rs
new file mode 100644
index 00000000..1671db35
--- /dev/null
+++ b/bloom/src/ggml_loader.rs
@@ -0,0 +1,406 @@
+use std::{
+    collections::HashMap,
+    io::{BufRead, Read, Seek, SeekFrom},
+    path::Path,
+};
+
+use ggml::loader::{
+    find_all_model_files, read_bytes_with_len, read_f32, read_i32, read_string, read_u32,
+    LoadError, LoadProgress,
+};
+use llama_rs::{mulf, TokenId, Vocabulary};
+
+use crate::{Bloom, Hyperparameters};
+
+/// Load a model from disk
+pub fn load(
+    path: impl AsRef<Path>,
+    n_ctx: usize,
+    mut load_progress_callback: impl FnMut(LoadProgress<Hyperparameters>),
+) -> Result<Bloom, LoadError> {
+    use std::fs::File;
+    use std::io::BufReader;
+
+    let main_path = path.as_ref();
+
+    let mut reader =
+        BufReader::new(
+            File::open(main_path).map_err(|e| LoadError::OpenFileFailed {
+                source: e,
+                path: main_path.to_owned(),
+            })?,
+        );
+
+    // Verify magic
+    let is_legacy_model: bool = match read_u32(&mut reader)? {
+        ggml::FILE_MAGIC => false,
+        ggml::FILE_MAGIC_UNVERSIONED => true,
+        _ => {
+            return Err(LoadError::InvalidMagic {
+                path: main_path.to_owned(),
+            })
+        }
+    };
+
+    // Load format version
+    if !is_legacy_model {
+        #[allow(unused_variables)]
+        let version: u32 = match read_u32(&mut reader)? {
+            ggml::FORMAT_VERSION => ggml::FORMAT_VERSION,
+            version => return Err(LoadError::InvalidFormatVersion { value: version }),
+        };
+    }
+
+    // =================
+    // Load hyper params
+    // =================
+
+    // NOTE: Field order matters! Data is laid out in the file exactly
+    // in this order.
+    let hparams = Hyperparameters {
+        n_vocab: read_i32(&mut reader)?.try_into()?,
+        n_ctx,
+        n_embd: read_i32(&mut reader)?.try_into()?,
+        n_mult: read_i32(&mut reader)?.try_into()?,
+        n_head: read_i32(&mut reader)?.try_into()?,
+        n_layer: read_i32(&mut reader)?.try_into()?,
+        f16_: read_i32(&mut reader)?.try_into()?,
+    };
+
+    let n_ff = ((4 * hparams.n_embd + hparams.n_mult - 1) / hparams.n_mult) * hparams.n_mult;
+
+    load_progress_callback(LoadProgress::HyperparametersLoaded(&hparams));
+
+    // ===============
+    // Load vocabulary
+    // ===============
+    let vocabulary = {
+        let mut id_to_token = vec![];
+        let mut id_to_token_score = vec![];
+        let mut token_to_id = HashMap::new();
+        let mut max_token_length = 0;
+
+        for i in 0..hparams.n_vocab {
+            let len = read_i32(&mut reader)?;
+            let token = read_bytes_with_len(&mut reader, len as usize)?;
+            max_token_length = max_token_length.max(token.len());
+            id_to_token.push(token.clone());
+            token_to_id.insert(token, TokenId::try_from(i)?);
+
+            // Token score, currently unused
+            if !is_legacy_model {
+                if let Ok(score) = read_f32(&mut reader) {
+                    id_to_token_score.push(score);
+                }
+            } else {
+                // Legacy model, set empty score
+                id_to_token_score.push(0.);
+            }
+        }
+
+        Vocabulary {
+            id_to_token,
+            id_to_token_score,
+            token_to_id,
+            max_token_length,
+        }
+    };
+
+    // for the big tensors, we have the option to store the data in 16-bit
+    // floats or quantized in order to save memory and also to speed up the
+    // computation
+    let wtype = match hparams.f16_ {
+        0 => ggml::Type::F32,
+        1 => ggml::Type::F16,
+        2 => ggml::Type::Q4_0,
+        3 => ggml::Type::Q4_1,
+        invalid => return Err(LoadError::HyperparametersF16Invalid { ftype: invalid }),
+    };
+
+    let n_embd = hparams.n_embd;
+    let n_layer = hparams.n_layer;
+    let n_vocab = hparams.n_vocab;
+
+    let ctx_size = {
+        let mut ctx_size: usize = 0;
+
+        ctx_size += mulf!(n_embd, n_vocab, ggml::type_sizef(wtype)); // tok_embeddings
+
+        ctx_size += mulf!(n_embd, ggml::type_sizef(ggml::Type::F32)); // norm
+        ctx_size += mulf!(n_embd, ggml::type_sizef(ggml::Type::F32)); // norm_b
+
+        ctx_size += mulf!(n_embd, ggml::type_sizef(ggml::Type::F32)); // output_norm
+        ctx_size += mulf!(n_embd, ggml::type_sizef(ggml::Type::F32)); // output_norm_b
+
+        ctx_size += mulf!(n_embd, n_vocab, ggml::type_sizef(wtype)); // output
+
+        ctx_size += mulf!(n_layer, n_embd, ggml::type_sizef(ggml::Type::F32)); // attention_norm
+        ctx_size += mulf!(n_layer, n_embd, ggml::type_sizef(ggml::Type::F32)); // attention_norm_b
+
+        ctx_size += mulf!(n_layer, 3, n_embd, n_embd, ggml::type_sizef(wtype)); //query_key_value
+        ctx_size += mulf!(n_layer, 3, n_embd, ggml::type_sizef(ggml::Type::F32)); //query_key_value_b
+
+        ctx_size += mulf!(n_layer, n_embd, n_embd, ggml::type_sizef(wtype)); // wo
+        ctx_size += mulf!(n_layer, n_embd, ggml::type_sizef(ggml::Type::F32)); // wo_b
+
+        ctx_size += mulf!(n_layer, n_embd, ggml::type_sizef(ggml::Type::F32)); // ffn_norm
+        ctx_size += mulf!(n_layer, n_embd, ggml::type_sizef(ggml::Type::F32)); // ffn_norm_b
+
+        ctx_size += mulf!(n_layer, n_ff, n_embd, ggml::type_sizef(wtype)); // w1
+        ctx_size += mulf!(n_layer, n_ff, ggml::type_sizef(ggml::Type::F32)); // w1_b
+
+        ctx_size += mulf!(n_layer, n_ff, n_embd, ggml::type_sizef(wtype)); // w2
+        ctx_size += mulf!(n_layer, n_ff, ggml::type_sizef(ggml::Type::F32)); // w2_b
+
+        ctx_size += (5 + 10 * n_layer) * 256; // object overhead
+
+        load_progress_callback(LoadProgress::ContextSize { bytes: ctx_size });
+
+        ctx_size
+    };
+
+    // Initialize the context
+    let context = ggml::Context::init(ctx_size as usize);
+
+    let model = Bloom::new(context, hparams, vocabulary, n_ff, wtype);
+
+    // Close the file, but keep its offset. That way we know how to skip the
+    // metadata when loading the parts.
+    let file_offset = reader.stream_position()?;
+    drop(reader);
+
+    let paths = find_all_model_files(main_path)?;
+
+    let n_parts = paths.len();
+
+    for (i, part_path) in paths.into_iter().enumerate() {
+        let part_id = i;
+
+        load_progress_callback(LoadProgress::PartLoading {
+            file: &part_path,
+            current_part: i + 1,
+            total_parts: n_parts,
+        });
+
+        let mut part_reader = BufReader::new(File::open(&part_path)?);
+
+        // Skip metadata
+        part_reader.seek(SeekFrom::Start(file_offset))?;
+
+        let mut total_size = 0;
+        let mut n_tensors = 0;
+
+        // Load weights
+        loop {
+            // NOTE: Implementation from #![feature(buf_read_has_data_left)]
+            let is_eof = part_reader.fill_buf().map(|b| b.is_empty())?;
+
+            if is_eof {
+                break;
+            }
+
+            let n_dims = usize::try_from(read_i32(&mut part_reader)?)?;
+            let length = read_i32(&mut part_reader)?;
+            let ftype = read_u32(&mut part_reader)?;
+
+            let mut nelements: usize = 1;
+            let mut ne = [1i64, 1i64];
+
+            #[allow(clippy::needless_range_loop)]
+            for i in 0..n_dims {
+                ne[i] = read_i32(&mut part_reader)? as i64;
+                nelements *= usize::try_from(ne[i])?;
+            }
+
+            let tensor_name = read_string(&mut part_reader, length as usize)?;
+
+            let Some(tensor) = model.tensors.get(&tensor_name)
+              else {
+                  return Err(LoadError::UnknownTensor { tensor_name, path: part_path.to_path_buf() });
+              };
+
+            // split_type = 0: split by columns
+            // split_type = 1: split by rows
+            //
+            // split_type = 0:
+            // regex:
+            //   - tok_embeddings.*
+            //   - layers.*.attention.wo.weight
+            //   - layers.*.feed_forward.w2.weight
+
+            // split_type = 1:
+            // regex:
+            //   - output.*
+            //   - layers.*.attention.wq.weight
+            //   - layers.*.attention.wk.weight
+            //   - layers.*.attention.wv.weight
+            //   - layers.*.feed_forward.w1.weight
+            //   - layers.*.feed_forward.w3.weight
+            #[allow(clippy::if_same_then_else)]
+            let split_type = if tensor_name.contains("tok_embeddings") {
+                0
+            } else if tensor_name.contains("layers") {
+                if tensor_name.contains("attention.wo.weight") {
+                    0
+                } else if tensor_name.contains("feed_forward.w2.weight") {
+                    0
+                } else {
+                    1
+                }
+            } else if tensor_name.contains("output") {
+                1
+            } else {
+                0
+            };
+
+            if n_dims == 1 {
+                if tensor.nelements() != nelements {
+                    return Err(LoadError::TensorWrongSize {
+                        tensor_name,
+                        path: part_path.to_path_buf(),
+                    });
+                }
+            } else if tensor.nelements() / n_parts != nelements {
+                return Err(LoadError::TensorWrongSize {
+                    tensor_name,
+                    path: part_path.to_path_buf(),
+                });
+            }
+
+            if n_dims == 1 {
+                if tensor.get_ne()[0] != ne[0] || tensor.get_ne()[1] != ne[1] {
+                    return Err(LoadError::TensorWrongSize {
+                        tensor_name,
+                        path: part_path.to_path_buf(),
+                    });
+                }
+            } else if split_type == 0 {
+                if tensor.get_ne()[0] / i64::try_from(n_parts)? != ne[0]
+                    || tensor.get_ne()[1] != ne[1]
+                {
+                    return Err(LoadError::TensorWrongSize {
+                        tensor_name,
+                        path: part_path.to_path_buf(),
+                    });
+                }
+            } else if tensor.get_ne()[0] != ne[0]
+                || tensor.get_ne()[1] / i64::try_from(n_parts)? != ne[1]
+            {
+                return Err(LoadError::TensorWrongSize {
+                    tensor_name,
+                    path: part_path.to_path_buf(),
+                });
+            }
+
+            let bpe = match ftype {
+                0 => ggml::type_size(ggml::Type::F32),
+                1 => ggml::type_size(ggml::Type::F16),
+                2 => {
+                    assert_eq!(ne[0] % 64, 0);
+                    ggml::type_size(ggml::Type::Q4_0)
+                }
+                3 => {
+                    assert_eq!(ne[0] % 64, 0);
+                    ggml::type_size(ggml::Type::Q4_1)
+                }
+                _ => {
+                    return Err(LoadError::InvalidFtype {
+                        ftype,
+                        path: part_path.to_path_buf(),
+                        tensor_name,
+                    })
+                }
+            };
+
+            if n_dims == 1 || n_parts == 1 {
+                if (nelements as usize * bpe) / ggml::blck_size(tensor.get_type()) as usize
+                    != tensor.nbytes()
+                {
+                    return Err(LoadError::TensorWrongSize {
+                        tensor_name,
+                        path: part_path.to_path_buf(),
+                    });
+                }
+
+                if part_id == 0 {
+                    // SAFETY: yolo, same as original code
+                    let slice = unsafe {
+                        let data = tensor.data();
+                        std::slice::from_raw_parts_mut(data as *mut u8, tensor.nbytes())
+                    };
+                    part_reader.read_exact(slice)?;
+                } else {
+                    part_reader.seek(SeekFrom::Current(tensor.nbytes() as i64))?;
+                }
+
+                total_size += tensor.nbytes();
+            } else {
+                if (nelements as usize * bpe) / ggml::blck_size(tensor.get_type()) as usize
+                    != tensor.nbytes() / n_parts
+                {
+                    return Err(LoadError::TensorWrongSize {
+                        tensor_name,
+                        path: part_path.to_path_buf(),
+                    });
+                }
+
+                if split_type == 0 {
+                    let np0 = ne[0];
+                    let row_size = (usize::try_from(tensor.get_ne()[0])?
+                        / ggml::blck_size(tensor.get_type()))
+                        * ggml::type_size(tensor.get_type());
+
+                    assert_eq!(row_size, tensor.get_nb()[1]);
+
+                    for i1 in 0..ne[1] {
+                        let offset_row = i1 as usize * row_size;
+                        let offset = offset_row
+                            + ((part_id * np0 as usize)
+                                / ggml::blck_size(tensor.get_type()) as usize)
+                                * ggml::type_size(tensor.get_type());
+                        // SAFETY: yolo, same as original code
+                        unsafe {
+                            let ptr = tensor.data().add(offset);
+                            let slice =
+                                std::slice::from_raw_parts_mut(ptr as *mut u8, row_size / n_parts);
+                            part_reader.read_exact(slice)?;
+                        }
+                    }
+                } else {
+                    let np1 = ne[1];
+
+                    let row_size = (usize::try_from(tensor.get_ne()[0])?
+                        / ggml::blck_size(tensor.get_type()))
+                        * ggml::type_size(tensor.get_type());
+
+                    for i1 in 0..ne[1] {
+                        let offset_row = (i1 as usize + part_id * np1 as usize) * row_size;
+                        // SAFETY: yolo, same as original code
+                        unsafe {
+                            let ptr = tensor.data().add(offset_row);
+                            let slice = std::slice::from_raw_parts_mut(ptr as *mut u8, row_size);
+                            part_reader.read_exact(slice)?;
+                        }
+                    }
+                }
+
+                total_size += tensor.nbytes() / n_parts;
+            }
+
+            n_tensors += 1;
+            load_progress_callback(LoadProgress::PartTensorLoaded {
+                file: &part_path,
+                current_tensor: n_tensors.try_into()?,
+                tensor_count: model.tensors.len(),
+            });
+        }
+
+        load_progress_callback(LoadProgress::PartLoaded {
+            file: &part_path,
+            byte_size: total_size,
+            tensor_count: n_tensors.try_into()?,
+        });
+    }
+
+    Ok(model)
+}
diff --git a/bloom/src/lib.rs b/bloom/src/lib.rs
new file mode 100644
index 00000000..aa050fef
--- /dev/null
+++ b/bloom/src/lib.rs
@@ -0,0 +1,517 @@
+use std::collections::HashMap;
+
+use ggml::loader::{LoadError, LoadProgress};
+use llama_rs::{InferenceSession, Model, Vocabulary};
+
+mod ggml_loader;
+
+pub use ggml_loader::load;
+
+/// The weights for the BLOOM model. All the mutable state is split into a
+/// separate struct `InferenceSession`.
+pub struct Bloom {
+    hparams: Hyperparameters,
+    vocabulary: Vocabulary,
+    tok_embeddings: ggml::Tensor,
+    norm: ggml::Tensor,
+    norm_b: ggml::Tensor,
+    output_norm: ggml::Tensor,
+    output_norm_b: ggml::Tensor,
+    output: ggml::Tensor,
+    layers: Vec<Layer>,
+    tensors: HashMap<String, ggml::Tensor>,
+    // Must be kept alive for the model
+    _context: ggml::Context,
+}
+
+impl Model for Bloom {
+    type Model = Bloom;
+    type Hyperparameters = Hyperparameters;
+    type Layer = Layer;
+
+    fn start_session(&self, params: llama_rs::InferenceSessionParameters) -> InferenceSession {
+        InferenceSession::new(
+            params,
+            self.hparams.n_ctx,
+            self.hparams.n_layer,
+            self.hparams.n_embd,
+            self.hparams.n_vocab,
+        )
+    }
+
+    fn evaluate(
+        &self,
+        session: &mut llama_rs::InferenceSession,
+        params: &llama_rs::InferenceParameters,
+        input_tokens: &[llama_rs::TokenId],
+        output_request: &mut llama_rs::EvaluateOutputRequest,
+    ) {
+        let n = input_tokens.len();
+        let n_past = session.n_past;
+        let n_threads = params.n_threads;
+
+        let Hyperparameters {
+            n_vocab,
+            n_ctx,
+            n_embd,
+            n_mult: _,
+            n_head,
+            n_layer,
+            f16_: _,
+        } = self.hparams;
+
+        // For the first run, we need to guess a maximum buffer size so we can measure
+        // the actual memory consumption of the temporary ggml context.
+        let mut buf_size = 1024 * 1024 * 1024;
+        if session.mem_per_token > 0 && session.mem_per_token * n > buf_size {
+            // add 10% to account for ggml object overhead
+            buf_size = (1.1f64 * session.mem_per_token as f64 * n as f64) as usize;
+        };
+        let ctx0 = ggml::Context::init(buf_size);
+
+        // TODO: REMAKE THIS AFTER CHECKING GGML GRAPH
+        let mut gf = ggml::ComputationGraph::new(n_threads);
+
+        let embd = ctx0.new_tensor_1d(ggml::Type::I32, n);
+        unsafe { embd.write_data(bytemuck::cast_slice(input_tokens)) };
+
+        let mut input_layer = ctx0.op_get_rows(&self.tok_embeddings, &embd);
+
+        //TODO: word embeddings norm,
+        {
+            input_layer = ctx0.op_norm(&input_layer);
+            input_layer = ctx0.op_mul(&ctx0.op_repeat(&self.norm, &input_layer), &input_layer);
+            input_layer = ctx0.op_add(&ctx0.op_repeat(&self.norm_b, &input_layer), &input_layer);
+        }
+
+        for il in 0..n_layer as usize {
+            let input_self_attention = input_layer.share();
+            let mut current: ggml::Tensor;
+
+            // norm
+            {
+                current = ctx0.op_norm(&input_layer);
+
+                // cur = attention_norm * cur
+                current = ctx0.op_mul(
+                    &ctx0.op_repeat(&self.layers[il].attention_norm, &current),
+                    &current,
+                );
+                current = ctx0.op_add(
+                    &ctx0.op_repeat(&self.layers[il].attention_norm_b, &current),
+                    &current,
+                );
+            }
+
+            //attention
+            {
+                current = ctx0.op_mul_mat(&self.layers[il].query_key_value, &current);
+                current = ctx0.op_add(
+                    &ctx0.op_repeat(&self.layers[il].query_key_value_b, &current),
+                    &current,
+                );
+            }
+
+            // self-attention
+            {
+                let nb = current.get_nb()[1];
+                let q_current = ctx0.op_view_2d(
+                    &current, n_embd, n, nb,
+                    //0 * std::mem::size_of::<f32>() * n_embd as usize,
+                    0,
+                );
+                let k_current =
+                    ctx0.op_view_2d(&current, n_embd, n, nb, std::mem::size_of::<f32>() * n_embd);
+                let v_current = ctx0.op_view_2d(
+                    &current,
+                    n_embd,
+                    n,
+                    nb,
+                    2 * std::mem::size_of::<f32>() * n_embd,
+                );
+
+                // store key and value to memory
+                if n >= 1 {
+                    let k = ctx0.op_view_1d(
+                        &session.memory_k,
+                        n * n_embd,
+                        (session.memory_k.element_size() * n_embd as usize)
+                            * (il * n_ctx as usize + n_past as usize),
+                    );
+
+                    let v = ctx0.op_view_1d(
+                        &session.memory_v,
+                        n * n_embd,
+                        (session.memory_v.element_size() * n_embd as usize)
+                            * (il * n_ctx as usize + n_past as usize),
+                    );
+
+                    gf.build_forward_expand(&ctx0.op_cpy(&k_current, &k));
+                    gf.build_forward_expand(&ctx0.op_cpy(&v_current, &v));
+                }
+
+                // Q = Qcur.contiguous().view(n_embd/n_head, n_head, N).permute(0, 2, 1, 3)
+                let q = ctx0.op_permute(
+                    &ctx0.op_cpy(
+                        &q_current,
+                        &ctx0.new_tensor_3d(ggml::Type::F32, n_embd / n_head, n_head, n),
+                    ),
+                    0,
+                    2,
+                    1,
+                    3,
+                );
+
+                // K = Kmem.view(n_embd/n_head, n_head, n_past + N).permute(0, 2, 1, 3)
+                let k = ctx0.op_permute(
+                    &ctx0.op_reshape_3d(
+                        &ctx0.op_view_1d(
+                            &session.memory_k,
+                            (n_past + n) * n_embd,
+                            il * n_ctx * session.memory_k.element_size() * n_embd,
+                        ),
+                        n_embd / n_head,
+                        n_head,
+                        n_past + n,
+                    ),
+                    0,
+                    2,
+                    1,
+                    3,
+                );
+
+                // K * Q
+                let k_q = ctx0.op_mul_mat(&k, &q);
+
+                // KQ_scaled = KQ / sqrt(n_embd/n_head)
+                let k_q_scaled = ctx0.op_scale(
+                    &k_q,
+                    &ctx0.new_f32(1.0 / f32::sqrt(n_embd as f32 / n_head as f32)),
+                );
+
+                //alibi
+                // KQ_scaled_alibi = KQ_scaled + alibi_bias
+                // TODO: op_alibi function
+                let k_q_scaled_alibi = ctx0.op_alibi(&k_q_scaled, n_past, n_head);
+
+                // KQ_masked = mask_past(KQ_scaled)
+                let k_q_masked = ctx0.op_diag_mask_inf(&k_q_scaled_alibi, n_past);
+
+                // KQ = soft_max(KQ_masked)
+                let k_q_soft_max = ctx0.op_soft_max(&k_q_masked);
+
+                let memv_elsize = session.memory_v.element_size();
+
+                // split cached V into n_head heads
+                let v = ctx0.op_view_3d(
+                    &session.memory_v,
+                    n_past + n,
+                    n_embd / n_head,
+                    n_head,
+                    n_ctx * memv_elsize,
+                    n_ctx * memv_elsize * n_embd / n_head,
+                    il * n_ctx * memv_elsize * n_embd,
+                );
+
+                // KQV = transpose(V) * KQ_soft_max
+                let k_q_v = ctx0.op_mul_mat(&v, &k_q_soft_max);
+
+                // KQV_merged = KQV.permute(0, 2, 1, 3)
+                let k_q_v_merged = ctx0.op_permute(&k_q_v, 0, 2, 1, 3);
+
+                // cur = KQV_merged.contiguous().view(n_embd, N)
+                current = ctx0.op_cpy(
+                    &k_q_v_merged,
+                    &ctx0.new_tensor_2d(ggml::Type::F32, n_embd, n),
+                );
+
+                // projection
+                current = ctx0.op_mul_mat(&self.layers[il].wo, &current);
+                current = ctx0.op_add(&ctx0.op_repeat(&self.layers[il].wo_b, &current), &current);
+            }
+
+            let input_feed_forward = ctx0.op_add(&current, &input_self_attention);
+
+            // feed-forward network
+            {
+                // norm
+                {
+                    current = ctx0.op_norm(&input_feed_forward);
+
+                    // cur = ffn_norm*cur + ffn_norm_b
+                    current = ctx0.op_mul(
+                        &ctx0.op_repeat(&self.layers[il].ffn_norm, &current),
+                        &current,
+                    );
+
+                    current = ctx0.op_add(
+                        &ctx0.op_repeat(&self.layers[il].ffn_norm_b, &current),
+                        &current,
+                    );
+                }
+
+                current = ctx0.op_mul_mat(&self.layers[il].w1, &current);
+
+                current = ctx0.op_add(&ctx0.op_repeat(&self.layers[il].w1_b, &current), &current);
+
+                // SILU activation
+
+                current = ctx0.op_gelu(&current);
+
+                current = ctx0.op_mul_mat(&self.layers[il].w2, &current);
+
+                current = ctx0.op_add(&ctx0.op_repeat(&self.layers[il].w2_b, &current), &current);
+            }
+
+            current = ctx0.op_add(&current, &input_feed_forward);
+
+            // input for next layer
+            input_layer = current;
+        }
+
+        // Used at the end to optionally extract the embeddings.
+        let embeddings_tensor;
+
+        // norm
+        {
+            input_layer = ctx0.op_norm(&input_layer);
+
+            // inpL = norm*inpL
+            input_layer = ctx0.op_mul(
+                &ctx0.op_repeat(&self.output_norm, &input_layer),
+                &input_layer,
+            );
+
+            input_layer = ctx0.op_add(
+                &ctx0.op_repeat(&self.output_norm_b, &input_layer),
+                &input_layer,
+            );
+
+            embeddings_tensor = input_layer.share(); //TODO: CHECK if this is still necessary, (not in BLOOM C implementation)
+        }
+
+        // lm_head
+        {
+            input_layer = ctx0.op_mul_mat(&self.output, &input_layer);
+        }
+
+        // logits -> probs
+        // inpL = ctx0.op_soft_max(&inpL);
+
+        // run the computation
+        gf.build_forward_expand(&input_layer);
+        ctx0.graph_compute(&mut gf);
+
+        // return result for just the last token
+        // SAFETY: yolo
+        assert_eq!(session.last_logits.len(), n_vocab as usize);
+        unsafe {
+            input_layer.read_data(
+                n_vocab as usize * (n - 1) * std::mem::size_of::<f32>(),
+                bytemuck::cast_slice_mut(&mut session.last_logits),
+            )
+        };
+
+        // Extract logits
+        if let Some(all_logits) = &mut output_request.all_logits {
+            all_logits.resize(n_vocab as usize * n, 0.0);
+            // SAFETY: Tensor data can be read (properly aligned, initialized,
+            // data will not be mutated or otherwise aliased during the copy),
+            // and we're not reading past the end of the tensor data.
+            assert_eq!(input_layer.nelements(), n_vocab * n);
+            unsafe {
+                input_layer.read_data(0, bytemuck::cast_slice_mut(all_logits));
+            }
+        }
+
+        // Extract embeddings
+        if let Some(embeddings) = &mut output_request.embeddings {
+            embeddings.resize(n_embd as usize * n, 0.0);
+            // SAFETY: Same rationale as for the "Extract logits" section applies.
+            assert_eq!(embeddings_tensor.nelements(), n_embd * n);
+            unsafe {
+                embeddings_tensor.read_data(0, bytemuck::cast_slice_mut(embeddings));
+            }
+        }
+
+        // Adjust the required memory per token if we didn't know that already
+        if session.mem_per_token == 0 {
+            session.mem_per_token = ctx0.used_mem() / n;
+        }
+
+        // Adjust n_past to new length.
+        session.n_past += input_tokens.len();
+    }
+
+    /// Returns the vocabulary used by this model.
+    fn vocabulary(&self) -> &Vocabulary {
+        &self.vocabulary
+    }
+
+    fn n_ctx(&self) -> usize {
+        self.hparams.n_ctx
+    }
+}
+
+impl Bloom {
+    /// Load the model from `path` with `n_context_tokens` context tokens.
+    ///
+    /// The status of the loading process will be reported through `load_progress_callback`.
+    pub fn load(
+        path: impl AsRef<std::path::Path>,
+        n_context_tokens: usize,
+        load_progress_callback: impl FnMut(LoadProgress<Hyperparameters>),
+    ) -> Result<Self, LoadError> {
+        load(path, n_context_tokens, load_progress_callback)
+    }
+
+    pub(crate) fn new(
+        context: ggml::Context,
+        hparams: Hyperparameters,
+        vocabulary: Vocabulary,
+        n_ff: usize,
+        wtype: ggml::Type,
+    ) -> Bloom {
+        let n_embd = hparams.n_embd;
+        let n_layer = hparams.n_layer;
+        let n_vocab = hparams.n_vocab;
+
+        let mut tensors = HashMap::new();
+
+        let tok_embeddings = context.new_tensor_2d(wtype, n_embd, n_vocab);
+
+        let norm = context.new_tensor_1d(ggml::Type::F32, n_embd);
+        let norm_b = context.new_tensor_1d(ggml::Type::F32, n_embd);
+
+        let output_norm = context.new_tensor_1d(ggml::Type::F32, n_embd);
+        let output_norm_b = context.new_tensor_1d(ggml::Type::F32, n_embd);
+
+        let output = context.new_tensor_2d(wtype, n_embd, n_vocab);
+
+        tensors.insert("tok_embeddings.weight".to_owned(), tok_embeddings.share());
+
+        tensors.insert("norm.weight".to_owned(), norm.share());
+        tensors.insert("norm.bias".to_owned(), norm_b.share());
+
+        tensors.insert("output_norm.weight".to_owned(), output_norm.share());
+        tensors.insert("output_norm.bias".to_owned(), output_norm_b.share());
+
+        tensors.insert("output.weight".to_owned(), output.share());
+
+        let mut layers = Vec::new();
+        for i in 0..n_layer {
+            let layer = Layer {
+                attention_norm: context.new_tensor_1d(ggml::Type::F32, n_embd),
+                attention_norm_b: context.new_tensor_1d(ggml::Type::F32, n_embd),
+
+                query_key_value: context.new_tensor_2d(wtype, n_embd, 3 * n_embd),
+                query_key_value_b: context.new_tensor_1d(ggml::Type::F32, 3 * n_embd),
+
+                wo: context.new_tensor_2d(wtype, n_embd, n_embd),
+                wo_b: context.new_tensor_1d(ggml::Type::F32, n_embd),
+
+                ffn_norm: context.new_tensor_1d(ggml::Type::F32, n_embd),
+                ffn_norm_b: context.new_tensor_1d(ggml::Type::F32, n_embd),
+
+                w1: context.new_tensor_2d(wtype, n_embd, n_ff),
+                w1_b: context.new_tensor_1d(ggml::Type::F32, n_ff),
+                w2: context.new_tensor_2d(wtype, n_ff, n_embd),
+                w2_b: context.new_tensor_1d(ggml::Type::F32, n_embd),
+            };
+
+            tensors.insert(
+                format!("layers.{i}.attention_norm.weight"),
+                layer.attention_norm.share(),
+            );
+
+            tensors.insert(
+                format!("layers.{i}.attention_norm.bias"),
+                layer.attention_norm_b.share(),
+            );
+
+            tensors.insert(
+                format!("layers.{i}.attention.query_key_value.weight"),
+                layer.query_key_value.share(),
+            );
+            tensors.insert(
+                format!("layers.{i}.attention.query_key_value.bias"),
+                layer.query_key_value_b.share(),
+            );
+
+            tensors.insert(format!("layers.{i}.attention.wo.weight"), layer.wo.share());
+            tensors.insert(format!("layers.{i}.attention.wo.bias"), layer.wo_b.share());
+
+            tensors.insert(
+                format!("layers.{i}.ffn_norm.weight"),
+                layer.ffn_norm.share(),
+            );
+            tensors.insert(
+                format!("layers.{i}.ffn_norm.bias"),
+                layer.ffn_norm_b.share(),
+            );
+
+            tensors.insert(
+                format!("layers.{i}.feed_forward.w1.weight"),
+                layer.w1.share(),
+            );
+            tensors.insert(
+                format!("layers.{i}.feed_forward.w1.bias"),
+                layer.w1_b.share(),
+            );
+            tensors.insert(
+                format!("layers.{i}.feed_forward.w2.weight"),
+                layer.w2.share(),
+            );
+            tensors.insert(
+                format!("layers.{i}.feed_forward.w2.bias"),
+                layer.w2_b.share(),
+            );
+
+            layers.push(layer);
+        }
+
+        Bloom {
+            hparams,
+            vocabulary,
+            tok_embeddings,
+            norm,
+            norm_b,
+            output_norm,
+            output_norm_b,
+            output,
+            layers,
+            tensors,
+            _context: context,
+        }
+    }
+}
+
+// NOTE: Field order matters! Data is laid out in the file exactly
+// in this order.
+#[derive(Debug, Default, PartialEq, Eq, PartialOrd, Ord, Clone, Copy)]
+pub struct Hyperparameters {
+    pub n_vocab: usize,
+    pub n_ctx: usize,
+    pub n_embd: usize,
+    pub n_mult: usize,
+    pub n_head: usize,
+    pub n_layer: usize,
+    pub f16_: u32,
+}
+
+pub struct Layer {
+    pub attention_norm: ggml::Tensor,
+    pub attention_norm_b: ggml::Tensor,
+    pub wo: ggml::Tensor,
+    pub wo_b: ggml::Tensor,
+    pub query_key_value: ggml::Tensor,
+    pub query_key_value_b: ggml::Tensor,
+    // normalization
+    pub ffn_norm: ggml::Tensor,
+    pub ffn_norm_b: ggml::Tensor,
+    // ff
+    pub w1: ggml::Tensor,
+    pub w1_b: ggml::Tensor,
+    pub w2: ggml::Tensor,
+    pub w2_b: ggml::Tensor,
+}
diff --git a/ggml-sys/ggml/ggml.c b/ggml-sys/ggml/ggml.c
index 69974989..67057f01 100644
--- a/ggml-sys/ggml/ggml.c
+++ b/ggml-sys/ggml/ggml.c
@@ -3098,7 +3098,7 @@ static const char * GGML_OP_LABEL[GGML_OP_COUNT] = {
     "MAP_BINARY",
 };
 
-static_assert(GGML_OP_COUNT == 38, "GGML_OP_COUNT != 38");
+static_assert(GGML_OP_COUNT == 39, "GGML_OP_COUNT != 39");
 
 static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
     "none",
@@ -3146,7 +3146,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
     "f(x,y)",
 };
 
-static_assert(GGML_OP_COUNT == 38, "GGML_OP_COUNT != 38");
+static_assert(GGML_OP_COUNT == 39, "GGML_OP_COUNT != 39");
 
 static_assert(sizeof(struct ggml_object)%GGML_MEM_ALIGN == 0, "ggml_object size must be a multiple of GGML_MEM_ALIGN");
 static_assert(sizeof(struct ggml_tensor)%GGML_MEM_ALIGN == 0, "ggml_tensor size must be a multiple of GGML_MEM_ALIGN");
@@ -5133,6 +5133,36 @@ struct ggml_tensor * ggml_rope(
     return result;
 }
 
+// ggml_alibi
+struct ggml_tensor * ggml_alibi(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        int                   n_past,
+        int                   n_head) {
+    GGML_ASSERT(n_past >= 0);
+    bool is_node = false;
+
+    if (a->grad) {
+        GGML_ASSERT(false); // TODO: implement backward
+        is_node = true;
+    }
+
+    // TODO: when implement backward, fix this:
+    //struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
+    struct ggml_tensor * result = ggml_view_tensor(ctx, a);
+
+    struct ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 3);
+    ((int32_t *) b->data)[0] = n_past;
+    ((int32_t *) b->data)[1] = n_head;
+
+    result->op   = GGML_OP_ALIBI;
+    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
+    result->src0 = a;
+    result->src1 = b;
+
+    return result;
+}
+
 // ggml_conv_1d_1s
 
 struct ggml_tensor * ggml_conv_1d_1s(
@@ -7757,6 +7787,163 @@ static void ggml_compute_forward_soft_max(
     }
 }
 
+// ggml_compute_forward_alibi
+
+static void ggml_compute_forward_alibi_f32(
+        const struct ggml_compute_params * params,
+        const struct ggml_tensor * src0,
+        const struct ggml_tensor * src1,
+        struct ggml_tensor * dst) {
+    assert(params->ith == 0);
+    assert(src1->type == GGML_TYPE_I32);
+    assert(ggml_nelements(src1) == 3);
+
+    if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
+        return;
+    }
+
+    const int n_past = ((int32_t *) src1->data)[0];
+    const int n_head = ((int32_t *) src1->data)[1];
+    const int mode   = ((int32_t *) src1->data)[2];
+
+    const int ne0 = src0->ne[0]; // all_seq_len = n_past + ne1
+    const int ne1 = src0->ne[1]; // seq_len_without_past
+    const int ne2 = src0->ne[2]; // n_head -> this is k
+    const int ne3 = src0->ne[3]; // 1 -> bsz
+
+    const int n  = ggml_nrows(src0);
+    const int ne2_ne3 = n/ne1; // ne2*ne3
+
+    const int nb0 = src0->nb[0];
+    const int nb1 = src0->nb[1];
+    const int nb2 = src0->nb[2];
+    const int nb3 = src0->nb[3];
+
+
+    // printf("\nne0: %d, ne1: %d, ne2: %d, ne3: %d", ne0, ne1, ne2, ne3);
+    // printf("\nn_past = %d, ne2 = %d", n_past, ne2);
+
+    assert(nb0 == sizeof(float));
+    assert(ne1+n_past == ne0);
+
+    // add alibi to src0 (KQ_scaled)
+    const int n_heads_log2_floor = 1 << (int) floor(log2(n_head));
+    const float m0 = pow(2.0, -8.0 / n_heads_log2_floor);
+    const float m1 = pow(2.0, -4.0 / n_heads_log2_floor);
+
+    for (int i = 0; i < ne0; i++) {
+        for (int j = 0; j < ne1; j++) {
+            for (int k = 0; k < ne2_ne3; k++) {
+                float * const src = (float *)((char *) src0->data + i*nb0 + j*nb1 + k*nb2);
+                float * dst_data  = (float *)((char *)  dst->data + i*nb0 + j*nb1 + k*nb2);
+
+                // TODO: k*nb2 or k*nb3
+
+                float m_k;
+                if (k < n_heads_log2_floor) {
+                    m_k = pow(m0, k + 1);
+                } else {
+                    m_k = pow(m1, 2 * (k - n_heads_log2_floor) + 1);
+                }
+                //TODO: optimize
+                dst_data[0] = (j+1) * m_k + src[0];
+            }
+        }
+    }
+
+}
+
+
+static void ggml_compute_forward_alibi_f16(
+        const struct ggml_compute_params * params,
+        const struct ggml_tensor * src0,
+        const struct ggml_tensor * src1,
+        struct ggml_tensor * dst) {
+    assert(params->ith == 0);
+    assert(src1->type == GGML_TYPE_I32);
+    assert(ggml_nelements(src1) == 3);
+
+    if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
+        return;
+    }
+
+    const int n_past = ((int32_t *) src1->data)[0];
+    const int n_head = ((int32_t *) src1->data)[1];
+    const int mode   = ((int32_t *) src1->data)[2];
+
+    const int ne0 = src0->ne[0]; // all_seq_len = n_past + ne1
+    const int ne1 = src0->ne[1]; // seq_len_without_past
+    const int ne2 = src0->ne[2]; // n_head -> this is k
+    const int ne3 = src0->ne[3]; // 1 -> bsz
+
+    const int n  = ggml_nrows(src0);
+    const int ne2_ne3 = n/ne1; // ne2*ne3
+
+    const int nb0 = src0->nb[0];
+    const int nb1 = src0->nb[1];
+    const int nb2 = src0->nb[2];
+    const int nb3 = src0->nb[3];
+
+
+    // printf("\nne0: %d, ne1: %d, ne2: %d, ne3: %d", ne0, ne1, ne2, ne3);
+    // printf("\nn_past = %d, ne2 = %d", n_past, ne2);
+
+    assert(nb0 == sizeof(float));
+    assert(ne1+n_past == ne0);
+
+    // add alibi to src0 (KQ_scaled)
+    const int n_heads_log2_floor = 1 << (int) floor(log2(n_head));
+    const ggml_fp16_t m0 = pow(2.0, -8.0 / n_heads_log2_floor);
+    const ggml_fp16_t m1 = pow(2.0, -4.0 / n_heads_log2_floor);
+
+    for (int i = 0; i < ne0; i++) {
+        for (int j = 0; j < ne1; j++) {
+            for (int k = 0; k < ne2_ne3; k++) {
+                ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i*nb0 + j*nb1 + k*nb2);
+                ggml_fp16_t * dst_data  = (ggml_fp16_t *)((char *)  dst->data + i*nb0 + j*nb1 + k*nb2);
+
+                // TODO: k*nb2 or k*nb3
+
+                ggml_fp16_t m_k;
+                if (k < n_heads_log2_floor) {
+                    m_k = pow(m0, k + 1);
+                } else {
+                    m_k = pow(m1, 2 * (k - n_heads_log2_floor) + 1);
+                }
+                //TODO: optimize
+                dst_data[0] = (j+1) * m_k + src[0];
+            }
+        }
+    }
+
+}
+
+static void ggml_compute_forward_alibi(
+        const struct ggml_compute_params * params,
+        const struct ggml_tensor * src0,
+        const struct ggml_tensor * src1,
+        struct ggml_tensor * dst) {
+    switch (src0->type) {
+        case GGML_TYPE_F16:
+            {
+                ggml_compute_forward_alibi_f16(params, src0, src1, dst);
+            } break;
+        case GGML_TYPE_F32:
+            {
+                ggml_compute_forward_alibi_f32(params, src0, src1, dst);
+            } break;
+        case GGML_TYPE_Q4_0:
+        case GGML_TYPE_Q4_1:
+        case GGML_TYPE_I8:
+        case GGML_TYPE_I16:
+        case GGML_TYPE_I32:
+        case GGML_TYPE_COUNT:
+            {
+                GGML_ASSERT(false);
+            } break;
+    }
+}
+
 // ggml_compute_forward_rope
 
 static void ggml_compute_forward_rope_f32(
@@ -9369,6 +9556,10 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
             {
                 ggml_compute_forward_rope(params, tensor->src0, tensor->src1, tensor);
             } break;
+        case GGML_OP_ALIBI:
+            {
+                ggml_compute_forward_alibi(params, tensor->src0, tensor->src1, tensor);
+            } break;
         case GGML_OP_CONV_1D_1S:
             {
                 ggml_compute_forward_conv_1d_1s(params, tensor->src0, tensor->src1, tensor);
@@ -9571,6 +9762,10 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
             {
                 GGML_ASSERT(false); // TODO: not implemented
             } break;
+        case GGML_OP_ALIBI:
+            {
+                GGML_ASSERT(false); // TODO: not implemented
+            } break;
         case GGML_OP_SILU:
             {
                 GGML_ASSERT(false); // TODO: not implemented
@@ -10088,6 +10283,10 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
                     {
                         node->n_tasks = n_threads;
                     } break;
+                case GGML_OP_ALIBI:
+                    {
+                        node->n_tasks = 1; //TODO
+                    } break;
                 case GGML_OP_CONV_1D_1S:
                 case GGML_OP_CONV_1D_2S:
                     {
diff --git a/ggml-sys/ggml/ggml.h b/ggml-sys/ggml/ggml.h
index 241e96a1..a74464c5 100644
--- a/ggml-sys/ggml/ggml.h
+++ b/ggml-sys/ggml/ggml.h
@@ -248,6 +248,7 @@ enum ggml_op {
     GGML_OP_DIAG_MASK_INF,
     GGML_OP_SOFT_MAX,
     GGML_OP_ROPE,
+    GGML_OP_ALIBI,
     GGML_OP_CONV_1D_1S,
     GGML_OP_CONV_1D_2S,
 
@@ -629,6 +630,14 @@ struct ggml_tensor * ggml_rope(
         int                   n_dims,
         int                   mode);
 
+// alibi position embedding
+// in-place, returns view(a)
+struct ggml_tensor * ggml_alibi(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        int                   n_past,
+        int                   n_head);
+
 // padding = 1
 // TODO: we don't support extra parameters for now
 //       that's why we are hard-coding the stride, padding, and dilation
diff --git a/ggml-sys/src/lib.rs b/ggml-sys/src/lib.rs
index 00b9210e..ef469688 100644
--- a/ggml-sys/src/lib.rs
+++ b/ggml-sys/src/lib.rs
@@ -996,6 +996,14 @@ extern "C" {
         filename: *const ::std::os::raw::c_char,
     );
 }
+extern "C" {
+    pub fn ggml_alibi(
+        ctx: *mut ggml_context,
+        a: *mut ggml_tensor,
+        n_past: ::std::os::raw::c_int,
+        n_head: ::std::os::raw::c_int,
+    ) -> *mut ggml_tensor;
+}
 pub const ggml_opt_type_GGML_OPT_ADAM: ggml_opt_type = 0;
 pub const ggml_opt_type_GGML_OPT_LBFGS: ggml_opt_type = 1;
 pub type ggml_opt_type = ::std::os::raw::c_uint;
diff --git a/ggml/Cargo.toml b/ggml/Cargo.toml
index 3f1e43a8..18b508ec 100644
--- a/ggml/Cargo.toml
+++ b/ggml/Cargo.toml
@@ -4,4 +4,8 @@ version = { workspace = true }
 edition = "2021"
 
 [dependencies]
-ggml-sys = { path = "../ggml-sys" }
\ No newline at end of file
+ggml-sys = { path = "../ggml-sys" }
+
+log = { workspace = true }
+
+thiserror = "1.0"
diff --git a/ggml/src/lib.rs b/ggml/src/lib.rs
index 7625e5b1..8d44367d 100644
--- a/ggml/src/lib.rs
+++ b/ggml/src/lib.rs
@@ -14,6 +14,9 @@ use std::{
     sync::{Arc, Weak},
 };
 
+/// GGML loading utilities
+pub mod loader;
+
 /// Magic constant for `ggml` files (versioned).
 pub const FILE_MAGIC: u32 = 0x67676d66;
 /// Magic constant for `ggml` files (unversioned).
@@ -462,6 +465,26 @@ impl Context {
             );
         }
     }
+
+    /// TODO: something something
+    pub fn op_alibi(&self, a: &Tensor, n_past: usize, n_head: usize) -> Tensor {
+        let tensor = unsafe {
+            ggml_sys::ggml_alibi(
+                self.ptr.as_ptr(),
+                a.ptr.as_ptr(),
+                usize_to_i32(n_past),
+                usize_to_i32(n_head),
+            )
+        };
+
+        self.new_tensor_raw(tensor)
+    }
+
+    /// Gaussian Error Linear Units
+    pub fn op_gelu(&self, a: &Tensor) -> Tensor {
+        let tensor = unsafe { ggml_sys::ggml_gelu(self.ptr.as_ptr(), a.ptr.as_ptr()) };
+        self.new_tensor_raw(tensor)
+    }
 }
 
 impl Drop for Context {
diff --git a/ggml/src/loader.rs b/ggml/src/loader.rs
new file mode 100644
index 00000000..d371ce5f
--- /dev/null
+++ b/ggml/src/loader.rs
@@ -0,0 +1,261 @@
+use std::{
+    io::BufRead,
+    path::{Path, PathBuf}, fmt::Debug,
+};
+
+use thiserror::Error;
+
+/// Each variant represents a step within the process of loading the model.
+/// These can be used to report progress to the user.
+#[derive(Clone, PartialEq, Eq, PartialOrd, Ord, Debug)]
+pub enum LoadProgress<'a, H> {
+    /// The hyperparameters have been loaded from the model.
+    HyperparametersLoaded(&'a H),
+    /// The context has been created.
+    ContextSize {
+        /// The size of the context.
+        bytes: usize,
+    },
+    /// A part of the model is being loaded.
+    PartLoading {
+        /// The path to the model part.
+        file: &'a Path,
+        /// The current part (0-indexed).
+        current_part: usize,
+        /// The number of total parts.
+        total_parts: usize,
+    },
+    /// A tensor from the current part has been loaded.
+    PartTensorLoaded {
+        /// The path to the model part.
+        file: &'a Path,
+        /// The current tensor (0-indexed).
+        current_tensor: usize,
+        /// The number of total tensors.
+        tensor_count: usize,
+    },
+    /// A model part has finished fully loading.
+    PartLoaded {
+        /// The path to the model part.
+        file: &'a Path,
+        /// The number of bytes in the part.
+        byte_size: usize,
+        /// The number of tensors in the part.
+        tensor_count: usize,
+    },
+}
+
+#[derive(Error, Debug)]
+/// Errors encountered during the loading process.
+pub enum LoadError {
+    #[error("could not open file {path:?}")]
+    /// A file failed to open.
+    OpenFileFailed {
+        /// The original error.
+        source: std::io::Error,
+        /// The path that failed.
+        path: PathBuf,
+    },
+    #[error("no parent path for {path:?}")]
+    /// There is no parent path for a given path.
+    NoParentPath {
+        /// The path without a parent.
+        path: PathBuf,
+    },
+    #[error("unable to read exactly {bytes} bytes")]
+    /// Reading exactly `bytes` from a file failed.
+    ReadExactFailed {
+        /// The original error.
+        source: std::io::Error,
+        /// The number of bytes that were attempted to be read.
+        bytes: usize,
+    },
+    #[error("non-specific I/O error")]
+    /// A non-specific IO error.
+    IO(#[from] std::io::Error),
+    #[error("could not convert bytes to a UTF-8 string")]
+    /// One of the strings encountered was not valid UTF-8.
+    InvalidUtf8(#[from] std::string::FromUtf8Error),
+    #[error("invalid integer conversion")]
+    /// One of the integers encountered could not be converted to a more appropriate type.
+    InvalidIntegerConversion(#[from] std::num::TryFromIntError),
+    #[error("invalid magic number for {path:?}")]
+    /// An invalid magic number was encountered during the loading process.
+    InvalidMagic {
+        /// The path that failed.
+        path: PathBuf,
+    },
+    #[error("invalid file format version {value}")]
+    /// The version of the format is not supported by this version of `llama-rs`.
+    InvalidFormatVersion {
+        /// The version that was encountered.
+        value: u32,
+    },
+    #[error("invalid value {ftype} for `f16` in hyperparameters")]
+    /// The `f16` hyperparameter had an invalid value.
+    HyperparametersF16Invalid {
+        /// The format type that was encountered.
+        ftype: u32,
+    },
+    #[error("unknown tensor `{tensor_name}` in {path:?}")]
+    /// The tensor `tensor_name` was encountered during the loading of `path`, but was not seen during
+    /// the model prelude.
+    UnknownTensor {
+        /// The name of the tensor.
+        tensor_name: String,
+        /// The path that failed.
+        path: PathBuf,
+    },
+    #[error("the tensor `{tensor_name}` has the wrong size in {path:?}")]
+    /// The tensor `tensor_name` did not match its expected size.
+    TensorWrongSize {
+        /// The name of the tensor.
+        tensor_name: String,
+        /// The path that failed.
+        path: PathBuf,
+    },
+    /// The tensor `tensor_name` did not have the expected format type.
+    #[error("invalid ftype {ftype} for tensor `{tensor_name}` in {path:?}")]
+    InvalidFtype {
+        /// The name of the tensor.
+        tensor_name: String,
+        /// The format type that was encountered.
+        ftype: u32,
+        /// The path that failed.
+        path: PathBuf,
+    },
+}
+
+/// Default load progress callback function
+pub fn load_progress<H: Debug>(progress: LoadProgress<H>) {
+    match progress {
+        LoadProgress::HyperparametersLoaded(hparams) => {
+            log::debug!("Loaded hyperparameters {hparams:#?}")
+        }
+        LoadProgress::ContextSize { bytes } => log::info!(
+            "ggml ctx size = {:.2} MB\n",
+            bytes as f64 / (1024.0 * 1024.0)
+        ),
+        LoadProgress::PartLoading {
+            file,
+            current_part,
+            total_parts,
+        } => {
+            let current_part = current_part + 1;
+            log::info!(
+                "Loading model part {}/{} from '{}'\n",
+                current_part,
+                total_parts,
+                file.to_string_lossy(),
+            )
+        }
+        LoadProgress::PartTensorLoaded {
+            current_tensor,
+            tensor_count,
+            ..
+        } => {
+            let current_tensor = current_tensor + 1;
+            if current_tensor % 8 == 0 {
+                log::info!("Loaded tensor {current_tensor}/{tensor_count}");
+            }
+        }
+        LoadProgress::PartLoaded {
+            file,
+            byte_size,
+            tensor_count,
+        } => {
+            log::info!("Loading of '{}' complete", file.to_string_lossy());
+            log::info!(
+                "Model size = {:.2} MB / num tensors = {}",
+                byte_size as f64 / 1024.0 / 1024.0,
+                tensor_count
+            );
+        }
+    }
+}
+
+/// Read bytes
+pub fn read_bytes<const N: usize>(reader: &mut impl BufRead) -> Result<[u8; N], LoadError> {
+    let mut bytes = [0u8; N];
+    reader
+        .read_exact(&mut bytes)
+        .map_err(|e| LoadError::ReadExactFailed {
+            source: e,
+            bytes: N,
+        })?;
+    Ok(bytes)
+}
+
+/// Ready bytes with length
+pub fn read_bytes_with_len(reader: &mut impl BufRead, len: usize) -> Result<Vec<u8>, LoadError> {
+    let mut bytes = vec![0u8; len];
+    reader
+        .read_exact(&mut bytes)
+        .map_err(|e| LoadError::ReadExactFailed {
+            source: e,
+            bytes: len,
+        })?;
+    Ok(bytes)
+}
+
+/// Read an i32
+pub fn read_i32(reader: &mut impl BufRead) -> Result<i32, LoadError> {
+    Ok(i32::from_le_bytes(read_bytes::<4>(reader)?))
+}
+
+/// Read a u32
+pub fn read_u32(reader: &mut impl BufRead) -> Result<u32, LoadError> {
+    Ok(u32::from_le_bytes(read_bytes::<4>(reader)?))
+}
+
+/// Read an f32
+pub fn read_f32(reader: &mut impl BufRead) -> Result<f32, LoadError> {
+    Ok(f32::from_le_bytes(read_bytes::<4>(reader)?))
+}
+
+/// Helper function. Reads a string from the buffer and returns it.
+pub fn read_string(reader: &mut impl BufRead, len: usize) -> Result<String, LoadError> {
+    Ok(String::from_utf8(read_bytes_with_len(reader, len)?)?)
+}
+
+/// Find all model files
+pub fn find_all_model_files(main_path: &Path) -> Result<Vec<PathBuf>, LoadError> {
+    Ok(collect_related_paths(
+        main_path,
+        std::fs::read_dir(main_path.parent().ok_or_else(|| LoadError::NoParentPath {
+            path: main_path.to_owned(),
+        })?)?
+        .filter_map(Result::ok)
+        .map(|de| de.path()),
+    ))
+}
+
+fn collect_related_paths(
+    main_path: &Path,
+    directory_paths: impl Iterator<Item = PathBuf>,
+) -> Vec<PathBuf> {
+    let main_filename = main_path.file_name().and_then(|p| p.to_str());
+
+    let mut paths: Vec<PathBuf> = directory_paths
+        .filter(|p| {
+            p.file_name()
+                .and_then(|p| p.to_str())
+                .zip(main_filename)
+                .map(|(part_filename, main_filename)| {
+                    match part_filename.strip_prefix(main_filename) {
+                        Some(suffix) => {
+                            suffix.is_empty()
+                                || (suffix
+                                    .strip_prefix('.')
+                                    .map(|s| s.parse::<usize>().is_ok())
+                                    .unwrap_or(false))
+                        }
+                        None => false,
+                    }
+                })
+                .unwrap_or(false)
+        })
+        .collect();
+    paths.sort();
+    paths
+}
diff --git a/llama-cli/Cargo.toml b/llama-cli/Cargo.toml
index c5d93022..28bfd223 100644
--- a/llama-cli/Cargo.toml
+++ b/llama-cli/Cargo.toml
@@ -6,17 +6,16 @@ edition = "2021"
 # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
 
 [dependencies]
+ggml = { path = "../ggml" }
 llama = { path = "../llama", features = ["convert"] }
 llama-rs = { path = "../llama-rs" }
 
+log = { workspace = true }
 rand = { workspace = true }
 
-bincode = "1.3.3"
 clap = { version = "4.1.8", features = ["derive"] }
 env_logger = "0.10.0"
-log = "0.4"
 num_cpus = "1.15.0"
 once_cell = "1.17.1"
 rustyline = "11.0.0"
 spinners = "4.1.0"
-zstd = { version = "0.12", default-features = false }
\ No newline at end of file
diff --git a/llama-cli/src/cli_args.rs b/llama-cli/src/cli_args.rs
index 9c21c07e..bc2dcb6d 100644
--- a/llama-cli/src/cli_args.rs
+++ b/llama-cli/src/cli_args.rs
@@ -1,10 +1,13 @@
 use std::path::PathBuf;
 
 use clap::{Parser, ValueEnum};
+use rand::SeedableRng;
+
+use ggml::loader::load_progress;
+
 use llama_rs::{
     InferenceParameters, InferenceSessionParameters, ModelKVMemoryType, TokenBias, EOT_TOKEN_ID,
 };
-use rand::SeedableRng;
 
 #[derive(Parser, Debug)]
 #[command(author, version, about, long_about = None)]
@@ -261,54 +264,8 @@ pub struct ModelLoad {
 }
 impl ModelLoad {
     pub fn load(&self) -> llama::Llama {
-        let model = llama::Llama::load(&self.model_path, self.num_ctx_tokens, |progress| {
-            use llama::LoadProgress;
-            match progress {
-                LoadProgress::HyperparametersLoaded(hparams) => {
-                    log::debug!("Loaded hyperparameters {hparams:#?}")
-                }
-                LoadProgress::ContextSize { bytes } => log::info!(
-                    "ggml ctx size = {:.2} MB\n",
-                    bytes as f64 / (1024.0 * 1024.0)
-                ),
-                LoadProgress::PartLoading {
-                    file,
-                    current_part,
-                    total_parts,
-                } => {
-                    let current_part = current_part + 1;
-                    log::info!(
-                        "Loading model part {}/{} from '{}'\n",
-                        current_part,
-                        total_parts,
-                        file.to_string_lossy(),
-                    )
-                }
-                LoadProgress::PartTensorLoaded {
-                    current_tensor,
-                    tensor_count,
-                    ..
-                } => {
-                    let current_tensor = current_tensor + 1;
-                    if current_tensor % 8 == 0 {
-                        log::info!("Loaded tensor {current_tensor}/{tensor_count}");
-                    }
-                }
-                LoadProgress::PartLoaded {
-                    file,
-                    byte_size,
-                    tensor_count,
-                } => {
-                    log::info!("Loading of '{}' complete", file.to_string_lossy());
-                    log::info!(
-                        "Model size = {:.2} MB / num tensors = {}",
-                        byte_size as f64 / 1024.0 / 1024.0,
-                        tensor_count
-                    );
-                }
-            }
-        })
-        .expect("Could not load model");
+        let model = llama::Llama::load(&self.model_path, self.num_ctx_tokens, load_progress)
+            .expect("Could not load model");
 
         log::info!("Model fully loaded!");
 
diff --git a/llama-cli/src/main.rs b/llama-cli/src/main.rs
index 3bddc674..781948b2 100644
--- a/llama-cli/src/main.rs
+++ b/llama-cli/src/main.rs
@@ -5,10 +5,9 @@ use cli_args::Args;
 use rustyline::error::ReadlineError;
 
 use llama::convert::convert_pth_to_ggml;
-use llama_rs::{InferenceError, Model};
+use llama_rs::{InferenceError, Model, snapshot};
 
 mod cli_args;
-mod snapshot;
 
 fn main() {
     env_logger::builder()
diff --git a/llama-rs/Cargo.toml b/llama-rs/Cargo.toml
index 26f863b1..2fb3289a 100644
--- a/llama-rs/Cargo.toml
+++ b/llama-rs/Cargo.toml
@@ -13,6 +13,9 @@ bytemuck = { workspace = true }
 rand = { workspace = true }
 serde = { workspace = true }
 
-thiserror = "1.0"
+bincode = "1.3.3"
+log = "0.4"
 partial_sort = "0.2.0"
-serde_bytes = "0.11"
\ No newline at end of file
+thiserror = "1.0"
+serde_bytes = "0.11"
+zstd = { version = "0.12", default-features = false }
diff --git a/llama-rs/src/lib.rs b/llama-rs/src/lib.rs
index b7888184..1c0ea6dd 100644
--- a/llama-rs/src/lib.rs
+++ b/llama-rs/src/lib.rs
@@ -3,11 +3,14 @@
 
 use thiserror::Error;
 
-mod inference_session;
 /// Large language model
 pub mod model;
 /// Utilities
 pub mod util;
+/// Snapshots
+pub mod snapshot;
+
+mod inference_session;
 mod vocabulary;
 
 pub use ggml::Type as ElementType;
diff --git a/llama-cli/src/snapshot.rs b/llama-rs/src/snapshot.rs
similarity index 89%
rename from llama-cli/src/snapshot.rs
rename to llama-rs/src/snapshot.rs
index 0dbe7de2..0bd6903a 100644
--- a/llama-cli/src/snapshot.rs
+++ b/llama-rs/src/snapshot.rs
@@ -1,10 +1,12 @@
-use llama_rs::{InferenceSession, InferenceSessionParameters, Model};
 use std::{
     error::Error,
     fs::File,
     io::{BufReader, BufWriter},
     path::Path,
 };
+
+use crate::{InferenceSession, InferenceSessionParameters, Model};
+
 use zstd::{
     stream::{read::Decoder, write::Encoder},
     zstd_safe::CompressionLevel,
@@ -12,13 +14,14 @@ use zstd::{
 
 const SNAPSHOT_COMPRESSION_LEVEL: CompressionLevel = 1;
 
+/// Read or create a session
 pub fn read_or_create_session(
-    model: &llama::Llama,
+    model: &impl Model,
     persist_session: Option<&Path>,
     load_session: Option<&Path>,
     inference_session_params: InferenceSessionParameters,
 ) -> (InferenceSession, bool) {
-    fn load(model: &llama::Llama, path: &Path) -> InferenceSession {
+    fn load(model: &impl Model, path: &Path) -> InferenceSession {
         let file = unwrap_or_exit(File::open(path), || format!("Could not open file {path:?}"));
         let decoder = unwrap_or_exit(Decoder::new(BufReader::new(file)), || {
             format!("Could not create decoder for {path:?}")
@@ -40,7 +43,8 @@ pub fn read_or_create_session(
     }
 }
 
-pub fn write_session(mut session: llama_rs::InferenceSession, path: &Path) {
+/// Write the session
+pub fn write_session(mut session: InferenceSession, path: &Path) {
     // SAFETY: the session is consumed here, so nothing else can access it.
     let snapshot = unsafe { session.get_snapshot() };
     let file = unwrap_or_exit(File::create(path), || {
diff --git a/llama/Cargo.toml b/llama/Cargo.toml
index 6fbf1740..9f933f5a 100644
--- a/llama/Cargo.toml
+++ b/llama/Cargo.toml
@@ -6,14 +6,12 @@ edition = "2021"
 # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
 
 [dependencies]
-bytemuck = { workspace = true }
 ggml = { path = "../ggml" }
 llama-rs = { path = "../llama-rs" }
 
+bytemuck = { workspace = true }
 serde = { workspace = true }
 
-thiserror = "1.0"
-
 # Used for the `convert` feature
 serde_json = { version = "1.0", optional = true }
 protobuf = { version = "= 2.14.0", optional = true }
diff --git a/llama/src/convert.rs b/llama/src/convert.rs
index 679fcb42..e9cac557 100644
--- a/llama/src/convert.rs
+++ b/llama/src/convert.rs
@@ -16,7 +16,9 @@ use std::{
     vec,
 };
 
-use crate::{ggml_loader::find_all_model_files, Hyperparameters, Vocabulary};
+use ggml::loader::find_all_model_files;
+
+use crate::{Hyperparameters, Vocabulary};
 
 /// Converts a `pth` file to a `ggml` file.
 pub fn convert_pth_to_ggml(model_directory: &Path, element_type: ggml::Type) {
diff --git a/llama/src/ggml_loader.rs b/llama/src/ggml_loader.rs
index 903e2f56..c38c3401 100644
--- a/llama/src/ggml_loader.rs
+++ b/llama/src/ggml_loader.rs
@@ -1,11 +1,14 @@
 use std::{
-  collections::HashMap,
-  io::{BufRead, Read, Seek, SeekFrom},
-  path::{Path, PathBuf},
+    collections::HashMap,
+    io::{BufRead, Read, Seek, SeekFrom},
+    path::Path,
 };
 
+use ggml::loader::{
+    find_all_model_files, read_bytes_with_len, read_f32, read_i32, read_string, read_u32,
+    LoadError, LoadProgress,
+};
 use llama_rs::{mulf, TokenId, Vocabulary};
-use thiserror::Error;
 
 use crate::{Hyperparameters, Llama};
 
@@ -13,7 +16,7 @@ use crate::{Hyperparameters, Llama};
 pub fn load(
     path: impl AsRef<Path>,
     n_context_tokens: usize,
-    mut load_progress_callback: impl FnMut(LoadProgress),
+    mut load_progress_callback: impl FnMut(LoadProgress<Hyperparameters>),
 ) -> Result<Llama, LoadError> {
     use std::fs::File;
     use std::io::BufReader;
@@ -390,261 +393,3 @@ pub fn load(
 
     Ok(model)
 }
-
-/// Each variant represents a step within the process of loading the model.
-/// These can be used to report progress to the user.
-#[derive(Clone, PartialEq, Eq, PartialOrd, Ord, Debug)]
-pub enum LoadProgress<'a> {
-    /// The hyperparameters have been loaded from the model.
-    HyperparametersLoaded(&'a Hyperparameters),
-    /// The context has been created.
-    ContextSize {
-        /// The size of the context.
-        bytes: usize,
-    },
-    /// A part of the model is being loaded.
-    PartLoading {
-        /// The path to the model part.
-        file: &'a Path,
-        /// The current part (0-indexed).
-        current_part: usize,
-        /// The number of total parts.
-        total_parts: usize,
-    },
-    /// A tensor from the current part has been loaded.
-    PartTensorLoaded {
-        /// The path to the model part.
-        file: &'a Path,
-        /// The current tensor (0-indexed).
-        current_tensor: usize,
-        /// The number of total tensors.
-        tensor_count: usize,
-    },
-    /// A model part has finished fully loading.
-    PartLoaded {
-        /// The path to the model part.
-        file: &'a Path,
-        /// The number of bytes in the part.
-        byte_size: usize,
-        /// The number of tensors in the part.
-        tensor_count: usize,
-    },
-}
-
-#[derive(Error, Debug)]
-/// Errors encountered during the loading process.
-pub enum LoadError {
-    #[error("could not open file {path:?}")]
-    /// A file failed to open.
-    OpenFileFailed {
-        /// The original error.
-        source: std::io::Error,
-        /// The path that failed.
-        path: PathBuf,
-    },
-    #[error("no parent path for {path:?}")]
-    /// There is no parent path for a given path.
-    NoParentPath {
-        /// The path without a parent.
-        path: PathBuf,
-    },
-    #[error("unable to read exactly {bytes} bytes")]
-    /// Reading exactly `bytes` from a file failed.
-    ReadExactFailed {
-        /// The original error.
-        source: std::io::Error,
-        /// The number of bytes that were attempted to be read.
-        bytes: usize,
-    },
-    #[error("non-specific I/O error")]
-    /// A non-specific IO error.
-    IO(#[from] std::io::Error),
-    #[error("could not convert bytes to a UTF-8 string")]
-    /// One of the strings encountered was not valid UTF-8.
-    InvalidUtf8(#[from] std::string::FromUtf8Error),
-    #[error("invalid integer conversion")]
-    /// One of the integers encountered could not be converted to a more appropriate type.
-    InvalidIntegerConversion(#[from] std::num::TryFromIntError),
-    #[error("invalid magic number for {path:?}")]
-    /// An invalid magic number was encountered during the loading process.
-    InvalidMagic {
-        /// The path that failed.
-        path: PathBuf,
-    },
-    #[error("invalid file format version {value}")]
-    /// The version of the format is not supported by this version of `llama-rs`.
-    InvalidFormatVersion {
-        /// The version that was encountered.
-        value: u32,
-    },
-    #[error("invalid value {ftype} for `f16` in hyperparameters")]
-    /// The `f16` hyperparameter had an invalid value.
-    HyperparametersF16Invalid {
-        /// The format type that was encountered.
-        ftype: u32,
-    },
-    #[error("unknown tensor `{tensor_name}` in {path:?}")]
-    /// The tensor `tensor_name` was encountered during the loading of `path`, but was not seen during
-    /// the model prelude.
-    UnknownTensor {
-        /// The name of the tensor.
-        tensor_name: String,
-        /// The path that failed.
-        path: PathBuf,
-    },
-    #[error("the tensor `{tensor_name}` has the wrong size in {path:?}")]
-    /// The tensor `tensor_name` did not match its expected size.
-    TensorWrongSize {
-        /// The name of the tensor.
-        tensor_name: String,
-        /// The path that failed.
-        path: PathBuf,
-    },
-    /// The tensor `tensor_name` did not have the expected format type.
-    #[error("invalid ftype {ftype} for tensor `{tensor_name}` in {path:?}")]
-    InvalidFtype {
-        /// The name of the tensor.
-        tensor_name: String,
-        /// The format type that was encountered.
-        ftype: u32,
-        /// The path that failed.
-        path: PathBuf,
-    },
-}
-
-/// Read bytes
-pub fn read_bytes<const N: usize>(reader: &mut impl BufRead) -> Result<[u8; N], LoadError> {
-  let mut bytes = [0u8; N];
-  reader
-      .read_exact(&mut bytes)
-      .map_err(|e| LoadError::ReadExactFailed {
-          source: e,
-          bytes: N,
-      })?;
-  Ok(bytes)
-}
-
-/// Ready bytes with length
-pub fn read_bytes_with_len(reader: &mut impl BufRead, len: usize) -> Result<Vec<u8>, LoadError> {
-  let mut bytes = vec![0u8; len];
-  reader
-      .read_exact(&mut bytes)
-      .map_err(|e| LoadError::ReadExactFailed {
-          source: e,
-          bytes: len,
-      })?;
-  Ok(bytes)
-}
-
-/// Read an i32
-pub fn read_i32(reader: &mut impl BufRead) -> Result<i32, LoadError> {
-  Ok(i32::from_le_bytes(read_bytes::<4>(reader)?))
-}
-
-/// Read a u32
-pub fn read_u32(reader: &mut impl BufRead) -> Result<u32, LoadError> {
-  Ok(u32::from_le_bytes(read_bytes::<4>(reader)?))
-}
-
-/// Read an f32
-pub fn read_f32(reader: &mut impl BufRead) -> Result<f32, LoadError> {
-  Ok(f32::from_le_bytes(read_bytes::<4>(reader)?))
-}
-
-/// Helper function. Reads a string from the buffer and returns it.
-pub fn read_string(reader: &mut impl BufRead, len: usize) -> Result<String, LoadError> {
-  Ok(String::from_utf8(read_bytes_with_len(reader, len)?)?)
-}
-
-
-
-pub fn find_all_model_files(main_path: &Path) -> Result<Vec<PathBuf>, LoadError> {
-    Ok(collect_related_paths(
-        main_path,
-        std::fs::read_dir(main_path.parent().ok_or_else(|| LoadError::NoParentPath {
-            path: main_path.to_owned(),
-        })?)?
-        .filter_map(Result::ok)
-        .map(|de| de.path()),
-    ))
-}
-
-fn collect_related_paths(
-    main_path: &Path,
-    directory_paths: impl Iterator<Item = PathBuf>,
-) -> Vec<PathBuf> {
-    let main_filename = main_path.file_name().and_then(|p| p.to_str());
-
-    let mut paths: Vec<PathBuf> = directory_paths
-        .filter(|p| {
-            p.file_name()
-                .and_then(|p| p.to_str())
-                .zip(main_filename)
-                .map(|(part_filename, main_filename)| {
-                    match part_filename.strip_prefix(main_filename) {
-                        Some(suffix) => {
-                            suffix.is_empty()
-                                || (suffix
-                                    .strip_prefix('.')
-                                    .map(|s| s.parse::<usize>().is_ok())
-                                    .unwrap_or(false))
-                        }
-                        None => false,
-                    }
-                })
-                .unwrap_or(false)
-        })
-        .collect();
-    paths.sort();
-    paths
-}
-
-#[cfg(test)]
-mod tests {
-    use super::*;
-
-    use llama_rs::util::TokenUtf8Buffer;
-
-    #[test]
-    fn test_collect_related_paths() {
-        let main_path = PathBuf::from("/models/llama.bin");
-        let directory_paths = [
-            "/models/llama.bin",
-            "/models/llama.bin.1",
-            "/models/llama.bin.2",
-            "/models/llama.bin.tmp",
-        ]
-        .map(PathBuf::from);
-        let expected_paths = [
-            "/models/llama.bin",
-            "/models/llama.bin.1",
-            "/models/llama.bin.2",
-        ]
-        .map(PathBuf::from);
-
-        let output_paths = collect_related_paths(&main_path, directory_paths.into_iter());
-        assert_eq!(expected_paths.as_slice(), output_paths);
-    }
-
-    #[test]
-    fn test_valid_utf8() {
-        let mut buffer = TokenUtf8Buffer::new();
-        assert_eq!(buffer.push(b"hello").as_deref(), Some("hello"));
-        assert_eq!(buffer.push(&[0xE2, 0x82, 0xAC]).as_deref(), Some("€"));
-    }
-
-    #[test]
-    fn test_partial_utf8() {
-        let mut buffer = TokenUtf8Buffer::new();
-        assert_eq!(buffer.push(&[0xE2, 0x82]).as_deref(), None);
-        assert_eq!(buffer.push(&[0xAC]).as_deref(), Some("€"));
-    }
-
-    #[test]
-    fn test_invalid_prelude_for_valid_utf8() {
-        let mut buffer = TokenUtf8Buffer::new();
-        assert_eq!(buffer.push(&[0xD8]).as_deref(), None);
-        assert_eq!(buffer.push(&[0xE2, 0x82]).as_deref(), None);
-        assert_eq!(buffer.push(&[0xAC]).as_deref(), Some("€"));
-    }
-}
diff --git a/llama/src/lib.rs b/llama/src/lib.rs
index bdb79ce6..41eb47ef 100644
--- a/llama/src/lib.rs
+++ b/llama/src/lib.rs
@@ -2,14 +2,15 @@ use std::collections::HashMap;
 
 use serde::Deserialize;
 
-use llama_rs::{InferenceSession, InferenceSessionParameters, model::Model, Vocabulary};
+use ggml::loader::{LoadError, LoadProgress};
+use llama_rs::{model::Model, InferenceSession, InferenceSessionParameters, Vocabulary};
 
 #[cfg(feature = "convert")]
 pub mod convert;
 
 mod ggml_loader;
 
-pub use ggml_loader::{load, LoadError, LoadProgress};
+pub use ggml_loader::load;
 
 pub struct Llama {
     pub(crate) hparams: Hyperparameters,
@@ -348,7 +349,7 @@ impl Llama {
     pub fn load(
         path: impl AsRef<std::path::Path>,
         n_context_tokens: usize,
-        load_progress_callback: impl FnMut(LoadProgress),
+        load_progress_callback: impl FnMut(LoadProgress<Hyperparameters>),
     ) -> Result<Self, LoadError> {
         load(path, n_context_tokens, load_progress_callback)
     }

From 6bfda758453b0f01dfbee8c817c489dbeb6a9cdf Mon Sep 17 00:00:00 2001
From: Dan Forbes <dan@danforbes.dev>
Date: Sun, 16 Apr 2023 07:50:39 -0700
Subject: [PATCH 03/35] cargo fmt

---
 ggml/src/loader.rs    | 3 ++-
 llama-cli/src/main.rs | 2 +-
 llama-rs/src/lib.rs   | 4 ++--
 llama-rs/src/model.rs | 2 +-
 4 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/ggml/src/loader.rs b/ggml/src/loader.rs
index d371ce5f..7851fadc 100644
--- a/ggml/src/loader.rs
+++ b/ggml/src/loader.rs
@@ -1,6 +1,7 @@
 use std::{
+    fmt::Debug,
     io::BufRead,
-    path::{Path, PathBuf}, fmt::Debug,
+    path::{Path, PathBuf},
 };
 
 use thiserror::Error;
diff --git a/llama-cli/src/main.rs b/llama-cli/src/main.rs
index 781948b2..df0a3f60 100644
--- a/llama-cli/src/main.rs
+++ b/llama-cli/src/main.rs
@@ -5,7 +5,7 @@ use cli_args::Args;
 use rustyline::error::ReadlineError;
 
 use llama::convert::convert_pth_to_ggml;
-use llama_rs::{InferenceError, Model, snapshot};
+use llama_rs::{snapshot, InferenceError, Model};
 
 mod cli_args;
 
diff --git a/llama-rs/src/lib.rs b/llama-rs/src/lib.rs
index 1c0ea6dd..04f4c8e1 100644
--- a/llama-rs/src/lib.rs
+++ b/llama-rs/src/lib.rs
@@ -5,10 +5,10 @@ use thiserror::Error;
 
 /// Large language model
 pub mod model;
-/// Utilities
-pub mod util;
 /// Snapshots
 pub mod snapshot;
+/// Utilities
+pub mod util;
 
 mod inference_session;
 mod vocabulary;
diff --git a/llama-rs/src/model.rs b/llama-rs/src/model.rs
index 91a502a1..9991e7a5 100644
--- a/llama-rs/src/model.rs
+++ b/llama-rs/src/model.rs
@@ -14,7 +14,7 @@ pub trait Model {
 
     /// Starts a new `InferenceSession` for this model.
     fn start_session(&self, params: InferenceSessionParameters) -> InferenceSession;
-    
+
     /// Evaluates the transformer.
     ///
     /// The provided `output_request` struct lets you specify which additional

From 73f59c3e7766ae5bfdc6818bfaed2483e639f03e Mon Sep 17 00:00:00 2001
From: Dan Forbes <dan@danforbes.dev>
Date: Sun, 16 Apr 2023 08:06:26 -0700
Subject: [PATCH 04/35] Rename llama-rs to llm-base

---
 Cargo.lock                                      | 10 +++++-----
 Cargo.toml                                      |  2 +-
 bloom-cli/Cargo.toml                            |  2 +-
 bloom-cli/src/cli_args.rs                       | 12 ++++++------
 bloom-cli/src/main.rs                           | 11 ++++++-----
 bloom/Cargo.toml                                |  2 +-
 bloom/src/ggml_loader.rs                        |  2 +-
 bloom/src/lib.rs                                | 15 +++++++++------
 llama-cli/Cargo.toml                            |  2 +-
 llama-cli/src/cli_args.rs                       | 12 ++++++------
 llama-cli/src/main.rs                           | 11 ++++++-----
 llama/Cargo.toml                                |  2 +-
 llama/src/ggml_loader.rs                        |  2 +-
 llama/src/lib.rs                                | 11 +++++++----
 {llama-rs => llm-base}/Cargo.toml               |  2 +-
 {llama-rs => llm-base}/src/inference_session.rs |  0
 {llama-rs => llm-base}/src/lib.rs               |  0
 {llama-rs => llm-base}/src/model.rs             |  0
 {llama-rs => llm-base}/src/snapshot.rs          |  0
 {llama-rs => llm-base}/src/util.rs              |  0
 {llama-rs => llm-base}/src/vocabulary.rs        |  0
 21 files changed, 53 insertions(+), 45 deletions(-)
 rename {llama-rs => llm-base}/Cargo.toml (96%)
 rename {llama-rs => llm-base}/src/inference_session.rs (100%)
 rename {llama-rs => llm-base}/src/lib.rs (100%)
 rename {llama-rs => llm-base}/src/model.rs (100%)
 rename {llama-rs => llm-base}/src/snapshot.rs (100%)
 rename {llama-rs => llm-base}/src/util.rs (100%)
 rename {llama-rs => llm-base}/src/vocabulary.rs (100%)

diff --git a/Cargo.lock b/Cargo.lock
index 06175af9..641bc672 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -66,7 +66,7 @@ version = "0.1.0"
 dependencies = [
  "bytemuck",
  "ggml",
- "llama-rs",
+ "llm-base",
 ]
 
 [[package]]
@@ -77,7 +77,7 @@ dependencies = [
  "clap",
  "env_logger",
  "ggml",
- "llama-rs",
+ "llm-base",
  "log",
  "num_cpus",
  "once_cell",
@@ -490,7 +490,7 @@ version = "0.1.0"
 dependencies = [
  "bytemuck",
  "ggml",
- "llama-rs",
+ "llm-base",
  "protobuf",
  "rust_tokenizers",
  "serde",
@@ -505,7 +505,7 @@ dependencies = [
  "env_logger",
  "ggml",
  "llama",
- "llama-rs",
+ "llm-base",
  "log",
  "num_cpus",
  "once_cell",
@@ -515,7 +515,7 @@ dependencies = [
 ]
 
 [[package]]
-name = "llama-rs"
+name = "llm-base"
 version = "0.1.0"
 dependencies = [
  "bincode",
diff --git a/Cargo.toml b/Cargo.toml
index bef02276..2d12def6 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -5,8 +5,8 @@ members = [
     "ggml-sys",
     "ggml",
     "llama",
-    "llama-rs",
     "llama-cli",
+    "llm-base",
     "generate-ggml-bindings"
 ]
 resolver = "2"
diff --git a/bloom-cli/Cargo.toml b/bloom-cli/Cargo.toml
index 1d3de63d..7cfc3108 100644
--- a/bloom-cli/Cargo.toml
+++ b/bloom-cli/Cargo.toml
@@ -8,7 +8,7 @@ edition = "2021"
 [dependencies]
 bloom = { path = "../bloom" }
 ggml = { path = "../ggml" }
-llama-rs = { path = "../llama-rs" }
+llm-base = { path = "../llm-base" }
 
 log = { workspace = true }
 rand = { workspace = true }
diff --git a/bloom-cli/src/cli_args.rs b/bloom-cli/src/cli_args.rs
index 91f03fe2..c0e479a4 100644
--- a/bloom-cli/src/cli_args.rs
+++ b/bloom-cli/src/cli_args.rs
@@ -5,7 +5,7 @@ use rand::SeedableRng;
 
 use ggml::loader::load_progress;
 
-use llama_rs::{
+use llm_base::{
     InferenceParameters, InferenceSessionParameters, ModelKVMemoryType, TokenBias, EOT_TOKEN_ID,
 };
 
@@ -321,13 +321,13 @@ pub enum ElementType {
     /// Float 32-bit.
     F32,
 }
-impl From<ElementType> for llama_rs::ElementType {
+impl From<ElementType> for llm_base::ElementType {
     fn from(model_type: ElementType) -> Self {
         match model_type {
-            ElementType::Q4_0 => llama_rs::ElementType::Q4_0,
-            ElementType::Q4_1 => llama_rs::ElementType::Q4_1,
-            ElementType::F16 => llama_rs::ElementType::F16,
-            ElementType::F32 => llama_rs::ElementType::F32,
+            ElementType::Q4_0 => llm_base::ElementType::Q4_0,
+            ElementType::Q4_1 => llm_base::ElementType::Q4_1,
+            ElementType::F16 => llm_base::ElementType::F16,
+            ElementType::F32 => llm_base::ElementType::F32,
         }
     }
 }
diff --git a/bloom-cli/src/main.rs b/bloom-cli/src/main.rs
index 026984ff..f91207ff 100644
--- a/bloom-cli/src/main.rs
+++ b/bloom-cli/src/main.rs
@@ -4,7 +4,7 @@ use clap::Parser;
 use cli_args::Args;
 use rustyline::error::ReadlineError;
 
-use llama_rs::{snapshot, InferenceError, Model};
+use llm_base::{snapshot, InferenceError, Model};
 
 mod cli_args;
 
@@ -53,14 +53,15 @@ fn infer(args: &cli_args::Infer) {
 
     match res {
         Ok(_) => (),
-        Err(llama_rs::InferenceError::ContextFull) => {
+        Err(InferenceError::ContextFull) => {
             log::warn!("Context window full, stopping inference.")
         }
-        Err(llama_rs::InferenceError::TokenizationFailed) => {
+        Err(InferenceError::TokenizationFailed) => {
             log::error!("Failed to tokenize initial prompt.");
         }
-        Err(llama_rs::InferenceError::UserCallback(_))
-        | Err(llama_rs::InferenceError::EndOfText) => unreachable!("cannot fail"),
+        Err(InferenceError::UserCallback(_)) | Err(InferenceError::EndOfText) => {
+            unreachable!("cannot fail")
+        }
     }
 
     if let Some(session_path) = args.save_session.as_ref().or(args.persist_session.as_ref()) {
diff --git a/bloom/Cargo.toml b/bloom/Cargo.toml
index 5f2a108e..1cf85379 100644
--- a/bloom/Cargo.toml
+++ b/bloom/Cargo.toml
@@ -7,6 +7,6 @@ edition = "2021"
 
 [dependencies]
 ggml = { path = "../ggml" }
-llama-rs = { path = "../llama-rs" }
+llm-base = { path = "../llm-base" }
 
 bytemuck = { workspace = true }
diff --git a/bloom/src/ggml_loader.rs b/bloom/src/ggml_loader.rs
index 1671db35..99b064a5 100644
--- a/bloom/src/ggml_loader.rs
+++ b/bloom/src/ggml_loader.rs
@@ -8,7 +8,7 @@ use ggml::loader::{
     find_all_model_files, read_bytes_with_len, read_f32, read_i32, read_string, read_u32,
     LoadError, LoadProgress,
 };
-use llama_rs::{mulf, TokenId, Vocabulary};
+use llm_base::{mulf, TokenId, Vocabulary};
 
 use crate::{Bloom, Hyperparameters};
 
diff --git a/bloom/src/lib.rs b/bloom/src/lib.rs
index aa050fef..56753964 100644
--- a/bloom/src/lib.rs
+++ b/bloom/src/lib.rs
@@ -1,7 +1,10 @@
 use std::collections::HashMap;
 
 use ggml::loader::{LoadError, LoadProgress};
-use llama_rs::{InferenceSession, Model, Vocabulary};
+use llm_base::{
+    EvaluateOutputRequest, InferenceParameters, InferenceSession, InferenceSessionParameters,
+    Model, TokenId, Vocabulary,
+};
 
 mod ggml_loader;
 
@@ -29,7 +32,7 @@ impl Model for Bloom {
     type Hyperparameters = Hyperparameters;
     type Layer = Layer;
 
-    fn start_session(&self, params: llama_rs::InferenceSessionParameters) -> InferenceSession {
+    fn start_session(&self, params: InferenceSessionParameters) -> InferenceSession {
         InferenceSession::new(
             params,
             self.hparams.n_ctx,
@@ -41,10 +44,10 @@ impl Model for Bloom {
 
     fn evaluate(
         &self,
-        session: &mut llama_rs::InferenceSession,
-        params: &llama_rs::InferenceParameters,
-        input_tokens: &[llama_rs::TokenId],
-        output_request: &mut llama_rs::EvaluateOutputRequest,
+        session: &mut InferenceSession,
+        params: &InferenceParameters,
+        input_tokens: &[TokenId],
+        output_request: &mut EvaluateOutputRequest,
     ) {
         let n = input_tokens.len();
         let n_past = session.n_past;
diff --git a/llama-cli/Cargo.toml b/llama-cli/Cargo.toml
index 28bfd223..2f900708 100644
--- a/llama-cli/Cargo.toml
+++ b/llama-cli/Cargo.toml
@@ -8,7 +8,7 @@ edition = "2021"
 [dependencies]
 ggml = { path = "../ggml" }
 llama = { path = "../llama", features = ["convert"] }
-llama-rs = { path = "../llama-rs" }
+llm-base = { path = "../llm-base" }
 
 log = { workspace = true }
 rand = { workspace = true }
diff --git a/llama-cli/src/cli_args.rs b/llama-cli/src/cli_args.rs
index bc2dcb6d..a3040ccc 100644
--- a/llama-cli/src/cli_args.rs
+++ b/llama-cli/src/cli_args.rs
@@ -5,7 +5,7 @@ use rand::SeedableRng;
 
 use ggml::loader::load_progress;
 
-use llama_rs::{
+use llm_base::{
     InferenceParameters, InferenceSessionParameters, ModelKVMemoryType, TokenBias, EOT_TOKEN_ID,
 };
 
@@ -330,13 +330,13 @@ pub enum ElementType {
     /// Float 32-bit.
     F32,
 }
-impl From<ElementType> for llama_rs::ElementType {
+impl From<ElementType> for llm_base::ElementType {
     fn from(model_type: ElementType) -> Self {
         match model_type {
-            ElementType::Q4_0 => llama_rs::ElementType::Q4_0,
-            ElementType::Q4_1 => llama_rs::ElementType::Q4_1,
-            ElementType::F16 => llama_rs::ElementType::F16,
-            ElementType::F32 => llama_rs::ElementType::F32,
+            ElementType::Q4_0 => llm_base::ElementType::Q4_0,
+            ElementType::Q4_1 => llm_base::ElementType::Q4_1,
+            ElementType::F16 => llm_base::ElementType::F16,
+            ElementType::F32 => llm_base::ElementType::F32,
         }
     }
 }
diff --git a/llama-cli/src/main.rs b/llama-cli/src/main.rs
index df0a3f60..add904fe 100644
--- a/llama-cli/src/main.rs
+++ b/llama-cli/src/main.rs
@@ -5,7 +5,7 @@ use cli_args::Args;
 use rustyline::error::ReadlineError;
 
 use llama::convert::convert_pth_to_ggml;
-use llama_rs::{snapshot, InferenceError, Model};
+use llm_base::{snapshot, InferenceError, Model};
 
 mod cli_args;
 
@@ -55,14 +55,15 @@ fn infer(args: &cli_args::Infer) {
 
     match res {
         Ok(_) => (),
-        Err(llama_rs::InferenceError::ContextFull) => {
+        Err(InferenceError::ContextFull) => {
             log::warn!("Context window full, stopping inference.")
         }
-        Err(llama_rs::InferenceError::TokenizationFailed) => {
+        Err(InferenceError::TokenizationFailed) => {
             log::error!("Failed to tokenize initial prompt.");
         }
-        Err(llama_rs::InferenceError::UserCallback(_))
-        | Err(llama_rs::InferenceError::EndOfText) => unreachable!("cannot fail"),
+        Err(InferenceError::UserCallback(_)) | Err(InferenceError::EndOfText) => {
+            unreachable!("cannot fail")
+        }
     }
 
     if let Some(session_path) = args.save_session.as_ref().or(args.persist_session.as_ref()) {
diff --git a/llama/Cargo.toml b/llama/Cargo.toml
index 9f933f5a..3eef6510 100644
--- a/llama/Cargo.toml
+++ b/llama/Cargo.toml
@@ -7,7 +7,7 @@ edition = "2021"
 
 [dependencies]
 ggml = { path = "../ggml" }
-llama-rs = { path = "../llama-rs" }
+llm-base = { path = "../llm-base" }
 
 bytemuck = { workspace = true }
 serde = { workspace = true }
diff --git a/llama/src/ggml_loader.rs b/llama/src/ggml_loader.rs
index c38c3401..5723540f 100644
--- a/llama/src/ggml_loader.rs
+++ b/llama/src/ggml_loader.rs
@@ -8,7 +8,7 @@ use ggml::loader::{
     find_all_model_files, read_bytes_with_len, read_f32, read_i32, read_string, read_u32,
     LoadError, LoadProgress,
 };
-use llama_rs::{mulf, TokenId, Vocabulary};
+use llm_base::{mulf, TokenId, Vocabulary};
 
 use crate::{Hyperparameters, Llama};
 
diff --git a/llama/src/lib.rs b/llama/src/lib.rs
index 41eb47ef..96c55250 100644
--- a/llama/src/lib.rs
+++ b/llama/src/lib.rs
@@ -3,7 +3,10 @@ use std::collections::HashMap;
 use serde::Deserialize;
 
 use ggml::loader::{LoadError, LoadProgress};
-use llama_rs::{model::Model, InferenceSession, InferenceSessionParameters, Vocabulary};
+use llm_base::{
+    EvaluateOutputRequest, InferenceParameters, InferenceSession, InferenceSessionParameters,
+    Model, TokenId, Vocabulary,
+};
 
 #[cfg(feature = "convert")]
 pub mod convert;
@@ -48,9 +51,9 @@ impl Model for Llama {
     fn evaluate(
         &self,
         session: &mut InferenceSession,
-        params: &llama_rs::InferenceParameters,
-        input_tokens: &[llama_rs::TokenId],
-        output_request: &mut llama_rs::EvaluateOutputRequest,
+        params: &InferenceParameters,
+        input_tokens: &[TokenId],
+        output_request: &mut EvaluateOutputRequest,
     ) {
         let n = input_tokens.len();
         let n_past = session.n_past;
diff --git a/llama-rs/Cargo.toml b/llm-base/Cargo.toml
similarity index 96%
rename from llama-rs/Cargo.toml
rename to llm-base/Cargo.toml
index 2fb3289a..d1dd70a4 100644
--- a/llama-rs/Cargo.toml
+++ b/llm-base/Cargo.toml
@@ -1,5 +1,5 @@
 [package]
-name = "llama-rs"
+name = "llm-base"
 version = { workspace = true }
 edition = "2021"
 rust-version = "1.65"
diff --git a/llama-rs/src/inference_session.rs b/llm-base/src/inference_session.rs
similarity index 100%
rename from llama-rs/src/inference_session.rs
rename to llm-base/src/inference_session.rs
diff --git a/llama-rs/src/lib.rs b/llm-base/src/lib.rs
similarity index 100%
rename from llama-rs/src/lib.rs
rename to llm-base/src/lib.rs
diff --git a/llama-rs/src/model.rs b/llm-base/src/model.rs
similarity index 100%
rename from llama-rs/src/model.rs
rename to llm-base/src/model.rs
diff --git a/llama-rs/src/snapshot.rs b/llm-base/src/snapshot.rs
similarity index 100%
rename from llama-rs/src/snapshot.rs
rename to llm-base/src/snapshot.rs
diff --git a/llama-rs/src/util.rs b/llm-base/src/util.rs
similarity index 100%
rename from llama-rs/src/util.rs
rename to llm-base/src/util.rs
diff --git a/llama-rs/src/vocabulary.rs b/llm-base/src/vocabulary.rs
similarity index 100%
rename from llama-rs/src/vocabulary.rs
rename to llm-base/src/vocabulary.rs

From e670c258d64fff546e9bf5cb2dad0ad0fb7acf39 Mon Sep 17 00:00:00 2001
From: Dan Forbes <dan@danforbes.dev>
Date: Sun, 16 Apr 2023 08:07:52 -0700
Subject: [PATCH 05/35] Clippy

---
 bloom/src/ggml_loader.rs |  8 ++++----
 bloom/src/lib.rs         | 18 +++++++++---------
 2 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/bloom/src/ggml_loader.rs b/bloom/src/ggml_loader.rs
index 99b064a5..7c8ff1cf 100644
--- a/bloom/src/ggml_loader.rs
+++ b/bloom/src/ggml_loader.rs
@@ -160,7 +160,7 @@ pub fn load(
     };
 
     // Initialize the context
-    let context = ggml::Context::init(ctx_size as usize);
+    let context = ggml::Context::init(ctx_size);
 
     let model = Bloom::new(context, hparams, vocabulary, n_ff, wtype);
 
@@ -313,7 +313,7 @@ pub fn load(
             };
 
             if n_dims == 1 || n_parts == 1 {
-                if (nelements as usize * bpe) / ggml::blck_size(tensor.get_type()) as usize
+                if (nelements * bpe) / ggml::blck_size(tensor.get_type())
                     != tensor.nbytes()
                 {
                     return Err(LoadError::TensorWrongSize {
@@ -335,7 +335,7 @@ pub fn load(
 
                 total_size += tensor.nbytes();
             } else {
-                if (nelements as usize * bpe) / ggml::blck_size(tensor.get_type()) as usize
+                if (nelements * bpe) / ggml::blck_size(tensor.get_type())
                     != tensor.nbytes() / n_parts
                 {
                     return Err(LoadError::TensorWrongSize {
@@ -356,7 +356,7 @@ pub fn load(
                         let offset_row = i1 as usize * row_size;
                         let offset = offset_row
                             + ((part_id * np0 as usize)
-                                / ggml::blck_size(tensor.get_type()) as usize)
+                                / ggml::blck_size(tensor.get_type()))
                                 * ggml::type_size(tensor.get_type());
                         // SAFETY: yolo, same as original code
                         unsafe {
diff --git a/bloom/src/lib.rs b/bloom/src/lib.rs
index 56753964..a5150a30 100644
--- a/bloom/src/lib.rs
+++ b/bloom/src/lib.rs
@@ -87,7 +87,7 @@ impl Model for Bloom {
             input_layer = ctx0.op_add(&ctx0.op_repeat(&self.norm_b, &input_layer), &input_layer);
         }
 
-        for il in 0..n_layer as usize {
+        for il in 0..n_layer {
             let input_self_attention = input_layer.share();
             let mut current: ggml::Tensor;
 
@@ -138,15 +138,15 @@ impl Model for Bloom {
                     let k = ctx0.op_view_1d(
                         &session.memory_k,
                         n * n_embd,
-                        (session.memory_k.element_size() * n_embd as usize)
-                            * (il * n_ctx as usize + n_past as usize),
+                        (session.memory_k.element_size() * n_embd)
+                            * (il * n_ctx + n_past),
                     );
 
                     let v = ctx0.op_view_1d(
                         &session.memory_v,
                         n * n_embd,
-                        (session.memory_v.element_size() * n_embd as usize)
-                            * (il * n_ctx as usize + n_past as usize),
+                        (session.memory_v.element_size() * n_embd)
+                            * (il * n_ctx + n_past),
                     );
 
                     gf.build_forward_expand(&ctx0.op_cpy(&k_current, &k));
@@ -307,17 +307,17 @@ impl Model for Bloom {
 
         // return result for just the last token
         // SAFETY: yolo
-        assert_eq!(session.last_logits.len(), n_vocab as usize);
+        assert_eq!(session.last_logits.len(), { n_vocab });
         unsafe {
             input_layer.read_data(
-                n_vocab as usize * (n - 1) * std::mem::size_of::<f32>(),
+                n_vocab * (n - 1) * std::mem::size_of::<f32>(),
                 bytemuck::cast_slice_mut(&mut session.last_logits),
             )
         };
 
         // Extract logits
         if let Some(all_logits) = &mut output_request.all_logits {
-            all_logits.resize(n_vocab as usize * n, 0.0);
+            all_logits.resize(n_vocab * n, 0.0);
             // SAFETY: Tensor data can be read (properly aligned, initialized,
             // data will not be mutated or otherwise aliased during the copy),
             // and we're not reading past the end of the tensor data.
@@ -329,7 +329,7 @@ impl Model for Bloom {
 
         // Extract embeddings
         if let Some(embeddings) = &mut output_request.embeddings {
-            embeddings.resize(n_embd as usize * n, 0.0);
+            embeddings.resize(n_embd * n, 0.0);
             // SAFETY: Same rationale as for the "Extract logits" section applies.
             assert_eq!(embeddings_tensor.nelements(), n_embd * n);
             unsafe {

From c4b41761d1003a048db889143f9ebc396fee9e2b Mon Sep 17 00:00:00 2001
From: Dan Forbes <dan@danforbes.dev>
Date: Sun, 16 Apr 2023 08:23:00 -0700
Subject: [PATCH 06/35] Remove redundant associated Model type from Model trait

---
 bloom/src/lib.rs      | 1 -
 llama/src/lib.rs      | 1 -
 llm-base/src/model.rs | 2 --
 3 files changed, 4 deletions(-)

diff --git a/bloom/src/lib.rs b/bloom/src/lib.rs
index a5150a30..d3293e36 100644
--- a/bloom/src/lib.rs
+++ b/bloom/src/lib.rs
@@ -28,7 +28,6 @@ pub struct Bloom {
 }
 
 impl Model for Bloom {
-    type Model = Bloom;
     type Hyperparameters = Hyperparameters;
     type Layer = Layer;
 
diff --git a/llama/src/lib.rs b/llama/src/lib.rs
index 96c55250..3b5e2633 100644
--- a/llama/src/lib.rs
+++ b/llama/src/lib.rs
@@ -34,7 +34,6 @@ pub struct Llama {
 }
 
 impl Model for Llama {
-    type Model = Llama;
     type Hyperparameters = Hyperparameters;
     type Layer = Layer;
 
diff --git a/llm-base/src/model.rs b/llm-base/src/model.rs
index 9991e7a5..94a74124 100644
--- a/llm-base/src/model.rs
+++ b/llm-base/src/model.rs
@@ -5,8 +5,6 @@ use crate::{
 
 /// A large language model.
 pub trait Model {
-    /// The model type.
-    type Model;
     /// Hyperparameters for the model
     type Hyperparameters;
     /// Layer for the model

From 1cf305f94ea37bbb997ec729b256697af1cbfbc9 Mon Sep 17 00:00:00 2001
From: Dan Forbes <dan@danforbes.dev>
Date: Sun, 16 Apr 2023 08:24:37 -0700
Subject: [PATCH 07/35] Remove associated Layer type from Model trait

---
 bloom/src/lib.rs      | 1 -
 llama/src/lib.rs      | 1 -
 llm-base/src/model.rs | 2 --
 3 files changed, 4 deletions(-)

diff --git a/bloom/src/lib.rs b/bloom/src/lib.rs
index d3293e36..6d06c3b0 100644
--- a/bloom/src/lib.rs
+++ b/bloom/src/lib.rs
@@ -29,7 +29,6 @@ pub struct Bloom {
 
 impl Model for Bloom {
     type Hyperparameters = Hyperparameters;
-    type Layer = Layer;
 
     fn start_session(&self, params: InferenceSessionParameters) -> InferenceSession {
         InferenceSession::new(
diff --git a/llama/src/lib.rs b/llama/src/lib.rs
index 3b5e2633..5824eb53 100644
--- a/llama/src/lib.rs
+++ b/llama/src/lib.rs
@@ -35,7 +35,6 @@ pub struct Llama {
 
 impl Model for Llama {
     type Hyperparameters = Hyperparameters;
-    type Layer = Layer;
 
     fn start_session(&self, params: InferenceSessionParameters) -> InferenceSession {
         InferenceSession::new(
diff --git a/llm-base/src/model.rs b/llm-base/src/model.rs
index 94a74124..ea3c7be1 100644
--- a/llm-base/src/model.rs
+++ b/llm-base/src/model.rs
@@ -7,8 +7,6 @@ use crate::{
 pub trait Model {
     /// Hyperparameters for the model
     type Hyperparameters;
-    /// Layer for the model
-    type Layer;
 
     /// Starts a new `InferenceSession` for this model.
     fn start_session(&self, params: InferenceSessionParameters) -> InferenceSession;

From 0d4dde9866452d2528d7ff93bdbc2075fc7eaa50 Mon Sep 17 00:00:00 2001
From: Dan Forbes <dan@danforbes.dev>
Date: Sun, 16 Apr 2023 08:26:36 -0700
Subject: [PATCH 08/35] cargo fmt

---
 bloom/src/ggml_loader.rs | 7 ++-----
 bloom/src/lib.rs         | 6 ++----
 2 files changed, 4 insertions(+), 9 deletions(-)

diff --git a/bloom/src/ggml_loader.rs b/bloom/src/ggml_loader.rs
index 7c8ff1cf..d03015c7 100644
--- a/bloom/src/ggml_loader.rs
+++ b/bloom/src/ggml_loader.rs
@@ -313,9 +313,7 @@ pub fn load(
             };
 
             if n_dims == 1 || n_parts == 1 {
-                if (nelements * bpe) / ggml::blck_size(tensor.get_type())
-                    != tensor.nbytes()
-                {
+                if (nelements * bpe) / ggml::blck_size(tensor.get_type()) != tensor.nbytes() {
                     return Err(LoadError::TensorWrongSize {
                         tensor_name,
                         path: part_path.to_path_buf(),
@@ -355,8 +353,7 @@ pub fn load(
                     for i1 in 0..ne[1] {
                         let offset_row = i1 as usize * row_size;
                         let offset = offset_row
-                            + ((part_id * np0 as usize)
-                                / ggml::blck_size(tensor.get_type()))
+                            + ((part_id * np0 as usize) / ggml::blck_size(tensor.get_type()))
                                 * ggml::type_size(tensor.get_type());
                         // SAFETY: yolo, same as original code
                         unsafe {
diff --git a/bloom/src/lib.rs b/bloom/src/lib.rs
index 6d06c3b0..b44241ec 100644
--- a/bloom/src/lib.rs
+++ b/bloom/src/lib.rs
@@ -136,15 +136,13 @@ impl Model for Bloom {
                     let k = ctx0.op_view_1d(
                         &session.memory_k,
                         n * n_embd,
-                        (session.memory_k.element_size() * n_embd)
-                            * (il * n_ctx + n_past),
+                        (session.memory_k.element_size() * n_embd) * (il * n_ctx + n_past),
                     );
 
                     let v = ctx0.op_view_1d(
                         &session.memory_v,
                         n * n_embd,
-                        (session.memory_v.element_size() * n_embd)
-                            * (il * n_ctx + n_past),
+                        (session.memory_v.element_size() * n_embd) * (il * n_ctx + n_past),
                     );
 
                     gf.build_forward_expand(&ctx0.op_cpy(&k_current, &k));

From 849c28d8623eab309ccf6bf5526aeb37c26d5a56 Mon Sep 17 00:00:00 2001
From: Dan Forbes <dan@danforbes.dev>
Date: Sun, 16 Apr 2023 09:07:05 -0700
Subject: [PATCH 09/35] Docs

---
 README.md | 34 ++++++++++++++++++++++------------
 1 file changed, 22 insertions(+), 12 deletions(-)

diff --git a/README.md b/README.md
index f89b9e80..b1ab9d04 100644
--- a/README.md
+++ b/README.md
@@ -30,10 +30,11 @@ performance as the original code.
 
 Make sure you have a Rust 1.65.0 or above and C toolchain[^1] set up.
 
-`llama-rs` is a Rust library, while `llama-cli` is a CLI application that wraps
-`llama-rs` and offers basic inference capabilities.
+`llm-base`, `bloom`, and `llama` are Rust libraries, while `bloom-cli` and
+`llama-cli` are a CLI applications that wrap `bloom` and `llama`, respectively,
+and offer basic inference capabilities.
 
-The following instructions explain how to build `llama-cli`.
+The following instructions explain how to build the CLI applications.
 
 **NOTE**: For best results, make sure to build and run in release mode.
 Debug builds are going to be very slow.
@@ -43,33 +44,34 @@ Debug builds are going to be very slow.
 Run
 
 ```shell
-cargo install --git https://github.com/rustformers/llama-rs llama-cli
+cargo install --git https://github.com/rustformers/llama-rs bloom-cli llama-cli
 ```
 
-to install `llama-cli` to your Cargo `bin` directory, which `rustup` is likely to
-have added to your `PATH`.
+to install `bloom-cli` and `llama-cli` to your Cargo `bin` directory, which
+`rustup` is likely to have added to your `PATH`.
 
-It can then be run through `llama-cli`.
+The CLI applications can then be run through `bloom-cli` and `llama-cli`, respectively.
 
 ### Building from repository
 
 Clone the repository, and then build it through
 
 ```shell
-cargo build --release --bin llama-cli
+cargo build --release
 ```
 
-The resulting binary will be at `target/release/llama-cli[.exe]`.
+The resulting binaries will be at `target/release/bloom-cli[.exe]` and
+`target/release/llama-cli[.exe]`, respectively.
 
-It can also be run directly through Cargo, using
+They can also be run directly through Cargo, using
 
 ```shell
-cargo run --release --bin llama-cli -- <ARGS>
+cargo run --release --bin {bloom,llama}-cli -- <ARGS>
 ```
 
 This is useful for development.
 
-### Getting the weights
+### Getting LLaMA weights
 
 In order to run the inference code in `llama-rs`, a copy of the model's weights
 are required. Currently, the only legal source to get the weights is [this
@@ -103,6 +105,14 @@ python3 scripts/convert-pth-to-ggml.py /path/to/your/models/7B/ 1
 > (versioned) ggml formats, but not the mmap-ready version that was [recently
 > merged](https://github.com/ggerganov/llama.cpp/pull/613).
 
+### BLOOM
+
+The open-source [BLOOM](https://bigscience.huggingface.co/blog/bloom) model is
+also supported.
+[More information](https://huggingface.co/docs/transformers/model_doc/bloom)
+about BLOOM is available on HuggingFace, as are some
+[quantized models](https://huggingface.co/models?search=bloom%20ggml).
+
 _Support for other open source models is currently planned. For models where
 weights can be legally distributed, this section will be updated with scripts to
 make the install process as user-friendly as possible. Due to the model's legal

From 54ad89056550b6077da455a2a84ca1c8ae9ecd32 Mon Sep 17 00:00:00 2001
From: Dan Forbes <dan@danforbes.dev>
Date: Sun, 16 Apr 2023 10:22:52 -0700
Subject: [PATCH 10/35] Tests and examples

---
 Cargo.lock                        |  3 ++
 bloom/Cargo.toml                  |  3 ++
 bloom/examples/bloom_inference.rs | 34 +++++++++++++++++++++
 ggml/Cargo.toml                   |  3 ++
 ggml/src/loader.rs                | 50 +++++++++++++++++++++++++++++++
 llama/Cargo.toml                  |  3 ++
 llama/examples/llama_inference.rs | 34 +++++++++++++++++++++
 7 files changed, 130 insertions(+)
 create mode 100644 bloom/examples/bloom_inference.rs
 create mode 100644 llama/examples/llama_inference.rs

diff --git a/Cargo.lock b/Cargo.lock
index 641bc672..c2f03d6b 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -67,6 +67,7 @@ dependencies = [
  "bytemuck",
  "ggml",
  "llm-base",
+ "rand",
 ]
 
 [[package]]
@@ -350,6 +351,7 @@ name = "ggml"
 version = "0.1.0"
 dependencies = [
  "ggml-sys",
+ "llm-base",
  "log",
  "thiserror",
 ]
@@ -492,6 +494,7 @@ dependencies = [
  "ggml",
  "llm-base",
  "protobuf",
+ "rand",
  "rust_tokenizers",
  "serde",
  "serde_json",
diff --git a/bloom/Cargo.toml b/bloom/Cargo.toml
index 1cf85379..2dd9b0a9 100644
--- a/bloom/Cargo.toml
+++ b/bloom/Cargo.toml
@@ -10,3 +10,6 @@ ggml = { path = "../ggml" }
 llm-base = { path = "../llm-base" }
 
 bytemuck = { workspace = true }
+
+[dev-dependencies]
+rand = { workspace = true }
diff --git a/bloom/examples/bloom_inference.rs b/bloom/examples/bloom_inference.rs
new file mode 100644
index 00000000..9109346a
--- /dev/null
+++ b/bloom/examples/bloom_inference.rs
@@ -0,0 +1,34 @@
+use std::{convert::Infallible, env::args, io::Write};
+
+use ggml::loader::{load_progress, LoadError};
+use llm_base::snapshot;
+
+extern crate bloom;
+
+fn main() -> Result<(), LoadError> {
+    let args: Vec<String> = args().collect();
+    let bloom = bloom::Bloom::load(&args[1], 32, load_progress)?;
+    let (mut session, _) = snapshot::read_or_create_session(
+        &bloom,
+        Default::default(),
+        Default::default(),
+        Default::default(),
+    );
+
+    let _ = session.inference_with_prompt::<Infallible>(
+        &bloom,
+        &Default::default(),
+        "The best kind of wine is ",
+        Some(32),
+        &mut rand::thread_rng(),
+        |t| {
+            print!("{t}");
+            std::io::stdout().flush().unwrap();
+
+            Ok(())
+        },
+    );
+
+    println!();
+    Ok(())
+}
diff --git a/ggml/Cargo.toml b/ggml/Cargo.toml
index 18b508ec..93c3c1c4 100644
--- a/ggml/Cargo.toml
+++ b/ggml/Cargo.toml
@@ -9,3 +9,6 @@ ggml-sys = { path = "../ggml-sys" }
 log = { workspace = true }
 
 thiserror = "1.0"
+
+[dev-dependencies]
+llm-base = { path = "../llm-base" }
diff --git a/ggml/src/loader.rs b/ggml/src/loader.rs
index 7851fadc..21dd8973 100644
--- a/ggml/src/loader.rs
+++ b/ggml/src/loader.rs
@@ -260,3 +260,53 @@ fn collect_related_paths(
     paths.sort();
     paths
 }
+
+#[cfg(test)]
+mod tests {
+    use llm_base::TokenUtf8Buffer;
+
+    use super::*;
+
+    #[test]
+    fn test_collect_related_paths() {
+        let main_path = PathBuf::from("/models/llama.bin");
+        let directory_paths = [
+            "/models/llama.bin",
+            "/models/llama.bin.1",
+            "/models/llama.bin.2",
+            "/models/llama.bin.tmp",
+        ]
+        .map(PathBuf::from);
+        let expected_paths = [
+            "/models/llama.bin",
+            "/models/llama.bin.1",
+            "/models/llama.bin.2",
+        ]
+        .map(PathBuf::from);
+
+        let output_paths = collect_related_paths(&main_path, directory_paths.into_iter());
+        assert_eq!(expected_paths.as_slice(), output_paths);
+    }
+
+    #[test]
+    fn test_valid_utf8() {
+        let mut buffer = TokenUtf8Buffer::new();
+        assert_eq!(buffer.push(b"hello").as_deref(), Some("hello"));
+        assert_eq!(buffer.push(&[0xE2, 0x82, 0xAC]).as_deref(), Some("€"));
+    }
+
+    #[test]
+    fn test_partial_utf8() {
+        let mut buffer = TokenUtf8Buffer::new();
+        assert_eq!(buffer.push(&[0xE2, 0x82]).as_deref(), None);
+        assert_eq!(buffer.push(&[0xAC]).as_deref(), Some("€"));
+    }
+
+    #[test]
+    fn test_invalid_prelude_for_valid_utf8() {
+        let mut buffer = TokenUtf8Buffer::new();
+        assert_eq!(buffer.push(&[0xD8]).as_deref(), None);
+        assert_eq!(buffer.push(&[0xE2, 0x82]).as_deref(), None);
+        assert_eq!(buffer.push(&[0xAC]).as_deref(), Some("€"));
+    }
+}
diff --git a/llama/Cargo.toml b/llama/Cargo.toml
index 3eef6510..da26fa1b 100644
--- a/llama/Cargo.toml
+++ b/llama/Cargo.toml
@@ -19,3 +19,6 @@ rust_tokenizers = { version = "3.1.2", optional = true }
 
 [features]
 convert = ["dep:serde_json", "dep:protobuf", "dep:rust_tokenizers"]
+
+[dev-dependencies]
+rand = { workspace = true }
diff --git a/llama/examples/llama_inference.rs b/llama/examples/llama_inference.rs
new file mode 100644
index 00000000..b1945a9a
--- /dev/null
+++ b/llama/examples/llama_inference.rs
@@ -0,0 +1,34 @@
+use std::{convert::Infallible, env::args, io::Write};
+
+use ggml::loader::{load_progress, LoadError};
+use llm_base::snapshot;
+
+extern crate llama;
+
+fn main() -> Result<(), LoadError> {
+    let args: Vec<String> = args().collect();
+    let bloom = llama::Llama::load(&args[1], 32, load_progress)?;
+    let (mut session, _) = snapshot::read_or_create_session(
+        &bloom,
+        Default::default(),
+        Default::default(),
+        Default::default(),
+    );
+
+    let _ = session.inference_with_prompt::<Infallible>(
+        &bloom,
+        &Default::default(),
+        "The best kind of wine is ",
+        Some(32),
+        &mut rand::thread_rng(),
+        |t| {
+            print!("{t}");
+            std::io::stdout().flush().unwrap();
+
+            Ok(())
+        },
+    );
+
+    println!();
+    Ok(())
+}

From 4ba7c1c8f9626218bc3422498d07e36fc3794bd9 Mon Sep 17 00:00:00 2001
From: Dan Forbes <dan@danforbes.dev>
Date: Sun, 16 Apr 2023 13:19:03 -0700
Subject: [PATCH 11/35] Layers are private

---
 bloom/src/lib.rs | 2 +-
 llama/src/lib.rs | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/bloom/src/lib.rs b/bloom/src/lib.rs
index b44241ec..1e43d158 100644
--- a/bloom/src/lib.rs
+++ b/bloom/src/lib.rs
@@ -498,7 +498,7 @@ pub struct Hyperparameters {
     pub f16_: u32,
 }
 
-pub struct Layer {
+struct Layer {
     pub attention_norm: ggml::Tensor,
     pub attention_norm_b: ggml::Tensor,
     pub wo: ggml::Tensor,
diff --git a/llama/src/lib.rs b/llama/src/lib.rs
index 5824eb53..67508bc1 100644
--- a/llama/src/lib.rs
+++ b/llama/src/lib.rs
@@ -459,7 +459,7 @@ pub struct Hyperparameters {
     pub f16_: u32,
 }
 
-pub struct Layer {
+struct Layer {
     attention_norm: ggml::Tensor,
 
     wq: ggml::Tensor,

From 440bd69f3c8e685f4ef946d8e9b2af4fb7144e20 Mon Sep 17 00:00:00 2001
From: Philpax <me@philpax.me>
Date: Wed, 26 Apr 2023 00:47:11 +0200
Subject: [PATCH 12/35] Fix build

---
 ggml-format/src/loader.rs     | 4 ++--
 ggml-format/src/saver.rs      | 2 +-
 ggml/src/lib.rs               | 2 ++
 llama/src/loader.rs           | 8 ++++----
 llm-base/src/loader_common.rs | 2 +-
 5 files changed, 10 insertions(+), 8 deletions(-)

diff --git a/ggml-format/src/loader.rs b/ggml-format/src/loader.rs
index ffc99c9b..eaec757a 100644
--- a/ggml-format/src/loader.rs
+++ b/ggml-format/src/loader.rs
@@ -35,7 +35,7 @@ pub enum LoadError<E: Error> {
         /// The name of the tensor.
         tensor_name: String,
         /// The format type that was encountered.
-        ftype: i32,
+        ftype: u32,
     },
     #[error("invariant broken: {0}")]
     /// An invariant was broken.
@@ -180,7 +180,7 @@ fn load_weights<E: Error, R: BufRead + Seek>(
         // load tensor header
         let n_dims: usize = read_i32(reader)?.try_into()?;
         let name_len = read_i32(reader)?;
-        let ftype = read_i32(reader)?;
+        let ftype = read_u32(reader)?;
 
         let mut n_elements: usize = 1;
         let mut dims = [1usize, 1];
diff --git a/ggml-format/src/saver.rs b/ggml-format/src/saver.rs
index 565032a3..e098b51c 100644
--- a/ggml-format/src/saver.rs
+++ b/ggml-format/src/saver.rs
@@ -97,7 +97,7 @@ pub fn save_model<E: Error, W: Write + Seek>(
         // Write tensor header
         util::write_i32(writer, n_dims.try_into()?)?;
         util::write_i32(writer, name.len().try_into()?)?;
-        util::write_i32(writer, element_type.into())?;
+        util::write_u32(writer, element_type.into())?;
         for &dim in &dims[0..n_dims] {
             util::write_i32(writer, dim.try_into()?)?;
         }
diff --git a/ggml/src/lib.rs b/ggml/src/lib.rs
index aa40153d..d84abc86 100644
--- a/ggml/src/lib.rs
+++ b/ggml/src/lib.rs
@@ -14,6 +14,8 @@ use std::{
     sync::{Arc, Weak},
 };
 
+pub use ggml_sys;
+
 /// Magic constant for `ggml` files (versioned, ggmf).
 pub const FILE_MAGIC_GGMF: u32 = 0x67676d66;
 /// Magic constant for `ggml` files (versioned, ggjt).
diff --git a/llama/src/loader.rs b/llama/src/loader.rs
index 47bd7ebe..ca52af75 100644
--- a/llama/src/loader.rs
+++ b/llama/src/loader.rs
@@ -247,7 +247,7 @@ fn load_weights_ggmf_or_unversioned(
 
             let n_dims = usize::try_from(read_i32(&mut part_reader)?)?;
             let length = read_i32(&mut part_reader)?;
-            let ftype = read_i32(&mut part_reader)?;
+            let ftype = read_u32(&mut part_reader)?;
 
             let TensorHeaderGgmf {
                 nelements,
@@ -369,7 +369,7 @@ fn load_tensor_header_ggmf<'a>(
     tensors: &'a mut HashMap<String, ggml::Tensor>,
     path: &Path,
     n_parts: usize,
-    ftype: i32,
+    ftype: u32,
 ) -> Result<TensorHeaderGgmf<'a>, LoadError> {
     let mut nelements = 1;
     let mut ne = [1i64, 1i64];
@@ -453,7 +453,7 @@ fn load_tensor_header_ggmf<'a>(
     })
 }
 
-fn tensor_type_size(ftype: i32, ne: [i64; 2]) -> Option<usize> {
+fn tensor_type_size(ftype: u32, ne: [i64; 2]) -> Option<usize> {
     let ftype = ggml::Type::try_from(ftype).ok()?;
     match ftype {
         ElementType::Q4_0 | ElementType::Q4_1 => {
@@ -488,7 +488,7 @@ fn load_weights_ggjt(
 
         let n_dims = read_i32(reader)? as usize;
         let length = read_i32(reader)?;
-        let ftype = read_i32(reader)?;
+        let ftype = read_u32(reader)?;
 
         let mut nelements: usize = 1;
         let mut ne = [1i64, 1];
diff --git a/llm-base/src/loader_common.rs b/llm-base/src/loader_common.rs
index 7c7feb55..81a38da9 100644
--- a/llm-base/src/loader_common.rs
+++ b/llm-base/src/loader_common.rs
@@ -192,7 +192,7 @@ pub enum LoadError {
         /// The name of the tensor.
         tensor_name: String,
         /// The format type that was encountered.
-        ftype: i32,
+        ftype: u32,
         /// The path that failed.
         path: PathBuf,
     },

From 565848451db955e01ed92ad55edf37e8651fbd27 Mon Sep 17 00:00:00 2001
From: Philpax <me@philpax.me>
Date: Wed, 26 Apr 2023 01:00:28 +0200
Subject: [PATCH 13/35] refactor: introduce llm(-cli)

---
 Cargo.lock                             |  50 ++--
 Cargo.toml                             |  13 +-
 bloom-cli/Cargo.toml                   |  21 --
 bloom-cli/src/cli_args.rs              | 331 -------------------------
 bloom-cli/src/main.rs                  | 197 ---------------
 generate-ggml-bindings/Cargo.toml      |   2 +-
 ggml-format/Cargo.toml                 |   2 +-
 {llama-cli => llm-cli}/Cargo.toml      |  10 +-
 {llama-cli => llm-cli}/src/cli_args.rs |  19 +-
 {llama-cli => llm-cli}/src/main.rs     |   6 +-
 llm/Cargo.toml                         |  14 ++
 llm/src/lib.rs                         |  11 +
 12 files changed, 71 insertions(+), 605 deletions(-)
 delete mode 100644 bloom-cli/Cargo.toml
 delete mode 100644 bloom-cli/src/cli_args.rs
 delete mode 100644 bloom-cli/src/main.rs
 rename {llama-cli => llm-cli}/Cargo.toml (62%)
 rename {llama-cli => llm-cli}/src/cli_args.rs (97%)
 rename {llama-cli => llm-cli}/src/main.rs (98%)
 create mode 100644 llm/Cargo.toml
 create mode 100644 llm/src/lib.rs

diff --git a/Cargo.lock b/Cargo.lock
index b96955c2..eee9259f 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -140,23 +140,6 @@ dependencies = [
  "rand",
 ]
 
-[[package]]
-name = "bloom-cli"
-version = "0.1.0"
-dependencies = [
- "bloom",
- "clap",
- "env_logger",
- "ggml",
- "llm-base",
- "log",
- "num_cpus",
- "once_cell",
- "rand",
- "rustyline",
- "spinners",
-]
-
 [[package]]
 name = "bytemuck"
 version = "1.13.1"
@@ -657,23 +640,12 @@ dependencies = [
 ]
 
 [[package]]
-name = "llama-cli"
+name = "llm"
 version = "0.1.0"
 dependencies = [
- "bincode",
- "clap",
- "color-eyre",
- "env_logger",
- "ggml",
+ "bloom",
  "llama",
  "llm-base",
- "log",
- "num_cpus",
- "once_cell",
- "rand",
- "rustyline",
- "spinners",
- "zstd",
 ]
 
 [[package]]
@@ -693,6 +665,24 @@ dependencies = [
  "zstd",
 ]
 
+[[package]]
+name = "llm-cli"
+version = "0.1.0"
+dependencies = [
+ "bincode",
+ "clap",
+ "color-eyre",
+ "env_logger",
+ "llm",
+ "log",
+ "num_cpus",
+ "once_cell",
+ "rand",
+ "rustyline",
+ "spinners",
+ "zstd",
+]
+
 [[package]]
 name = "log"
 version = "0.4.17"
diff --git a/Cargo.toml b/Cargo.toml
index 62371cd4..98b57a1b 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -1,13 +1,16 @@
 [workspace]
 members = [
-    "bloom",
-    "bloom-cli",
+    # Crates
     "ggml-sys",
-    "ggml",
-    "llama",
     "ggml-format",
-    "llama-cli",
+    "ggml",
     "llm-base",
+    "llama",
+    "bloom",
+    "llm",
+    "llm-cli",
+
+    # Tools
     "generate-ggml-bindings"
 ]
 resolver = "2"
diff --git a/bloom-cli/Cargo.toml b/bloom-cli/Cargo.toml
deleted file mode 100644
index 7cfc3108..00000000
--- a/bloom-cli/Cargo.toml
+++ /dev/null
@@ -1,21 +0,0 @@
-[package]
-name = "bloom-cli"
-version = { workspace = true }
-edition = "2021"
-
-# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
-
-[dependencies]
-bloom = { path = "../bloom" }
-ggml = { path = "../ggml" }
-llm-base = { path = "../llm-base" }
-
-log = { workspace = true }
-rand = { workspace = true }
-
-clap = { version = "4.1.8", features = ["derive"] }
-env_logger = "0.10.0"
-num_cpus = "1.15.0"
-once_cell = "1.17.1"
-rustyline = "11.0.0"
-spinners = "4.1.0"
diff --git a/bloom-cli/src/cli_args.rs b/bloom-cli/src/cli_args.rs
deleted file mode 100644
index b166c2d1..00000000
--- a/bloom-cli/src/cli_args.rs
+++ /dev/null
@@ -1,331 +0,0 @@
-use std::path::PathBuf;
-
-use clap::{Parser, ValueEnum};
-use rand::SeedableRng;
-
-use llm_base::{
-    InferenceParameters, InferenceSessionParameters, ModelKVMemoryType, TokenBias, EOT_TOKEN_ID,
-};
-
-#[derive(Parser, Debug)]
-#[command(author, version, about, long_about = None)]
-pub enum Args {
-    #[command()]
-    /// Use a model to infer the next tokens in a sequence, and exit
-    Infer(Box<Infer>),
-
-    #[command()]
-    /// Dumps the prompt to console and exits, first as a comma-separated list of token IDs
-    /// and then as a list of comma-separated string keys and token ID values.
-    DumpTokens(Box<DumpTokens>),
-
-    #[command()]
-    /// Use a model to interactively prompt it multiple times, while
-    /// resetting the context between invocations
-    Repl(Box<Repl>),
-
-    #[command()]
-    /// Use a model to interactively generate tokens, and chat with it
-    ///
-    /// Note that most, if not all, existing models are not trained for this
-    /// and do not support a long enough context window to be able to
-    /// have an extended conversation.
-    ChatExperimental(Box<Repl>),
-}
-
-#[derive(Parser, Debug)]
-pub struct Infer {
-    #[command(flatten)]
-    pub model_load: ModelLoad,
-
-    #[command(flatten)]
-    pub prompt_file: PromptFile,
-
-    #[command(flatten)]
-    pub generate: Generate,
-
-    /// The prompt to feed the generator.
-    ///
-    /// If used with `--prompt-file`/`-f`, the prompt from the file will be used
-    /// and `{{PROMPT}}` will be replaced with the value of `--prompt`/`-p`.
-    #[arg(long, short = 'p', default_value = None)]
-    pub prompt: Option<String>,
-
-    /// Saves an inference session at the given path. The same session can then be
-    /// loaded from disk using `--load-session`.
-    ///
-    /// Use this with `-n 0` to save just the prompt
-    #[arg(long, default_value = None)]
-    pub save_session: Option<PathBuf>,
-
-    /// Loads an inference session from the given path if present, and then saves
-    /// the result to the same path after inference is completed.
-    ///
-    /// Equivalent to `--load-session` and `--save-session` with the same path,
-    /// but will not error if the path does not exist
-    #[arg(long, default_value = None)]
-    pub persist_session: Option<PathBuf>,
-}
-
-#[derive(Parser, Debug)]
-pub struct DumpTokens {
-    #[command(flatten)]
-    pub model_load: ModelLoad,
-
-    #[command(flatten)]
-    pub prompt_file: PromptFile,
-
-    /// The prompt to feed the generator.
-    ///
-    /// If used with `--prompt-file`/`-f`, the prompt from the file will be used
-    /// and `{{PROMPT}}` will be replaced with the value of `--prompt`/`-p`.
-    #[arg(long, short = 'p', default_value = None)]
-    pub prompt: Option<String>,
-}
-
-#[derive(Parser, Debug)]
-pub struct Repl {
-    #[command(flatten)]
-    pub model_load: ModelLoad,
-
-    #[command(flatten)]
-    pub prompt_file: PromptFile,
-
-    #[command(flatten)]
-    pub generate: Generate,
-}
-
-#[derive(Parser, Debug)]
-pub struct Generate {
-    /// Sets the number of threads to use
-    #[arg(long, short = 't')]
-    pub num_threads: Option<usize>,
-
-    /// Sets how many tokens to predict
-    #[arg(long, short = 'n')]
-    pub num_predict: Option<usize>,
-
-    /// How many tokens from the prompt at a time to feed the network. Does not
-    /// affect generation.
-    #[arg(long, default_value_t = 8)]
-    pub batch_size: usize,
-
-    /// Size of the 'last N' buffer that is used for the `repeat_penalty`
-    /// option. In tokens.
-    #[arg(long, default_value_t = 64)]
-    pub repeat_last_n: usize,
-
-    /// The penalty for repeating tokens. Higher values make the generation less
-    /// likely to get into a loop, but may harm results when repetitive outputs
-    /// are desired.
-    #[arg(long, default_value_t = 1.30)]
-    pub repeat_penalty: f32,
-
-    /// Temperature
-    #[arg(long, default_value_t = 0.80)]
-    pub temperature: f32,
-
-    /// Top-K: The top K words by score are kept during sampling.
-    #[arg(long, default_value_t = 40)]
-    pub top_k: usize,
-
-    /// Top-p: The cumulative probability after which no more words are kept
-    /// for sampling.
-    #[arg(long, default_value_t = 0.95)]
-    pub top_p: f32,
-
-    /// Loads a saved inference session from the given path, previously saved using
-    /// `--save-session`
-    #[arg(long, default_value = None)]
-    pub load_session: Option<PathBuf>,
-
-    /// Specifies the seed to use during sampling. Note that, depending on
-    /// hardware, the same seed may lead to different results on two separate
-    /// machines.
-    #[arg(long, default_value = None)]
-    pub seed: Option<u64>,
-
-    /// Use 16-bit floats for model memory key and value. Ignored when restoring
-    /// from the cache.
-    #[arg(long, default_value_t = false)]
-    pub float16: bool,
-
-    /// A comma separated list of token biases. The list should be in the format
-    /// "TID=BIAS,TID=BIAS" where TID is an integer token ID and BIAS is a
-    /// floating point number.
-    /// For example, "1=-1.0,2=-1.0" sets the bias for token IDs 1
-    /// (start of document) and 2 (end of document) to -1.0 which effectively
-    /// disables the model from generating responses containing those token IDs.
-    #[arg(long, default_value = None, value_parser = parse_bias)]
-    pub token_bias: Option<TokenBias>,
-
-    /// Prevent the end of stream (EOS/EOD) token from being generated. This will allow the
-    /// model to generate text until it runs out of context space. Note: The --token-bias
-    /// option will override this if specified.
-    #[arg(long, default_value_t = false)]
-    pub ignore_eos: bool,
-}
-impl Generate {
-    #[cfg(all(target_os = "macos", target_arch = "aarch64"))]
-    pub fn autodetect_num_threads(&self) -> usize {
-        std::process::Command::new("sysctl")
-            .arg("-n")
-            .arg("hw.perflevel0.physicalcpu")
-            .output()
-            .ok()
-            .and_then(|output| String::from_utf8(output.stdout).ok()?.trim().parse().ok())
-            .unwrap_or(num_cpus::get_physical())
-    }
-
-    #[cfg(not(all(target_os = "macos", target_arch = "aarch64")))]
-    pub fn autodetect_num_threads(&self) -> usize {
-        num_cpus::get_physical()
-    }
-
-    pub fn num_threads(&self) -> usize {
-        self.num_threads
-            .unwrap_or_else(|| self.autodetect_num_threads())
-    }
-
-    pub fn inference_session_parameters(&self) -> InferenceSessionParameters {
-        let mem_typ = if self.float16 {
-            ModelKVMemoryType::Float16
-        } else {
-            ModelKVMemoryType::Float32
-        };
-        InferenceSessionParameters {
-            memory_k_type: mem_typ,
-            memory_v_type: mem_typ,
-            repetition_penalty_last_n: self.repeat_last_n,
-        }
-    }
-
-    pub fn rng(&self) -> rand::rngs::StdRng {
-        if let Some(seed) = self.seed {
-            rand::rngs::StdRng::seed_from_u64(seed)
-        } else {
-            rand::rngs::StdRng::from_entropy()
-        }
-    }
-
-    pub fn inference_parameters(&self, session_loaded: bool) -> InferenceParameters {
-        InferenceParameters {
-            n_threads: self.num_threads(),
-            n_batch: self.batch_size,
-            top_k: self.top_k,
-            top_p: self.top_p,
-            repeat_penalty: self.repeat_penalty,
-            temperature: self.temperature,
-            bias_tokens: self.token_bias.clone().unwrap_or_else(|| {
-                if self.ignore_eos {
-                    TokenBias::new(vec![(EOT_TOKEN_ID, -1.0)])
-                } else {
-                    TokenBias::default()
-                }
-            }),
-            play_back_previous_tokens: session_loaded,
-        }
-    }
-}
-fn parse_bias(s: &str) -> Result<TokenBias, String> {
-    s.parse()
-}
-
-#[derive(Parser, Debug)]
-pub struct ModelLoad {
-    /// Where to load the model path from
-    #[arg(long, short = 'm')]
-    pub model_path: String,
-
-    /// Sets the size of the context (in tokens). Allows feeding longer prompts.
-    /// Note that this affects memory.
-    ///
-    /// LLaMA models are trained with a context size of 2048 tokens. If you
-    /// want to use a larger context size, you will need to retrain the model,
-    /// or use a model that was trained with a larger context size.
-    ///
-    /// Alternate methods to extend the context, including
-    /// [context clearing](https://github.com/rustformers/llama-rs/issues/77) are
-    /// being investigated, but are not yet implemented. Additionally, these
-    /// will likely not perform as well as a model with a larger context size.
-    #[arg(long, default_value_t = 2048)]
-    pub num_ctx_tokens: usize,
-}
-impl ModelLoad {
-    pub fn load(&self) -> bloom::Bloom {
-        // let model = bloom::Bloom::load(&self.model_path, self.num_ctx_tokens, |_| {})
-        //     .expect("Could not load model");
-
-        log::info!("Model fully loaded!");
-
-        todo!()
-    }
-}
-
-#[derive(Parser, Debug)]
-pub struct PromptFile {
-    /// A file to read the prompt from.
-    #[arg(long, short = 'f', default_value = None)]
-    pub prompt_file: Option<String>,
-}
-impl PromptFile {
-    pub fn contents(&self) -> Option<String> {
-        match &self.prompt_file {
-            Some(path) => {
-                match std::fs::read_to_string(path) {
-                    Ok(mut prompt) => {
-                        // Strip off the last character if it's exactly newline. Also strip off a single
-                        // carriage return if it's there. Since String must be valid UTF-8 it should be
-                        // guaranteed that looking at the string as bytes here is safe: UTF-8 non-ASCII
-                        // bytes will always the high bit set.
-                        if matches!(prompt.as_bytes().last(), Some(b'\n')) {
-                            prompt.pop();
-                        }
-                        if matches!(prompt.as_bytes().last(), Some(b'\r')) {
-                            prompt.pop();
-                        }
-                        Some(prompt)
-                    }
-                    Err(err) => {
-                        log::error!("Could not read prompt file at {path}. Error {err}");
-                        std::process::exit(1);
-                    }
-                }
-            }
-            _ => None,
-        }
-    }
-}
-
-#[derive(Parser, Debug)]
-pub struct Convert {
-    /// Path to model directory
-    #[arg(long, short = 'd')]
-    pub directory: PathBuf,
-
-    /// File type to convert to
-    #[arg(long, short = 't', value_enum, default_value_t = ElementType::Q4_0)]
-    pub element_type: ElementType,
-}
-
-#[derive(Parser, Debug, ValueEnum, Clone, Copy)]
-pub enum ElementType {
-    /// Quantized 4-bit (type 0).
-    Q4_0,
-    /// Quantized 4-bit (type 1); used by GPTQ.
-    Q4_1,
-    /// Float 16-bit.
-    F16,
-    /// Float 32-bit.
-    F32,
-}
-impl From<ElementType> for llm_base::ElementType {
-    fn from(model_type: ElementType) -> Self {
-        match model_type {
-            ElementType::Q4_0 => llm_base::ElementType::Q4_0,
-            ElementType::Q4_1 => llm_base::ElementType::Q4_1,
-            ElementType::F16 => llm_base::ElementType::F16,
-            ElementType::F32 => llm_base::ElementType::F32,
-        }
-    }
-}
diff --git a/bloom-cli/src/main.rs b/bloom-cli/src/main.rs
deleted file mode 100644
index f91207ff..00000000
--- a/bloom-cli/src/main.rs
+++ /dev/null
@@ -1,197 +0,0 @@
-use std::{convert::Infallible, io::Write};
-
-use clap::Parser;
-use cli_args::Args;
-use rustyline::error::ReadlineError;
-
-use llm_base::{snapshot, InferenceError, Model};
-
-mod cli_args;
-
-fn main() {
-    env_logger::builder()
-        .filter_level(log::LevelFilter::Info)
-        .parse_default_env()
-        .init();
-
-    let cli_args = Args::parse();
-    match cli_args {
-        Args::Infer(args) => infer(&args),
-        Args::DumpTokens(args) => dump_tokens(&args),
-        Args::Repl(args) => interactive(&args, false),
-        Args::ChatExperimental(args) => interactive(&args, true),
-    }
-}
-
-fn infer(args: &cli_args::Infer) {
-    let prompt = load_prompt_file_with_prompt(&args.prompt_file, args.prompt.as_deref());
-    let inference_session_params = args.generate.inference_session_parameters();
-    let model = args.model_load.load();
-    let (mut session, session_loaded) = snapshot::read_or_create_session(
-        &model,
-        args.persist_session.as_deref(),
-        args.generate.load_session.as_deref(),
-        inference_session_params,
-    );
-    let inference_params = args.generate.inference_parameters(session_loaded);
-
-    let mut rng = args.generate.rng();
-    let res = session.inference_with_prompt::<Infallible>(
-        &model,
-        &inference_params,
-        &prompt,
-        args.generate.num_predict,
-        &mut rng,
-        |t| {
-            print!("{t}");
-            std::io::stdout().flush().unwrap();
-
-            Ok(())
-        },
-    );
-    println!();
-
-    match res {
-        Ok(_) => (),
-        Err(InferenceError::ContextFull) => {
-            log::warn!("Context window full, stopping inference.")
-        }
-        Err(InferenceError::TokenizationFailed) => {
-            log::error!("Failed to tokenize initial prompt.");
-        }
-        Err(InferenceError::UserCallback(_)) | Err(InferenceError::EndOfText) => {
-            unreachable!("cannot fail")
-        }
-    }
-
-    if let Some(session_path) = args.save_session.as_ref().or(args.persist_session.as_ref()) {
-        // Write the memory to the cache file
-        snapshot::write_session(session, session_path);
-    }
-}
-
-fn dump_tokens(args: &cli_args::DumpTokens) {
-    let prompt = load_prompt_file_with_prompt(&args.prompt_file, args.prompt.as_deref());
-    let model = args.model_load.load();
-    let toks = match model.vocabulary().tokenize(&prompt, false) {
-        Ok(toks) => toks,
-        Err(e) => {
-            log::error!("Could not tokenize prompt: {e}");
-            std::process::exit(1);
-        }
-    };
-    log::info!("=== Dumping prompt tokens:");
-    log::info!(
-        "{}",
-        toks.iter()
-            .map(|(_, tid)| tid.to_string())
-            .collect::<Vec<_>>()
-            .join(", ")
-    );
-    log::info!(
-        "{}",
-        toks.iter()
-            .map(|(s, tid)| format!("{s:?}:{tid}"))
-            .collect::<Vec<_>>()
-            .join(", ")
-    );
-}
-
-fn interactive(
-    args: &cli_args::Repl,
-    // If set to false, the session will be cloned after each inference
-    // to ensure that previous state is not carried over.
-    chat_mode: bool,
-) {
-    let prompt_file = args.prompt_file.contents();
-    let inference_session_params = args.generate.inference_session_parameters();
-    let model = args.model_load.load();
-    let (mut session, session_loaded) = snapshot::read_or_create_session(
-        &model,
-        None,
-        args.generate.load_session.as_deref(),
-        inference_session_params,
-    );
-    let inference_params = args.generate.inference_parameters(session_loaded);
-
-    let mut rng = args.generate.rng();
-    let mut rl = rustyline::DefaultEditor::new().unwrap();
-    loop {
-        let readline = rl.readline(">> ");
-        match readline {
-            Ok(line) => {
-                let session_backup = if chat_mode {
-                    None
-                } else {
-                    Some(session.clone())
-                };
-
-                let prompt = prompt_file
-                    .as_deref()
-                    .map(|pf| process_prompt(pf, &line))
-                    .unwrap_or(line);
-
-                let mut sp = spinners::Spinner::new(spinners::Spinners::Dots2, "".to_string());
-                if let Err(InferenceError::ContextFull) = session.feed_prompt::<Infallible>(
-                    &model,
-                    &inference_params,
-                    &prompt,
-                    |_| Ok(()),
-                ) {
-                    log::error!("Prompt exceeds context window length.")
-                };
-                sp.stop();
-
-                let res = session.inference_with_prompt::<Infallible>(
-                    &model,
-                    &inference_params,
-                    "",
-                    args.generate.num_predict,
-                    &mut rng,
-                    |tk| {
-                        print!("{tk}");
-                        std::io::stdout().flush().unwrap();
-                        Ok(())
-                    },
-                );
-                println!();
-
-                if let Err(InferenceError::ContextFull) = res {
-                    log::error!("Reply exceeds context window length");
-                }
-
-                if let Some(session_backup) = session_backup {
-                    session = session_backup;
-                }
-            }
-            Err(ReadlineError::Eof) | Err(ReadlineError::Interrupted) => {
-                break;
-            }
-            Err(err) => {
-                log::error!("{err}");
-            }
-        }
-    }
-}
-
-fn load_prompt_file_with_prompt(
-    prompt_file: &cli_args::PromptFile,
-    prompt: Option<&str>,
-) -> String {
-    if let Some(prompt_file) = prompt_file.contents() {
-        if let Some(prompt) = prompt {
-            process_prompt(&prompt_file, prompt)
-        } else {
-            prompt_file
-        }
-    } else if let Some(prompt) = prompt {
-        prompt.to_owned()
-    } else {
-        log::error!("No prompt or prompt file was provided. See --help");
-        std::process::exit(1);
-    }
-}
-
-fn process_prompt(raw_prompt: &str, prompt: &str) -> String {
-    raw_prompt.replace("{{PROMPT}}", prompt)
-}
diff --git a/generate-ggml-bindings/Cargo.toml b/generate-ggml-bindings/Cargo.toml
index 6f75538e..fabb70e0 100644
--- a/generate-ggml-bindings/Cargo.toml
+++ b/generate-ggml-bindings/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "generate-ggml-bindings"
-version = "0.1.0"
+version = { workspace = true }
 edition = "2021"
 
 # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
diff --git a/ggml-format/Cargo.toml b/ggml-format/Cargo.toml
index 91daca22..8627e559 100644
--- a/ggml-format/Cargo.toml
+++ b/ggml-format/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "ggml-format"
-version = "0.1.0"
+version = { workspace = true }
 edition = "2021"
 
 # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
diff --git a/llama-cli/Cargo.toml b/llm-cli/Cargo.toml
similarity index 62%
rename from llama-cli/Cargo.toml
rename to llm-cli/Cargo.toml
index de511dae..a552c364 100644
--- a/llama-cli/Cargo.toml
+++ b/llm-cli/Cargo.toml
@@ -1,14 +1,14 @@
 [package]
 edition = "2021"
-name = "llama-cli"
+name = "llm-cli"
 version = {workspace = true}
 
-# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
+[[bin]]
+name = "llm"
+path = "src/main.rs"
 
 [dependencies]
-ggml = { path = "../ggml" }
-llama = { path = "../llama", features = ["convert", "quantize"] }
-llm-base = { path = "../llm-base" }
+llm = { path = "../llm" }
 
 log = { workspace = true }
 rand = {workspace = true}
diff --git a/llama-cli/src/cli_args.rs b/llm-cli/src/cli_args.rs
similarity index 97%
rename from llama-cli/src/cli_args.rs
rename to llm-cli/src/cli_args.rs
index 334e0037..bddb4be4 100644
--- a/llama-cli/src/cli_args.rs
+++ b/llm-cli/src/cli_args.rs
@@ -2,12 +2,11 @@ use std::path::PathBuf;
 
 use clap::{Parser, ValueEnum};
 use color_eyre::eyre::{Result, WrapErr};
-use rand::SeedableRng;
-
-use llm_base::{
-    InferenceParameters, InferenceSessionParameters, LoadProgress, ModelKVMemoryType, TokenBias,
-    EOT_TOKEN_ID,
+use llm::{
+    llama, InferenceParameters, InferenceSessionParameters, LoadProgress, ModelKVMemoryType,
+    TokenBias, EOT_TOKEN_ID,
 };
+use rand::SeedableRng;
 
 #[derive(Parser, Debug)]
 #[command(author, version, about, long_about = None)]
@@ -389,13 +388,13 @@ pub enum FileType {
     /// Float 32-bit.
     F32,
 }
-impl From<FileType> for llm_base::FileType {
+impl From<FileType> for llm::FileType {
     fn from(t: FileType) -> Self {
         match t {
-            FileType::Q4_0 => llm_base::FileType::MostlyQ4_0,
-            FileType::Q4_1 => llm_base::FileType::MostlyQ4_1,
-            FileType::F16 => llm_base::FileType::MostlyF16,
-            FileType::F32 => llm_base::FileType::F32,
+            FileType::Q4_0 => llm::FileType::MostlyQ4_0,
+            FileType::Q4_1 => llm::FileType::MostlyQ4_1,
+            FileType::F16 => llm::FileType::MostlyF16,
+            FileType::F32 => llm::FileType::F32,
         }
     }
 }
diff --git a/llama-cli/src/main.rs b/llm-cli/src/main.rs
similarity index 98%
rename from llama-cli/src/main.rs
rename to llm-cli/src/main.rs
index 2b085da1..84f716a5 100644
--- a/llama-cli/src/main.rs
+++ b/llm-cli/src/main.rs
@@ -3,11 +3,9 @@ use std::{convert::Infallible, io::Write};
 use clap::Parser;
 use cli_args::Args;
 use color_eyre::eyre::{Context, Result};
-use llama::convert::convert_pth_to_ggml;
+use llm::{llama::convert::convert_pth_to_ggml, snapshot, InferenceError, Model};
 use rustyline::error::ReadlineError;
 
-use llm_base::{snapshot, InferenceError, Model};
-
 mod cli_args;
 
 fn main() -> Result<()> {
@@ -188,7 +186,7 @@ fn interactive(
 }
 
 fn quantize(args: &cli_args::Quantize) -> Result<()> {
-    use llama::quantize::{quantize, QuantizeProgress::*};
+    use llm::llama::quantize::{quantize, QuantizeProgress::*};
     quantize(
         &args.source,
         &args.destination,
diff --git a/llm/Cargo.toml b/llm/Cargo.toml
new file mode 100644
index 00000000..080de0ed
--- /dev/null
+++ b/llm/Cargo.toml
@@ -0,0 +1,14 @@
+[package]
+name = "llm"
+version = { workspace = true }
+edition = "2021"
+
+[dependencies]
+llm-base = { path = "../llm-base" }
+llama = { path = "../llama", features = ["convert", "quantize"], optional = true }
+bloom = { path = "../bloom", optional = true }
+
+[features]
+default = ["llama", "bloom"]
+llama = ["dep:llama"]
+bloom = ["dep:bloom"]
\ No newline at end of file
diff --git a/llm/src/lib.rs b/llm/src/lib.rs
new file mode 100644
index 00000000..3653e712
--- /dev/null
+++ b/llm/src/lib.rs
@@ -0,0 +1,11 @@
+pub use llm_base::{
+    snapshot, FileType, InferenceError, InferenceParameters, InferenceSession,
+    InferenceSessionParameters, InferenceSnapshot, LoadError, LoadProgress, Model,
+    ModelKVMemoryType, SnapshotError, TokenBias, TokenId, TokenUtf8Buffer, Vocabulary,
+    EOT_TOKEN_ID,
+};
+
+#[cfg(feature = "bloom")]
+pub use bloom;
+#[cfg(feature = "llama")]
+pub use llama;

From bcf5627dafce0918db7870310cfa2c6758f168e6 Mon Sep 17 00:00:00 2001
From: Dan Forbes <dan@danforbes.dev>
Date: Tue, 25 Apr 2023 17:19:05 -0700
Subject: [PATCH 14/35] Fix model name in LLaMA inference example

---
 llama/examples/llama_inference.rs | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/llama/examples/llama_inference.rs b/llama/examples/llama_inference.rs
index e2928bfe..b644f1f2 100644
--- a/llama/examples/llama_inference.rs
+++ b/llama/examples/llama_inference.rs
@@ -6,16 +6,16 @@ extern crate llama;
 
 fn main() -> Result<(), LoadError> {
     let args: Vec<String> = args().collect();
-    let bloom = llama::Llama::load(&args[1], true, 32, |_| {})?;
+    let llama = llama::Llama::load(&args[1], true, 32, |_| {})?;
     let (mut session, _) = snapshot::read_or_create_session(
-        &bloom,
+        &llama,
         Default::default(),
         Default::default(),
         Default::default(),
     );
 
     let _ = session.inference_with_prompt::<Infallible>(
-        &bloom,
+        &llama,
         &Default::default(),
         "The best kind of wine is ",
         Some(32),

From 5ac4b79522666a023b19c7abb00cd3a1dd78020b Mon Sep 17 00:00:00 2001
From: Philpax <me@philpax.me>
Date: Wed, 26 Apr 2023 03:02:10 +0200
Subject: [PATCH 15/35] feat: wire up both bloom/llama to CLI

---
 Cargo.lock                                   |   3 +-
 bloom/src/lib.rs                             | 257 +++++++--------
 ggml/src/loader.rs                           | 312 -------------------
 llama/Cargo.toml                             |   6 +-
 llama/src/convert.rs                         |   1 -
 llama/src/lib.rs                             | 190 +++--------
 llama/src/loader2.rs                         | 234 --------------
 llama/src/{loader.rs => old_loader.rs}       | 120 +++----
 llama/src/quantize.rs                        |  27 +-
 llm-base/Cargo.toml                          |   1 +
 llm-base/src/inference_session.rs            |  10 +-
 llm-base/src/lib.rs                          |   8 +-
 llm-base/src/{loader_common.rs => loader.rs} | 238 +++++++++++++-
 llm-base/src/model.rs                        |  77 ++++-
 llm-base/src/snapshot.rs                     |   6 +-
 llm-base/src/util.rs                         |   1 +
 llm-cli/src/cli_args.rs                      | 151 +++++----
 llm-cli/src/main.rs                          |  14 +-
 llm/src/lib.rs                               |  10 +-
 19 files changed, 677 insertions(+), 989 deletions(-)
 delete mode 100644 ggml/src/loader.rs
 delete mode 100644 llama/src/loader2.rs
 rename llama/src/{loader.rs => old_loader.rs} (85%)
 rename llm-base/src/{loader_common.rs => loader.rs} (54%)

diff --git a/Cargo.lock b/Cargo.lock
index eee9259f..9c1dce3a 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -624,11 +624,9 @@ name = "llama"
 version = "0.1.0"
 dependencies = [
  "bytemuck",
- "ggml",
  "ggml-format",
  "half",
  "llm-base",
- "memmap2",
  "partial_sort",
  "protobuf",
  "rand",
@@ -657,6 +655,7 @@ dependencies = [
  "ggml",
  "ggml-format",
  "log",
+ "memmap2",
  "partial_sort",
  "rand",
  "serde",
diff --git a/bloom/src/lib.rs b/bloom/src/lib.rs
index 9c1a46cc..01c53a10 100644
--- a/bloom/src/lib.rs
+++ b/bloom/src/lib.rs
@@ -1,15 +1,15 @@
-use std::collections::HashMap;
-
 // use ggml_loader::{LoadError, LoadProgress};
 use llm_base::{
-    EvaluateOutputRequest, InferenceParameters, InferenceSession, InferenceSessionParameters,
-    Model, TokenId, Vocabulary,
+    util, EvaluateOutputRequest, FileType, InferenceParameters, InferenceSession,
+    InferenceSessionParameters, LoadError, Mmap, Model, TokenId, Vocabulary,
 };
 
 /// The weights for the BLOOM model. All the mutable state is split into a
 /// separate struct `InferenceSession`.
 pub struct Bloom {
-    hparams: Hyperparameters,
+    hyperparameters: Hyperparameters,
+    n_context_tokens: usize,
+
     vocabulary: Vocabulary,
     tok_embeddings: ggml::Tensor,
     norm: ggml::Tensor,
@@ -18,21 +18,103 @@ pub struct Bloom {
     output_norm_b: ggml::Tensor,
     output: ggml::Tensor,
     layers: Vec<Layer>,
-    _tensors: HashMap<String, ggml::Tensor>,
+
     // Must be kept alive for the model
     _context: ggml::Context,
+    _mmap: Option<Mmap>,
 }
 
 impl Model for Bloom {
     type Hyperparameters = Hyperparameters;
 
+    fn new<E: std::error::Error>(
+        hyperparameters: Self::Hyperparameters,
+        n_context_tokens: usize,
+        vocabulary: Vocabulary,
+        tensor_loader: impl llm_base::TensorLoader<E>,
+    ) -> Result<Self, E> {
+        let n_embd = hyperparameters.n_embd;
+        let n_layer = hyperparameters.n_layer;
+        let n_vocab = hyperparameters.n_vocab;
+        let n_mult = hyperparameters.n_mult;
+        let n_ff = ((4 * n_embd + n_mult - 1) / n_mult) * n_mult;
+
+        let mut tl = tensor_loader;
+
+        let tok_embeddings = tl.load("tok_embeddings.weight", &[n_embd, n_vocab])?;
+
+        let norm = tl.load("norm.weight", &[n_embd])?;
+        let norm_b = tl.load("norm.bias", &[n_embd])?;
+
+        let output_norm = tl.load("output_norm.weight", &[n_embd])?;
+        let output_norm_b = tl.load("output_norm.bias", &[n_embd])?;
+
+        let output = tl.load("output.weight", &[n_embd, n_vocab])?;
+
+        let mut layers = Vec::new();
+        for i in 0..n_layer {
+            let layer = Layer {
+                attention_norm: tl.load(&format!("layers.{i}.attention_norm.weight"), &[n_embd])?,
+                attention_norm_b: tl.load(&format!("layers.{i}.attention_norm.bias"), &[n_embd])?,
+
+                query_key_value: tl.load(
+                    &format!("layers.{i}.attention.query_key_value.weight"),
+                    &[n_embd, 3 * n_embd],
+                )?,
+                query_key_value_b: tl.load(
+                    &format!("layers.{i}.attention.query_key_value.bias"),
+                    &[3 * n_embd],
+                )?,
+
+                wo: tl.load(
+                    &format!("layers.{i}.attention.wo.weight"),
+                    &[n_embd, n_embd],
+                )?,
+                wo_b: tl.load(&format!("layers.{i}.attention.wo.bias"), &[n_embd])?,
+
+                ffn_norm: tl.load(&format!("layers.{i}.ffn_norm.weight"), &[n_embd])?,
+                ffn_norm_b: tl.load(&format!("layers.{i}.ffn_norm.bias"), &[n_embd])?,
+
+                w1: tl.load(
+                    &format!("layers.{i}.feed_forward.w1.weight"),
+                    &[n_embd, n_ff],
+                )?,
+                w1_b: tl.load(&format!("layers.{i}.feed_forward.w1.bias"), &[n_ff])?,
+                w2: tl.load(
+                    &format!("layers.{i}.feed_forward.w2.weight"),
+                    &[n_ff, n_embd],
+                )?,
+                w2_b: tl.load(&format!("layers.{i}.feed_forward.w2.bias"), &[n_embd])?,
+            };
+
+            layers.push(layer);
+        }
+
+        let (_context, _, _mmap) = tl.finish();
+
+        Ok(Bloom {
+            hyperparameters,
+            n_context_tokens,
+            vocabulary,
+            tok_embeddings,
+            norm,
+            norm_b,
+            output_norm,
+            output_norm_b,
+            output,
+            layers,
+            _context,
+            _mmap,
+        })
+    }
+
     fn start_session(&self, params: InferenceSessionParameters) -> InferenceSession {
         InferenceSession::new(
             params,
-            self.hparams.n_ctx,
-            self.hparams.n_layer,
-            self.hparams.n_embd,
-            self.hparams.n_vocab,
+            self.n_context_tokens,
+            self.hyperparameters.n_layer,
+            self.hyperparameters.n_embd,
+            self.hyperparameters.n_vocab,
         )
     }
 
@@ -49,13 +131,13 @@ impl Model for Bloom {
 
         let Hyperparameters {
             n_vocab,
-            n_ctx,
             n_embd,
             n_mult: _,
             n_head,
             n_layer,
-            f16_: _,
-        } = self.hparams;
+            file_type: _,
+        } = self.hyperparameters;
+        let n_ctx = self.n_context_tokens;
 
         // For the first run, we need to guess a maximum buffer size so we can measure
         // the actual memory consumption of the temporary ggml context.
@@ -346,144 +428,39 @@ impl Model for Bloom {
     }
 
     fn n_ctx(&self) -> usize {
-        self.hparams.n_ctx
-    }
-}
-
-impl Bloom {
-    #[allow(dead_code)]
-    pub(crate) fn new(
-        context: ggml::Context,
-        hparams: Hyperparameters,
-        vocabulary: Vocabulary,
-        n_ff: usize,
-        wtype: ggml::Type,
-    ) -> Bloom {
-        let n_embd = hparams.n_embd;
-        let n_layer = hparams.n_layer;
-        let n_vocab = hparams.n_vocab;
-
-        let mut tensors = HashMap::new();
-
-        let tok_embeddings = context.new_tensor_2d(wtype, n_embd, n_vocab);
-
-        let norm = context.new_tensor_1d(ggml::Type::F32, n_embd);
-        let norm_b = context.new_tensor_1d(ggml::Type::F32, n_embd);
-
-        let output_norm = context.new_tensor_1d(ggml::Type::F32, n_embd);
-        let output_norm_b = context.new_tensor_1d(ggml::Type::F32, n_embd);
-
-        let output = context.new_tensor_2d(wtype, n_embd, n_vocab);
-
-        tensors.insert("tok_embeddings.weight".to_owned(), tok_embeddings.share());
-
-        tensors.insert("norm.weight".to_owned(), norm.share());
-        tensors.insert("norm.bias".to_owned(), norm_b.share());
-
-        tensors.insert("output_norm.weight".to_owned(), output_norm.share());
-        tensors.insert("output_norm.bias".to_owned(), output_norm_b.share());
-
-        tensors.insert("output.weight".to_owned(), output.share());
-
-        let mut layers = Vec::new();
-        for i in 0..n_layer {
-            let layer = Layer {
-                attention_norm: context.new_tensor_1d(ggml::Type::F32, n_embd),
-                attention_norm_b: context.new_tensor_1d(ggml::Type::F32, n_embd),
-
-                query_key_value: context.new_tensor_2d(wtype, n_embd, 3 * n_embd),
-                query_key_value_b: context.new_tensor_1d(ggml::Type::F32, 3 * n_embd),
-
-                wo: context.new_tensor_2d(wtype, n_embd, n_embd),
-                wo_b: context.new_tensor_1d(ggml::Type::F32, n_embd),
-
-                ffn_norm: context.new_tensor_1d(ggml::Type::F32, n_embd),
-                ffn_norm_b: context.new_tensor_1d(ggml::Type::F32, n_embd),
-
-                w1: context.new_tensor_2d(wtype, n_embd, n_ff),
-                w1_b: context.new_tensor_1d(ggml::Type::F32, n_ff),
-                w2: context.new_tensor_2d(wtype, n_ff, n_embd),
-                w2_b: context.new_tensor_1d(ggml::Type::F32, n_embd),
-            };
-
-            tensors.insert(
-                format!("layers.{i}.attention_norm.weight"),
-                layer.attention_norm.share(),
-            );
-
-            tensors.insert(
-                format!("layers.{i}.attention_norm.bias"),
-                layer.attention_norm_b.share(),
-            );
-
-            tensors.insert(
-                format!("layers.{i}.attention.query_key_value.weight"),
-                layer.query_key_value.share(),
-            );
-            tensors.insert(
-                format!("layers.{i}.attention.query_key_value.bias"),
-                layer.query_key_value_b.share(),
-            );
-
-            tensors.insert(format!("layers.{i}.attention.wo.weight"), layer.wo.share());
-            tensors.insert(format!("layers.{i}.attention.wo.bias"), layer.wo_b.share());
-
-            tensors.insert(
-                format!("layers.{i}.ffn_norm.weight"),
-                layer.ffn_norm.share(),
-            );
-            tensors.insert(
-                format!("layers.{i}.ffn_norm.bias"),
-                layer.ffn_norm_b.share(),
-            );
-
-            tensors.insert(
-                format!("layers.{i}.feed_forward.w1.weight"),
-                layer.w1.share(),
-            );
-            tensors.insert(
-                format!("layers.{i}.feed_forward.w1.bias"),
-                layer.w1_b.share(),
-            );
-            tensors.insert(
-                format!("layers.{i}.feed_forward.w2.weight"),
-                layer.w2.share(),
-            );
-            tensors.insert(
-                format!("layers.{i}.feed_forward.w2.bias"),
-                layer.w2_b.share(),
-            );
-
-            layers.push(layer);
-        }
-
-        Bloom {
-            hparams,
-            vocabulary,
-            tok_embeddings,
-            norm,
-            norm_b,
-            output_norm,
-            output_norm_b,
-            output,
-            layers,
-            _tensors: tensors,
-            _context: context,
-        }
+        self.n_context_tokens
     }
 }
 
 // NOTE: Field order matters! Data is laid out in the file exactly
 // in this order.
-#[derive(Debug, Default, PartialEq, Eq, PartialOrd, Ord, Clone, Copy)]
+#[derive(Debug, Default, PartialEq, Eq, Clone, Copy)]
 pub struct Hyperparameters {
     pub n_vocab: usize,
-    pub n_ctx: usize,
     pub n_embd: usize,
     pub n_mult: usize,
     pub n_head: usize,
     pub n_layer: usize,
-    pub f16_: u32,
+    pub file_type: FileType,
+}
+impl llm_base::Hyperparameters for Hyperparameters {
+    fn read(reader: &mut dyn std::io::BufRead) -> Result<Self, llm_base::LoadError> {
+        Ok(Hyperparameters {
+            n_vocab: util::read_i32(reader)?.try_into()?,
+            n_embd: util::read_i32(reader)?.try_into()?,
+            n_mult: util::read_i32(reader)?.try_into()?,
+            n_head: util::read_i32(reader)?.try_into()?,
+            n_layer: util::read_i32(reader)?.try_into()?,
+            file_type: {
+                let ftype = util::read_i32(reader)?;
+                FileType::try_from(ftype).map_err(|_| LoadError::UnsupportedFileType(ftype))?
+            },
+        })
+    }
+
+    fn n_vocabulary(&self) -> usize {
+        self.n_vocab
+    }
 }
 
 struct Layer {
diff --git a/ggml/src/loader.rs b/ggml/src/loader.rs
deleted file mode 100644
index 21dd8973..00000000
--- a/ggml/src/loader.rs
+++ /dev/null
@@ -1,312 +0,0 @@
-use std::{
-    fmt::Debug,
-    io::BufRead,
-    path::{Path, PathBuf},
-};
-
-use thiserror::Error;
-
-/// Each variant represents a step within the process of loading the model.
-/// These can be used to report progress to the user.
-#[derive(Clone, PartialEq, Eq, PartialOrd, Ord, Debug)]
-pub enum LoadProgress<'a, H> {
-    /// The hyperparameters have been loaded from the model.
-    HyperparametersLoaded(&'a H),
-    /// The context has been created.
-    ContextSize {
-        /// The size of the context.
-        bytes: usize,
-    },
-    /// A part of the model is being loaded.
-    PartLoading {
-        /// The path to the model part.
-        file: &'a Path,
-        /// The current part (0-indexed).
-        current_part: usize,
-        /// The number of total parts.
-        total_parts: usize,
-    },
-    /// A tensor from the current part has been loaded.
-    PartTensorLoaded {
-        /// The path to the model part.
-        file: &'a Path,
-        /// The current tensor (0-indexed).
-        current_tensor: usize,
-        /// The number of total tensors.
-        tensor_count: usize,
-    },
-    /// A model part has finished fully loading.
-    PartLoaded {
-        /// The path to the model part.
-        file: &'a Path,
-        /// The number of bytes in the part.
-        byte_size: usize,
-        /// The number of tensors in the part.
-        tensor_count: usize,
-    },
-}
-
-#[derive(Error, Debug)]
-/// Errors encountered during the loading process.
-pub enum LoadError {
-    #[error("could not open file {path:?}")]
-    /// A file failed to open.
-    OpenFileFailed {
-        /// The original error.
-        source: std::io::Error,
-        /// The path that failed.
-        path: PathBuf,
-    },
-    #[error("no parent path for {path:?}")]
-    /// There is no parent path for a given path.
-    NoParentPath {
-        /// The path without a parent.
-        path: PathBuf,
-    },
-    #[error("unable to read exactly {bytes} bytes")]
-    /// Reading exactly `bytes` from a file failed.
-    ReadExactFailed {
-        /// The original error.
-        source: std::io::Error,
-        /// The number of bytes that were attempted to be read.
-        bytes: usize,
-    },
-    #[error("non-specific I/O error")]
-    /// A non-specific IO error.
-    IO(#[from] std::io::Error),
-    #[error("could not convert bytes to a UTF-8 string")]
-    /// One of the strings encountered was not valid UTF-8.
-    InvalidUtf8(#[from] std::string::FromUtf8Error),
-    #[error("invalid integer conversion")]
-    /// One of the integers encountered could not be converted to a more appropriate type.
-    InvalidIntegerConversion(#[from] std::num::TryFromIntError),
-    #[error("invalid magic number for {path:?}")]
-    /// An invalid magic number was encountered during the loading process.
-    InvalidMagic {
-        /// The path that failed.
-        path: PathBuf,
-    },
-    #[error("invalid file format version {value}")]
-    /// The version of the format is not supported by this version of `llama-rs`.
-    InvalidFormatVersion {
-        /// The version that was encountered.
-        value: u32,
-    },
-    #[error("invalid value {ftype} for `f16` in hyperparameters")]
-    /// The `f16` hyperparameter had an invalid value.
-    HyperparametersF16Invalid {
-        /// The format type that was encountered.
-        ftype: u32,
-    },
-    #[error("unknown tensor `{tensor_name}` in {path:?}")]
-    /// The tensor `tensor_name` was encountered during the loading of `path`, but was not seen during
-    /// the model prelude.
-    UnknownTensor {
-        /// The name of the tensor.
-        tensor_name: String,
-        /// The path that failed.
-        path: PathBuf,
-    },
-    #[error("the tensor `{tensor_name}` has the wrong size in {path:?}")]
-    /// The tensor `tensor_name` did not match its expected size.
-    TensorWrongSize {
-        /// The name of the tensor.
-        tensor_name: String,
-        /// The path that failed.
-        path: PathBuf,
-    },
-    /// The tensor `tensor_name` did not have the expected format type.
-    #[error("invalid ftype {ftype} for tensor `{tensor_name}` in {path:?}")]
-    InvalidFtype {
-        /// The name of the tensor.
-        tensor_name: String,
-        /// The format type that was encountered.
-        ftype: u32,
-        /// The path that failed.
-        path: PathBuf,
-    },
-}
-
-/// Default load progress callback function
-pub fn load_progress<H: Debug>(progress: LoadProgress<H>) {
-    match progress {
-        LoadProgress::HyperparametersLoaded(hparams) => {
-            log::debug!("Loaded hyperparameters {hparams:#?}")
-        }
-        LoadProgress::ContextSize { bytes } => log::info!(
-            "ggml ctx size = {:.2} MB\n",
-            bytes as f64 / (1024.0 * 1024.0)
-        ),
-        LoadProgress::PartLoading {
-            file,
-            current_part,
-            total_parts,
-        } => {
-            let current_part = current_part + 1;
-            log::info!(
-                "Loading model part {}/{} from '{}'\n",
-                current_part,
-                total_parts,
-                file.to_string_lossy(),
-            )
-        }
-        LoadProgress::PartTensorLoaded {
-            current_tensor,
-            tensor_count,
-            ..
-        } => {
-            let current_tensor = current_tensor + 1;
-            if current_tensor % 8 == 0 {
-                log::info!("Loaded tensor {current_tensor}/{tensor_count}");
-            }
-        }
-        LoadProgress::PartLoaded {
-            file,
-            byte_size,
-            tensor_count,
-        } => {
-            log::info!("Loading of '{}' complete", file.to_string_lossy());
-            log::info!(
-                "Model size = {:.2} MB / num tensors = {}",
-                byte_size as f64 / 1024.0 / 1024.0,
-                tensor_count
-            );
-        }
-    }
-}
-
-/// Read bytes
-pub fn read_bytes<const N: usize>(reader: &mut impl BufRead) -> Result<[u8; N], LoadError> {
-    let mut bytes = [0u8; N];
-    reader
-        .read_exact(&mut bytes)
-        .map_err(|e| LoadError::ReadExactFailed {
-            source: e,
-            bytes: N,
-        })?;
-    Ok(bytes)
-}
-
-/// Ready bytes with length
-pub fn read_bytes_with_len(reader: &mut impl BufRead, len: usize) -> Result<Vec<u8>, LoadError> {
-    let mut bytes = vec![0u8; len];
-    reader
-        .read_exact(&mut bytes)
-        .map_err(|e| LoadError::ReadExactFailed {
-            source: e,
-            bytes: len,
-        })?;
-    Ok(bytes)
-}
-
-/// Read an i32
-pub fn read_i32(reader: &mut impl BufRead) -> Result<i32, LoadError> {
-    Ok(i32::from_le_bytes(read_bytes::<4>(reader)?))
-}
-
-/// Read a u32
-pub fn read_u32(reader: &mut impl BufRead) -> Result<u32, LoadError> {
-    Ok(u32::from_le_bytes(read_bytes::<4>(reader)?))
-}
-
-/// Read an f32
-pub fn read_f32(reader: &mut impl BufRead) -> Result<f32, LoadError> {
-    Ok(f32::from_le_bytes(read_bytes::<4>(reader)?))
-}
-
-/// Helper function. Reads a string from the buffer and returns it.
-pub fn read_string(reader: &mut impl BufRead, len: usize) -> Result<String, LoadError> {
-    Ok(String::from_utf8(read_bytes_with_len(reader, len)?)?)
-}
-
-/// Find all model files
-pub fn find_all_model_files(main_path: &Path) -> Result<Vec<PathBuf>, LoadError> {
-    Ok(collect_related_paths(
-        main_path,
-        std::fs::read_dir(main_path.parent().ok_or_else(|| LoadError::NoParentPath {
-            path: main_path.to_owned(),
-        })?)?
-        .filter_map(Result::ok)
-        .map(|de| de.path()),
-    ))
-}
-
-fn collect_related_paths(
-    main_path: &Path,
-    directory_paths: impl Iterator<Item = PathBuf>,
-) -> Vec<PathBuf> {
-    let main_filename = main_path.file_name().and_then(|p| p.to_str());
-
-    let mut paths: Vec<PathBuf> = directory_paths
-        .filter(|p| {
-            p.file_name()
-                .and_then(|p| p.to_str())
-                .zip(main_filename)
-                .map(|(part_filename, main_filename)| {
-                    match part_filename.strip_prefix(main_filename) {
-                        Some(suffix) => {
-                            suffix.is_empty()
-                                || (suffix
-                                    .strip_prefix('.')
-                                    .map(|s| s.parse::<usize>().is_ok())
-                                    .unwrap_or(false))
-                        }
-                        None => false,
-                    }
-                })
-                .unwrap_or(false)
-        })
-        .collect();
-    paths.sort();
-    paths
-}
-
-#[cfg(test)]
-mod tests {
-    use llm_base::TokenUtf8Buffer;
-
-    use super::*;
-
-    #[test]
-    fn test_collect_related_paths() {
-        let main_path = PathBuf::from("/models/llama.bin");
-        let directory_paths = [
-            "/models/llama.bin",
-            "/models/llama.bin.1",
-            "/models/llama.bin.2",
-            "/models/llama.bin.tmp",
-        ]
-        .map(PathBuf::from);
-        let expected_paths = [
-            "/models/llama.bin",
-            "/models/llama.bin.1",
-            "/models/llama.bin.2",
-        ]
-        .map(PathBuf::from);
-
-        let output_paths = collect_related_paths(&main_path, directory_paths.into_iter());
-        assert_eq!(expected_paths.as_slice(), output_paths);
-    }
-
-    #[test]
-    fn test_valid_utf8() {
-        let mut buffer = TokenUtf8Buffer::new();
-        assert_eq!(buffer.push(b"hello").as_deref(), Some("hello"));
-        assert_eq!(buffer.push(&[0xE2, 0x82, 0xAC]).as_deref(), Some("€"));
-    }
-
-    #[test]
-    fn test_partial_utf8() {
-        let mut buffer = TokenUtf8Buffer::new();
-        assert_eq!(buffer.push(&[0xE2, 0x82]).as_deref(), None);
-        assert_eq!(buffer.push(&[0xAC]).as_deref(), Some("€"));
-    }
-
-    #[test]
-    fn test_invalid_prelude_for_valid_utf8() {
-        let mut buffer = TokenUtf8Buffer::new();
-        assert_eq!(buffer.push(&[0xD8]).as_deref(), None);
-        assert_eq!(buffer.push(&[0xE2, 0x82]).as_deref(), None);
-        assert_eq!(buffer.push(&[0xAC]).as_deref(), Some("€"));
-    }
-}
diff --git a/llama/Cargo.toml b/llama/Cargo.toml
index 1789223a..13632965 100644
--- a/llama/Cargo.toml
+++ b/llama/Cargo.toml
@@ -6,9 +6,7 @@ edition = "2021"
 # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
 
 [dependencies]
-ggml = { path = "../ggml" }
 llm-base = { path = "../llm-base" }
-ggml-format = { path = "../ggml-format" }
 
 bytemuck = { workspace = true }
 serde = { workspace = true }
@@ -17,7 +15,6 @@ rand = { workspace = true }
 partial_sort = "0.2.0"
 thiserror = "1.0"
 serde_bytes = "0.11"
-memmap2 = "0.5.10"
 
 # Used for the `convert` feature
 serde_json = { version = "1.0", optional = true }
@@ -26,10 +23,11 @@ rust_tokenizers = { version = "3.1.2", optional = true }
 
 # Used for the `quantize` feature
 half = { version = "2.2.1", optional = true }
+ggml-format = { path = "../ggml-format", optional = true }
 
 [features]
 convert = ["dep:serde_json", "dep:protobuf", "dep:rust_tokenizers"]
-quantize = ["dep:half"]
+quantize = ["dep:half", "dep:ggml-format"]
 
 [dev-dependencies]
 rand = { workspace = true }
diff --git a/llama/src/convert.rs b/llama/src/convert.rs
index de346e94..2acf23af 100644
--- a/llama/src/convert.rs
+++ b/llama/src/convert.rs
@@ -80,7 +80,6 @@ fn load_hyperparameters(path: &Path, file_type: FileType, vocab: &Vocabulary) ->
     let json: HyperParametersJson = serde_json::from_str(&json).expect("Unable to parse json");
     Hyperparameters {
         file_type,
-        n_ctx: 0,
         n_embd: json.dim,
         n_head: json.n_heads,
         n_layer: json.n_layers,
diff --git a/llama/src/lib.rs b/llama/src/lib.rs
index 3cccc23d..2a4ffe4b 100644
--- a/llama/src/lib.rs
+++ b/llama/src/lib.rs
@@ -1,23 +1,18 @@
-use std::{collections::HashMap, error::Error, path::Path};
+use std::{error::Error, path::Path};
 
 use llm_base::{
-    EvaluateOutputRequest, FileType, InferenceParameters, InferenceSession,
-    InferenceSessionParameters, LoadError, LoadProgress, Model,
+    util, EvaluateOutputRequest, FileType, InferenceParameters, InferenceSession,
+    InferenceSessionParameters, LoadError, LoadProgress, Mmap, Model, TensorLoader,
 };
-use memmap2::Mmap;
-
 #[cfg(feature = "convert")]
 pub mod convert;
 
 #[cfg(feature = "quantize")]
 pub mod quantize;
 
-mod loader;
-mod loader2;
+mod old_loader;
 
-pub use ggml::Type as ElementType;
-pub use llm_base::util::TokenUtf8Buffer;
-pub use llm_base::{TokenBias, TokenId, Vocabulary};
+pub use llm_base::{ggml, util::TokenUtf8Buffer, TokenBias, TokenId, Vocabulary};
 
 /// The weights for the LLaMA model. All the mutable state is split into a
 /// separate struct `InferenceSession`.
@@ -25,7 +20,8 @@ pub use llm_base::{TokenBias, TokenId, Vocabulary};
 /// # Safety
 /// This implements [Send] and [Sync] as it is immutable after construction.
 pub struct Llama {
-    pub(crate) hyperparameters: Hyperparameters,
+    hyperparameters: Hyperparameters,
+    n_context_tokens: usize,
 
     vocabulary: Vocabulary,
 
@@ -36,8 +32,6 @@ pub struct Llama {
 
     layers: Vec<Layer>,
 
-    tensors: HashMap<String, ggml::Tensor>,
-
     /// Needs to kept alive while the model is alive
     _mmap: Option<Mmap>,
 
@@ -48,95 +42,33 @@ unsafe impl Send for Llama {}
 unsafe impl Sync for Llama {}
 
 impl Llama {
-    pub(crate) fn new_loader1(
-        context: ggml::Context,
-        hparams: Hyperparameters,
-        vocabulary: Vocabulary,
-        n_ff: usize,
-        wtype: ggml::Type,
-        mmap: Option<Mmap>,
-    ) -> Self {
-        let n_embd = hparams.n_embd;
-        let n_layer = hparams.n_layer;
-        let n_vocab = hparams.n_vocab;
-
-        let mut tensors = HashMap::new();
-
-        let tok_embeddings = context.new_tensor_2d(wtype, n_embd, n_vocab);
-        let norm = context.new_tensor_1d(ggml::Type::F32, n_embd);
-        let output = context.new_tensor_2d(wtype, n_embd, n_vocab);
-
-        tensors.insert("tok_embeddings.weight".to_owned(), tok_embeddings.share());
-        tensors.insert("norm.weight".to_owned(), norm.share());
-        tensors.insert("output.weight".to_owned(), output.share());
-
-        let mut layers = Vec::new();
-        for i in 0..n_layer {
-            let layer = Layer {
-                attention_norm: context.new_tensor_1d(ggml::Type::F32, n_embd),
-                wq: context.new_tensor_2d(wtype, n_embd, n_embd),
-                wk: context.new_tensor_2d(wtype, n_embd, n_embd),
-                wv: context.new_tensor_2d(wtype, n_embd, n_embd),
-                wo: context.new_tensor_2d(wtype, n_embd, n_embd),
-                ffn_norm: context.new_tensor_1d(ggml::Type::F32, n_embd),
-                w1: context.new_tensor_2d(wtype, n_embd, n_ff),
-                w2: context.new_tensor_2d(wtype, n_ff, n_embd),
-                w3: context.new_tensor_2d(wtype, n_embd, n_ff),
-            };
-
-            tensors.insert(
-                format!("layers.{i}.attention_norm.weight"),
-                layer.attention_norm.share(),
-            );
-
-            tensors.insert(format!("layers.{i}.attention.wq.weight"), layer.wq.share());
-            tensors.insert(format!("layers.{i}.attention.wk.weight"), layer.wk.share());
-            tensors.insert(format!("layers.{i}.attention.wv.weight"), layer.wv.share());
-            tensors.insert(format!("layers.{i}.attention.wo.weight"), layer.wo.share());
-
-            tensors.insert(
-                format!("layers.{i}.ffn_norm.weight"),
-                layer.ffn_norm.share(),
-            );
-
-            tensors.insert(
-                format!("layers.{i}.feed_forward.w1.weight"),
-                layer.w1.share(),
-            );
-            tensors.insert(
-                format!("layers.{i}.feed_forward.w2.weight"),
-                layer.w2.share(),
-            );
-            tensors.insert(
-                format!("layers.{i}.feed_forward.w3.weight"),
-                layer.w3.share(),
-            );
-
-            layers.push(layer);
-        }
-
-        Llama {
-            hyperparameters: hparams,
-            vocabulary,
-            tok_embeddings,
-            norm,
-            output,
-            layers,
-            tensors,
-            _context: context,
-            _mmap: mmap,
-        }
+    /// Load the model from `path` with `n_context_tokens` context tokens.
+    ///
+    /// The status of the loading process will be reported through `load_progress_callback`.
+    pub fn load(
+        path: impl AsRef<Path>,
+        prefer_mmap: bool,
+        n_context_tokens: usize,
+        load_progress_callback: impl FnMut(LoadProgress),
+    ) -> Result<Llama, LoadError> {
+        llm_base::load(path, prefer_mmap, n_context_tokens, load_progress_callback)
     }
+}
+impl Model for Llama {
+    type Hyperparameters = Hyperparameters;
 
-    pub(crate) fn new_loader2<E: Error>(
-        hyperparameters: Hyperparameters,
+    fn new<E: Error>(
+        hyperparameters: Self::Hyperparameters,
+        n_context_tokens: usize,
         vocabulary: Vocabulary,
-        n_ff: usize,
         tensor_loader: impl TensorLoader<E>,
-    ) -> Result<Llama, E> {
+    ) -> Result<Self, E> {
         let n_embd = hyperparameters.n_embd;
         let n_layer = hyperparameters.n_layer;
         let n_vocab = hyperparameters.n_vocab;
+        let n_mult = hyperparameters.n_mult;
+
+        let n_ff = ((2 * (4 * n_embd) / 3 + n_mult - 1) / n_mult) * n_mult;
 
         let mut tl = tensor_loader;
 
@@ -182,59 +114,26 @@ impl Llama {
             layers.push(layer);
         }
 
-        let (_context, tensors, _mmap) = tl.finish();
+        let (_context, _tensors, _mmap) = tl.finish();
 
         Ok(Self {
             hyperparameters,
+            n_context_tokens,
             vocabulary,
             tok_embeddings,
             norm,
             output,
             layers,
-            tensors,
             _context,
             _mmap,
         })
     }
 
-    /// Load the model from `path` with `n_context_tokens` context tokens.
-    ///
-    /// The status of the loading process will be reported through `load_progress_callback`.
-    pub fn load(
-        path: impl AsRef<Path>,
-        prefer_mmap: bool,
-        n_context_tokens: usize,
-        load_progress_callback: impl FnMut(LoadProgress<Hyperparameters>),
-    ) -> Result<Llama, LoadError> {
-        // Loader2 is the default. It can support GGML, GGMF and GGJT, but does not support multipart models.
-        //
-        // Loader1 is the old loader. It can support multipart models, but will be deprecated.
-        let use_loader_2: bool = match std::env::var("GGML_LOADER").as_deref() {
-            Ok("2") => true,
-            Ok("1") => false,
-            Ok(_) => panic!("Please use GGML_LOADER=1 or GGML_LOADER=2"),
-            Err(_) => true,
-        };
-
-        if use_loader_2 {
-            loader2::load(path, prefer_mmap, n_context_tokens, load_progress_callback)
-        } else {
-            loader::load(path, prefer_mmap, n_context_tokens, load_progress_callback)
-        }
-    }
-
-    pub(crate) fn tensors_mut(&mut self) -> &mut HashMap<String, ggml::Tensor> {
-        &mut self.tensors
-    }
-}
-impl Model for Llama {
-    type Hyperparameters = Hyperparameters;
-
     /// Starts a new `InferenceSession` for this model.
     fn start_session(&self, params: InferenceSessionParameters) -> InferenceSession {
         InferenceSession::new(
             params,
-            self.hyperparameters.n_ctx,
+            self.n_context_tokens,
             self.hyperparameters.n_layer,
             self.hyperparameters.n_embd,
             self.hyperparameters.n_vocab,
@@ -257,7 +156,6 @@ impl Model for Llama {
 
         let Hyperparameters {
             n_vocab,
-            n_ctx,
             n_embd,
             n_mult: _,
             n_head,
@@ -265,6 +163,7 @@ impl Model for Llama {
             n_rot,
             file_type: _,
         } = self.hyperparameters;
+        let n_ctx = self.n_context_tokens;
 
         // For the first run, we need to guess a maximum buffer size so we can measure
         // the actual memory consumption of the temporary ggml context.
@@ -530,7 +429,7 @@ impl Model for Llama {
     }
 
     fn n_ctx(&self) -> usize {
-        self.hyperparameters.n_ctx
+        self.n_context_tokens
     }
 }
 #[cfg(test)]
@@ -545,12 +444,12 @@ impl Llama {
 
         Self {
             hyperparameters: Default::default(),
+            n_context_tokens: 0,
             vocabulary: Default::default(),
             tok_embeddings,
             norm,
             output,
             layers: Default::default(),
-            tensors: Default::default(),
             _mmap: Default::default(),
             _context: context,
         }
@@ -562,8 +461,6 @@ impl Llama {
 pub struct Hyperparameters {
     /// n_vocab
     pub n_vocab: usize,
-    /// n_ctx
-    pub n_ctx: usize,
     /// n_embd
     pub n_embd: usize,
     /// n_mult
@@ -577,10 +474,25 @@ pub struct Hyperparameters {
     /// file_type
     pub file_type: FileType,
 }
+impl llm_base::Hyperparameters for Hyperparameters {
+    fn read(reader: &mut dyn std::io::BufRead) -> Result<Self, LoadError> {
+        Ok(Hyperparameters {
+            n_vocab: util::read_i32(reader)?.try_into()?,
+            n_embd: util::read_i32(reader)?.try_into()?,
+            n_mult: util::read_i32(reader)?.try_into()?,
+            n_head: util::read_i32(reader)?.try_into()?,
+            n_layer: util::read_i32(reader)?.try_into()?,
+            n_rot: util::read_i32(reader)?.try_into()?,
+            file_type: {
+                let ftype = util::read_i32(reader)?;
+                FileType::try_from(ftype).map_err(|_| LoadError::UnsupportedFileType(ftype))?
+            },
+        })
+    }
 
-pub(crate) trait TensorLoader<E: Error> {
-    fn load(&mut self, name: &str, ne: &[usize]) -> Result<ggml::Tensor, E>;
-    fn finish(self) -> (ggml::Context, HashMap<String, ggml::Tensor>, Option<Mmap>);
+    fn n_vocabulary(&self) -> usize {
+        self.n_vocab
+    }
 }
 
 struct Layer {
diff --git a/llama/src/loader2.rs b/llama/src/loader2.rs
deleted file mode 100644
index bbf3567d..00000000
--- a/llama/src/loader2.rs
+++ /dev/null
@@ -1,234 +0,0 @@
-use ggml_format::{util::read_i32, ContainerType, PartialHyperparameters, TensorInfo};
-use llm_base::{util, FileType};
-use memmap2::Mmap;
-
-use std::{
-    collections::HashMap,
-    fs::File,
-    io::{BufRead, BufReader, Read, Seek, SeekFrom},
-    path::{Path, PathBuf},
-};
-
-use crate::{Hyperparameters, Llama, LoadError, LoadProgress, TensorLoader, TokenId, Vocabulary};
-
-pub(crate) fn load(
-    path: impl AsRef<Path>,
-    prefer_mmap: bool,
-    n_context_tokens: usize,
-    mut load_progress_callback: impl FnMut(LoadProgress<Hyperparameters>),
-) -> Result<Llama, LoadError> {
-    let main_path = path.as_ref();
-
-    let paths = util::find_all_model_files(main_path)?;
-    if paths.len() != 1 {
-        return Err(LoadError::MultipartNotSupported { paths });
-    }
-
-    let file = File::open(main_path).map_err(|e| LoadError::OpenFileFailed {
-        source: e,
-        path: main_path.to_owned(),
-    })?;
-    let mut reader = BufReader::new(&file);
-
-    let path = path.as_ref().to_owned();
-
-    (load_progress_callback)(LoadProgress::PartLoading {
-        file: &path,
-        current_part: 0,
-        total_parts: 1,
-    });
-
-    let mut loader = Loader::new(n_context_tokens, load_progress_callback);
-
-    ggml_format::load_model(&mut reader, &mut loader)
-        .map_err(|err| LoadError::from_format_error(err, path.clone()))?;
-
-    let Loader {
-        hyperparameters,
-        vocabulary,
-        tensors,
-        mut load_progress_callback,
-        container_type,
-        ..
-    } = loader;
-
-    let Hyperparameters { n_embd, n_mult, .. } = hyperparameters;
-    let n_ff = ((2 * (4 * n_embd) / 3 + n_mult - 1) / n_mult) * n_mult;
-
-    let use_mmap = prefer_mmap && container_type.support_mmap();
-
-    let ctx_size = tensors
-        .values()
-        .map(|ti| {
-            ggml::Tensor::C_TYPE_SIZE
-                + ggml::OBJECT_SIZE
-                + if use_mmap { 0 } else { ti.calc_size() }
-        })
-        .sum::<usize>();
-    (load_progress_callback)(LoadProgress::ContextSize { bytes: ctx_size });
-    let context = ggml::Context::init(ctx_size, !use_mmap);
-
-    let mmap = if use_mmap {
-        let file = File::open(&path)?;
-        Some(unsafe { Mmap::map(&file)? })
-    } else {
-        None
-    };
-
-    struct TensorLoader2<'a> {
-        path: PathBuf,
-        file: File,
-        tensors: HashMap<String, TensorInfo>,
-        context: ggml::Context,
-        mmap: Option<Mmap>,
-        load_progress_callback: &'a mut dyn FnMut(LoadProgress<Hyperparameters>),
-        loaded_tensors: HashMap<String, ggml::Tensor>,
-    }
-    impl TensorLoader<LoadError> for TensorLoader2<'_> {
-        fn load(&mut self, name: &str, ne: &[usize]) -> Result<ggml::Tensor, LoadError> {
-            let info = self
-                .tensors
-                .get(name)
-                .ok_or_else(|| LoadError::UnknownTensor {
-                    path: self.path.clone(),
-                    tensor_name: name.to_owned(),
-                })?;
-
-            let ctx = &self.context;
-            let mut tensor = match ne.len() {
-                1 => ctx.new_tensor_1d(info.element_type, ne[0]),
-                2 => ctx.new_tensor_2d(info.element_type, ne[0], ne[1]),
-                3 => ctx.new_tensor_3d(info.element_type, ne[0], ne[1], ne[2]),
-                _ => {
-                    return Err(LoadError::InvariantBroken {
-                        path: self.path.clone(),
-                        invariant: format!(
-                            "the tensor {name} had an unsupported dimension count: {ne:?}"
-                        ),
-                    })
-                }
-            };
-
-            match self.mmap.as_ref() {
-                Some(mmap) => unsafe {
-                    let ptr = mmap.as_ptr().offset(info.start_offset as isize);
-                    tensor.set_data(ptr as *mut std::ffi::c_void);
-                },
-                None => {
-                    let buf: &mut [u8] = unsafe {
-                        std::slice::from_raw_parts_mut(tensor.data() as *mut u8, tensor.nbytes())
-                    };
-                    self.file.seek(SeekFrom::Start(info.start_offset))?;
-                    self.file.read_exact(buf)?;
-                }
-            }
-
-            self.loaded_tensors.insert(name.to_owned(), tensor.share());
-            (self.load_progress_callback)(LoadProgress::PartTensorLoaded {
-                file: &self.path,
-                current_tensor: self.loaded_tensors.len(),
-                tensor_count: self.tensors.len(),
-            });
-
-            Ok(tensor)
-        }
-
-        fn finish(self) -> (ggml::Context, HashMap<String, ggml::Tensor>, Option<Mmap>) {
-            (self.context, self.loaded_tensors, self.mmap)
-        }
-    }
-
-    let tensors_len = tensors.len();
-    let tl = TensorLoader2 {
-        path: path.clone(),
-        file,
-        tensors,
-        context,
-        mmap,
-        load_progress_callback: &mut load_progress_callback,
-        loaded_tensors: Default::default(),
-    };
-
-    let model = Llama::new_loader2(hyperparameters, vocabulary, n_ff, tl)?;
-
-    (load_progress_callback)(LoadProgress::PartLoaded {
-        file: &path,
-        byte_size: 0,
-        tensor_count: tensors_len,
-    });
-
-    Ok(model)
-}
-
-pub(crate) struct Loader<F: FnMut(LoadProgress<Hyperparameters>)> {
-    // Input
-    n_ctx: usize,
-    load_progress_callback: F,
-
-    // Output
-    pub(crate) container_type: ContainerType,
-    pub(crate) hyperparameters: Hyperparameters,
-    pub(crate) vocabulary: Vocabulary,
-    pub(crate) tensors: HashMap<String, TensorInfo>,
-}
-impl<F: FnMut(LoadProgress<Hyperparameters>)> Loader<F> {
-    pub(crate) fn new(n_ctx: usize, load_progress_callback: F) -> Self {
-        Self {
-            n_ctx,
-            load_progress_callback,
-
-            container_type: ContainerType::Ggjt,
-            hyperparameters: Hyperparameters::default(),
-            vocabulary: Vocabulary::default(),
-            tensors: HashMap::default(),
-        }
-    }
-}
-impl<F: FnMut(LoadProgress<Hyperparameters>)> ggml_format::LoadHandler<LoadError> for Loader<F> {
-    fn container_type(&mut self, container_type: ContainerType) -> Result<(), LoadError> {
-        self.container_type = container_type;
-        Ok(())
-    }
-
-    fn vocabulary_token(&mut self, i: usize, token: Vec<u8>, score: f32) -> Result<(), LoadError> {
-        let id = match TokenId::try_from(i) {
-            Ok(id) => id,
-            Err(err) => return Err(LoadError::InvalidIntegerConversion(err)),
-        };
-        self.vocabulary.push_token(id, token, score);
-
-        Ok(())
-    }
-
-    fn read_hyperparameters(
-        &mut self,
-        reader: &mut dyn BufRead,
-    ) -> Result<PartialHyperparameters, LoadError> {
-        // NOTE: Field order matters! Data is laid out in the file exactly in this order.
-        let hyperparameters = Hyperparameters {
-            n_vocab: read_i32(reader)?.try_into()?,
-            n_embd: read_i32(reader)?.try_into()?,
-            n_mult: read_i32(reader)?.try_into()?,
-            n_head: read_i32(reader)?.try_into()?,
-            n_layer: read_i32(reader)?.try_into()?,
-            n_rot: read_i32(reader)?.try_into()?,
-            file_type: {
-                let ftype = read_i32(reader)?;
-                FileType::try_from(ftype).map_err(|_| LoadError::UnsupportedFileType(ftype))?
-            },
-            n_ctx: self.n_ctx,
-        };
-        let partial = PartialHyperparameters {
-            n_vocab: hyperparameters.n_vocab,
-        };
-        self.hyperparameters = hyperparameters;
-        (self.load_progress_callback)(LoadProgress::HyperparametersLoaded(&self.hyperparameters));
-
-        Ok(partial)
-    }
-
-    fn tensor_buffer(&mut self, info: TensorInfo) -> Result<(), LoadError> {
-        self.tensors.insert(info.name.clone(), info);
-        Ok(())
-    }
-}
diff --git a/llama/src/loader.rs b/llama/src/old_loader.rs
similarity index 85%
rename from llama/src/loader.rs
rename to llama/src/old_loader.rs
index ca52af75..8037645e 100644
--- a/llama/src/loader.rs
+++ b/llama/src/old_loader.rs
@@ -1,4 +1,9 @@
 #![allow(dead_code)]
+//! Old loader. Can load multipart models, but is difficult to maintain.
+//! Plan is to use this to create a tool that can convert multipart models
+//! to single-part models for use with the new loader.
+//!
+//! <https://github.com/rustformers/llama-rs/issues/150>
 
 use std::{
     collections::HashMap,
@@ -6,20 +11,15 @@ use std::{
     path::Path,
 };
 
-use crate::{ElementType, Hyperparameters};
+use crate::Hyperparameters;
 use crate::{Llama, LoadError, LoadProgress, TokenId, Vocabulary};
-use ggml_format::{
-    util::{has_data_left, read_bytes_with_len, read_f32, read_i32, read_u32},
-    ContainerType,
-};
-use llm_base::{mulf, util, FileType};
-use memmap2::Mmap;
+use llm_base::{ggml, mulf, util, ContainerType, FileType, Mmap};
 
 pub(crate) fn load(
     path: impl AsRef<Path>,
     prefer_mmap: bool,
     n_context_tokens: usize,
-    mut load_progress_callback: impl FnMut(LoadProgress<Hyperparameters>),
+    mut load_progress_callback: impl FnMut(LoadProgress),
 ) -> Result<Llama, LoadError> {
     use std::fs::File;
     use std::io::BufReader;
@@ -33,7 +33,7 @@ pub(crate) fn load(
     let mut reader = BufReader::new(&file);
 
     // Verify magic
-    let magic = read_u32(&mut reader)?;
+    let magic = util::read_u32(&mut reader)?;
     let model_type: ContainerType = match magic {
         ggml::FILE_MAGIC_GGMF => ContainerType::Ggmf,
         ggml::FILE_MAGIC_GGJT => ContainerType::Ggjt,
@@ -49,7 +49,7 @@ pub(crate) fn load(
     // Load format version
     match model_type {
         ContainerType::Ggmf | ContainerType::Ggjt => {
-            let _version: u32 = match read_u32(&mut reader)? {
+            let _version: u32 = match util::read_u32(&mut reader)? {
                 ggml::FORMAT_VERSION => ggml::FORMAT_VERSION,
                 version => {
                     return Err(LoadError::InvalidFormatVersion {
@@ -69,15 +69,14 @@ pub(crate) fn load(
     // NOTE: Field order matters! Data is laid out in the file exactly
     // in this order.
     let hparams = Hyperparameters {
-        n_vocab: read_i32(&mut reader)?.try_into()?,
-        n_ctx: n_context_tokens,
-        n_embd: read_i32(&mut reader)?.try_into()?,
-        n_mult: read_i32(&mut reader)?.try_into()?,
-        n_head: read_i32(&mut reader)?.try_into()?,
-        n_layer: read_i32(&mut reader)?.try_into()?,
-        n_rot: read_i32(&mut reader)?.try_into()?,
+        n_vocab: util::read_i32(&mut reader)?.try_into()?,
+        n_embd: util::read_i32(&mut reader)?.try_into()?,
+        n_mult: util::read_i32(&mut reader)?.try_into()?,
+        n_head: util::read_i32(&mut reader)?.try_into()?,
+        n_layer: util::read_i32(&mut reader)?.try_into()?,
+        n_rot: util::read_i32(&mut reader)?.try_into()?,
         file_type: {
-            let ftype = read_i32(&mut reader)?;
+            let ftype = util::read_i32(&mut reader)?;
             FileType::try_from(ftype).map_err(|_| LoadError::UnsupportedFileType(ftype))
         }?,
     };
@@ -85,7 +84,7 @@ pub(crate) fn load(
     let n_ff =
         ((2 * (4 * hparams.n_embd) / 3 + hparams.n_mult - 1) / hparams.n_mult) * hparams.n_mult;
 
-    load_progress_callback(LoadProgress::HyperparametersLoaded(&hparams));
+    load_progress_callback(LoadProgress::HyperparametersLoaded);
 
     // ===============
     // Load vocabulary
@@ -94,12 +93,12 @@ pub(crate) fn load(
         let mut vocab = Vocabulary::default();
 
         for i in 0..hparams.n_vocab {
-            let len = read_i32(&mut reader)?;
+            let len = util::read_i32(&mut reader)?;
             let id = i as TokenId;
-            let token = read_bytes_with_len(&mut reader, len.try_into()?)?;
+            let token = util::read_bytes_with_len(&mut reader, len.try_into()?)?;
 
             let score = match model_type {
-                ContainerType::Ggmf | ContainerType::Ggjt => read_f32(&mut reader)?,
+                ContainerType::Ggmf | ContainerType::Ggjt => util::read_f32(&mut reader)?,
                 ContainerType::Ggml => {
                     // Legacy model, set empty score
                     0.
@@ -172,30 +171,33 @@ pub(crate) fn load(
         (None, None)
     };
 
-    let mut model = Llama::new_loader1(context, hparams, vocabulary, n_ff, wtype, mmap);
-    match model_type {
-        ContainerType::Ggmf | ContainerType::Ggml => {
-            let file_offset = reader.stream_position()?;
-            drop(reader);
-            load_weights_ggmf_or_unversioned(
-                file_offset,
-                main_path,
-                load_progress_callback,
-                model.tensors_mut(),
-            )?
-        }
-        ContainerType::Ggjt => {
-            load_weights_ggjt(
-                &mut reader,
-                mmap_ptr,
-                main_path,
-                load_progress_callback,
-                model.tensors_mut(),
-            )?;
-        }
-    }
-
-    Ok(model)
+    let _ = (context, vocabulary, mmap, mmap_ptr, n_context_tokens);
+
+    // let mut model = Llama::new_loader1(context, hparams, vocabulary, n_ff, wtype, mmap);
+    // match model_type {
+    //     ContainerType::Ggmf | ContainerType::Ggml => {
+    //         let file_offset = reader.stream_position()?;
+    //         drop(reader);
+    //         load_weights_ggmf_or_unversioned(
+    //             file_offset,
+    //             main_path,
+    //             load_progress_callback,
+    //             model.tensors_mut(),
+    //         )?
+    //     }
+    //     ContainerType::Ggjt => {
+    //         load_weights_ggjt(
+    //             &mut reader,
+    //             mmap_ptr,
+    //             main_path,
+    //             load_progress_callback,
+    //             model.tensors_mut(),
+    //         )?;
+    //     }
+    // }
+
+    // Ok(model)
+    todo!()
 }
 
 /// Helper function. Reads a string from the buffer and returns it.
@@ -214,7 +216,7 @@ pub(crate) fn read_string(reader: &mut impl BufRead, len: usize) -> Result<Strin
 fn load_weights_ggmf_or_unversioned(
     file_offset: u64,
     main_path: &Path,
-    mut load_progress_callback: impl FnMut(LoadProgress<Hyperparameters>),
+    mut load_progress_callback: impl FnMut(LoadProgress),
     tensors: &mut HashMap<String, ggml::Tensor>,
 ) -> Result<(), LoadError> {
     use std::{fs::File, io::BufReader};
@@ -241,13 +243,13 @@ fn load_weights_ggmf_or_unversioned(
 
         // Load weights
         loop {
-            if !has_data_left(&mut part_reader)? {
+            if !util::has_data_left(&mut part_reader)? {
                 break;
             }
 
-            let n_dims = usize::try_from(read_i32(&mut part_reader)?)?;
-            let length = read_i32(&mut part_reader)?;
-            let ftype = read_u32(&mut part_reader)?;
+            let n_dims = usize::try_from(util::read_i32(&mut part_reader)?)?;
+            let length = util::read_i32(&mut part_reader)?;
+            let ftype = util::read_u32(&mut part_reader)?;
 
             let TensorHeaderGgmf {
                 nelements,
@@ -376,7 +378,7 @@ fn load_tensor_header_ggmf<'a>(
     assert!(n_dims <= ne.len());
     #[allow(clippy::needless_range_loop)]
     for i in 0..n_dims {
-        ne[i] = read_i32(reader)? as i64;
+        ne[i] = util::read_i32(reader)? as i64;
         nelements *= usize::try_from(ne[i])?;
     }
     let tensor_name = read_string(reader, length as usize)?;
@@ -456,7 +458,7 @@ fn load_tensor_header_ggmf<'a>(
 fn tensor_type_size(ftype: u32, ne: [i64; 2]) -> Option<usize> {
     let ftype = ggml::Type::try_from(ftype).ok()?;
     match ftype {
-        ElementType::Q4_0 | ElementType::Q4_1 => {
+        ggml::Type::Q4_0 | ggml::Type::Q4_1 => {
             assert_eq!(ne[0] % 64, 0);
         }
         _ => {}
@@ -468,7 +470,7 @@ fn load_weights_ggjt(
     reader: &mut (impl BufRead + Seek),
     mmap_base: Option<*const u8>,
     path: &Path,
-    mut load_progress_callback: impl FnMut(LoadProgress<Hyperparameters>),
+    mut load_progress_callback: impl FnMut(LoadProgress),
     tensors: &mut HashMap<String, ggml::Tensor>,
 ) -> Result<(), LoadError>
 // where R: std::io::Read
@@ -482,20 +484,20 @@ fn load_weights_ggjt(
     });
 
     loop {
-        if !has_data_left(reader)? {
+        if !util::has_data_left(reader)? {
             break;
         }
 
-        let n_dims = read_i32(reader)? as usize;
-        let length = read_i32(reader)?;
-        let ftype = read_u32(reader)?;
+        let n_dims = util::read_i32(reader)? as usize;
+        let length = util::read_i32(reader)?;
+        let ftype = util::read_u32(reader)?;
 
         let mut nelements: usize = 1;
         let mut ne = [1i64, 1];
         assert!(n_dims <= ne.len());
         #[allow(clippy::needless_range_loop)]
         for i in 0..n_dims {
-            let dim = read_i32(reader)? as usize;
+            let dim = util::read_i32(reader)? as usize;
             ne[i] = dim as i64;
             nelements *= dim;
         }
diff --git a/llama/src/quantize.rs b/llama/src/quantize.rs
index dd7ec58b..54643a1c 100644
--- a/llama/src/quantize.rs
+++ b/llama/src/quantize.rs
@@ -1,8 +1,9 @@
 //! Implements quantization of weights.
 
-use crate::{loader2::Loader, Hyperparameters, LoadError, LoadProgress};
-use ggml_format::{util::write_i32, SaveError, SaveHandler, TensorData, TensorInfo};
+use crate::{Hyperparameters, LoadError, LoadProgress};
+use ggml_format::{SaveError, SaveHandler, TensorData, TensorInfo};
 use half::f16;
+use llm_base::{ggml, util, Loader};
 use std::{
     collections::HashMap,
     fs::File,
@@ -17,7 +18,7 @@ use thiserror::Error;
 /// Progress of quantization.
 pub enum QuantizeProgress<'a> {
     /// Hyperparameters have been loaded.
-    HyperparametersLoaded(&'a Hyperparameters),
+    HyperparametersLoaded,
     /// A tensor is being loaded.
     TensorLoading {
         /// Name of the tensor.
@@ -145,11 +146,11 @@ pub fn quantize(
         path: path_in.to_owned(),
     })?;
     let mut reader = BufReader::new(&file_in);
-    let mut loader = Loader::new(0, {
+    let mut loader = Loader::new({
         let progress_callback = progress_callback.clone();
         move |p| {
-            if let LoadProgress::HyperparametersLoaded(h) = p {
-                progress_callback(QuantizeProgress::HyperparametersLoaded(h))
+            if let LoadProgress::HyperparametersLoaded = p {
+                progress_callback(QuantizeProgress::HyperparametersLoaded)
             }
         }
     });
@@ -240,13 +241,13 @@ impl<'a, F: Fn(QuantizeProgress)> QuantizeSaver<'a, F> {
 impl<F: Fn(QuantizeProgress)> SaveHandler<QuantizeError> for QuantizeSaver<'_, F> {
     fn write_hyperparameters(&mut self, writer: &mut dyn Write) -> Result<(), QuantizeError> {
         let h = self.hyperparameters;
-        write_i32(writer, h.n_vocab.try_into()?)?;
-        write_i32(writer, h.n_embd.try_into()?)?;
-        write_i32(writer, h.n_mult.try_into()?)?;
-        write_i32(writer, h.n_head.try_into()?)?;
-        write_i32(writer, h.n_layer.try_into()?)?;
-        write_i32(writer, h.n_rot.try_into()?)?;
-        write_i32(writer, h.file_type.into())?;
+        util::write_i32(writer, h.n_vocab.try_into()?)?;
+        util::write_i32(writer, h.n_embd.try_into()?)?;
+        util::write_i32(writer, h.n_mult.try_into()?)?;
+        util::write_i32(writer, h.n_head.try_into()?)?;
+        util::write_i32(writer, h.n_layer.try_into()?)?;
+        util::write_i32(writer, h.n_rot.try_into()?)?;
+        util::write_i32(writer, h.file_type.into())?;
         Ok(())
     }
 
diff --git a/llm-base/Cargo.toml b/llm-base/Cargo.toml
index 846a94a6..7968f7dd 100644
--- a/llm-base/Cargo.toml
+++ b/llm-base/Cargo.toml
@@ -20,3 +20,4 @@ partial_sort = "0.2.0"
 thiserror = "1.0"
 serde_bytes = "0.11"
 zstd = { version = "0.12", default-features = false }
+memmap2 = "0.5.10"
diff --git a/llm-base/src/inference_session.rs b/llm-base/src/inference_session.rs
index dbab0d82..22ae2a71 100644
--- a/llm-base/src/inference_session.rs
+++ b/llm-base/src/inference_session.rs
@@ -5,7 +5,7 @@ use rand::{distributions::WeightedIndex, prelude::Distribution};
 use thiserror::Error;
 
 use crate::{
-    mulf, EvaluateOutputRequest, InferenceError, InferenceParameters, Model, TokenId,
+    mulf, ErasedModel, EvaluateOutputRequest, InferenceError, InferenceParameters, TokenId,
     TokenUtf8Buffer, EOT_TOKEN_ID,
 };
 
@@ -65,7 +65,7 @@ impl InferenceSession {
     /// Feed a prompt to the model for this session.
     pub fn feed_prompt<E: std::error::Error + 'static>(
         &mut self,
-        model: &impl Model,
+        model: &dyn ErasedModel,
         params: &InferenceParameters,
         prompt: &str,
         mut callback: impl FnMut(&[u8]) -> Result<(), E>,
@@ -103,7 +103,7 @@ impl InferenceSession {
     /// Infer the next token for this session.
     pub fn infer_next_token<'v>(
         &mut self,
-        model: &'v impl Model,
+        model: &'v dyn ErasedModel,
         params: &InferenceParameters,
         rng: &mut impl rand::Rng,
     ) -> Result<&'v [u8], InferenceError> {
@@ -140,7 +140,7 @@ impl InferenceSession {
     /// If `params.play_back_previous_tokens` is specified, this will "play back" all existing tokens in the session.
     pub fn inference_with_prompt<E: std::error::Error + 'static>(
         &mut self,
-        model: &impl Model,
+        model: &dyn ErasedModel,
         params: &InferenceParameters,
         prompt: &str,
         maximum_token_count: Option<usize>,
@@ -322,7 +322,7 @@ impl InferenceSession {
     /// Creates an [InferenceSession] from a snapshot.
     pub fn from_snapshot(
         snapshot: InferenceSnapshot,
-        model: &impl Model,
+        model: &dyn ErasedModel,
     ) -> Result<Self, SnapshotError> {
         let mut session = model.start_session(snapshot.session_params);
 
diff --git a/llm-base/src/lib.rs b/llm-base/src/lib.rs
index 5b19b5b0..db714bd1 100644
--- a/llm-base/src/lib.rs
+++ b/llm-base/src/lib.rs
@@ -11,16 +11,18 @@ pub mod snapshot;
 pub mod util;
 
 mod inference_session;
-mod loader_common;
+mod loader;
 mod vocabulary;
 
+pub use ggml;
 pub use ggml::Type as ElementType;
 pub use inference_session::{
     InferenceSession, InferenceSessionParameters, InferenceSnapshot, ModelKVMemoryType,
     SnapshotError,
 };
-pub use loader_common::{FileType, LoadError, LoadProgress};
-pub use model::Model;
+pub use loader::{load, ContainerType, FileType, LoadError, LoadProgress, Loader, TensorLoader};
+pub use memmap2::Mmap;
+pub use model::{ErasedModel, Hyperparameters, Model};
 pub use util::TokenUtf8Buffer;
 pub use vocabulary::{TokenBias, TokenId, Vocabulary};
 
diff --git a/llm-base/src/loader_common.rs b/llm-base/src/loader.rs
similarity index 54%
rename from llm-base/src/loader_common.rs
rename to llm-base/src/loader.rs
index 81a38da9..f58450c7 100644
--- a/llm-base/src/loader_common.rs
+++ b/llm-base/src/loader.rs
@@ -1,10 +1,18 @@
 use std::{
+    collections::HashMap,
     fmt::{Display, Formatter},
+    fs::File,
+    io::{BufRead, BufReader, Read, Seek, SeekFrom},
     path::{Path, PathBuf},
 };
 
-use crate::util::FindAllModelFilesError;
-use ggml_format::{ContainerType, LoadError as FormatLoadError};
+use crate::{
+    util::{self, FindAllModelFilesError},
+    Hyperparameters, Model, TokenId, Vocabulary,
+};
+pub use ggml_format::ContainerType;
+use ggml_format::{LoadError as FormatLoadError, PartialHyperparameters, TensorInfo};
+use memmap2::Mmap;
 use thiserror::Error;
 
 /// How the tensors are stored in the GGML LLaMA model.
@@ -73,9 +81,9 @@ impl Display for FileType {
 /// Each variant represents a step within the process of loading the model.
 /// These can be used to report progress to the user.
 #[derive(Clone, PartialEq, Eq, Debug)]
-pub enum LoadProgress<'a, HP> {
+pub enum LoadProgress<'a> {
     /// The hyperparameters have been loaded from the model.
-    HyperparametersLoaded(&'a HP),
+    HyperparametersLoaded,
     /// The context has been created.
     ContextSize {
         /// The size of the context.
@@ -264,3 +272,225 @@ impl LoadError {
         }
     }
 }
+
+/// Used by models to fetch tensors from a loader.
+pub trait TensorLoader<E: std::error::Error> {
+    /// Loads a tensor from the loader.
+    fn load(&mut self, name: &str, ne: &[usize]) -> Result<ggml::Tensor, E>;
+    /// Finish loading the model, and extract all of the state from the loader.
+    fn finish(self) -> (ggml::Context, HashMap<String, ggml::Tensor>, Option<Mmap>);
+}
+
+/// Load an arbitrary GGML model.
+pub fn load<M: Model>(
+    path: impl AsRef<Path>,
+    prefer_mmap: bool,
+    n_context_tokens: usize,
+    mut load_progress_callback: impl FnMut(LoadProgress),
+) -> Result<M, LoadError> {
+    let main_path = path.as_ref();
+
+    let paths = util::find_all_model_files(main_path)?;
+    if paths.len() != 1 {
+        return Err(LoadError::MultipartNotSupported { paths });
+    }
+
+    let file = File::open(main_path).map_err(|e| LoadError::OpenFileFailed {
+        source: e,
+        path: main_path.to_owned(),
+    })?;
+    let mut reader = BufReader::new(&file);
+
+    let path = path.as_ref().to_owned();
+
+    (load_progress_callback)(LoadProgress::PartLoading {
+        file: &path,
+        current_part: 0,
+        total_parts: 1,
+    });
+
+    let mut loader = Loader::new(load_progress_callback);
+
+    ggml_format::load_model(&mut reader, &mut loader)
+        .map_err(|err| LoadError::from_format_error(err, path.clone()))?;
+
+    let Loader {
+        hyperparameters,
+        vocabulary,
+        tensors,
+        mut load_progress_callback,
+        container_type,
+        ..
+    } = loader;
+
+    let use_mmap = prefer_mmap && container_type.support_mmap();
+
+    let ctx_size = tensors
+        .values()
+        .map(|ti| {
+            ggml::Tensor::C_TYPE_SIZE
+                + ggml::OBJECT_SIZE
+                + if use_mmap { 0 } else { ti.calc_size() }
+        })
+        .sum::<usize>();
+    (load_progress_callback)(LoadProgress::ContextSize { bytes: ctx_size });
+    let context = ggml::Context::init(ctx_size, !use_mmap);
+
+    let mmap = if use_mmap {
+        let file = File::open(&path)?;
+        Some(unsafe { Mmap::map(&file)? })
+    } else {
+        None
+    };
+
+    struct MmapCompatibleLoader<'a> {
+        path: PathBuf,
+        file: File,
+        tensors: HashMap<String, TensorInfo>,
+        context: ggml::Context,
+        mmap: Option<Mmap>,
+        load_progress_callback: &'a mut dyn FnMut(LoadProgress),
+        loaded_tensors: HashMap<String, ggml::Tensor>,
+    }
+    impl TensorLoader<LoadError> for MmapCompatibleLoader<'_> {
+        fn load(&mut self, name: &str, ne: &[usize]) -> Result<ggml::Tensor, LoadError> {
+            let info = self
+                .tensors
+                .get(name)
+                .ok_or_else(|| LoadError::UnknownTensor {
+                    path: self.path.clone(),
+                    tensor_name: name.to_owned(),
+                })?;
+
+            let ctx = &self.context;
+            let mut tensor = match ne.len() {
+                1 => ctx.new_tensor_1d(info.element_type, ne[0]),
+                2 => ctx.new_tensor_2d(info.element_type, ne[0], ne[1]),
+                3 => ctx.new_tensor_3d(info.element_type, ne[0], ne[1], ne[2]),
+                _ => {
+                    return Err(LoadError::InvariantBroken {
+                        path: self.path.clone(),
+                        invariant: format!(
+                            "the tensor {name} had an unsupported dimension count: {ne:?}"
+                        ),
+                    })
+                }
+            };
+
+            match self.mmap.as_ref() {
+                Some(mmap) => unsafe {
+                    let ptr = mmap.as_ptr().offset(info.start_offset as isize);
+                    tensor.set_data(ptr as *mut std::ffi::c_void);
+                },
+                None => {
+                    let buf: &mut [u8] = unsafe {
+                        std::slice::from_raw_parts_mut(tensor.data() as *mut u8, tensor.nbytes())
+                    };
+                    self.file.seek(SeekFrom::Start(info.start_offset))?;
+                    self.file.read_exact(buf)?;
+                }
+            }
+
+            self.loaded_tensors.insert(name.to_owned(), tensor.share());
+            (self.load_progress_callback)(LoadProgress::PartTensorLoaded {
+                file: &self.path,
+                current_tensor: self.loaded_tensors.len(),
+                tensor_count: self.tensors.len(),
+            });
+
+            Ok(tensor)
+        }
+
+        fn finish(self) -> (ggml::Context, HashMap<String, ggml::Tensor>, Option<Mmap>) {
+            (self.context, self.loaded_tensors, self.mmap)
+        }
+    }
+
+    let tensors_len = tensors.len();
+    let tl = MmapCompatibleLoader {
+        path: path.clone(),
+        file,
+        tensors,
+        context,
+        mmap,
+        load_progress_callback: &mut load_progress_callback,
+        loaded_tensors: Default::default(),
+    };
+
+    let model = Model::new(hyperparameters, n_context_tokens, vocabulary, tl)?;
+
+    (load_progress_callback)(LoadProgress::PartLoaded {
+        file: &path,
+        byte_size: 0,
+        tensor_count: tensors_len,
+    });
+
+    Ok(model)
+}
+
+/// A GGML format loader for LLMs.
+pub struct Loader<Hp: Hyperparameters, F: FnMut(LoadProgress)> {
+    // Input
+    load_progress_callback: F,
+
+    // Output
+    /// The container type of the model.
+    pub container_type: ContainerType,
+    /// The hyperparameters of the model.
+    pub hyperparameters: Hp,
+    /// The vocabulary of the model.
+    pub vocabulary: Vocabulary,
+    /// The tensors of the model.
+    pub tensors: HashMap<String, TensorInfo>,
+}
+impl<Hp: Hyperparameters, F: FnMut(LoadProgress)> Loader<Hp, F> {
+    /// Creates a new loader.
+    pub fn new(load_progress_callback: F) -> Self {
+        Self {
+            load_progress_callback,
+
+            container_type: ContainerType::Ggjt,
+            hyperparameters: Hp::default(),
+            vocabulary: Vocabulary::default(),
+            tensors: HashMap::default(),
+        }
+    }
+}
+impl<Hp: Hyperparameters, F: FnMut(LoadProgress)> ggml_format::LoadHandler<LoadError>
+    for Loader<Hp, F>
+{
+    fn container_type(&mut self, container_type: ContainerType) -> Result<(), LoadError> {
+        self.container_type = container_type;
+        Ok(())
+    }
+
+    fn vocabulary_token(&mut self, i: usize, token: Vec<u8>, score: f32) -> Result<(), LoadError> {
+        let id = match TokenId::try_from(i) {
+            Ok(id) => id,
+            Err(err) => return Err(LoadError::InvalidIntegerConversion(err)),
+        };
+        self.vocabulary.push_token(id, token, score);
+
+        Ok(())
+    }
+
+    fn read_hyperparameters(
+        &mut self,
+        reader: &mut dyn BufRead,
+    ) -> Result<PartialHyperparameters, LoadError> {
+        // NOTE: Field order matters! Data is laid out in the file exactly in this order.
+        let hyperparameters = Hp::read(reader)?;
+        let partial = PartialHyperparameters {
+            n_vocab: hyperparameters.n_vocabulary(),
+        };
+        self.hyperparameters = hyperparameters;
+        (self.load_progress_callback)(LoadProgress::HyperparametersLoaded);
+
+        Ok(partial)
+    }
+
+    fn tensor_buffer(&mut self, info: TensorInfo) -> Result<(), LoadError> {
+        self.tensors.insert(info.name.clone(), info);
+        Ok(())
+    }
+}
diff --git a/llm-base/src/model.rs b/llm-base/src/model.rs
index ea3c7be1..460982a7 100644
--- a/llm-base/src/model.rs
+++ b/llm-base/src/model.rs
@@ -1,12 +1,24 @@
+use std::{error::Error, io::BufRead};
+
 use crate::{
-    vocabulary::TokenId, EvaluateOutputRequest, InferenceParameters, InferenceSession,
-    InferenceSessionParameters, Vocabulary,
+    loader::TensorLoader, vocabulary::TokenId, EvaluateOutputRequest, InferenceParameters,
+    InferenceSession, InferenceSessionParameters, LoadError, Vocabulary,
 };
 
 /// A large language model.
 pub trait Model {
     /// Hyperparameters for the model
-    type Hyperparameters;
+    type Hyperparameters: Hyperparameters;
+
+    /// Creates a new model from the provided hyperparameters.
+    fn new<E: Error>(
+        hyperparameters: Self::Hyperparameters,
+        n_context_tokens: usize,
+        vocabulary: Vocabulary,
+        tensor_loader: impl TensorLoader<E>,
+    ) -> Result<Self, E>
+    where
+        Self: Sized;
 
     /// Starts a new `InferenceSession` for this model.
     fn start_session(&self, params: InferenceSessionParameters) -> InferenceSession;
@@ -31,3 +43,62 @@ pub trait Model {
     /// Model context size
     fn n_ctx(&self) -> usize;
 }
+
+/// A type-erased model to allow for interacting with a model without knowing
+/// its hyperparameters.
+pub trait ErasedModel {
+    /// Starts a new `InferenceSession` for this model.
+    fn start_session(&self, params: InferenceSessionParameters) -> InferenceSession;
+
+    /// Evaluates the transformer.
+    ///
+    /// The provided `output_request` struct lets you specify which additional
+    /// data you are interested in fetching from the transformer. Setting a
+    /// field to a `Some` value will clear and fill the provided vector with
+    /// data. The provided vector will be resized to the exact output size.
+    fn evaluate(
+        &self,
+        session: &mut InferenceSession,
+        params: &InferenceParameters,
+        input_tokens: &[TokenId],
+        output_request: &mut EvaluateOutputRequest,
+    );
+
+    /// Model vocabulary
+    fn vocabulary(&self) -> &Vocabulary;
+
+    /// Model context size
+    fn n_ctx(&self) -> usize;
+}
+impl<H: Hyperparameters, M: Model<Hyperparameters = H>> ErasedModel for M {
+    fn start_session(&self, params: InferenceSessionParameters) -> InferenceSession {
+        Model::start_session(self, params)
+    }
+
+    fn evaluate(
+        &self,
+        session: &mut InferenceSession,
+        params: &InferenceParameters,
+        input_tokens: &[TokenId],
+        output_request: &mut EvaluateOutputRequest,
+    ) {
+        Model::evaluate(self, session, params, input_tokens, output_request)
+    }
+
+    fn vocabulary(&self) -> &Vocabulary {
+        Model::vocabulary(self)
+    }
+
+    fn n_ctx(&self) -> usize {
+        Model::n_ctx(self)
+    }
+}
+
+/// Implemented by model hyperparameters for loading and saving to a GGML model read/writer.
+pub trait Hyperparameters: Sized + Default {
+    /// Read the parameters from a reader.
+    fn read(reader: &mut dyn BufRead) -> Result<Self, LoadError>;
+
+    /// Get the number of tokens in the vocabulary.
+    fn n_vocabulary(&self) -> usize;
+}
diff --git a/llm-base/src/snapshot.rs b/llm-base/src/snapshot.rs
index 0bd6903a..1494258f 100644
--- a/llm-base/src/snapshot.rs
+++ b/llm-base/src/snapshot.rs
@@ -5,7 +5,7 @@ use std::{
     path::Path,
 };
 
-use crate::{InferenceSession, InferenceSessionParameters, Model};
+use crate::{ErasedModel, InferenceSession, InferenceSessionParameters};
 
 use zstd::{
     stream::{read::Decoder, write::Encoder},
@@ -16,12 +16,12 @@ const SNAPSHOT_COMPRESSION_LEVEL: CompressionLevel = 1;
 
 /// Read or create a session
 pub fn read_or_create_session(
-    model: &impl Model,
+    model: &dyn ErasedModel,
     persist_session: Option<&Path>,
     load_session: Option<&Path>,
     inference_session_params: InferenceSessionParameters,
 ) -> (InferenceSession, bool) {
-    fn load(model: &impl Model, path: &Path) -> InferenceSession {
+    fn load(model: &dyn ErasedModel, path: &Path) -> InferenceSession {
         let file = unwrap_or_exit(File::open(path), || format!("Could not open file {path:?}"));
         let decoder = unwrap_or_exit(Decoder::new(BufReader::new(file)), || {
             format!("Could not create decoder for {path:?}")
diff --git a/llm-base/src/util.rs b/llm-base/src/util.rs
index 3934dcc6..eeae8b1a 100644
--- a/llm-base/src/util.rs
+++ b/llm-base/src/util.rs
@@ -1,3 +1,4 @@
+pub use ggml_format::util::*;
 use std::path::{Path, PathBuf};
 
 /// NOTE: The original code relies in promotion rules and automatic cast between
diff --git a/llm-cli/src/cli_args.rs b/llm-cli/src/cli_args.rs
index bddb4be4..efc66fa6 100644
--- a/llm-cli/src/cli_args.rs
+++ b/llm-cli/src/cli_args.rs
@@ -1,10 +1,13 @@
-use std::path::PathBuf;
+use std::{
+    fmt::Debug,
+    path::{Path, PathBuf},
+};
 
 use clap::{Parser, ValueEnum};
 use color_eyre::eyre::{Result, WrapErr};
 use llm::{
-    llama, InferenceParameters, InferenceSessionParameters, LoadProgress, ModelKVMemoryType,
-    TokenBias, EOT_TOKEN_ID,
+    ElementType, ErasedModel, InferenceParameters, InferenceSessionParameters, LoadProgress,
+    ModelKVMemoryType, TokenBias, EOT_TOKEN_ID,
 };
 use rand::SeedableRng;
 
@@ -250,6 +253,10 @@ pub struct ModelLoad {
     #[arg(long, short = 'm')]
     pub model_path: PathBuf,
 
+    /// The model architecture to use.
+    #[arg(long, short = 'a', default_value_t, value_enum)]
+    pub model_architecture: ModelArchitecture,
+
     /// Sets the size of the context (in tokens). Allows feeding longer prompts.
     /// Note that this affects memory.
     ///
@@ -268,60 +275,71 @@ pub struct ModelLoad {
     #[arg(long)]
     pub no_mmap: bool,
 }
+#[derive(Parser, Debug, ValueEnum, Clone, Copy, Default)]
+pub enum ModelArchitecture {
+    /// Meta's LLaMA model and derivatives (Vicuna, etc).
+    #[default]
+    Llama,
+    /// The BigScience Large Open-science Open-access Multilingual Language Model (BLOOM).
+    Bloom,
+}
 impl ModelLoad {
-    pub fn load(&self) -> Result<llama::Llama> {
+    pub fn load(&self) -> Result<Box<dyn ErasedModel>> {
         let now = std::time::Instant::now();
-        let model = llama::Llama::load(
-            &self.model_path,
-            !self.no_mmap,
-            self.num_ctx_tokens,
-            |progress| match progress {
-                LoadProgress::HyperparametersLoaded(hparams) => {
-                    log::debug!("Loaded hyperparameters {hparams:#?}")
-                }
-                LoadProgress::ContextSize { bytes } => log::info!(
-                    "ggml ctx size = {:.2} MB\n",
-                    bytes as f64 / (1024.0 * 1024.0)
-                ),
-                LoadProgress::PartLoading {
-                    file,
-                    current_part,
-                    total_parts,
-                } => {
-                    let current_part = current_part + 1;
-                    log::info!(
-                        "Loading model part {}/{} from '{}' (mmap preferred: {})\n",
+
+        let prefer_mmap = !self.no_mmap;
+        let model = self
+            .load_indirect(
+                &self.model_path,
+                !self.no_mmap,
+                self.num_ctx_tokens,
+                |progress| match progress {
+                    LoadProgress::HyperparametersLoaded => {
+                        log::debug!("Loaded hyperparameters")
+                    }
+                    LoadProgress::ContextSize { bytes } => log::info!(
+                        "ggml ctx size = {:.2} MB\n",
+                        bytes as f64 / (1024.0 * 1024.0)
+                    ),
+                    LoadProgress::PartLoading {
+                        file,
                         current_part,
                         total_parts,
-                        file.to_string_lossy(),
-                        !self.no_mmap
-                    )
-                }
-                LoadProgress::PartTensorLoaded {
-                    current_tensor,
-                    tensor_count,
-                    ..
-                } => {
-                    let current_tensor = current_tensor + 1;
-                    if current_tensor % 8 == 0 {
-                        log::info!("Loaded tensor {current_tensor}/{tensor_count}");
+                    } => {
+                        let current_part = current_part + 1;
+                        log::info!(
+                            "Loading model part {}/{} from '{}' (mmap preferred: {})\n",
+                            current_part,
+                            total_parts,
+                            file.to_string_lossy(),
+                            prefer_mmap
+                        )
                     }
-                }
-                LoadProgress::PartLoaded {
-                    file,
-                    byte_size,
-                    tensor_count,
-                } => {
-                    log::info!("Loading of '{}' complete", file.to_string_lossy());
-                    log::info!(
-                        "Model size = {:.2} MB / num tensors = {}",
-                        byte_size as f64 / 1024.0 / 1024.0,
-                        tensor_count
-                    );
-                }
-            },
-        )
-        .wrap_err("Could not load model")?;
+                    LoadProgress::PartTensorLoaded {
+                        current_tensor,
+                        tensor_count,
+                        ..
+                    } => {
+                        let current_tensor = current_tensor + 1;
+                        if current_tensor % 8 == 0 {
+                            log::info!("Loaded tensor {current_tensor}/{tensor_count}");
+                        }
+                    }
+                    LoadProgress::PartLoaded {
+                        file,
+                        byte_size,
+                        tensor_count,
+                    } => {
+                        log::info!("Loading of '{}' complete", file.to_string_lossy());
+                        log::info!(
+                            "Model size = {:.2} MB / num tensors = {}",
+                            byte_size as f64 / 1024.0 / 1024.0,
+                            tensor_count
+                        );
+                    }
+                },
+            )
+            .wrap_err("Could not load model")?;
 
         log::info!(
             "Model fully loaded! Elapsed: {}ms",
@@ -330,6 +348,29 @@ impl ModelLoad {
 
         Ok(model)
     }
+
+    fn load_indirect(
+        &self,
+        path: &Path,
+        prefer_mmap: bool,
+        n_context_tokens: usize,
+        load_progress_callback: impl FnMut(LoadProgress<'_>),
+    ) -> Result<Box<dyn ErasedModel>> {
+        Ok(match self.model_architecture {
+            ModelArchitecture::Llama => Box::new(llm::load::<llm::Llama>(
+                path,
+                prefer_mmap,
+                n_context_tokens,
+                load_progress_callback,
+            )?),
+            ModelArchitecture::Bloom => Box::new(llm::load::<llm::Bloom>(
+                path,
+                prefer_mmap,
+                n_context_tokens,
+                load_progress_callback,
+            )?),
+        })
+    }
 }
 
 #[derive(Parser, Debug)]
@@ -421,11 +462,11 @@ pub enum QuantizationTarget {
     /// Quantized 4-bit (type 1).
     Q4_1,
 }
-impl From<QuantizationTarget> for llama::ElementType {
+impl From<QuantizationTarget> for ElementType {
     fn from(t: QuantizationTarget) -> Self {
         match t {
-            QuantizationTarget::Q4_0 => llama::ElementType::Q4_0,
-            QuantizationTarget::Q4_1 => llama::ElementType::Q4_1,
+            QuantizationTarget::Q4_0 => ElementType::Q4_0,
+            QuantizationTarget::Q4_1 => ElementType::Q4_1,
         }
     }
 }
diff --git a/llm-cli/src/main.rs b/llm-cli/src/main.rs
index 84f716a5..adbc1a2e 100644
--- a/llm-cli/src/main.rs
+++ b/llm-cli/src/main.rs
@@ -3,7 +3,7 @@ use std::{convert::Infallible, io::Write};
 use clap::Parser;
 use cli_args::Args;
 use color_eyre::eyre::{Context, Result};
-use llm::{llama::convert::convert_pth_to_ggml, snapshot, InferenceError, Model};
+use llm::{llama::convert::convert_pth_to_ggml, snapshot, InferenceError};
 use rustyline::error::ReadlineError;
 
 mod cli_args;
@@ -33,7 +33,7 @@ fn infer(args: &cli_args::Infer) -> Result<()> {
     let inference_session_params = args.generate.inference_session_parameters();
     let model = args.model_load.load()?;
     let (mut session, session_loaded) = snapshot::read_or_create_session(
-        &model,
+        model.as_ref(),
         args.persist_session.as_deref(),
         args.generate.load_session.as_deref(),
         inference_session_params,
@@ -42,7 +42,7 @@ fn infer(args: &cli_args::Infer) -> Result<()> {
 
     let mut rng = args.generate.rng();
     let res = session.inference_with_prompt::<Infallible>(
-        &model,
+        model.as_ref(),
         &inference_params,
         &prompt,
         args.generate.num_predict,
@@ -116,7 +116,7 @@ fn interactive(
     let inference_session_params = args.generate.inference_session_parameters();
     let model = args.model_load.load()?;
     let (mut session, session_loaded) = snapshot::read_or_create_session(
-        &model,
+        model.as_ref(),
         None,
         args.generate.load_session.as_deref(),
         inference_session_params,
@@ -142,7 +142,7 @@ fn interactive(
 
                 let mut sp = spinners::Spinner::new(spinners::Spinners::Dots2, "".to_string());
                 if let Err(InferenceError::ContextFull) = session.feed_prompt::<Infallible>(
-                    &model,
+                    model.as_ref(),
                     &inference_params,
                     &prompt,
                     |_| Ok(()),
@@ -152,7 +152,7 @@ fn interactive(
                 sp.stop();
 
                 let res = session.inference_with_prompt::<Infallible>(
-                    &model,
+                    model.as_ref(),
                     &inference_params,
                     "",
                     args.generate.num_predict,
@@ -192,7 +192,7 @@ fn quantize(args: &cli_args::Quantize) -> Result<()> {
         &args.destination,
         args.target.into(),
         |progress| match progress {
-            HyperparametersLoaded(_) => log::info!("Loaded hyperparameters"),
+            HyperparametersLoaded => log::info!("Loaded hyperparameters"),
             TensorLoading {
                 name,
                 dims,
diff --git a/llm/src/lib.rs b/llm/src/lib.rs
index 3653e712..1d89390b 100644
--- a/llm/src/lib.rs
+++ b/llm/src/lib.rs
@@ -1,11 +1,11 @@
 pub use llm_base::{
-    snapshot, FileType, InferenceError, InferenceParameters, InferenceSession,
-    InferenceSessionParameters, InferenceSnapshot, LoadError, LoadProgress, Model,
-    ModelKVMemoryType, SnapshotError, TokenBias, TokenId, TokenUtf8Buffer, Vocabulary,
+    load, snapshot, ElementType, ErasedModel, FileType, InferenceError, InferenceParameters,
+    InferenceSession, InferenceSessionParameters, InferenceSnapshot, LoadError, LoadProgress,
+    Model, ModelKVMemoryType, SnapshotError, TokenBias, TokenId, TokenUtf8Buffer, Vocabulary,
     EOT_TOKEN_ID,
 };
 
 #[cfg(feature = "bloom")]
-pub use bloom;
+pub use bloom::{self, Bloom};
 #[cfg(feature = "llama")]
-pub use llama;
+pub use llama::{self, Llama};

From 1761512e52d178b04dbab55188d069ebc53dcb6e Mon Sep 17 00:00:00 2001
From: Dan Forbes <dan@danforbes.dev>
Date: Wed, 26 Apr 2023 09:58:39 -0700
Subject: [PATCH 16/35] Add example for testing BLOOM inference

---
 bloom/examples/bloom_inference.rs | 33 +++++++++++++++++++++++++++++++
 bloom/src/lib.rs                  | 18 ++++++++++++++++-
 2 files changed, 50 insertions(+), 1 deletion(-)
 create mode 100644 bloom/examples/bloom_inference.rs

diff --git a/bloom/examples/bloom_inference.rs b/bloom/examples/bloom_inference.rs
new file mode 100644
index 00000000..9972cacf
--- /dev/null
+++ b/bloom/examples/bloom_inference.rs
@@ -0,0 +1,33 @@
+use std::{convert::Infallible, env::args, io::Write};
+
+use llm_base::{snapshot, LoadError};
+
+extern crate bloom;
+
+fn main() -> Result<(), LoadError> {
+    let args: Vec<String> = args().collect();
+    let bloom = bloom::Bloom::load(&args[1], true, 32, |_| {})?;
+    let (mut session, _) = snapshot::read_or_create_session(
+        &bloom,
+        Default::default(),
+        Default::default(),
+        Default::default(),
+    );
+
+    let _ = session.inference_with_prompt::<Infallible>(
+        &bloom,
+        &Default::default(),
+        "The best kind of wine is ",
+        Some(32),
+        &mut rand::thread_rng(),
+        |t| {
+            print!("{t}");
+            std::io::stdout().flush().unwrap();
+
+            Ok(())
+        },
+    );
+
+    println!();
+    Ok(())
+}
diff --git a/bloom/src/lib.rs b/bloom/src/lib.rs
index 01c53a10..55499fe1 100644
--- a/bloom/src/lib.rs
+++ b/bloom/src/lib.rs
@@ -1,7 +1,9 @@
+use std::path::Path;
+
 // use ggml_loader::{LoadError, LoadProgress};
 use llm_base::{
     util, EvaluateOutputRequest, FileType, InferenceParameters, InferenceSession,
-    InferenceSessionParameters, LoadError, Mmap, Model, TokenId, Vocabulary,
+    InferenceSessionParameters, LoadError, Mmap, Model, TokenId, Vocabulary, LoadProgress,
 };
 
 /// The weights for the BLOOM model. All the mutable state is split into a
@@ -24,6 +26,20 @@ pub struct Bloom {
     _mmap: Option<Mmap>,
 }
 
+impl Bloom {
+    /// Load the model from `path` with `n_context_tokens` context tokens.
+    ///
+    /// The status of the loading process will be reported through `load_progress_callback`.
+    pub fn load(
+        path: impl AsRef<Path>,
+        prefer_mmap: bool,
+        n_context_tokens: usize,
+        load_progress_callback: impl FnMut(LoadProgress),
+    ) -> Result<Bloom, LoadError> {
+        llm_base::load(path, prefer_mmap, n_context_tokens, load_progress_callback)
+    }
+}
+
 impl Model for Bloom {
     type Hyperparameters = Hyperparameters;
 

From 8d2d9c6bdeb9e2feb40cb8e9f19d64cd459ef5d3 Mon Sep 17 00:00:00 2001
From: Dan Forbes <dan@danforbes.dev>
Date: Wed, 26 Apr 2023 10:31:33 -0700
Subject: [PATCH 17/35] cargo fmt

---
 bloom/src/lib.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/bloom/src/lib.rs b/bloom/src/lib.rs
index 55499fe1..44b00845 100644
--- a/bloom/src/lib.rs
+++ b/bloom/src/lib.rs
@@ -3,7 +3,7 @@ use std::path::Path;
 // use ggml_loader::{LoadError, LoadProgress};
 use llm_base::{
     util, EvaluateOutputRequest, FileType, InferenceParameters, InferenceSession,
-    InferenceSessionParameters, LoadError, Mmap, Model, TokenId, Vocabulary, LoadProgress,
+    InferenceSessionParameters, LoadError, LoadProgress, Mmap, Model, TokenId, Vocabulary,
 };
 
 /// The weights for the BLOOM model. All the mutable state is split into a

From 813bdd1922117c45ce44e6f0d1cc8972d3645e3f Mon Sep 17 00:00:00 2001
From: Dan Forbes <dan@danforbes.dev>
Date: Wed, 26 Apr 2023 12:02:00 -0700
Subject: [PATCH 18/35] Add launch.json for debugging loading and inference

---
 .vscode/launch.json | 44 ++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 44 insertions(+)
 create mode 100644 .vscode/launch.json

diff --git a/.vscode/launch.json b/.vscode/launch.json
new file mode 100644
index 00000000..d13a1828
--- /dev/null
+++ b/.vscode/launch.json
@@ -0,0 +1,44 @@
+{
+  // Use IntelliSense to learn about possible attributes.
+  // Hover to view descriptions of existing attributes.
+  // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
+  "version": "0.2.0",
+  "configurations": [
+    {
+      "type": "lldb",
+      "request": "launch",
+      "name": "Debug example 'llama_inference'",
+      "cargo": {
+        "args": [
+          "build",
+          "--example=llama_inference",
+          "--package=llama"
+        ],
+        "filter": {
+          "name": "llama_inference",
+          "kind": "example"
+        }
+      },
+      "args": ["${env:HOME}/.ggml-models/gpt4all-7b.bin"],
+      "cwd": "${workspaceFolder}"
+    },
+    {
+      "type": "lldb",
+      "request": "launch",
+      "name": "Debug example 'bloom_inference'",
+      "cargo": {
+        "args": [
+          "build",
+          "--example=bloom_inference",
+          "--package=bloom"
+        ],
+        "filter": {
+          "name": "bloom_inference",
+          "kind": "example"
+        }
+      },
+      "args": ["${env:HOME}/.ggml-models/bloom-7b.bin"],
+      "cwd": "${workspaceFolder}"
+    }
+  ]
+}
\ No newline at end of file

From e19418c27ed6182d3bdf06140a14187da857f6a7 Mon Sep 17 00:00:00 2001
From: Dan Forbes <dan@danforbes.dev>
Date: Thu, 27 Apr 2023 08:07:41 -0700
Subject: [PATCH 19/35] Check tensor dimensions when loading

---
 llm-base/src/loader.rs | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

diff --git a/llm-base/src/loader.rs b/llm-base/src/loader.rs
index a50fd202..eb1bc84f 100644
--- a/llm-base/src/loader.rs
+++ b/llm-base/src/loader.rs
@@ -362,8 +362,19 @@ pub fn load<M: Model>(
                     tensor_name: name.to_owned(),
                 })?;
 
+            let dims = ne.len();
+            if dims != info.n_dims {
+                return Err(LoadError::InvariantBroken {
+                    path: self.path.clone(),
+                    invariant: format!(
+                        "the tensor {name} should have {} dimensions, not {dims}",
+                        info.n_dims
+                    ),
+                });
+            }
+
             let ctx = &self.context;
-            let mut tensor = match ne.len() {
+            let mut tensor = match dims {
                 1 => ctx.new_tensor_1d(info.element_type, ne[0]),
                 2 => ctx.new_tensor_2d(info.element_type, ne[0], ne[1]),
                 3 => ctx.new_tensor_3d(info.element_type, ne[0], ne[1], ne[2]),

From e35f93b642b6e1fc9975c22de938b08451d77d97 Mon Sep 17 00:00:00 2001
From: Dan Forbes <dan@danforbes.dev>
Date: Thu, 27 Apr 2023 09:55:34 -0700
Subject: [PATCH 20/35] `Model` -> `KnownModel`, `ErasedModel -> Model`

---
 bloom/src/lib.rs                  | 27 +++++++++++++++++++++++----
 llama/src/lib.rs                  |  4 ++--
 llm-base/src/inference_session.rs | 10 +++++-----
 llm-base/src/lib.rs               |  2 +-
 llm-base/src/loader.rs            |  6 +++---
 llm-base/src/model.rs             | 14 +++++++-------
 llm-base/src/snapshot.rs          |  6 +++---
 llm-cli/src/cli_args.rs           |  6 +++---
 llm/src/lib.rs                    |  4 ++--
 9 files changed, 49 insertions(+), 30 deletions(-)

diff --git a/bloom/src/lib.rs b/bloom/src/lib.rs
index 44b00845..399d365c 100644
--- a/bloom/src/lib.rs
+++ b/bloom/src/lib.rs
@@ -3,7 +3,7 @@ use std::path::Path;
 // use ggml_loader::{LoadError, LoadProgress};
 use llm_base::{
     util, EvaluateOutputRequest, FileType, InferenceParameters, InferenceSession,
-    InferenceSessionParameters, LoadError, LoadProgress, Mmap, Model, TokenId, Vocabulary,
+    InferenceSessionParameters, LoadError, LoadProgress, Mmap, KnownModel, TokenId, Vocabulary,
 };
 
 /// The weights for the BLOOM model. All the mutable state is split into a
@@ -40,7 +40,7 @@ impl Bloom {
     }
 }
 
-impl Model for Bloom {
+impl KnownModel for Bloom {
     type Hyperparameters = Hyperparameters;
 
     fn new<E: std::error::Error>(
@@ -172,7 +172,7 @@ impl Model for Bloom {
 
         let mut input_layer = ctx0.op_get_rows(&self.tok_embeddings, &embd);
 
-        //TODO: word embeddings norm,
+        // word embeddings norm,
         {
             input_layer = ctx0.op_norm(&input_layer);
             input_layer = ctx0.op_mul(&ctx0.op_repeat(&self.norm, &input_layer), &input_layer);
@@ -289,7 +289,6 @@ impl Model for Bloom {
 
                 //alibi
                 // KQ_scaled_alibi = KQ_scaled + alibi_bias
-                // TODO: op_alibi function
                 let k_q_scaled_alibi = ctx0.op_alibi(&k_q_scaled, n_past, n_head);
 
                 // KQ_masked = mask_past(KQ_scaled)
@@ -300,6 +299,26 @@ impl Model for Bloom {
 
                 let memv_elsize = session.memory_v.element_size();
 
+                // let v_trans = ctx0.op_permute(
+                //     &ctx0.op_reshape_3d(
+                //         &ctx0.op_view_1d(
+                //             &session.memory_v,
+                //             (n_past + n) * n_embd,
+                //             il * n_ctx * memv_elsize * n_embd,
+                //         ),
+                //         n_embd / n_head,
+                //         n_head,
+                //         n_past + n,
+                //     ),
+                //     1,
+                //     2,
+                //     0,
+                //     3,
+                // );
+
+                // // GGML_ASSERT: ggml/ggml.c:4899: !ggml_is_transposed(a)
+                // let k_q_v = ctx0.op_mul_mat(&v_trans, &k_q_soft_max);
+
                 // split cached V into n_head heads
                 let v = ctx0.op_view_3d(
                     &session.memory_v,
diff --git a/llama/src/lib.rs b/llama/src/lib.rs
index 2a4ffe4b..91c69f89 100644
--- a/llama/src/lib.rs
+++ b/llama/src/lib.rs
@@ -2,7 +2,7 @@ use std::{error::Error, path::Path};
 
 use llm_base::{
     util, EvaluateOutputRequest, FileType, InferenceParameters, InferenceSession,
-    InferenceSessionParameters, LoadError, LoadProgress, Mmap, Model, TensorLoader,
+    InferenceSessionParameters, LoadError, LoadProgress, Mmap, KnownModel, TensorLoader,
 };
 #[cfg(feature = "convert")]
 pub mod convert;
@@ -54,7 +54,7 @@ impl Llama {
         llm_base::load(path, prefer_mmap, n_context_tokens, load_progress_callback)
     }
 }
-impl Model for Llama {
+impl KnownModel for Llama {
     type Hyperparameters = Hyperparameters;
 
     fn new<E: Error>(
diff --git a/llm-base/src/inference_session.rs b/llm-base/src/inference_session.rs
index 22ae2a71..e161c8e8 100644
--- a/llm-base/src/inference_session.rs
+++ b/llm-base/src/inference_session.rs
@@ -5,7 +5,7 @@ use rand::{distributions::WeightedIndex, prelude::Distribution};
 use thiserror::Error;
 
 use crate::{
-    mulf, ErasedModel, EvaluateOutputRequest, InferenceError, InferenceParameters, TokenId,
+    mulf, Model, EvaluateOutputRequest, InferenceError, InferenceParameters, TokenId,
     TokenUtf8Buffer, EOT_TOKEN_ID,
 };
 
@@ -65,7 +65,7 @@ impl InferenceSession {
     /// Feed a prompt to the model for this session.
     pub fn feed_prompt<E: std::error::Error + 'static>(
         &mut self,
-        model: &dyn ErasedModel,
+        model: &dyn Model,
         params: &InferenceParameters,
         prompt: &str,
         mut callback: impl FnMut(&[u8]) -> Result<(), E>,
@@ -103,7 +103,7 @@ impl InferenceSession {
     /// Infer the next token for this session.
     pub fn infer_next_token<'v>(
         &mut self,
-        model: &'v dyn ErasedModel,
+        model: &'v dyn Model,
         params: &InferenceParameters,
         rng: &mut impl rand::Rng,
     ) -> Result<&'v [u8], InferenceError> {
@@ -140,7 +140,7 @@ impl InferenceSession {
     /// If `params.play_back_previous_tokens` is specified, this will "play back" all existing tokens in the session.
     pub fn inference_with_prompt<E: std::error::Error + 'static>(
         &mut self,
-        model: &dyn ErasedModel,
+        model: &dyn Model,
         params: &InferenceParameters,
         prompt: &str,
         maximum_token_count: Option<usize>,
@@ -322,7 +322,7 @@ impl InferenceSession {
     /// Creates an [InferenceSession] from a snapshot.
     pub fn from_snapshot(
         snapshot: InferenceSnapshot,
-        model: &dyn ErasedModel,
+        model: &dyn Model,
     ) -> Result<Self, SnapshotError> {
         let mut session = model.start_session(snapshot.session_params);
 
diff --git a/llm-base/src/lib.rs b/llm-base/src/lib.rs
index db714bd1..846a7ae7 100644
--- a/llm-base/src/lib.rs
+++ b/llm-base/src/lib.rs
@@ -22,7 +22,7 @@ pub use inference_session::{
 };
 pub use loader::{load, ContainerType, FileType, LoadError, LoadProgress, Loader, TensorLoader};
 pub use memmap2::Mmap;
-pub use model::{ErasedModel, Hyperparameters, Model};
+pub use model::{Model, Hyperparameters, KnownModel};
 pub use util::TokenUtf8Buffer;
 pub use vocabulary::{TokenBias, TokenId, Vocabulary};
 
diff --git a/llm-base/src/loader.rs b/llm-base/src/loader.rs
index eb1bc84f..e5412305 100644
--- a/llm-base/src/loader.rs
+++ b/llm-base/src/loader.rs
@@ -8,7 +8,7 @@ use std::{
 
 use crate::{
     util::{self, FindAllModelFilesError},
-    Hyperparameters, Model, TokenId, Vocabulary,
+    Hyperparameters, KnownModel, TokenId, Vocabulary,
 };
 pub use ggml_format::ContainerType;
 use ggml_format::{LoadError as FormatLoadError, PartialHyperparameters, TensorInfo};
@@ -282,7 +282,7 @@ pub trait TensorLoader<E: std::error::Error> {
 }
 
 /// Load an arbitrary GGML model.
-pub fn load<M: Model>(
+pub fn load<M: KnownModel>(
     path: impl AsRef<Path>,
     prefer_mmap: bool,
     n_context_tokens: usize,
@@ -428,7 +428,7 @@ pub fn load<M: Model>(
         loaded_tensors: Default::default(),
     };
 
-    let model = Model::new(hyperparameters, n_context_tokens, vocabulary, tl)?;
+    let model = KnownModel::new(hyperparameters, n_context_tokens, vocabulary, tl)?;
 
     (load_progress_callback)(LoadProgress::PartLoaded {
         file: &path,
diff --git a/llm-base/src/model.rs b/llm-base/src/model.rs
index 460982a7..ca5d6ccc 100644
--- a/llm-base/src/model.rs
+++ b/llm-base/src/model.rs
@@ -6,7 +6,7 @@ use crate::{
 };
 
 /// A large language model.
-pub trait Model {
+pub trait KnownModel {
     /// Hyperparameters for the model
     type Hyperparameters: Hyperparameters;
 
@@ -46,7 +46,7 @@ pub trait Model {
 
 /// A type-erased model to allow for interacting with a model without knowing
 /// its hyperparameters.
-pub trait ErasedModel {
+pub trait Model {
     /// Starts a new `InferenceSession` for this model.
     fn start_session(&self, params: InferenceSessionParameters) -> InferenceSession;
 
@@ -70,9 +70,9 @@ pub trait ErasedModel {
     /// Model context size
     fn n_ctx(&self) -> usize;
 }
-impl<H: Hyperparameters, M: Model<Hyperparameters = H>> ErasedModel for M {
+impl<H: Hyperparameters, M: KnownModel<Hyperparameters = H>> Model for M {
     fn start_session(&self, params: InferenceSessionParameters) -> InferenceSession {
-        Model::start_session(self, params)
+        KnownModel::start_session(self, params)
     }
 
     fn evaluate(
@@ -82,15 +82,15 @@ impl<H: Hyperparameters, M: Model<Hyperparameters = H>> ErasedModel for M {
         input_tokens: &[TokenId],
         output_request: &mut EvaluateOutputRequest,
     ) {
-        Model::evaluate(self, session, params, input_tokens, output_request)
+        KnownModel::evaluate(self, session, params, input_tokens, output_request)
     }
 
     fn vocabulary(&self) -> &Vocabulary {
-        Model::vocabulary(self)
+        KnownModel::vocabulary(self)
     }
 
     fn n_ctx(&self) -> usize {
-        Model::n_ctx(self)
+        KnownModel::n_ctx(self)
     }
 }
 
diff --git a/llm-base/src/snapshot.rs b/llm-base/src/snapshot.rs
index 1494258f..660e244b 100644
--- a/llm-base/src/snapshot.rs
+++ b/llm-base/src/snapshot.rs
@@ -5,7 +5,7 @@ use std::{
     path::Path,
 };
 
-use crate::{ErasedModel, InferenceSession, InferenceSessionParameters};
+use crate::{Model, InferenceSession, InferenceSessionParameters};
 
 use zstd::{
     stream::{read::Decoder, write::Encoder},
@@ -16,12 +16,12 @@ const SNAPSHOT_COMPRESSION_LEVEL: CompressionLevel = 1;
 
 /// Read or create a session
 pub fn read_or_create_session(
-    model: &dyn ErasedModel,
+    model: &dyn Model,
     persist_session: Option<&Path>,
     load_session: Option<&Path>,
     inference_session_params: InferenceSessionParameters,
 ) -> (InferenceSession, bool) {
-    fn load(model: &dyn ErasedModel, path: &Path) -> InferenceSession {
+    fn load(model: &dyn Model, path: &Path) -> InferenceSession {
         let file = unwrap_or_exit(File::open(path), || format!("Could not open file {path:?}"));
         let decoder = unwrap_or_exit(Decoder::new(BufReader::new(file)), || {
             format!("Could not create decoder for {path:?}")
diff --git a/llm-cli/src/cli_args.rs b/llm-cli/src/cli_args.rs
index efc66fa6..6f8c6791 100644
--- a/llm-cli/src/cli_args.rs
+++ b/llm-cli/src/cli_args.rs
@@ -6,7 +6,7 @@ use std::{
 use clap::{Parser, ValueEnum};
 use color_eyre::eyre::{Result, WrapErr};
 use llm::{
-    ElementType, ErasedModel, InferenceParameters, InferenceSessionParameters, LoadProgress,
+    ElementType, Model, InferenceParameters, InferenceSessionParameters, LoadProgress,
     ModelKVMemoryType, TokenBias, EOT_TOKEN_ID,
 };
 use rand::SeedableRng;
@@ -284,7 +284,7 @@ pub enum ModelArchitecture {
     Bloom,
 }
 impl ModelLoad {
-    pub fn load(&self) -> Result<Box<dyn ErasedModel>> {
+    pub fn load(&self) -> Result<Box<dyn Model>> {
         let now = std::time::Instant::now();
 
         let prefer_mmap = !self.no_mmap;
@@ -355,7 +355,7 @@ impl ModelLoad {
         prefer_mmap: bool,
         n_context_tokens: usize,
         load_progress_callback: impl FnMut(LoadProgress<'_>),
-    ) -> Result<Box<dyn ErasedModel>> {
+    ) -> Result<Box<dyn Model>> {
         Ok(match self.model_architecture {
             ModelArchitecture::Llama => Box::new(llm::load::<llm::Llama>(
                 path,
diff --git a/llm/src/lib.rs b/llm/src/lib.rs
index 1d89390b..ebac4ed5 100644
--- a/llm/src/lib.rs
+++ b/llm/src/lib.rs
@@ -1,7 +1,7 @@
 pub use llm_base::{
-    load, snapshot, ElementType, ErasedModel, FileType, InferenceError, InferenceParameters,
+    load, snapshot, ElementType, Model, FileType, InferenceError, InferenceParameters,
     InferenceSession, InferenceSessionParameters, InferenceSnapshot, LoadError, LoadProgress,
-    Model, ModelKVMemoryType, SnapshotError, TokenBias, TokenId, TokenUtf8Buffer, Vocabulary,
+    KnownModel, ModelKVMemoryType, SnapshotError, TokenBias, TokenId, TokenUtf8Buffer, Vocabulary,
     EOT_TOKEN_ID,
 };
 

From 0aea8f7c71b1a71f53ef8d1ceff7b87eb983421d Mon Sep 17 00:00:00 2001
From: Dan Forbes <dan@danforbes.dev>
Date: Thu, 27 Apr 2023 11:54:02 -0700
Subject: [PATCH 21/35] Refactor ggml stuff into a single crate

---
 .gitmodules                            |     3 +
 Cargo.lock                             |    36 +-
 Cargo.toml                             |     7 +-
 bloom/Cargo.toml                       |     2 +-
 bloom/src/lib.rs                       |    52 +-
 generate-ggml-bindings/Cargo.toml      |     9 -
 generate-ggml-bindings/src/main.rs     |    34 -
 ggml-format/Cargo.toml                 |    13 -
 ggml-format/src/lib.rs                 |    45 -
 ggml-rs/Cargo.toml                     |    14 +
 {ggml-sys => ggml-rs}/build.rs         |    25 +-
 ggml-rs/ggml                           |     1 +
 ggml-rs/src/context.rs                 |   408 +
 ggml-rs/src/lib.rs                     |   268 +
 {ggml-format => ggml-rs}/src/loader.rs |    21 +-
 {ggml-format => ggml-rs}/src/saver.rs  |     4 +-
 ggml-rs/src/tensor.rs                  |   130 +
 {ggml-format => ggml-rs}/src/tests.rs  |    30 +-
 {ggml-format => ggml-rs}/src/util.rs   |     0
 ggml-sys/Cargo.toml                    |    10 -
 ggml-sys/ggml/.gitattributes           |     2 -
 ggml-sys/ggml/CREDITS.txt              |     8 -
 ggml-sys/ggml/ggml.c                   | 12527 -----------------------
 ggml-sys/ggml/ggml.h                   |   875 --
 ggml-sys/src/lib.rs                    |  1564 ---
 ggml/Cargo.toml                        |    14 -
 ggml/src/lib.rs                        |   758 --
 llama/Cargo.toml                       |     4 +-
 llama/src/lib.rs                       |    42 +-
 llama/src/old_loader.rs                |    76 +-
 llama/src/quantize.rs                  |    39 +-
 llm-base/Cargo.toml                    |     3 +-
 llm-base/src/inference_session.rs      |    30 +-
 llm-base/src/lib.rs                    |     6 +-
 llm-base/src/loader.rs                 |    29 +-
 llm-base/src/snapshot.rs               |     2 +-
 llm-base/src/util.rs                   |     2 +-
 llm-cli/src/cli_args.rs                |     2 +-
 llm/src/lib.rs                         |     6 +-
 39 files changed, 1027 insertions(+), 16074 deletions(-)
 create mode 100644 .gitmodules
 delete mode 100644 generate-ggml-bindings/Cargo.toml
 delete mode 100644 generate-ggml-bindings/src/main.rs
 delete mode 100644 ggml-format/Cargo.toml
 delete mode 100644 ggml-format/src/lib.rs
 create mode 100644 ggml-rs/Cargo.toml
 rename {ggml-sys => ggml-rs}/build.rs (82%)
 create mode 160000 ggml-rs/ggml
 create mode 100644 ggml-rs/src/context.rs
 create mode 100644 ggml-rs/src/lib.rs
 rename {ggml-format => ggml-rs}/src/loader.rs (92%)
 rename {ggml-format => ggml-rs}/src/saver.rs (97%)
 create mode 100644 ggml-rs/src/tensor.rs
 rename {ggml-format => ggml-rs}/src/tests.rs (84%)
 rename {ggml-format => ggml-rs}/src/util.rs (100%)
 delete mode 100644 ggml-sys/Cargo.toml
 delete mode 100644 ggml-sys/ggml/.gitattributes
 delete mode 100644 ggml-sys/ggml/CREDITS.txt
 delete mode 100644 ggml-sys/ggml/ggml.c
 delete mode 100644 ggml-sys/ggml/ggml.h
 delete mode 100644 ggml-sys/src/lib.rs
 delete mode 100644 ggml/Cargo.toml
 delete mode 100644 ggml/src/lib.rs

diff --git a/.gitmodules b/.gitmodules
new file mode 100644
index 00000000..5f5d5012
--- /dev/null
+++ b/.gitmodules
@@ -0,0 +1,3 @@
+[submodule "ggml-rs/ggml"]
+	path = ggml-rs/ggml
+	url = git@github.com:ggerganov/ggml.git
diff --git a/Cargo.lock b/Cargo.lock
index 9c1dce3a..f3aa78f9 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -135,7 +135,7 @@ name = "bloom"
 version = "0.1.0"
 dependencies = [
  "bytemuck",
- "ggml",
+ "ggml-rs",
  "llm-base",
  "rand",
 ]
@@ -430,13 +430,6 @@ dependencies = [
  "windows-sys 0.48.0",
 ]
 
-[[package]]
-name = "generate-ggml-bindings"
-version = "0.1.0"
-dependencies = [
- "bindgen",
-]
-
 [[package]]
 name = "getrandom"
 version = "0.2.9"
@@ -449,31 +442,15 @@ dependencies = [
 ]
 
 [[package]]
-name = "ggml"
-version = "0.1.0"
-dependencies = [
- "ggml-sys",
- "llm-base",
- "log",
- "thiserror",
-]
-
-[[package]]
-name = "ggml-format"
+name = "ggml-rs"
 version = "0.1.0"
 dependencies = [
- "ggml",
+ "bindgen",
+ "cc",
  "rand",
  "thiserror",
 ]
 
-[[package]]
-name = "ggml-sys"
-version = "0.1.0"
-dependencies = [
- "cc",
-]
-
 [[package]]
 name = "gimli"
 version = "0.27.2"
@@ -624,7 +601,7 @@ name = "llama"
 version = "0.1.0"
 dependencies = [
  "bytemuck",
- "ggml-format",
+ "ggml-rs",
  "half",
  "llm-base",
  "partial_sort",
@@ -652,8 +629,7 @@ version = "0.1.0"
 dependencies = [
  "bincode",
  "bytemuck",
- "ggml",
- "ggml-format",
+ "ggml-rs",
  "log",
  "memmap2",
  "partial_sort",
diff --git a/Cargo.toml b/Cargo.toml
index 98b57a1b..fc60bb3c 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -1,17 +1,12 @@
 [workspace]
 members = [
     # Crates
-    "ggml-sys",
-    "ggml-format",
-    "ggml",
+    "ggml-rs",
     "llm-base",
     "llama",
     "bloom",
     "llm",
     "llm-cli",
-
-    # Tools
-    "generate-ggml-bindings"
 ]
 resolver = "2"
 
diff --git a/bloom/Cargo.toml b/bloom/Cargo.toml
index 2dd9b0a9..884ab244 100644
--- a/bloom/Cargo.toml
+++ b/bloom/Cargo.toml
@@ -6,7 +6,7 @@ edition = "2021"
 # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
 
 [dependencies]
-ggml = { path = "../ggml" }
+ggml-rs = { path = "../ggml-rs" }
 llm-base = { path = "../llm-base" }
 
 bytemuck = { workspace = true }
diff --git a/bloom/src/lib.rs b/bloom/src/lib.rs
index 399d365c..a0b58a1f 100644
--- a/bloom/src/lib.rs
+++ b/bloom/src/lib.rs
@@ -3,7 +3,7 @@ use std::path::Path;
 // use ggml_loader::{LoadError, LoadProgress};
 use llm_base::{
     util, EvaluateOutputRequest, FileType, InferenceParameters, InferenceSession,
-    InferenceSessionParameters, LoadError, LoadProgress, Mmap, KnownModel, TokenId, Vocabulary,
+    InferenceSessionParameters, KnownModel, LoadError, LoadProgress, Mmap, TokenId, Vocabulary,
 };
 
 /// The weights for the BLOOM model. All the mutable state is split into a
@@ -13,16 +13,16 @@ pub struct Bloom {
     n_context_tokens: usize,
 
     vocabulary: Vocabulary,
-    tok_embeddings: ggml::Tensor,
-    norm: ggml::Tensor,
-    norm_b: ggml::Tensor,
-    output_norm: ggml::Tensor,
-    output_norm_b: ggml::Tensor,
-    output: ggml::Tensor,
+    tok_embeddings: ggml_rs::Tensor,
+    norm: ggml_rs::Tensor,
+    norm_b: ggml_rs::Tensor,
+    output_norm: ggml_rs::Tensor,
+    output_norm_b: ggml_rs::Tensor,
+    output: ggml_rs::Tensor,
     layers: Vec<Layer>,
 
     // Must be kept alive for the model
-    _context: ggml::Context,
+    _context: ggml_rs::context::Context,
     _mmap: Option<Mmap>,
 }
 
@@ -162,12 +162,12 @@ impl KnownModel for Bloom {
             // add 10% to account for ggml object overhead
             buf_size = (1.1f64 * session.mem_per_token as f64 * n as f64) as usize;
         };
-        let ctx0 = ggml::Context::init(buf_size, true);
+        let ctx0 = ggml_rs::context::Context::init(buf_size, true);
 
         // TODO: REMAKE THIS AFTER CHECKING GGML GRAPH
-        let mut gf = ggml::ComputationGraph::new(n_threads);
+        let mut gf = ggml_rs::ComputationGraph::new(n_threads);
 
-        let mut embd = ctx0.new_tensor_1d(ggml::Type::I32, n);
+        let mut embd = ctx0.new_tensor_1d(ggml_rs::Type::I32, n);
         unsafe { embd.write_data(bytemuck::cast_slice(input_tokens)) };
 
         let mut input_layer = ctx0.op_get_rows(&self.tok_embeddings, &embd);
@@ -181,7 +181,7 @@ impl KnownModel for Bloom {
 
         for il in 0..n_layer {
             let input_self_attention = input_layer.share();
-            let mut current: ggml::Tensor;
+            let mut current: ggml_rs::Tensor;
 
             // norm
             {
@@ -252,7 +252,7 @@ impl KnownModel for Bloom {
                 let q = ctx0.op_permute(
                     &ctx0.op_cpy(
                         &q_current,
-                        &ctx0.new_tensor_3d(ggml::Type::F32, n_embd / n_head, n_head, n),
+                        &ctx0.new_tensor_3d(ggml_rs::Type::F32, n_embd / n_head, n_head, n),
                     ),
                     0,
                     2,
@@ -336,7 +336,7 @@ impl KnownModel for Bloom {
                 // cur = KQV_merged.contiguous().view(n_embd, N)
                 current = ctx0.op_cpy(
                     &k_q_v_merged,
-                    &ctx0.new_tensor_2d(ggml::Type::F32, n_embd, n),
+                    &ctx0.new_tensor_2d(ggml_rs::Type::F32, n_embd, n),
                 );
 
                 // projection
@@ -499,18 +499,18 @@ impl llm_base::Hyperparameters for Hyperparameters {
 }
 
 struct Layer {
-    pub attention_norm: ggml::Tensor,
-    pub attention_norm_b: ggml::Tensor,
-    pub wo: ggml::Tensor,
-    pub wo_b: ggml::Tensor,
-    pub query_key_value: ggml::Tensor,
-    pub query_key_value_b: ggml::Tensor,
+    pub attention_norm: ggml_rs::Tensor,
+    pub attention_norm_b: ggml_rs::Tensor,
+    pub wo: ggml_rs::Tensor,
+    pub wo_b: ggml_rs::Tensor,
+    pub query_key_value: ggml_rs::Tensor,
+    pub query_key_value_b: ggml_rs::Tensor,
     // normalization
-    pub ffn_norm: ggml::Tensor,
-    pub ffn_norm_b: ggml::Tensor,
+    pub ffn_norm: ggml_rs::Tensor,
+    pub ffn_norm_b: ggml_rs::Tensor,
     // ff
-    pub w1: ggml::Tensor,
-    pub w1_b: ggml::Tensor,
-    pub w2: ggml::Tensor,
-    pub w2_b: ggml::Tensor,
+    pub w1: ggml_rs::Tensor,
+    pub w1_b: ggml_rs::Tensor,
+    pub w2: ggml_rs::Tensor,
+    pub w2_b: ggml_rs::Tensor,
 }
diff --git a/generate-ggml-bindings/Cargo.toml b/generate-ggml-bindings/Cargo.toml
deleted file mode 100644
index fabb70e0..00000000
--- a/generate-ggml-bindings/Cargo.toml
+++ /dev/null
@@ -1,9 +0,0 @@
-[package]
-name = "generate-ggml-bindings"
-version = { workspace = true }
-edition = "2021"
-
-# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
-
-[dependencies]
-bindgen = "0.64.0"
diff --git a/generate-ggml-bindings/src/main.rs b/generate-ggml-bindings/src/main.rs
deleted file mode 100644
index 9f8f7324..00000000
--- a/generate-ggml-bindings/src/main.rs
+++ /dev/null
@@ -1,34 +0,0 @@
-use std::{env, path::PathBuf};
-
-fn main() {
-    // Parse arguments
-    let args: Vec<String> = env::args().collect();
-
-    if args.len() != 2 {
-        println!("Usage: {} <path_to_ggml_crate>", args[0]);
-        return;
-    }
-
-    let ggml_crate_path = &args[1];
-
-    let header_path = format!("{ggml_crate_path}/ggml/ggml.h");
-
-    let bindings = bindgen::Builder::default()
-        .header(&header_path)
-        // Suppress some warnings
-        .raw_line("#![allow(non_upper_case_globals)]")
-        .raw_line("#![allow(non_camel_case_types)]")
-        .raw_line("#![allow(non_snake_case)]")
-        .raw_line("#![allow(unused)]")
-        // Do not generate code for ggml's includes (stdlib)
-        .allowlist_file(&header_path)
-        .generate()
-        .expect("Unable to generate bindings");
-
-    let out_path = PathBuf::from(ggml_crate_path).join("src").join("lib.rs");
-    bindings
-        .write_to_file(out_path)
-        .expect("Couldn't write bindings");
-
-    println!("Successfully updated bindings in src/lib.rs");
-}
diff --git a/ggml-format/Cargo.toml b/ggml-format/Cargo.toml
deleted file mode 100644
index 8627e559..00000000
--- a/ggml-format/Cargo.toml
+++ /dev/null
@@ -1,13 +0,0 @@
-[package]
-name = "ggml-format"
-version = { workspace = true }
-edition = "2021"
-
-# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
-
-[dependencies]
-ggml = { path = "../ggml" }
-thiserror = "1.0"
-
-[dev-dependencies]
-rand = "0.8"
diff --git a/ggml-format/src/lib.rs b/ggml-format/src/lib.rs
deleted file mode 100644
index b26aa0f2..00000000
--- a/ggml-format/src/lib.rs
+++ /dev/null
@@ -1,45 +0,0 @@
-#![deny(missing_docs)]
-//! A reader and writer for the `ggml` model format.
-//!
-//! The reader supports the GGML, GGMF and GGJT container formats, but
-//! only single-part models.
-//!
-//! The writer isn't implemented yet. It will support the GGJT container
-//! format only.
-
-/// Utilities for reading and writing.
-pub mod util;
-
-mod loader;
-mod saver;
-#[cfg(test)]
-mod tests;
-
-pub use loader::{
-    data_size, load_model, LoadError, LoadHandler, PartialHyperparameters, TensorInfo,
-};
-pub use saver::{save_model, SaveError, SaveHandler, TensorData};
-
-/// The type of a tensor element.
-pub type ElementType = ggml::Type;
-
-#[derive(Debug, PartialEq, Clone, Copy)]
-/// The format of the file containing the model.
-pub enum ContainerType {
-    /// `GGML`: legacy format, oldest ggml tensor file format
-    Ggml,
-    /// `GGMF`: also legacy format. Introduces versioning. Newer than GGML, older than GGJT.
-    Ggmf,
-    /// `GGJT`: mmap-able format.
-    Ggjt,
-}
-impl ContainerType {
-    /// Does this container type support mmap?
-    pub fn support_mmap(&self) -> bool {
-        match self {
-            ContainerType::Ggml => false,
-            ContainerType::Ggmf => false,
-            ContainerType::Ggjt => true,
-        }
-    }
-}
diff --git a/ggml-rs/Cargo.toml b/ggml-rs/Cargo.toml
new file mode 100644
index 00000000..e2e59a65
--- /dev/null
+++ b/ggml-rs/Cargo.toml
@@ -0,0 +1,14 @@
+[package]
+name = "ggml-rs"
+version = { workspace = true }
+edition = "2021"
+
+[build-dependencies]
+bindgen = "0.64.0"
+cc = "^1.0"
+
+[dependencies]
+thiserror = "1.0"
+
+[dev-dependencies]
+rand = "0.8"
diff --git a/ggml-sys/build.rs b/ggml-rs/build.rs
similarity index 82%
rename from ggml-sys/build.rs
rename to ggml-rs/build.rs
index a4b89b70..02b38258 100644
--- a/ggml-sys/build.rs
+++ b/ggml-rs/build.rs
@@ -1,17 +1,18 @@
-use std::env;
+use std::{env, path::PathBuf};
 
+// By default, this crate will attempt to compile ggml with the features of your host system if
+// the host and target are the same. If they are not, it will turn off auto-feature-detection,
+// and you will need to manually specify target features through target-features.
 fn main() {
-    // By default, this crate will attempt to compile ggml with the features of your host system if
-    // the host and target are the same. If they are not, it will turn off auto-feature-detection,
-    // and you will need to manually specify target features through target-features.
-
     println!("cargo:rerun-if-changed=ggml");
 
-    let ggml_src = ["ggml/ggml.c"];
+    let ggml_src = ["ggml/src/ggml.c"];
 
     let mut builder = cc::Build::new();
 
-    let build = builder.files(ggml_src.iter()).include("include");
+    let build = builder
+        .files(ggml_src.iter())
+        .include("./ggml/include/ggml");
 
     // This is a very basic heuristic for applying compile flags.
     // Feel free to update this to fit your operating system.
@@ -88,6 +89,16 @@ fn main() {
     }
     build.warnings(false);
     build.compile("ggml");
+
+    let header_path = "./ggml/include/ggml/ggml.h";
+    bindgen::Builder::default()
+        .header(String::from(header_path))
+        .allowlist_file(header_path)
+        .parse_callbacks(Box::new(bindgen::CargoCallbacks))
+        .generate()
+        .expect("Unable to generate bindings.")
+        .write_to_file(PathBuf::from(env::var("OUT_DIR").unwrap()).join("bindings.rs"))
+        .expect("Unable to write generated bindings to file.");
 }
 
 fn get_supported_target_features() -> std::collections::HashSet<String> {
diff --git a/ggml-rs/ggml b/ggml-rs/ggml
new file mode 160000
index 00000000..8cc06712
--- /dev/null
+++ b/ggml-rs/ggml
@@ -0,0 +1 @@
+Subproject commit 8cc067122059864eb0fa97bf50d5dd71c5050b4b
diff --git a/ggml-rs/src/context.rs b/ggml-rs/src/context.rs
new file mode 100644
index 00000000..9aff0e38
--- /dev/null
+++ b/ggml-rs/src/context.rs
@@ -0,0 +1,408 @@
+use std::{
+    os::raw::{c_int, c_void},
+    ptr::NonNull,
+    sync::{Arc, Weak},
+};
+
+use crate::{usize_to_i32, usize_to_i64, Buffer, ComputationGraph, Tensor, Type};
+
+/// Acts as a RAII-guard over a `crate::ggml_context`, allocating via
+/// `ggml_init` and dropping via `ggml_free`.
+pub struct Context {
+    /// An `Arc` is used to model the relation between the context and the
+    /// allocated tensors. Tensors are owned by the object, so a [`Tensor`]
+    /// contains a `Weak` reference underneath and doesn't let you do anything
+    /// with it if the underlying context has been deallocated.
+    ptr: Arc<NonNull<crate::ggml_context>>,
+}
+
+impl Context {
+    /// Creates a new [Context] with the specified `mem_size` as a working area.
+    pub fn init(mem_size: usize, alloc: bool) -> Self {
+        let raw = unsafe {
+            crate::ggml_init(crate::ggml_init_params {
+                mem_size,
+                // Null here means we want ggml to own this memory. We don't
+                // support passing an owned buffer from the Rust side.
+                mem_buffer: std::ptr::null_mut(),
+                no_alloc: !alloc,
+            })
+        };
+        Self {
+            ptr: Arc::new(NonNull::new(raw).expect("Should not be null")),
+        }
+    }
+
+    /// Wraps a raw tensor with a weak pointer to the context.
+    fn new_tensor_raw(&self, raw: *mut crate::ggml_tensor) -> Tensor {
+        Tensor {
+            ptr: NonNull::new(raw).expect("Should not be null"),
+            ctx: Arc::downgrade(&self.ptr),
+        }
+    }
+
+    /// Creates a new 1D tensor.
+    pub fn new_tensor_1d(&self, typ: Type, ne0: usize) -> Tensor {
+        let raw =
+            unsafe { crate::ggml_new_tensor_1d(self.ptr.as_ptr(), typ.into(), usize_to_i64(ne0)) };
+        self.new_tensor_raw(raw)
+    }
+
+    /// Creates a new 2D tensor.
+    pub fn new_tensor_2d(&self, typ: Type, ne0: usize, ne1: usize) -> Tensor {
+        let raw = unsafe {
+            crate::ggml_new_tensor_2d(
+                self.ptr.as_ptr(),
+                typ.into(),
+                usize_to_i64(ne0),
+                usize_to_i64(ne1),
+            )
+        };
+        self.new_tensor_raw(raw)
+    }
+
+    /// Creates a new 3D tensor.
+    pub fn new_tensor_3d(&self, typ: Type, ne0: usize, ne1: usize, ne2: usize) -> Tensor {
+        let raw = unsafe {
+            crate::ggml_new_tensor_3d(
+                self.ptr.as_ptr(),
+                typ.into(),
+                usize_to_i64(ne0),
+                usize_to_i64(ne1),
+                usize_to_i64(ne2),
+            )
+        };
+        self.new_tensor_raw(raw)
+    }
+
+    /// Creates a new 1D tensor with the specified value.
+    pub fn new_f32(&self, x: f32) -> Tensor {
+        let raw = unsafe { crate::ggml_new_f32(self.ptr.as_ptr(), x) };
+        self.new_tensor_raw(raw)
+    }
+
+    /// Unknown, aside from the obvious. It's transposing something!
+    pub fn op_transpose(&self, a: &Tensor) -> Tensor {
+        let tensor = unsafe { crate::ggml_transpose(self.ptr.as_ptr(), a.ptr.as_ptr()) };
+        self.new_tensor_raw(tensor)
+    }
+
+    /// Unknown.
+    pub fn op_get_rows(&self, a: &Tensor, b: &Tensor) -> Tensor {
+        let tensor =
+            unsafe { crate::ggml_get_rows(self.ptr.as_ptr(), a.ptr.as_ptr(), b.ptr.as_ptr()) };
+        self.new_tensor_raw(tensor)
+    }
+
+    /// Creates a new tensor with the values of `a`, but normalized.
+    pub fn op_norm(&self, a: &Tensor) -> Tensor {
+        let tensor = unsafe { crate::ggml_norm(self.ptr.as_ptr(), a.ptr.as_ptr()) };
+        self.new_tensor_raw(tensor)
+    }
+
+    /// Creates a new tensor with the values of `a`, but normalized using RMSNorm.
+    pub fn op_rms_norm(&self, a: &Tensor) -> Tensor {
+        let tensor = unsafe { crate::ggml_rms_norm(self.ptr.as_ptr(), a.ptr.as_ptr()) };
+        self.new_tensor_raw(tensor)
+    }
+
+    /// Creates a new tensor with the multiplication of `a` and `b`.
+    pub fn op_mul(&self, a: &Tensor, b: &Tensor) -> Tensor {
+        let tensor = unsafe { crate::ggml_mul(self.ptr.as_ptr(), a.ptr.as_ptr(), b.ptr.as_ptr()) };
+        self.new_tensor_raw(tensor)
+    }
+
+    /// Unknown.
+    pub fn op_repeat(&self, a: &Tensor, b: &Tensor) -> Tensor {
+        let tensor =
+            unsafe { crate::ggml_repeat(self.ptr.as_ptr(), a.ptr.as_ptr(), b.ptr.as_ptr()) };
+        self.new_tensor_raw(tensor)
+    }
+
+    /// Creates a new tensor with the multiplication of `a` and `b` as if they were matrices.
+    ///
+    /// `a`: m rows, n columns
+    ///
+    /// `b`: p rows, n columns (i.e. we transpose it internally)
+    ///
+    /// Result is m columns, p rows
+    pub fn op_mul_mat(&self, a: &Tensor, b: &Tensor) -> Tensor {
+        let tensor =
+            unsafe { crate::ggml_mul_mat(self.ptr.as_ptr(), a.ptr.as_ptr(), b.ptr.as_ptr()) };
+        self.new_tensor_raw(tensor)
+    }
+
+    /// Creates a new tensor with the addition of `a` and `b`.
+    pub fn op_add(&self, a: &Tensor, b: &Tensor) -> Tensor {
+        let tensor = unsafe { crate::ggml_add(self.ptr.as_ptr(), a.ptr.as_ptr(), b.ptr.as_ptr()) };
+        self.new_tensor_raw(tensor)
+    }
+
+    /// Creates a new tensor with the [SiLU](https://pytorch.org/docs/stable/generated/torch.nn.SiLU.html) activation function applied to `a`.
+    pub fn op_silu(&self, a: &Tensor) -> Tensor {
+        let tensor = unsafe { crate::ggml_silu(self.ptr.as_ptr(), a.ptr.as_ptr()) };
+        self.new_tensor_raw(tensor)
+    }
+
+    /// In-place, scales `a` by the 1D tensor `b`.
+    pub fn op_scale(&self, a: &Tensor, b: &Tensor) -> Tensor {
+        let tensor =
+            unsafe { crate::ggml_scale(self.ptr.as_ptr(), a.ptr.as_ptr(), b.ptr.as_ptr()) };
+        self.new_tensor_raw(tensor)
+    }
+
+    /// In-place, sets the elements above the diagonal to -INF.
+    pub fn op_diag_mask_inf(&self, a: &Tensor, n_past: usize) -> Tensor {
+        let tensor = unsafe {
+            crate::ggml_diag_mask_inf(self.ptr.as_ptr(), a.ptr.as_ptr(), usize_to_i32(n_past))
+        };
+        self.new_tensor_raw(tensor)
+    }
+
+    /// In-place, applies the [Softmax function](https://en.wikipedia.org/wiki/Softmax_function) to `a`.
+    pub fn op_soft_max(&self, a: &Tensor) -> Tensor {
+        let tensor = unsafe { crate::ggml_soft_max(self.ptr.as_ptr(), a.ptr.as_ptr()) };
+        self.new_tensor_raw(tensor)
+    }
+
+    /// Creates a new tensor with result of mapping `fun` with `a`.
+    ///
+    /// `cnt` is the number of `f32` elements to be mapped.
+    /// `src` is source for elements to be mapped.
+    /// `dst` is the destination for mapped elements.
+    ///
+    /// # Safety
+    ///
+    /// This is marked unsafe since we're passing pointers into C code, and not
+    /// only vanilla pointers but a pointer to a function. For obvious reasons, it's
+    /// important not to do anything crazy like mutate any of these values concurrently.
+    ///
+    /// Don't make assumptions about how/when the function will be called. It may be called
+    /// on a row, it may be called on a whole tensor. It may be called concurrently or not.
+    /// Once you give that function pointer to C land, all bets are off.
+    pub unsafe fn op_map_unary(
+        &self,
+        a: &Tensor,
+        fun: unsafe extern "C" fn(cnt: c_int, dst: *mut f32, src: *const f32),
+    ) -> Tensor {
+        let tensor =
+            unsafe { crate::ggml_map_unary_f32(self.ptr.as_ptr(), a.ptr.as_ptr(), Some(fun)) };
+        self.new_tensor_raw(tensor)
+    }
+
+    /// Creates a new tensor with result of mapping `fun` with `a` and `b`.
+    ///
+    /// `cnt` is the number of `f32` elements to be mapped.
+    /// `src0`, `src1` are the sources of elements to be mapped.
+    /// `dst` is the destination for mapped elements.
+    ///
+    /// # Safety
+    ///
+    /// This is marked unsafe since we're passing pointers into C code, and not
+    /// only vanilla pointers but a pointer to a function. For obvious reasons, it's
+    /// important not to do anything crazy like mutate any of these values concurrently.
+    ///
+    /// Don't make assumptions about how/when the function will be called. It may be called
+    /// on a row, it may be called on a whole tensor. It may be called concurrently or not.
+    /// Once you give that function pointer to C land, all bets are off.
+    pub unsafe fn op_map_binary(
+        &self,
+        a: &Tensor,
+        b: &Tensor,
+        fun: unsafe extern "C" fn(cnt: c_int, dst: *mut f32, src0: *const f32, src1: *const f32),
+    ) -> Tensor {
+        let tensor = unsafe {
+            crate::ggml_map_binary_f32(self.ptr.as_ptr(), a.ptr.as_ptr(), b.ptr.as_ptr(), Some(fun))
+        };
+        self.new_tensor_raw(tensor)
+    }
+
+    /// Creates a 1D view over `a`.
+    pub fn op_view_1d(&self, a: &Tensor, ne0: usize, offset: usize) -> Tensor {
+        let tensor = unsafe {
+            crate::ggml_view_1d(self.ptr.as_ptr(), a.ptr.as_ptr(), usize_to_i64(ne0), offset)
+        };
+        self.new_tensor_raw(tensor)
+    }
+
+    /// Creates a 2D view over `a`.
+    pub fn op_view_2d(&self, a: &Tensor, ne: (usize, usize), nb1: usize, offset: usize) -> Tensor {
+        let (ne0, ne1) = ne;
+        let tensor = unsafe {
+            crate::ggml_view_2d(
+                self.ptr.as_ptr(),
+                a.ptr.as_ptr(),
+                usize_to_i64(ne0),
+                usize_to_i64(ne1),
+                nb1,
+                offset,
+            )
+        };
+        self.new_tensor_raw(tensor)
+    }
+
+    /// Creates a 3d view over `a`.
+    pub fn op_view_3d(
+        &self,
+        a: &Tensor,
+        ne: (usize, usize, usize),
+        nb: (usize, usize),
+        offset: usize,
+    ) -> Tensor {
+        let (ne0, ne1, ne2) = ne;
+        let (nb1, nb2) = nb;
+        let tensor = unsafe {
+            crate::ggml_view_3d(
+                self.ptr.as_ptr(),
+                a.ptr.as_ptr(),
+                usize_to_i64(ne0),
+                usize_to_i64(ne1),
+                usize_to_i64(ne2),
+                nb1,
+                nb2,
+                offset,
+            )
+        };
+        self.new_tensor_raw(tensor)
+    }
+
+    /// Copies `a` to `b` and returns `b`.
+    pub fn op_cpy(&self, a: &Tensor, b: &Tensor) -> Tensor {
+        let tensor = unsafe { crate::ggml_cpy(self.ptr.as_ptr(), a.ptr.as_ptr(), b.ptr.as_ptr()) };
+        self.new_tensor_raw(tensor)
+    }
+
+    /// Creates a new tensor with the axes of `a` permuted as described by the parameters.
+    pub fn op_permute(
+        &self,
+        a: &Tensor,
+        axis0: usize,
+        axis1: usize,
+        axis2: usize,
+        axis3: usize,
+    ) -> Tensor {
+        let tensor = unsafe {
+            crate::ggml_permute(
+                self.ptr.as_ptr(),
+                a.ptr.as_ptr(),
+                usize_to_i32(axis0),
+                usize_to_i32(axis1),
+                usize_to_i32(axis2),
+                usize_to_i32(axis3),
+            )
+        };
+        self.new_tensor_raw(tensor)
+    }
+
+    /// In-place; reshapes `a` in accordance with the dimensions of `b`
+    pub fn op_reshape(&self, a: &Tensor, b: &Tensor) -> Tensor {
+        let tensor =
+            unsafe { crate::ggml_reshape(self.ptr.as_ptr(), a.ptr.as_ptr(), b.ptr.as_ptr()) };
+        self.new_tensor_raw(tensor)
+    }
+
+    /// In-place; reshapes `a` in accordance with the specified dimensions.
+    pub fn op_reshape_2d(&self, a: &Tensor, ne0: usize, ne1: usize) -> Tensor {
+        let tensor = unsafe {
+            crate::ggml_reshape_2d(
+                self.ptr.as_ptr(),
+                a.ptr.as_ptr(),
+                usize_to_i64(ne0),
+                usize_to_i64(ne1),
+            )
+        };
+        self.new_tensor_raw(tensor)
+    }
+
+    /// In-place; reshapes `a` in accordance with the specified dimensions.
+    pub fn op_reshape_3d(&self, a: &Tensor, ne0: usize, ne1: usize, ne2: usize) -> Tensor {
+        let tensor = unsafe {
+            crate::ggml_reshape_3d(
+                self.ptr.as_ptr(),
+                a.ptr.as_ptr(),
+                usize_to_i64(ne0),
+                usize_to_i64(ne1),
+                usize_to_i64(ne2),
+            )
+        };
+        self.new_tensor_raw(tensor)
+    }
+
+    /// In-place; applies ROtary Positional Encoding.
+    pub fn op_rope(&self, a: &Tensor, npast: usize, ndims: usize, mode: i32) -> Tensor {
+        let tensor = unsafe {
+            crate::ggml_rope(
+                self.ptr.as_ptr(),
+                a.ptr.as_ptr(),
+                usize_to_i32(npast),
+                usize_to_i32(ndims),
+                mode,
+            )
+        };
+        self.new_tensor_raw(tensor)
+    }
+
+    /// Computes the specified graph. Must be run in order to evaluate the graph.
+    pub fn graph_compute(&self, graph: &mut ComputationGraph) {
+        unsafe {
+            crate::ggml_graph_compute(self.ptr.as_ptr(), &mut graph.inner);
+        }
+    }
+
+    /// Retrieves the memory used by this [Context].
+    pub fn used_mem(&self) -> usize {
+        unsafe { crate::ggml_used_mem(self.ptr.as_ptr()) }
+    }
+
+    /// Sets the scratch buffer to be used by this [Context].
+    ///
+    /// If `scratch_buffer` is `None`, the scratch buffer will be disabled.
+    pub fn use_scratch<'a>(&'a self, scratch_buffer: Option<&'a mut Buffer>) {
+        let (size, data) = if let Some(buffer) = scratch_buffer {
+            (buffer.data.len(), buffer.data.as_ptr() as *mut c_void)
+        } else {
+            (0, std::ptr::null_mut())
+        };
+        // SAFETY: this just passes (most likely uninitialized) memory buffer to the ggml C API
+        unsafe {
+            crate::ggml_set_scratch(
+                self.ptr.as_ptr(),
+                crate::ggml_scratch {
+                    offs: 0,
+                    size,
+                    data,
+                },
+            );
+        }
+    }
+
+    /// TODO: something something
+    pub fn op_alibi(&self, a: &Tensor, n_past: usize, n_head: usize) -> Tensor {
+        let tensor = unsafe {
+            crate::ggml_alibi(
+                self.ptr.as_ptr(),
+                a.ptr.as_ptr(),
+                usize_to_i32(n_past),
+                usize_to_i32(n_head),
+            )
+        };
+
+        self.new_tensor_raw(tensor)
+    }
+
+    /// Gaussian Error Linear Units
+    pub fn op_gelu(&self, a: &Tensor) -> Tensor {
+        let tensor = unsafe { crate::ggml_gelu(self.ptr.as_ptr(), a.ptr.as_ptr()) };
+        self.new_tensor_raw(tensor)
+    }
+}
+
+impl Drop for Context {
+    fn drop(&mut self) {
+        // SAFETY: The only non-weak copy of ptr is no longer accessible after
+        // this drop call.
+        unsafe {
+            crate::ggml_free(self.ptr.as_ptr());
+        }
+    }
+}
diff --git a/ggml-rs/src/lib.rs b/ggml-rs/src/lib.rs
new file mode 100644
index 00000000..5950de13
--- /dev/null
+++ b/ggml-rs/src/lib.rs
@@ -0,0 +1,268 @@
+#![allow(non_upper_case_globals)]
+#![allow(non_camel_case_types)]
+#![allow(non_snake_case)]
+#![allow(unused)]
+
+//! `ggml-rs` is a semi-idiomatic wrapper for the `ggml` C library.
+//!
+//! It exposes a subset of operations (currently used to implement the [llama-rs](https://crates.io/crates/llama-rs) library).
+//! Note that it does not expose a fully-idiomatic safe Rust interface; operations that could be potentially unsafe are marked as such.
+//!
+//! `ggml-rs` operates on a computational graph; no values will be computed until [Context::graph_compute] is executed.
+//! All [Tensor]s are nodes in this computational graph, and values cannot be retrieved until computation is completed.
+
+use std::{
+    os::raw::{c_int, c_void},
+    ptr::NonNull,
+    sync::{Arc, Weak},
+};
+
+include!(concat!(env!("OUT_DIR"), "/bindings.rs"));
+
+pub use tensor::Tensor;
+
+/// Utilities for reading and writing.
+pub mod util;
+
+pub mod loader;
+
+pub mod saver;
+
+pub mod context;
+mod tensor;
+
+#[cfg(test)]
+mod tests;
+
+/// The type of a tensor element.
+pub type ElementType = Type;
+
+#[derive(Debug, PartialEq, Clone, Copy)]
+/// The format of the file containing the model.
+pub enum ContainerType {
+    /// `GGML`: legacy format, oldest ggml tensor file format
+    Ggml,
+    /// `GGMF`: also legacy format. Introduces versioning. Newer than GGML, older than GGJT.
+    Ggmf,
+    /// `GGJT`: mmap-able format.
+    Ggjt,
+}
+impl ContainerType {
+    /// Does this container type support mmap?
+    pub fn support_mmap(&self) -> bool {
+        match self {
+            ContainerType::Ggml => false,
+            ContainerType::Ggmf => false,
+            ContainerType::Ggjt => true,
+        }
+    }
+}
+
+/// Magic constant for `ggml` files (versioned, ggmf).
+pub const FILE_MAGIC_GGMF: u32 = 0x67676d66;
+/// Magic constant for `ggml` files (versioned, ggjt).
+pub const FILE_MAGIC_GGJT: u32 = 0x67676a74;
+/// Magic constant for `ggml` files (unversioned).
+pub const FILE_MAGIC_UNVERSIONED: u32 = 0x67676d6c;
+
+/// The currently-supported format version for `ggml` files.
+pub const FORMAT_VERSION: u32 = 1;
+
+/// The size of a `ggml` object.
+pub const OBJECT_SIZE: usize = crate::GGML_OBJECT_SIZE;
+
+#[derive(Debug, Copy, Clone, PartialEq, Eq, Default)]
+/// The type of a value in `ggml`.
+pub enum Type {
+    /// Quantized 4-bit (type 0).
+    #[default]
+    Q4_0,
+    /// Quantized 4-bit (type 1); used by GPTQ.
+    Q4_1,
+    /// Quantized 4-bit (type 2).
+    Q4_2,
+    /// Quantized 4-bit (type 3).
+    Q4_3,
+    /// Quantized 8-bit (type 0).
+    Q8_0,
+    /// Integer 32-bit.
+    I32,
+    /// Float 16-bit.
+    F16,
+    /// Float 32-bit.
+    F32,
+}
+impl From<Type> for crate::ggml_type {
+    fn from(t: Type) -> Self {
+        match t {
+            Type::Q4_0 => crate::ggml_type_GGML_TYPE_Q4_0,
+            Type::Q4_1 => crate::ggml_type_GGML_TYPE_Q4_1,
+            Type::Q4_2 => crate::ggml_type_GGML_TYPE_Q4_2,
+            Type::Q4_3 => crate::ggml_type_GGML_TYPE_Q4_3,
+            Type::Q8_0 => crate::ggml_type_GGML_TYPE_Q8_0,
+            Type::I32 => crate::ggml_type_GGML_TYPE_I32,
+            Type::F16 => crate::ggml_type_GGML_TYPE_F16,
+            Type::F32 => crate::ggml_type_GGML_TYPE_F32,
+        }
+    }
+}
+impl TryFrom<crate::ggml_type> for Type {
+    type Error = ();
+    fn try_from(t: crate::ggml_type) -> Result<Self, Self::Error> {
+        match t {
+            crate::ggml_type_GGML_TYPE_Q4_0 => Ok(Type::Q4_0),
+            crate::ggml_type_GGML_TYPE_Q4_1 => Ok(Type::Q4_1),
+            crate::ggml_type_GGML_TYPE_Q4_2 => Ok(Type::Q4_2),
+            crate::ggml_type_GGML_TYPE_Q4_3 => Ok(Type::Q4_3),
+            crate::ggml_type_GGML_TYPE_Q8_0 => Ok(Type::Q8_0),
+            crate::ggml_type_GGML_TYPE_I32 => Ok(Type::I32),
+            crate::ggml_type_GGML_TYPE_F16 => Ok(Type::F16),
+            crate::ggml_type_GGML_TYPE_F32 => Ok(Type::F32),
+            _ => Err(()),
+        }
+    }
+}
+impl std::fmt::Display for Type {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        match self {
+            Type::Q4_0 => write!(f, "q4_0"),
+            Type::Q4_1 => write!(f, "q4_1"),
+            Type::Q4_2 => write!(f, "q4_2"),
+            Type::Q4_3 => write!(f, "q4_3"),
+            Type::Q8_0 => write!(f, "q8_0"),
+            Type::I32 => write!(f, "i32"),
+            Type::F16 => write!(f, "f16"),
+            Type::F32 => write!(f, "f32"),
+        }
+    }
+}
+
+/// A buffer of memory that can be used as a scratch buffer for a [Context].
+///
+/// See [Context::use_scratch].
+pub struct Buffer {
+    data: Box<[u8]>,
+}
+
+impl Buffer {
+    /// Creates a new buffer of the specified size.
+    pub fn new(size: usize) -> Self {
+        let mut data: Vec<u8> = Vec::with_capacity(size);
+
+        // SAFETY: The contents are intentionally uninitialized, as they will be passed to
+        // the ggml C API which will fill them with data.
+        #[allow(clippy::uninit_vec)]
+        unsafe {
+            data.set_len(size);
+        }
+
+        Buffer {
+            data: data.into_boxed_slice(),
+        }
+    }
+}
+
+/// A `ggml` computation graph. Keeps track of all state during computation.
+pub struct ComputationGraph {
+    inner: crate::ggml_cgraph,
+}
+
+impl ComputationGraph {
+    /// Create a new [ComputationGraph] with the specified `n_threads`.
+    pub fn new(n_threads: usize) -> Self {
+        Self {
+            inner: crate::ggml_cgraph {
+                n_threads: usize_to_i32(n_threads),
+                // SAFETY: This should be safe to zero. The original C++ impl
+                // just leaves it uninitialized
+                ..unsafe { std::mem::zeroed::<crate::ggml_cgraph>() }
+            },
+        }
+    }
+
+    /// Build this computational graph in the forward direction in preparation for computation.
+    pub fn build_forward_expand(&mut self, tensor: &Tensor) {
+        unsafe { crate::ggml_build_forward_expand(&mut self.inner, tensor.ptr.as_ptr()) }
+    }
+}
+
+/// The size of `t` as bytes.
+pub fn type_size(t: Type) -> usize {
+    unsafe { crate::ggml_type_size(t.into()) }
+}
+
+/// [type_size]/[blck_size] as float.
+pub fn type_sizef(x: Type) -> f64 {
+    (unsafe { crate::ggml_type_sizef(x.into()) }) as f64
+}
+
+/// The size of a block for `t`. Only relevant for quantized types.
+pub fn blck_size(t: Type) -> usize {
+    i32_to_usize(unsafe { crate::ggml_blck_size(t.into()) })
+}
+
+fn usize_to_i32(val: usize) -> i32 {
+    i32::try_from(val).unwrap()
+}
+
+fn usize_to_i64(val: usize) -> i64 {
+    i64::try_from(val).unwrap()
+}
+
+fn i32_to_usize(val: i32) -> usize {
+    usize::try_from(val).unwrap()
+}
+
+fn i64_to_usize(val: i64) -> usize {
+    usize::try_from(val).unwrap()
+}
+
+/// Contains the result of a quantization operation.
+pub struct QuantizationResult {
+    /// The quantized output.
+    pub output: Vec<u8>,
+    /// The quantization history.
+    pub history: Vec<i64>,
+}
+
+/// Quantizes `src` into `dst` using `q4_0` quantization.
+///
+/// You must ensure that `src.len() == n_elements`, and `n_elements_0`
+/// is the first dimension of `src`.
+pub fn quantize_q4_0(src: &[f32], n_elements: usize, n_elements_0: usize) -> QuantizationResult {
+    quantize_impl(src, n_elements, n_elements_0, crate::ggml_quantize_q4_0)
+}
+
+/// Quantizes `src` into `dst` using `q4_1` quantization.
+///
+/// You must ensure that `src.len() == n_elements`, and `n_elements_0`
+/// is the first dimension of `src`.
+pub fn quantize_q4_1(src: &[f32], n_elements: usize, n_elements_0: usize) -> QuantizationResult {
+    quantize_impl(src, n_elements, n_elements_0, crate::ggml_quantize_q4_1)
+}
+
+fn quantize_impl(
+    src: &[f32],
+    n_elements: usize,
+    n_elements_0: usize,
+    quantizer: unsafe extern "C" fn(*const f32, *mut c_void, c_int, c_int, *mut i64) -> usize,
+) -> QuantizationResult {
+    assert_eq!(src.len(), n_elements);
+    assert_eq!(n_elements % n_elements_0, 0);
+
+    // A conservative multiplier of 4 is used here.
+    let mut output = vec![0u8; n_elements * 4];
+    let mut history = vec![0i64; 16];
+    let output_size = unsafe {
+        quantizer(
+            src.as_ptr(),
+            output.as_mut_ptr() as *mut c_void,
+            n_elements.try_into().unwrap(),
+            n_elements_0.try_into().unwrap(),
+            history.as_mut_ptr(),
+        )
+    };
+
+    output.resize(output_size, 0u8);
+    QuantizationResult { output, history }
+}
diff --git a/ggml-format/src/loader.rs b/ggml-rs/src/loader.rs
similarity index 92%
rename from ggml-format/src/loader.rs
rename to ggml-rs/src/loader.rs
index eaec757a..30963fd0 100644
--- a/ggml-format/src/loader.rs
+++ b/ggml-rs/src/loader.rs
@@ -75,7 +75,7 @@ impl TensorInfo {
     ///
     /// Do not use this if loading with `mmap`.
     pub fn read_data<R: BufRead + Seek>(&self, reader: &mut R) -> std::io::Result<Vec<u8>> {
-        let n_bytes = self.n_elements * ggml::type_size(self.element_type);
+        let n_bytes = self.n_elements * crate::type_size(self.element_type);
         let mut data = vec![0; n_bytes];
         reader.seek(SeekFrom::Start(self.start_offset))?;
         reader.read_exact(&mut data)?;
@@ -85,7 +85,7 @@ impl TensorInfo {
 
 /// Returns the size occupied by a tensor's data in bytes given the element type and number of elements.
 pub fn data_size(element_type: ElementType, n_elements: usize) -> usize {
-    (ggml::type_size(element_type) * n_elements) / ggml::blck_size(element_type)
+    (crate::type_size(element_type) * n_elements) / crate::blck_size(element_type)
 }
 
 #[derive(Debug, Clone)]
@@ -118,9 +118,9 @@ pub fn load_model<E: Error, R: BufRead + Seek>(
 ) -> Result<(), LoadError<E>> {
     // Verify magic
     let container_type: ContainerType = match read_u32(reader)? {
-        ggml::FILE_MAGIC_GGMF => ContainerType::Ggmf,
-        ggml::FILE_MAGIC_GGJT => ContainerType::Ggjt,
-        ggml::FILE_MAGIC_UNVERSIONED => ContainerType::Ggml,
+        crate::FILE_MAGIC_GGMF => ContainerType::Ggmf,
+        crate::FILE_MAGIC_GGJT => ContainerType::Ggjt,
+        crate::FILE_MAGIC_UNVERSIONED => ContainerType::Ggml,
         magic => return Err(LoadError::InvalidMagic(magic)),
     };
     handler
@@ -131,7 +131,7 @@ pub fn load_model<E: Error, R: BufRead + Seek>(
     match container_type {
         ContainerType::Ggmf | ContainerType::Ggjt => {
             let _version: u32 = match read_u32(reader)? {
-                ggml::FORMAT_VERSION => ggml::FORMAT_VERSION,
+                crate::FORMAT_VERSION => crate::FORMAT_VERSION,
                 version => return Err(LoadError::InvalidFormatVersion(container_type, version)),
             };
         }
@@ -198,10 +198,11 @@ fn load_weights<E: Error, R: BufRead + Seek>(
 
         // load tensor name
         let name = String::from_utf8(read_bytes_with_len(reader, name_len.try_into()?)?)?;
-        let ftype = ggml::Type::try_from(ftype).map_err(|_| LoadError::UnsupportedElementType {
-            tensor_name: name.clone(),
-            ftype,
-        })?;
+        let ftype =
+            crate::Type::try_from(ftype).map_err(|_| LoadError::UnsupportedElementType {
+                tensor_name: name.clone(),
+                ftype,
+            })?;
 
         // sanity check
         match ftype {
diff --git a/ggml-format/src/saver.rs b/ggml-rs/src/saver.rs
similarity index 97%
rename from ggml-format/src/saver.rs
rename to ggml-rs/src/saver.rs
index e098b51c..9a5e86e8 100644
--- a/ggml-format/src/saver.rs
+++ b/ggml-rs/src/saver.rs
@@ -61,8 +61,8 @@ pub fn save_model<E: Error, W: Write + Seek>(
     tensor_names: &[String],
 ) -> Result<(), SaveError<E>> {
     // Write header and hyperparameters
-    util::write_u32(writer, ggml::FILE_MAGIC_GGJT)?;
-    util::write_u32(writer, ggml::FORMAT_VERSION)?;
+    util::write_u32(writer, crate::FILE_MAGIC_GGJT)?;
+    util::write_u32(writer, crate::FORMAT_VERSION)?;
     handler
         .write_hyperparameters(writer)
         .map_err(SaveError::ImplementationError)?;
diff --git a/ggml-rs/src/tensor.rs b/ggml-rs/src/tensor.rs
new file mode 100644
index 00000000..6e426940
--- /dev/null
+++ b/ggml-rs/src/tensor.rs
@@ -0,0 +1,130 @@
+use std::{
+    os::raw::{c_int, c_void},
+    ptr::NonNull,
+    sync::{Arc, Weak},
+};
+
+use crate::{i64_to_usize, Type};
+
+/// Tensors are owned by the context. A tensor is alive as long as the
+/// underlying context it was created with is alive.
+pub struct Tensor {
+    pub(crate) ptr: NonNull<crate::ggml_tensor>,
+    pub(crate) ctx: Weak<NonNull<crate::ggml_context>>,
+}
+
+impl Tensor {
+    /// Size of the `ggml_tensor` struct in bytes.
+    ///
+    /// Exposed for purposes of determining context size.
+    pub const C_TYPE_SIZE: usize = std::mem::size_of::<crate::ggml_tensor>();
+
+    /// Creates a shared copy of this tensor pointer.
+    pub fn share(&self) -> Self {
+        Tensor {
+            ptr: self.ptr,
+            ctx: Weak::clone(&self.ctx),
+        }
+    }
+
+    fn with_alive_ctx<U>(&self, mut f: impl FnMut() -> U) -> U {
+        if let Some(_ctx) = self.ctx.upgrade() {
+            f()
+        } else {
+            panic!("Using a tensor after the context was dropped")
+        }
+    }
+
+    fn with_alive_ctx_mut<U>(&self, mut f: impl FnMut() -> U) -> U {
+        if let Some(_ctx) = self.ctx.upgrade() {
+            f()
+        } else {
+            panic!("Using a tensor after the context was dropped")
+        }
+    }
+
+    /// Number of bytes used by this tensor.
+    pub fn nbytes(&self) -> usize {
+        self.with_alive_ctx(|| {
+            // SAFETY: The with_alive_call guarantees the context is alive
+            unsafe { crate::ggml_nbytes(self.ptr.as_ptr()) }
+        })
+    }
+
+    /// Provides raw mutable access to the data contained within the tensor.
+    ///
+    /// # Safety
+    ///
+    /// Only `std::slice::from_raw_parts_mut(tensor.data(), tensor.nbytes())` is safe to mutate.
+    pub unsafe fn data(&mut self) -> *mut c_void {
+        self.with_alive_ctx(|| {
+            // SAFETY: The with_alive_call guarantees the context is alive
+            unsafe { *self.ptr.as_ptr() }.data
+        })
+    }
+
+    /// Set the tensor's data pointer (useful for mmap-ed data)
+    ///
+    /// # Safety
+    ///
+    /// The memory region from `data_ptr` to `data_ptr.offset(tensor.nbytes())` will be read from.
+    pub unsafe fn set_data(&mut self, data_ptr: *mut c_void) {
+        let tensor = self.ptr.as_mut();
+        self.with_alive_ctx_mut(|| {
+            // SAFETY: The with_alive_call guarantees the context is alive
+            tensor.data = data_ptr;
+        })
+    }
+
+    /// Number of elements in this tensor.
+    pub fn nelements(&self) -> usize {
+        self.with_alive_ctx(|| {
+            // SAFETY: The with_alive_call guarantees the context is alive
+            i64_to_usize(unsafe { crate::ggml_nelements(self.ptr.as_ptr()) })
+        })
+    }
+
+    /// Number of elements in each dimension.
+    pub fn get_ne(&self) -> [i64; 4] {
+        self.with_alive_ctx(|| unsafe { *self.ptr.as_ptr() }.ne)
+    }
+
+    /// Stride of each dimension.
+    pub fn get_nb(&self) -> [usize; 4] {
+        self.with_alive_ctx(|| unsafe { *self.ptr.as_ptr() }.nb)
+    }
+
+    /// The data type.
+    pub fn get_type(&self) -> Type {
+        self.with_alive_ctx(|| unsafe { *self.ptr.as_ptr() }.type_.try_into().unwrap())
+    }
+
+    /// The size of the element type in bytes.
+    pub fn element_size(&self) -> usize {
+        self.with_alive_ctx(|| unsafe { crate::ggml_element_size(self.ptr.as_ptr()) })
+    }
+
+    /// Writes `src` to this tensor.
+    ///
+    /// # Safety
+    ///
+    /// This tensor must not be written to or read by from any other code.
+    pub unsafe fn write_data(&mut self, src: &[u8]) {
+        std::ptr::copy_nonoverlapping(src.as_ptr(), self.data() as *mut u8, src.len())
+    }
+
+    /// Zeroes out this tensor.
+    pub fn zero_data(&mut self) {
+        unsafe { std::ptr::write_bytes(self.data() as *mut u8, 0, self.nbytes()) }
+    }
+
+    /// Reads this tensor into `dst`, starting from `offset`.
+    ///
+    /// # Safety
+    ///
+    /// This tensor must not be written to or read by from any other code.
+    pub unsafe fn read_data(&self, offset: usize, dst: &mut [u8]) {
+        let data = unsafe { crate::ggml_get_data(self.ptr.as_ptr()).add(offset) };
+        std::ptr::copy_nonoverlapping(data, dst as *mut _ as _, dst.len())
+    }
+}
diff --git a/ggml-format/src/tests.rs b/ggml-rs/src/tests.rs
similarity index 84%
rename from ggml-format/src/tests.rs
rename to ggml-rs/src/tests.rs
index 91d925bb..9b17acd7 100644
--- a/ggml-format/src/tests.rs
+++ b/ggml-rs/src/tests.rs
@@ -26,7 +26,7 @@ fn can_roundtrip_loader_and_saver() {
     ];
 
     let mut rng = rand::thread_rng();
-    let element_type = ggml::Type::F16;
+    let element_type = crate::Type::F16;
     let model = Model {
         hyperparameters: Hyperparameters {
             some_hyperparameter: random(),
@@ -43,13 +43,13 @@ fn can_roundtrip_loader_and_saver() {
                     .collect::<Vec<_>>();
 
                 let n_elements = dims.iter().product::<usize>();
-                let data = (0..data_size(element_type, n_elements))
+                let data = (0..loader::data_size(element_type, n_elements))
                     .map(|_| random())
                     .collect::<Vec<_>>();
 
                 (
                     format!("tensor_{}", i),
-                    TensorData {
+                    saver::TensorData {
                         n_dims,
                         dims: dims.try_into().unwrap(),
                         element_type,
@@ -64,7 +64,7 @@ fn can_roundtrip_loader_and_saver() {
     let mut buffer = Vec::new();
     let mut cursor = std::io::Cursor::new(&mut buffer);
     let mut save_handler = MockSaveHandler { model: &model };
-    save_model(
+    saver::save_model(
         &mut cursor,
         &mut save_handler,
         &model.vocabulary,
@@ -78,7 +78,7 @@ fn can_roundtrip_loader_and_saver() {
         data: &buffer,
         loaded_model: Model::default(),
     };
-    load_model(&mut cursor, &mut load_handler).unwrap();
+    loader::load_model(&mut cursor, &mut load_handler).unwrap();
     assert_eq!(load_handler.loaded_model, model);
 }
 
@@ -92,14 +92,14 @@ impl Hyperparameters {
     fn read(reader: &mut dyn BufRead) -> Result<Self, std::io::Error> {
         Ok(Self {
             some_hyperparameter: util::read_u32(reader)?,
-            some_other_hyperparameter: util::read_u32(reader)? as u32,
+            some_other_hyperparameter: util::read_u32(reader)?,
             vocabulary_size: util::read_u32(reader)?,
         })
     }
 
     fn write(&self, writer: &mut dyn Write) -> Result<(), std::io::Error> {
         util::write_u32(writer, self.some_hyperparameter)?;
-        util::write_u32(writer, self.some_other_hyperparameter as u32)?;
+        util::write_u32(writer, self.some_other_hyperparameter)?;
         util::write_u32(writer, self.vocabulary_size)?;
         Ok(())
     }
@@ -109,19 +109,19 @@ impl Hyperparameters {
 struct Model {
     hyperparameters: Hyperparameters,
     vocabulary: Vec<(Vec<u8>, f32)>,
-    tensors: BTreeMap<String, TensorData>,
+    tensors: BTreeMap<String, saver::TensorData>,
 }
 
 struct MockSaveHandler<'a> {
     model: &'a Model,
 }
-impl SaveHandler<DummyError> for MockSaveHandler<'_> {
+impl saver::SaveHandler<DummyError> for MockSaveHandler<'_> {
     fn write_hyperparameters(&mut self, writer: &mut dyn Write) -> Result<(), DummyError> {
         self.model.hyperparameters.write(writer).unwrap();
         Ok(())
     }
 
-    fn tensor_data(&mut self, tensor_name: &str) -> Result<TensorData, DummyError> {
+    fn tensor_data(&mut self, tensor_name: &str) -> Result<saver::TensorData, DummyError> {
         self.model
             .tensors
             .get(tensor_name)
@@ -134,7 +134,7 @@ struct MockLoadHandler<'a> {
     data: &'a [u8],
     loaded_model: Model,
 }
-impl LoadHandler<DummyError> for MockLoadHandler<'_> {
+impl loader::LoadHandler<DummyError> for MockLoadHandler<'_> {
     fn container_type(&mut self, container_type: ContainerType) -> Result<(), DummyError> {
         assert_eq!(container_type, ContainerType::Ggjt);
         Ok(())
@@ -149,9 +149,9 @@ impl LoadHandler<DummyError> for MockLoadHandler<'_> {
     fn read_hyperparameters(
         &mut self,
         reader: &mut dyn BufRead,
-    ) -> Result<PartialHyperparameters, DummyError> {
+    ) -> Result<loader::PartialHyperparameters, DummyError> {
         self.loaded_model.hyperparameters = Hyperparameters::read(reader).unwrap();
-        Ok(PartialHyperparameters {
+        Ok(loader::PartialHyperparameters {
             n_vocab: self
                 .loaded_model
                 .hyperparameters
@@ -161,8 +161,8 @@ impl LoadHandler<DummyError> for MockLoadHandler<'_> {
         })
     }
 
-    fn tensor_buffer(&mut self, info: TensorInfo) -> Result<(), DummyError> {
-        let data = TensorData {
+    fn tensor_buffer(&mut self, info: loader::TensorInfo) -> Result<(), DummyError> {
+        let data = saver::TensorData {
             n_dims: info.n_dims,
             dims: info.dims,
             element_type: info.element_type,
diff --git a/ggml-format/src/util.rs b/ggml-rs/src/util.rs
similarity index 100%
rename from ggml-format/src/util.rs
rename to ggml-rs/src/util.rs
diff --git a/ggml-sys/Cargo.toml b/ggml-sys/Cargo.toml
deleted file mode 100644
index 6a971dee..00000000
--- a/ggml-sys/Cargo.toml
+++ /dev/null
@@ -1,10 +0,0 @@
-[build-dependencies.cc]
-version = "^1.0"
-
-[dependencies]
-
-[package]
-description = "Low level bindings for ggml"
-edition = "2021"
-name = "ggml-sys"
-version = {workspace = true}
diff --git a/ggml-sys/ggml/.gitattributes b/ggml-sys/ggml/.gitattributes
deleted file mode 100644
index 304373d7..00000000
--- a/ggml-sys/ggml/.gitattributes
+++ /dev/null
@@ -1,2 +0,0 @@
-*.c linguist-vendored
-*.h linguist-vendored
diff --git a/ggml-sys/ggml/CREDITS.txt b/ggml-sys/ggml/CREDITS.txt
deleted file mode 100644
index e3a704f3..00000000
--- a/ggml-sys/ggml/CREDITS.txt
+++ /dev/null
@@ -1,8 +0,0 @@
-Vendored version: https://github.com/ggerganov/llama.cpp/commit/0e018fe008eacebdbcfa2d61b6c988c245c961cd
-For convenience, changes involved in this sync:
-https://github.com/ggerganov/llama.cpp/compare/74f5899df4a6083fc467b620baa1cf821e37799d..0e018fe008eacebdbcfa2d61b6c988c245c961cd
-
-The ggml.c and ggml.h files are distributed under the terms of the MIT license.
-Credit goes to the original authors: Copyright (c) 2023 Georgi Gerganov
-
-`ggml` has been patched with patches from <https://github.com/NouamaneTazi/bloomz.cpp> to enable BLOOM evaluation.
\ No newline at end of file
diff --git a/ggml-sys/ggml/ggml.c b/ggml-sys/ggml/ggml.c
deleted file mode 100644
index c397ca36..00000000
--- a/ggml-sys/ggml/ggml.c
+++ /dev/null
@@ -1,12527 +0,0 @@
-// Defines CLOCK_MONOTONIC on Linux
-#define _GNU_SOURCE
-
-#include "ggml.h"
-
-#if defined(_MSC_VER) || defined(__MINGW32__)
-#include <malloc.h> // using malloc.h with MSC/MINGW
-#elif !defined(__FreeBSD__) && !defined(__NetBSD__) && !defined(__OpenBSD__)
-#include <alloca.h>
-#endif
-
-#include <assert.h>
-#include <errno.h>
-#include <time.h>
-#include <math.h>
-#include <stdlib.h>
-#include <string.h>
-#include <stdint.h>
-#include <inttypes.h>
-#include <stdio.h>
-#include <float.h>
-#include <limits.h>
-
-// if C99 - static_assert is noop
-// ref: https://stackoverflow.com/a/53923785/4039976
-#ifndef static_assert
-#define static_assert(cond, msg) struct global_scope_noop_trick
-#endif
-
-#if defined(_WIN32)
-
-#include <windows.h>
-
-typedef volatile LONG atomic_int;
-typedef atomic_int atomic_bool;
-
-static void atomic_store(atomic_int* ptr, LONG val) {
-    InterlockedExchange(ptr, val);
-}
-static LONG atomic_load(atomic_int* ptr) {
-    return InterlockedCompareExchange(ptr, 0, 0);
-}
-static LONG atomic_fetch_add(atomic_int* ptr, LONG inc) {
-    return InterlockedExchangeAdd(ptr, inc);
-}
-static LONG atomic_fetch_sub(atomic_int* ptr, LONG dec) {
-    return atomic_fetch_add(ptr, -(dec));
-}
-
-typedef HANDLE pthread_t;
-
-typedef DWORD thread_ret_t;
-static int pthread_create(pthread_t* out, void* unused, thread_ret_t(*func)(void*), void* arg) {
-    (void) unused;
-    HANDLE handle = CreateThread(NULL, 0, (LPTHREAD_START_ROUTINE) func, arg, 0, NULL);
-    if (handle == NULL)
-    {
-        return EAGAIN;
-    }
-
-    *out = handle;
-    return 0;
-}
-
-static int pthread_join(pthread_t thread, void* unused) {
-    (void) unused;
-    return (int) WaitForSingleObject(thread, INFINITE);
-}
-
-static int sched_yield (void) {
-    Sleep (0);
-    return 0;
-}
-#else
-#include <pthread.h>
-#include <stdatomic.h>
-
-typedef void* thread_ret_t;
-#endif
-
-// __FMA__ and __F16C__ are not defined in MSVC, however they are implied with AVX2/AVX512
-#if defined(_MSC_VER) && (defined(__AVX2__) || defined(__AVX512F__))
-#ifndef __FMA__
-#define __FMA__
-#endif
-#ifndef __F16C__
-#define __F16C__
-#endif
-#ifndef __SSE3__
-#define __SSE3__
-#endif
-#endif
-
-#ifdef __HAIKU__
-#define static_assert(cond, msg) _Static_assert(cond, msg)
-#endif
-
-/*#define GGML_PERF*/
-#define GGML_DEBUG 0
-#define GGML_GELU_FP16
-#define GGML_SILU_FP16
-
-#define GGML_SOFT_MAX_UNROLL 4
-#define GGML_VEC_DOT_UNROLL  2
-
-#ifdef GGML_USE_ACCELERATE
-// uncomment to use vDSP for soft max computation
-// note: not sure if it is actually faster
-//#define GGML_SOFT_MAX_ACCELERATE
-#endif
-
-#if UINTPTR_MAX == 0xFFFFFFFF
-    #define GGML_MEM_ALIGN 4
-#else
-    #define GGML_MEM_ALIGN 16
-#endif
-
-#if defined(_MSC_VER) || defined(__MINGW32__)
-#define GGML_ALIGNED_MALLOC(size)  _aligned_malloc(size, GGML_MEM_ALIGN)
-#define GGML_ALIGNED_FREE(ptr)     _aligned_free(ptr)
-#else
-inline static void* ggml_aligned_malloc(size_t size) {
-    void* aligned_memory = NULL;
-    int result = posix_memalign(&aligned_memory, GGML_MEM_ALIGN, size);
-    if (result != 0) {
-        // Handle allocation failure
-        return NULL;
-    }
-    return aligned_memory;
-}
-#define GGML_ALIGNED_MALLOC(size)  ggml_aligned_malloc(size)
-#define GGML_ALIGNED_FREE(ptr)     free(ptr)
-#endif
-
-#define UNUSED(x) (void)(x)
-#define SWAP(x, y, T) do { T SWAP = x; x = y; y = SWAP; } while (0)
-
-#define GGML_ASSERT(x) \
-    do { \
-        if (!(x)) { \
-            fprintf(stderr, "GGML_ASSERT: %s:%d: %s\n", __FILE__, __LINE__, #x); \
-            abort(); \
-        } \
-    } while (0)
-
-#if defined(GGML_USE_ACCELERATE)
-#include <Accelerate/Accelerate.h>
-#elif defined(GGML_USE_OPENBLAS)
-#include <cblas.h>
-#elif defined(GGML_USE_CUBLAS)
-#include "ggml-cuda.h"
-#endif
-
-#undef MIN
-#undef MAX
-#define MIN(a, b) ((a) < (b) ? (a) : (b))
-#define MAX(a, b) ((a) > (b) ? (a) : (b))
-
-// floating point type used to accumulate sums
-typedef double ggml_float;
-
-// 16-bit float
-// on Arm, we use __fp16
-// on x86, we use uint16_t
-#ifdef __ARM_NEON
-
-// if YCM cannot find <arm_neon.h>, make a symbolic link to it, for example:
-//
-//   $ ln -sfn /Library/Developer/CommandLineTools/usr/lib/clang/13.1.6/include/arm_neon.h ./src/
-//
-#include <arm_neon.h>
-
-#define GGML_COMPUTE_FP16_TO_FP32(x) ((float) (x))
-#define GGML_COMPUTE_FP32_TO_FP16(x) (x)
-
-#define GGML_FP16_TO_FP32(x) ((float) (x))
-#define GGML_FP32_TO_FP16(x) (x)
-
-#else
-
-#ifdef __wasm_simd128__
-#include <wasm_simd128.h>
-#else
-#ifdef __POWER9_VECTOR__
-#include <altivec.h>
-#undef bool
-#define bool _Bool
-#else
-#include <immintrin.h>
-#endif
-#endif
-
-#ifdef __F16C__
-
-#ifdef _MSC_VER
-#define GGML_COMPUTE_FP16_TO_FP32(x) _mm_cvtss_f32(_mm_cvtph_ps(_mm_cvtsi32_si128(x)))
-#define GGML_COMPUTE_FP32_TO_FP16(x) _mm_extract_epi16(_mm_cvtps_ph(_mm_set_ss(x), 0), 0)
-#else
-#define GGML_COMPUTE_FP16_TO_FP32(x) _cvtsh_ss(x)
-#define GGML_COMPUTE_FP32_TO_FP16(x) _cvtss_sh(x, 0)
-#endif
-
-#elif defined(__POWER9_VECTOR__)
-
-#define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
-#define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x)
-/* the inline asm below is about 12% faster than the lookup method */
-#define GGML_FP16_TO_FP32(x) GGML_COMPUTE_FP16_TO_FP32(x)
-#define GGML_FP32_TO_FP16(x) GGML_COMPUTE_FP32_TO_FP16(x)
-
-static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) {
-    register float f;
-    register double d;
-    __asm__(
-        "mtfprd %0,%2\n"
-        "xscvhpdp %0,%0\n"
-        "frsp %1,%0\n" :
-        /* temp */ "=d"(d),
-        /* out */  "=f"(f):
-        /* in */   "r"(h));
-    return f;
-}
-
-static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) {
-    register double d;
-    register ggml_fp16_t r;
-    __asm__( /* xscvdphp can work on double or single precision */
-        "xscvdphp %0,%2\n"
-        "mffprd %1,%0\n" :
-        /* temp */ "=d"(d),
-        /* out */  "=r"(r):
-        /* in */   "f"(f));
-    return r;
-}
-
-#else
-
-// FP16 <-> FP32
-// ref: https://github.com/Maratyszcza/FP16
-
-static inline float fp32_from_bits(uint32_t w) {
-    union {
-        uint32_t as_bits;
-        float as_value;
-    } fp32;
-    fp32.as_bits = w;
-    return fp32.as_value;
-}
-
-static inline uint32_t fp32_to_bits(float f) {
-    union {
-        float as_value;
-        uint32_t as_bits;
-    } fp32;
-    fp32.as_value = f;
-    return fp32.as_bits;
-}
-
-static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) {
-    const uint32_t w = (uint32_t) h << 16;
-    const uint32_t sign = w & UINT32_C(0x80000000);
-    const uint32_t two_w = w + w;
-
-    const uint32_t exp_offset = UINT32_C(0xE0) << 23;
-#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) || defined(__GNUC__) && !defined(__STRICT_ANSI__)
-    const float exp_scale = 0x1.0p-112f;
-#else
-    const float exp_scale = fp32_from_bits(UINT32_C(0x7800000));
-#endif
-    const float normalized_value = fp32_from_bits((two_w >> 4) + exp_offset) * exp_scale;
-
-    const uint32_t magic_mask = UINT32_C(126) << 23;
-    const float magic_bias = 0.5f;
-    const float denormalized_value = fp32_from_bits((two_w >> 17) | magic_mask) - magic_bias;
-
-    const uint32_t denormalized_cutoff = UINT32_C(1) << 27;
-    const uint32_t result = sign |
-        (two_w < denormalized_cutoff ? fp32_to_bits(denormalized_value) : fp32_to_bits(normalized_value));
-    return fp32_from_bits(result);
-}
-
-static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) {
-#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) || defined(__GNUC__) && !defined(__STRICT_ANSI__)
-    const float scale_to_inf = 0x1.0p+112f;
-    const float scale_to_zero = 0x1.0p-110f;
-#else
-    const float scale_to_inf = fp32_from_bits(UINT32_C(0x77800000));
-    const float scale_to_zero = fp32_from_bits(UINT32_C(0x08800000));
-#endif
-    float base = (fabsf(f) * scale_to_inf) * scale_to_zero;
-
-    const uint32_t w = fp32_to_bits(f);
-    const uint32_t shl1_w = w + w;
-    const uint32_t sign = w & UINT32_C(0x80000000);
-    uint32_t bias = shl1_w & UINT32_C(0xFF000000);
-    if (bias < UINT32_C(0x71000000)) {
-        bias = UINT32_C(0x71000000);
-    }
-
-    base = fp32_from_bits((bias >> 1) + UINT32_C(0x07800000)) + base;
-    const uint32_t bits = fp32_to_bits(base);
-    const uint32_t exp_bits = (bits >> 13) & UINT32_C(0x00007C00);
-    const uint32_t mantissa_bits = bits & UINT32_C(0x00000FFF);
-    const uint32_t nonsign = exp_bits + mantissa_bits;
-    return (sign >> 16) | (shl1_w > UINT32_C(0xFF000000) ? UINT16_C(0x7E00) : nonsign);
-}
-
-#define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
-#define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x)
-
-#endif // __F16C__
-
-#endif // __ARM_NEON
-
-//
-// global data
-//
-
-// precomputed gelu table for f16 (128 KB)
-static ggml_fp16_t table_gelu_f16[1 << 16];
-
-// precomputed silu table for f16 (128 KB)
-static ggml_fp16_t table_silu_f16[1 << 16];
-
-// precomputed exp table for f16 (128 KB)
-static ggml_fp16_t table_exp_f16[1 << 16];
-
-// precomputed f32 table for f16 (256 KB)
-static float table_f32_f16[1 << 16];
-
-// On ARM NEON, it's quicker to directly convert x -> x instead of calling into ggml_lookup_fp16_to_fp32,
-// so we define GGML_FP16_TO_FP32 and GGML_FP32_TO_FP16 elsewhere for NEON.
-// This is also true for POWER9.
-#if !defined(GGML_FP16_TO_FP32) || !defined(GGML_FP32_TO_FP16)
-
-inline static float ggml_lookup_fp16_to_fp32(ggml_fp16_t f) {
-    uint16_t s;
-    memcpy(&s, &f, sizeof(uint16_t));
-    return table_f32_f16[s];
-}
-
-#define GGML_FP16_TO_FP32(x) ggml_lookup_fp16_to_fp32(x)
-#define GGML_FP32_TO_FP16(x) GGML_COMPUTE_FP32_TO_FP16(x)
-
-#endif
-
-// note: do not use these inside ggml.c
-// these are meant to be used via the ggml.h API
-float ggml_fp16_to_fp32(ggml_fp16_t x) {
-    return (float) GGML_FP16_TO_FP32(x);
-}
-
-ggml_fp16_t ggml_fp32_to_fp16(float x) {
-    return GGML_FP32_TO_FP16(x);
-}
-
-//
-// timing
-//
-
-#if defined(_MSC_VER) || defined(__MINGW32__)
-static int64_t timer_freq;
-void ggml_time_init(void) {
-    LARGE_INTEGER frequency;
-    QueryPerformanceFrequency(&frequency);
-    timer_freq = frequency.QuadPart;
-}
-int64_t ggml_time_ms(void) {
-    LARGE_INTEGER t;
-    QueryPerformanceCounter(&t);
-    return (t.QuadPart * 1000) / timer_freq;
-}
-int64_t ggml_time_us(void) {
-    LARGE_INTEGER t;
-    QueryPerformanceCounter(&t);
-    return (t.QuadPart * 1000000) / timer_freq;
-}
-#else
-void ggml_time_init(void) {}
-int64_t ggml_time_ms(void) {
-    struct timespec ts;
-    clock_gettime(CLOCK_MONOTONIC, &ts);
-    return (int64_t)ts.tv_sec*1000 + (int64_t)ts.tv_nsec/1000000;
-}
-
-int64_t ggml_time_us(void) {
-    struct timespec ts;
-    clock_gettime(CLOCK_MONOTONIC, &ts);
-    return (int64_t)ts.tv_sec*1000000 + (int64_t)ts.tv_nsec/1000;
-}
-#endif
-
-int64_t ggml_cycles(void) {
-    return clock();
-}
-
-int64_t ggml_cycles_per_ms(void) {
-    return CLOCKS_PER_SEC/1000;
-}
-
-#ifdef GGML_PERF
-#define ggml_perf_time_ms()       ggml_time_ms()
-#define ggml_perf_time_us()       ggml_time_us()
-#define ggml_perf_cycles()        ggml_cycles()
-#define ggml_perf_cycles_per_ms() ggml_cycles_per_ms()
-#else
-#define ggml_perf_time_ms()       0
-#define ggml_perf_time_us()       0
-#define ggml_perf_cycles()        0
-#define ggml_perf_cycles_per_ms() 0
-#endif
-
-//
-// cache line
-//
-
-#if defined(__cpp_lib_hardware_interference_size)
-#define CACHE_LINE_SIZE hardware_destructive_interference_size
-#else
-#if defined(__POWER9_VECTOR__)
-#define CACHE_LINE_SIZE 128
-#else
-#define CACHE_LINE_SIZE 64
-#endif
-#endif
-
-static const size_t CACHE_LINE_SIZE_F32 = CACHE_LINE_SIZE/sizeof(float);
-
-//
-// quantization
-//
-
-#if __AVX__ || __AVX2__ || __AVX512F__
-// Unpack 16 4-bit fields into 16 bytes
-// The output vector contains 16 bytes, each one in [ 0 .. 15 ] interval
-static inline __m128i bytes_from_nibbles_16(const uint8_t * rsi)
-{
-    // Load 8 bytes from memory
-    __m128i tmp = _mm_loadu_si64( ( const __m128i* )rsi );
-
-    // Expand bytes into uint16_t values
-    __m128i bytes = _mm_cvtepu8_epi16( tmp );
-
-    // Unpack values into individual bytes
-    const __m128i lowMask = _mm_set1_epi8( 0xF );
-    __m128i high = _mm_andnot_si128( lowMask, bytes );
-    __m128i low = _mm_and_si128( lowMask, bytes );
-    high = _mm_slli_epi16( high, 4 );
-    bytes = _mm_or_si128( low, high );
-    return bytes;
-}
-
-// horizontally add 8 floats
-static inline float hsum_float_8(const __m256 x) {
-    __m128 res = _mm256_extractf128_ps(x, 1);
-    res = _mm_add_ps(res, _mm256_castps256_ps128(x));
-    res = _mm_add_ps(res, _mm_movehl_ps(res, res));
-    res = _mm_add_ss(res, _mm_movehdup_ps(res));
-    return _mm_cvtss_f32(res);
-}
-
-// horizontally add 8 int32_t
-static inline int hsum_i32_8(const __m256i a) {
-    const __m128i sum128 = _mm_add_epi32(_mm256_castsi256_si128(a), _mm256_extractf128_si256(a, 1));
-    const __m128i hi64 = _mm_unpackhi_epi64(sum128, sum128);
-    const __m128i sum64 = _mm_add_epi32(hi64, sum128);
-    const __m128i hi32  = _mm_shuffle_epi32(sum64, _MM_SHUFFLE(2, 3, 0, 1));
-    return _mm_cvtsi128_si32(_mm_add_epi32(sum64, hi32));
-}
-
-// horizontally add 4 int32_t
-static inline int hsum_i32_4(const __m128i a) {
-    const __m128i hi64 = _mm_unpackhi_epi64(a, a);
-    const __m128i sum64 = _mm_add_epi32(hi64, a);
-    const __m128i hi32  = _mm_shuffle_epi32(sum64, _MM_SHUFFLE(2, 3, 0, 1));
-    return _mm_cvtsi128_si32(_mm_add_epi32(sum64, hi32));
-}
-
-#if __AVX2__ || __AVX512F__
-// Unpack 32 4-bit fields into 32 bytes
-// The output vector contains 32 bytes, each one in [ 0 .. 15 ] interval
-static inline __m256i bytes_from_nibbles_32(const uint8_t * rsi)
-{
-    // Load 16 bytes from memory
-    __m128i tmp = _mm_loadu_si128( ( const __m128i* )rsi );
-
-    // Expand bytes into uint16_t values
-    __m256i bytes = _mm256_cvtepu8_epi16( tmp );
-
-    // Unpack values into individual bytes
-    const __m256i lowMask = _mm256_set1_epi8( 0xF );
-    __m256i high = _mm256_andnot_si256( lowMask, bytes );
-    __m256i low = _mm256_and_si256( lowMask, bytes );
-    high = _mm256_slli_epi16( high, 4 );
-    bytes = _mm256_or_si256( low, high );
-    return bytes;
-}
-
-// add int16_t pairwise and return as float vector
-static inline __m256 sum_i16_pairs_float(const __m256i x) {
-    const __m256i ones = _mm256_set1_epi16(1);
-    const __m256i summed_pairs = _mm256_madd_epi16(ones, x);
-    return _mm256_cvtepi32_ps(summed_pairs);
-}
-
-// multiply int8_t, add results pairwise twice and return as float vector
-static inline __m256 mul_sum_i8_pairs_float(const __m256i x, const __m256i y) {
-    // Get absolute values of x vectors
-    const __m256i ax = _mm256_sign_epi8(x, x);
-    // Sign the values of the y vectors
-    const __m256i sy = _mm256_sign_epi8(y, x);
-    // Perform multiplication and create 16-bit values
-    const __m256i dot = _mm256_maddubs_epi16(ax, sy);
-    return sum_i16_pairs_float(dot);
-}
-
-static inline __m128i packNibbles( __m256i bytes )
-{
-    // Move bits within 16-bit lanes from 0000_abcd_0000_efgh into 0000_0000_abcd_efgh
-    const __m256i lowByte = _mm256_set1_epi16( 0xFF );
-    __m256i high = _mm256_andnot_si256( lowByte, bytes );
-    __m256i low = _mm256_and_si256( lowByte, bytes );
-    high = _mm256_srli_epi16( high, 4 );
-    bytes = _mm256_or_si256( low, high );
-
-    // Compress uint16_t lanes into bytes
-    __m128i r0 = _mm256_castsi256_si128( bytes );
-    __m128i r1 = _mm256_extracti128_si256( bytes, 1 );
-    return _mm_packus_epi16( r0, r1 );
-}
-#else
-static inline __m128i packNibbles( __m128i bytes1, __m128i bytes2 )
-{
-    // Move bits within 16-bit lanes from 0000_abcd_0000_efgh into 0000_0000_abcd_efgh
-    const __m128i lowByte = _mm_set1_epi16( 0xFF );
-    __m128i high = _mm_andnot_si128( lowByte, bytes1 );
-    __m128i low = _mm_and_si128( lowByte, bytes1 );
-    high = _mm_srli_epi16( high, 4 );
-    bytes1 = _mm_or_si128( low, high );
-    high = _mm_andnot_si128( lowByte, bytes2 );
-    low = _mm_and_si128( lowByte, bytes2 );
-    high = _mm_srli_epi16( high, 4 );
-    bytes2 = _mm_or_si128( low, high );
-
-    return _mm_packus_epi16( bytes1, bytes2);
-}
-#endif
-#endif // __AVX__ || __AVX2__ || __AVX512F__
-
-#if __ARM_NEON
-
-#if !defined(__aarch64__)
-
-inline static uint16_t vaddvq_u8(uint8x16_t v) {
-    return
-        (uint16_t)vgetq_lane_u8(v, 0)  + (uint16_t)vgetq_lane_u8(v, 1)  +
-        (uint16_t)vgetq_lane_u8(v, 2)  + (uint16_t)vgetq_lane_u8(v, 3)  +
-        (uint16_t)vgetq_lane_u8(v, 4)  + (uint16_t)vgetq_lane_u8(v, 5)  +
-        (uint16_t)vgetq_lane_u8(v, 6)  + (uint16_t)vgetq_lane_u8(v, 7)  +
-        (uint16_t)vgetq_lane_u8(v, 8)  + (uint16_t)vgetq_lane_u8(v, 9)  +
-        (uint16_t)vgetq_lane_u8(v, 10) + (uint16_t)vgetq_lane_u8(v, 11) +
-        (uint16_t)vgetq_lane_u8(v, 12) + (uint16_t)vgetq_lane_u8(v, 13) +
-        (uint16_t)vgetq_lane_u8(v, 14) + (uint16_t)vgetq_lane_u8(v, 15);
-}
-
-inline static int16_t vaddvq_s8(int8x16_t v) {
-    return
-        (int16_t)vgetq_lane_s8(v, 0)  + (int16_t)vgetq_lane_s8(v, 1)  +
-        (int16_t)vgetq_lane_s8(v, 2)  + (int16_t)vgetq_lane_s8(v, 3)  +
-        (int16_t)vgetq_lane_s8(v, 4)  + (int16_t)vgetq_lane_s8(v, 5)  +
-        (int16_t)vgetq_lane_s8(v, 6)  + (int16_t)vgetq_lane_s8(v, 7)  +
-        (int16_t)vgetq_lane_s8(v, 8)  + (int16_t)vgetq_lane_s8(v, 9)  +
-        (int16_t)vgetq_lane_s8(v, 10) + (int16_t)vgetq_lane_s8(v, 11) +
-        (int16_t)vgetq_lane_s8(v, 12) + (int16_t)vgetq_lane_s8(v, 13) +
-        (int16_t)vgetq_lane_s8(v, 14) + (int16_t)vgetq_lane_s8(v, 15);
-}
-
-inline static int32_t vaddvq_s16(int16x8_t v) {
-    return
-        (int32_t)vgetq_lane_s16(v, 0) + (int32_t)vgetq_lane_s16(v, 1) +
-        (int32_t)vgetq_lane_s16(v, 2) + (int32_t)vgetq_lane_s16(v, 3) +
-        (int32_t)vgetq_lane_s16(v, 4) + (int32_t)vgetq_lane_s16(v, 5) +
-        (int32_t)vgetq_lane_s16(v, 6) + (int32_t)vgetq_lane_s16(v, 7);
-}
-
-inline static uint32_t vaddvq_u16(uint16x8_t v) {
-    return
-        (uint32_t)vgetq_lane_u16(v, 0) + (uint32_t)vgetq_lane_u16(v, 1) +
-        (uint32_t)vgetq_lane_u16(v, 2) + (uint32_t)vgetq_lane_u16(v, 3) +
-        (uint32_t)vgetq_lane_u16(v, 4) + (uint32_t)vgetq_lane_u16(v, 5) +
-        (uint32_t)vgetq_lane_u16(v, 6) + (uint32_t)vgetq_lane_u16(v, 7);
-}
-
-inline static int32_t vaddvq_s32(int32x4_t v) {
-    return vgetq_lane_s32(v, 0) + vgetq_lane_s32(v, 1) + vgetq_lane_s32(v, 2) + vgetq_lane_s32(v, 3);
-}
-
-inline static float vaddvq_f32(float32x4_t v) {
-    return vgetq_lane_f32(v, 0) + vgetq_lane_f32(v, 1) + vgetq_lane_f32(v, 2) + vgetq_lane_f32(v, 3);
-}
-
-float vminvq_f32(float32x4_t v) {
-    return
-        MIN(MIN(vgetq_lane_f32(v, 0), vgetq_lane_f32(v, 1)),
-            MIN(vgetq_lane_f32(v, 2), vgetq_lane_f32(v, 3)));
-}
-
-float vmaxvq_f32(float32x4_t v) {
-    return
-        MAX(MAX(vgetq_lane_f32(v, 0), vgetq_lane_f32(v, 1)),
-            MAX(vgetq_lane_f32(v, 2), vgetq_lane_f32(v, 3)));
-}
-
-int8x8_t vzip1_s8(int8x8_t a, int8x8_t b) {
-    return vget_low_s8(vcombine_s8(a, b));
-}
-
-int8x8_t vzip2_s8(int8x8_t a, int8x8_t b) {
-    return vget_high_s8(vcombine_s8(a, b));
-}
-
-uint8x8_t vzip1_u8(uint8x8_t a, uint8x8_t b) {
-    return vget_low_u8(vcombine_u8(a, b));
-}
-
-uint8x8_t vzip2_u8(uint8x8_t a, uint8x8_t b) {
-    return vget_high_u8(vcombine_u8(a, b));
-}
-
-#endif
-#endif
-
-
-#define QK4_0 32
-typedef struct {
-    float   d;          // delta
-    uint8_t qs[QK4_0 / 2];  // nibbles / quants
-} block_q4_0;
-static_assert(sizeof(block_q4_0) == sizeof(float) + QK4_0 / 2, "wrong q4_0 block size/padding");
-
-#define QK4_1 32
-typedef struct {
-    float   d;          // delta
-    float   m;          // min
-    uint8_t qs[QK4_1 / 2];  // nibbles / quants
-} block_q4_1;
-static_assert(sizeof(block_q4_1) == 2 * sizeof(float) + QK4_1 / 2, "wrong q4_1 block size/padding");
-
-#define QK4_2 16
-typedef struct {
-    ggml_fp16_t d;         // delta
-    uint8_t qs[QK4_2 / 2]; // nibbles / quants
-} block_q4_2;
-static_assert(sizeof(block_q4_2) == sizeof(ggml_fp16_t) + QK4_2 / 2, "wrong q4_2 block size/padding");
-
-#define QK4_3 16
-typedef struct {
-    ggml_fp16_t d;         // delta
-    ggml_fp16_t m;         // min
-    uint8_t qs[QK4_3 / 2]; // nibbles / quants
-} block_q4_3;
-static_assert(sizeof(block_q4_3) == 2 * sizeof(ggml_fp16_t) + QK4_3 / 2, "wrong q4_3 block size/padding");
-
-#define QK8_0 32
-typedef struct {
-    float   d;          // delta
-    float   s0;         // d * sum(qs[i]) low
-    float   s1;         // d * sum(qs[i]) high
-    int8_t  qs[QK8_0];  // quants
-} block_q8_0;
-static_assert(sizeof(block_q8_0) == 3*sizeof(float) + QK8_0, "wrong q8_0 block size/padding");
-
-
-// reference implementation for deterministic creation of model files
-static void quantize_row_q4_0_reference(const float * restrict x, block_q4_0 * restrict y, int k) {
-    assert(k % QK4_0 == 0);
-    const int nb = k / QK4_0;
-
-    uint8_t pp[QK4_0/2];
-
-    for (int i = 0; i < nb; i++) {
-        float amax = 0.0f; // absolute max
-
-        for (int l = 0; l < QK4_0; l++) {
-            const float v = x[i*QK4_0 + l];
-            amax = MAX(amax, fabsf(v));
-        }
-
-        const float d = amax / ((1 << 3) - 1);
-        const float id = d ? 1.0f/d : 0.0f;
-
-        y[i].d = d;
-
-        for (int l = 0; l < QK4_0; l += 2) {
-            const float v0 = x[i*QK4_0 + l + 0]*id;
-            const float v1 = x[i*QK4_0 + l + 1]*id;
-
-            const uint8_t vi0 = (int8_t)roundf(v0) + 8;
-            const uint8_t vi1 = (int8_t)roundf(v1) + 8;
-
-            assert(vi0 < 16);
-            assert(vi1 < 16);
-
-            pp[l/2] = vi0 | (vi1 << 4);
-        }
-
-        memcpy(y[i].qs, pp, sizeof(pp));
-    }
-}
-
-static void quantize_row_q4_0(const float * restrict x, void * restrict vy, int k) {
-    assert(k % QK4_0 == 0);
-    const int nb = k / QK4_0;
-
-    block_q4_0 * restrict y = vy;
-
-#if defined(__POWER9_VECTOR__)
-    const vector float v85 = vec_splats(8.5f);
-    for (int i = 0; i < nb; i++) {
-        float amax = 0.0f; // absolute max
-
-        vector float srcv [8];
-        vector float asrcv[8];
-        vector float amaxv[8];
-
-        for (int l = 0; l < 8; l++) srcv[l]  = *(vector float *)(x + i*32 + 4*l);
-        for (int l = 0; l < 8; l++) asrcv[l] = vec_abs(srcv[l]);
-
-        for (int l = 0; l < 4; l++) amaxv[2*l] = vec_max(asrcv[2*l], asrcv[2*l+1]);
-        //for (int l = 0; l < 2; l++) amaxv[4*l] = vec_max(amaxv[4*l], amaxv[4*l+2]);
-        amaxv[0] = vec_max(amaxv[0], amaxv[2]);
-        amaxv[4] = vec_max(amaxv[4], amaxv[6]);
-        //for (int l = 0; l < 1; l++) amaxv[8*l] = vec_max(amaxv[8*l], amaxv[8*l+4]);
-        amaxv[0] = vec_max(amaxv[0], amaxv[4]);
-
-        amax = MAX(
-                MAX(vec_extract(amaxv[0], 0), vec_extract(amaxv[0], 1)),
-                MAX(vec_extract(amaxv[0], 2), vec_extract(amaxv[0], 3)));
-
-        const float d = amax / ((1 << 3) - 1);
-        const float id = d ? 1.0/d : 0.0;
-
-        y[i].d = d;
-
-        const vector float vid = vec_splats(id);
-        uint8_t * restrict pb = y[i].qs;
-        for (int l = 0; l < 8; l++) {
-            const vector float vf  = vec_madd(srcv[l], vid, v85);
-            const vector signed int vi = vec_signed(vf);
-
-            pb[2*l + 0] = vec_extract(vi, 0) | (vec_extract(vi, 1) << 4);
-            pb[2*l + 1] = vec_extract(vi, 2) | (vec_extract(vi, 3) << 4);
-        }
-    }
-#elif __ARM_NEON
-    for (int i = 0; i < nb; i++) {
-        float32x4_t srcv [8];
-        float32x4_t asrcv[8];
-        float32x4_t amaxv[8];
-
-        for (int l = 0; l < 8; l++) srcv[l]  = vld1q_f32(x + i*32 + 4*l);
-        for (int l = 0; l < 8; l++) asrcv[l] = vabsq_f32(srcv[l]);
-
-        for (int l = 0; l < 4; l++) amaxv[2*l] = vmaxq_f32(asrcv[2*l], asrcv[2*l+1]);
-        for (int l = 0; l < 2; l++) amaxv[4*l] = vmaxq_f32(amaxv[4*l], amaxv[4*l+2]);
-        for (int l = 0; l < 1; l++) amaxv[8*l] = vmaxq_f32(amaxv[8*l], amaxv[8*l+4]);
-
-        const float amax = vmaxvq_f32(amaxv[0]);
-
-        const float d = amax / ((1 << 3) - 1);
-        const float id = d ? 1.0f/d : 0.0f;
-
-        y[i].d = d;
-
-        for (int l = 0; l < 8; l++) {
-            const float32x4_t v  = vmulq_n_f32(srcv[l], id);
-            const float32x4_t vf = vaddq_f32(v, vdupq_n_f32(8.5f));
-            const int32x4_t   vi = vcvtq_s32_f32(vf);
-
-            y[i].qs[2*l + 0] = vgetq_lane_s32(vi, 0) | (vgetq_lane_s32(vi, 1) << 4);
-            y[i].qs[2*l + 1] = vgetq_lane_s32(vi, 2) | (vgetq_lane_s32(vi, 3) << 4);
-        }
-    }
-#elif defined(__AVX2__)
-    for (int i = 0; i < nb; i++) {
-        // Load elements into 4 AVX vectors
-        __m256 v0 = _mm256_loadu_ps( x );
-        __m256 v1 = _mm256_loadu_ps( x + 8 );
-        __m256 v2 = _mm256_loadu_ps( x + 16 );
-        __m256 v3 = _mm256_loadu_ps( x + 24 );
-        x += 32;
-
-        // Compute max(abs(e)) for the block
-        const __m256 signBit = _mm256_set1_ps( -0.0f );
-        __m256 maxAbs = _mm256_andnot_ps( signBit, v0 );
-        maxAbs = _mm256_max_ps( maxAbs, _mm256_andnot_ps( signBit, v1 ) );
-        maxAbs = _mm256_max_ps( maxAbs, _mm256_andnot_ps( signBit, v2 ) );
-        maxAbs = _mm256_max_ps( maxAbs, _mm256_andnot_ps( signBit, v3 ) );
-
-        __m128 max4 = _mm_max_ps( _mm256_extractf128_ps( maxAbs, 1 ), _mm256_castps256_ps128( maxAbs ) );
-        max4 = _mm_max_ps( max4, _mm_movehl_ps( max4, max4 ) );
-        max4 = _mm_max_ss( max4, _mm_movehdup_ps( max4 ) );
-        const float maxScalar = _mm_cvtss_f32( max4 );
-
-        // Quantize these floats
-        const float d = maxScalar / 7.0f;
-        y[i].d = d;
-        const float id = ( maxScalar != 0.0f ) ? 7.0f / maxScalar : 0.0f;
-        const __m256 mul = _mm256_set1_ps( id );
-
-        // Apply the multiplier
-        v0 = _mm256_mul_ps( v0, mul );
-        v1 = _mm256_mul_ps( v1, mul );
-        v2 = _mm256_mul_ps( v2, mul );
-        v3 = _mm256_mul_ps( v3, mul );
-
-        // Round to nearest integer
-        v0 = _mm256_round_ps( v0, _MM_ROUND_NEAREST );
-        v1 = _mm256_round_ps( v1, _MM_ROUND_NEAREST );
-        v2 = _mm256_round_ps( v2, _MM_ROUND_NEAREST );
-        v3 = _mm256_round_ps( v3, _MM_ROUND_NEAREST );
-
-        // Convert floats to integers
-        __m256i i0 = _mm256_cvtps_epi32( v0 );
-        __m256i i1 = _mm256_cvtps_epi32( v1 );
-        __m256i i2 = _mm256_cvtps_epi32( v2 );
-        __m256i i3 = _mm256_cvtps_epi32( v3 );
-
-        // Convert int32 to int16
-        i0 = _mm256_packs_epi32( i0, i1 );	// 0, 1, 2, 3,  8, 9, 10, 11,  4, 5, 6, 7, 12, 13, 14, 15
-        i2 = _mm256_packs_epi32( i2, i3 );	// 16, 17, 18, 19,  24, 25, 26, 27,  20, 21, 22, 23, 28, 29, 30, 31
-                                            // Convert int16 to int8
-        i0 = _mm256_packs_epi16( i0, i2 );	// 0, 1, 2, 3,  8, 9, 10, 11,  16, 17, 18, 19,  24, 25, 26, 27,  4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31
-
-        // We got our precious signed bytes, but the order is now wrong
-        // These AVX2 pack instructions process 16-byte pieces independently
-        // The following instruction is fixing the order
-        const __m256i perm = _mm256_setr_epi32( 0, 4, 1, 5, 2, 6, 3, 7 );
-        i0 = _mm256_permutevar8x32_epi32( i0, perm );
-
-        // Apply offset to translate the range from [ -7 .. +7 ] into [ +1 .. +15 ]
-        const __m256i off = _mm256_set1_epi8( 8 );
-        i0 = _mm256_add_epi8( i0, off );
-
-        // Compress the vector into 4 bit/value, and store
-        __m128i res = packNibbles( i0 );
-        _mm_storeu_si128( ( __m128i* )y[i].qs, res );
-    }
-#elif defined(__AVX__)
-    for (int i = 0; i < nb; i++) {
-        // Load elements into 4 AVX vectors
-        __m256 v0 = _mm256_loadu_ps( x );
-        __m256 v1 = _mm256_loadu_ps( x + 8 );
-        __m256 v2 = _mm256_loadu_ps( x + 16 );
-        __m256 v3 = _mm256_loadu_ps( x + 24 );
-        x += 32;
-
-        // Compute max(abs(e)) for the block
-        const __m256 signBit = _mm256_set1_ps( -0.0f );
-        __m256 maxAbs = _mm256_andnot_ps( signBit, v0 );
-        maxAbs = _mm256_max_ps( maxAbs, _mm256_andnot_ps( signBit, v1 ) );
-        maxAbs = _mm256_max_ps( maxAbs, _mm256_andnot_ps( signBit, v2 ) );
-        maxAbs = _mm256_max_ps( maxAbs, _mm256_andnot_ps( signBit, v3 ) );
-
-        __m128 max4 = _mm_max_ps( _mm256_extractf128_ps( maxAbs, 1 ), _mm256_castps256_ps128( maxAbs ) );
-        max4 = _mm_max_ps( max4, _mm_movehl_ps( max4, max4 ) );
-        max4 = _mm_max_ss( max4, _mm_movehdup_ps( max4 ) );
-        const float maxScalar = _mm_cvtss_f32( max4 );
-
-        // Quantize these floats
-        const float d = maxScalar / 7.0f;
-        y[i].d = d;
-        const float id = ( maxScalar != 0.0f ) ? 7.0f / maxScalar : 0.0f;
-        const __m256 mul = _mm256_set1_ps( id );
-
-        // Apply the multiplier
-        v0 = _mm256_mul_ps( v0, mul );
-        v1 = _mm256_mul_ps( v1, mul );
-        v2 = _mm256_mul_ps( v2, mul );
-        v3 = _mm256_mul_ps( v3, mul );
-
-        // Round to nearest integer
-        v0 = _mm256_round_ps( v0, _MM_ROUND_NEAREST );
-        v1 = _mm256_round_ps( v1, _MM_ROUND_NEAREST );
-        v2 = _mm256_round_ps( v2, _MM_ROUND_NEAREST );
-        v3 = _mm256_round_ps( v3, _MM_ROUND_NEAREST );
-
-        // Convert floats to integers
-        __m256i i0 = _mm256_cvtps_epi32( v0 );
-        __m256i i1 = _mm256_cvtps_epi32( v1 );
-        __m256i i2 = _mm256_cvtps_epi32( v2 );
-        __m256i i3 = _mm256_cvtps_epi32( v3 );
-
-        // Since we don't have in AVX some necessary functions,
-        // we split the registers in half and call AVX2 analogs from SSE
-        __m128i ni0 = _mm256_castsi256_si128( i0 );
-        __m128i ni1 = _mm256_extractf128_si256( i0, 1);
-        __m128i ni2 = _mm256_castsi256_si128( i1 );
-        __m128i ni3 = _mm256_extractf128_si256( i1, 1);
-        __m128i ni4 = _mm256_castsi256_si128( i2 );
-        __m128i ni5 = _mm256_extractf128_si256( i2, 1);
-        __m128i ni6 = _mm256_castsi256_si128( i3 );
-        __m128i ni7 = _mm256_extractf128_si256( i3, 1);
-
-        // Convert int32 to int16
-        ni0 = _mm_packs_epi32( ni0, ni1 );
-        ni2 = _mm_packs_epi32( ni2, ni3 );
-        ni4 = _mm_packs_epi32( ni4, ni5 );
-        ni6 = _mm_packs_epi32( ni6, ni7 );
-        // Convert int16 to int8
-        ni0 = _mm_packs_epi16( ni0, ni2 );
-        ni4 = _mm_packs_epi16( ni4, ni6 );
-
-        // Apply offset to translate the range from [ -7 .. +7 ] into [ +1 .. +15 ]
-        const __m128i off = _mm_set1_epi8( 8);
-        ni0 = _mm_add_epi8( ni0, off );
-        ni4 = _mm_add_epi8( ni4, off );
-
-        // Compress the vector into 4 bit/value, and store
-        __m128i res = packNibbles( ni0, ni4 );
-        _mm_storeu_si128( ( __m128i* )y[i].qs, res );
-    }
-#elif defined(__wasm_simd128__)
-    for (int i = 0; i < nb; i++) {
-        float amax = 0.0f; // absolute max
-
-        v128_t srcv [8];
-        v128_t asrcv[8];
-        v128_t amaxv[8];
-
-        for (int l = 0; l < 8; l++) srcv[l]  = wasm_v128_load(x + i*32 + 4*l);
-        for (int l = 0; l < 8; l++) asrcv[l] = wasm_f32x4_abs(srcv[l]);
-
-        for (int l = 0; l < 4; l++) amaxv[2*l] = wasm_f32x4_max(asrcv[2*l], asrcv[2*l+1]);
-        for (int l = 0; l < 2; l++) amaxv[4*l] = wasm_f32x4_max(amaxv[4*l], amaxv[4*l+2]);
-        for (int l = 0; l < 1; l++) amaxv[8*l] = wasm_f32x4_max(amaxv[8*l], amaxv[8*l+4]);
-
-        amax = MAX(
-                MAX(wasm_f32x4_extract_lane(amaxv[0], 0), wasm_f32x4_extract_lane(amaxv[0], 1)),
-                MAX(wasm_f32x4_extract_lane(amaxv[0], 2), wasm_f32x4_extract_lane(amaxv[0], 3)));
-
-        const float d = amax / ((1 << 3) - 1);
-        const float id = d ? 1.0/d : 0.0;
-
-        y[i].d = d;
-
-        for (int l = 0; l < 8; l++) {
-            const v128_t v  = wasm_f32x4_mul(srcv[l], wasm_f32x4_splat(id));
-            const v128_t vf = wasm_f32x4_add(v, wasm_f32x4_splat(8.5f));
-            const v128_t vi = wasm_i32x4_trunc_sat_f32x4(vf);
-
-            y[i].qs[2*l + 0] = wasm_i32x4_extract_lane(vi, 0) | (wasm_i32x4_extract_lane(vi, 1) << 4);
-            y[i].qs[2*l + 1] = wasm_i32x4_extract_lane(vi, 2) | (wasm_i32x4_extract_lane(vi, 3) << 4);
-        }
-    }
-#else
-    // scalar
-    quantize_row_q4_0_reference(x, y, k);
-#endif
-}
-
-static void quantize_row_q4_1_reference(const float * restrict x, void * restrict vy, int k) {
-    assert(k % QK4_1 == 0);
-    const int nb = k / QK4_1;
-
-    block_q4_1 * restrict y = vy;
-
-    uint8_t pp[QK4_1/2];
-
-    for (int i = 0; i < nb; i++) {
-        float min = FLT_MAX;
-        float max = -FLT_MAX;
-
-        for (int l = 0; l < QK4_1; l++) {
-            const float v = x[i*QK4_1 + l];
-            if (v < min) min = v;
-            if (v > max) max = v;
-        }
-
-        const float d = (max - min) / ((1 << 4) - 1);
-        const float id = d ? 1.0f/d : 0.0f;
-
-        y[i].d = d;
-        y[i].m = min;
-
-        for (int l = 0; l < QK4_1; l += 2) {
-            const float v0 = (x[i*QK4_1 + l + 0] - min)*id;
-            const float v1 = (x[i*QK4_1 + l + 1] - min)*id;
-
-            const uint8_t vi0 = roundf(v0);
-            const uint8_t vi1 = roundf(v1);
-
-            assert(vi0 < 16);
-            assert(vi1 < 16);
-
-            pp[l/2] = vi0 | (vi1 << 4);
-        }
-
-        memcpy(y[i].qs, pp, sizeof(pp));
-    }
-}
-
-static void quantize_row_q4_1(const float * restrict x, void * restrict vy, int k) {
-    assert(k % QK4_1 == 0);
-
-    const int nb = k / QK4_1;
-
-    block_q4_1 * restrict y = vy;
-
-#if defined(__AVX2__)
-    for (int i = 0; i < nb; i++) {
-        // Load elements into 4 AVX vectors
-        __m256 v0 = _mm256_loadu_ps( x );
-        __m256 v1 = _mm256_loadu_ps( x + 8 );
-        __m256 v2 = _mm256_loadu_ps( x + 16 );
-        __m256 v3 = _mm256_loadu_ps( x + 24 );
-        x += 32;
-
-        // Compute max for the block
-        __m256 vmax;
-        vmax = _mm256_max_ps( v0, v1 );
-        vmax = _mm256_max_ps( vmax, v2 );
-        vmax = _mm256_max_ps( vmax, v3 );
-
-        __m128 max4 = _mm_max_ps( _mm256_extractf128_ps( vmax, 1 ), _mm256_castps256_ps128( vmax ) );
-        max4 = _mm_max_ps( max4, _mm_movehl_ps( max4, max4 ) );
-        max4 = _mm_max_ss( max4, _mm_movehdup_ps( max4 ) );
-        const float maxScalar = _mm_cvtss_f32( max4 );
-
-        // Compute min for the block
-        __m256 vmin;
-        vmin = _mm256_min_ps( v0, v1 );
-        vmin = _mm256_min_ps( vmin, v2 );
-        vmin = _mm256_min_ps( vmin, v3 );
-
-        __m128 min4 = _mm_min_ps( _mm256_extractf128_ps( vmin, 1 ), _mm256_castps256_ps128( vmin ) );
-        min4 = _mm_min_ps( min4, _mm_movehl_ps( min4, min4 ) );
-        min4 = _mm_min_ss( min4, _mm_movehdup_ps( min4 ) );
-        const float minScalar = _mm_cvtss_f32( min4 );
-
-        // Quantize these floats
-        const float d = (maxScalar - minScalar) / ((1 << 4) - 1);
-        const float id = d ? 1.0f/d : 0.0f;
-
-        y[i].m = minScalar;
-        y[i].d = d;
-
-        // x = (x-min)*id
-        const __m256 mul = _mm256_set1_ps( id );
-        const __m256 off = _mm256_set1_ps( minScalar );
-        v0 = _mm256_mul_ps( _mm256_sub_ps( v0, off ), mul );
-        v1 = _mm256_mul_ps( _mm256_sub_ps( v1, off ), mul );
-        v2 = _mm256_mul_ps( _mm256_sub_ps( v2, off ), mul );
-        v3 = _mm256_mul_ps( _mm256_sub_ps( v3, off ), mul );
-
-        // Round to nearest integer
-        v0 = _mm256_round_ps( v0, _MM_ROUND_NEAREST );
-        v1 = _mm256_round_ps( v1, _MM_ROUND_NEAREST );
-        v2 = _mm256_round_ps( v2, _MM_ROUND_NEAREST );
-        v3 = _mm256_round_ps( v3, _MM_ROUND_NEAREST );
-
-        // Convert floats to integers
-        __m256i i0 = _mm256_cvtps_epi32( v0 );
-        __m256i i1 = _mm256_cvtps_epi32( v1 );
-        __m256i i2 = _mm256_cvtps_epi32( v2 );
-        __m256i i3 = _mm256_cvtps_epi32( v3 );
-
-        // Convert int32 to int16
-        i0 = _mm256_packs_epi32( i0, i1 );	// 0, 1, 2, 3,  8, 9, 10, 11,  4, 5, 6, 7, 12, 13, 14, 15
-        i2 = _mm256_packs_epi32( i2, i3 );	// 16, 17, 18, 19,  24, 25, 26, 27,  20, 21, 22, 23, 28, 29, 30, 31
-                                            // Convert int16 to int8
-        i0 = _mm256_packs_epi16( i0, i2 );	// 0, 1, 2, 3,  8, 9, 10, 11,  16, 17, 18, 19,  24, 25, 26, 27,  4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31
-
-        // We got our precious signed bytes, but the order is now wrong
-        // These AVX2 pack instructions process 16-byte pieces independently
-        // The following instruction is fixing the order
-        const __m256i perm = _mm256_setr_epi32( 0, 4, 1, 5, 2, 6, 3, 7 );
-        i0 = _mm256_permutevar8x32_epi32( i0, perm );
-
-        // Compress the vector into 4 bit/value, and store
-        __m128i res = packNibbles( i0 );
-        _mm_storeu_si128( ( __m128i* )y[i].qs, res );
-    }
-#elif __ARM_NEON
-    for (int i = 0; i < nb; i++) {
-        float32x4_t srcv[8];
-        float32x4_t minv[8];
-        float32x4_t maxv[8];
-
-        for (int l = 0; l < 8; l++) srcv[l] = vld1q_f32(x + i*QK4_1 + 4*l);
-
-        for (int l = 0; l < 4; l++) minv[2*l] = vminq_f32(srcv[2*l], srcv[2*l + 1]);
-        for (int l = 0; l < 2; l++) minv[4*l] = vminq_f32(minv[4*l], minv[4*l + 2]);
-        for (int l = 0; l < 1; l++) minv[8*l] = vminq_f32(minv[8*l], minv[8*l + 4]);
-
-        for (int l = 0; l < 4; l++) maxv[2*l] = vmaxq_f32(srcv[2*l], srcv[2*l + 1]);
-        for (int l = 0; l < 2; l++) maxv[4*l] = vmaxq_f32(maxv[4*l], maxv[4*l + 2]);
-        for (int l = 0; l < 1; l++) maxv[8*l] = vmaxq_f32(maxv[8*l], maxv[8*l + 4]);
-
-        const float min = vminvq_f32(minv[0]);
-        const float max = vmaxvq_f32(maxv[0]);
-
-        const float d = (max - min) / ((1 << 4) - 1);
-        const float id = d ? 1.0f/d : 0.0f;
-
-        y[i].d = d;
-        y[i].m = min;
-
-        const float32x4_t minv0 = vdupq_n_f32(min);
-
-        for (int l = 0; l < 8; l++) {
-            const float32x4_t v  = vmulq_n_f32(vsubq_f32(srcv[l], minv0), id);
-            const float32x4_t vf = vaddq_f32(v, vdupq_n_f32(0.5f)); // needed to round to nearest
-            const int32x4_t   vi = vcvtq_s32_f32(vf);
-
-            y[i].qs[2*l + 0] = vgetq_lane_s32(vi, 0) | (vgetq_lane_s32(vi, 1) << 4);
-            y[i].qs[2*l + 1] = vgetq_lane_s32(vi, 2) | (vgetq_lane_s32(vi, 3) << 4);
-        }
-    }
-#else
-    // scalar
-    quantize_row_q4_1_reference(x, vy, k);
-#endif
-}
-
-// reference implementation for deterministic creation of model files
-static void quantize_row_q4_2_reference(const float * restrict x, block_q4_2 * restrict y, int k) {
-    assert(k % QK4_2 == 0);
-
-    const int nb = k / QK4_2;
-
-    for (int i = 0; i < nb; i++) {
-        float amax = 0.0f; // absolute max
-
-        for (int l = 0; l < QK4_2; l++) {
-            const float v = x[i*QK4_2 + l];
-            amax = MAX(amax, fabsf(v));
-        }
-
-        const float d = amax / ((1 << 3) - 1);
-
-        const float id = d ? 1.0f/d : 0.0f;
-
-        y[i].d = GGML_FP32_TO_FP16(d);
-
-        for (int l = 0; l < QK4_2; l += 2) {
-            const float v0 = x[i*QK4_2 + l + 0]*id;
-            const float v1 = x[i*QK4_2 + l + 1]*id;
-
-            const uint8_t vi0 = (uint8_t)(v0 + 8.5f);
-            const uint8_t vi1 = (uint8_t)(v1 + 8.5f);
-
-            assert(vi0 < 16);
-            assert(vi1 < 16);
-
-            y[i].qs[l/2] = vi0 | (vi1 << 4);
-        }
-    }
-}
-
-static inline int nearest_int(float fval) {
-    assert(fval <= 4194303.f);
-    float val = fval + 12582912.f;
-    int i; memcpy(&i, &val, sizeof(int));
-    return (i & 0x007fffff) - 0x00400000;
-}
-
-static float kquantize_q4_with_bounds(int n, int nmin, int nmax, const float * restrict X, int nCandidates,
-        const float * restrict candidates, int8_t * restrict L) {
-    assert (nmin >= INT8_MIN);
-    assert (nmax <= INT8_MAX);
-    float amax = 0;
-    for (int i=0; i<n; ++i) amax = MAX(amax, fabsf(X[i]));
-    if (!amax) { // all zero
-        for (int i=0; i<n; ++i) L[i] = 0;
-        return 1.f;
-    }
-    float best = 0, bestScale = 0;
-    for (int si=0; si<nCandidates; ++si) {
-        float iscale = candidates[si]/amax;
-        float sumlxP = 0; int suml2P = 0;
-        float sumlxM = 0; int suml2M = 0;
-        for (int i=0; i<n; ++i) {
-            int l = nearest_int(iscale*X[i]);
-            int lp = MAX(nmin, MIN(nmax, +l));
-            int lm = MAX(nmin, MIN(nmax, -l));
-            sumlxP += X[i]*lp; suml2P += lp*lp;
-            sumlxM += X[i]*lm; suml2M += lm*lm;
-        }
-        float sumlxP2 = sumlxP*sumlxP;
-        float sumlxM2 = sumlxM*sumlxM;
-        if (sumlxP2*suml2M > sumlxM2*suml2P) {
-            if (sumlxP2 > best*suml2P) {
-                best = sumlxP2/suml2P; bestScale = iscale;
-            }
-        } else {
-            if (sumlxM2 > best*suml2M) {
-                best = sumlxM2/suml2M; bestScale = -iscale;
-            }
-        }
-    }
-    float sumlx = 0; int suml2 = 0;
-    for (int i=0; i<n; ++i) {
-        int l = nearest_int(bestScale*X[i]);
-        l = MAX(nmin, MIN(nmax, l));
-        sumlx += X[i]*l; suml2 += l*l;
-        L[i] = l;
-    }
-    float scale = sumlx/suml2;
-    return scale;
-}
-
-static void quantize_row_q4_2_rmse(const float * restrict x, block_q4_2 * restrict y, int k) {
-#define CANDIDATE_COUNT 8
-    static const float candidates[CANDIDATE_COUNT] = { +8.7f, +8.3f, +8.1f, +7.8f, +7.3f, +7.0f, +6.3f, +5.7f };
-    assert(k % QK4_2 == 0);
-
-    int8_t L[QK4_2];
-
-    const int nb = k / QK4_2;
-
-    for (int i = 0; i < nb; i++) {
-        float scale = kquantize_q4_with_bounds(QK4_2, -8, 7, x, CANDIDATE_COUNT, candidates, L);
-        y[i].d = GGML_FP32_TO_FP16(scale);
-
-        for (int l = 0; l < QK4_2; l += 2) {
-            const uint8_t vi0 = (uint8_t)(L[l+0] + 8);
-            const uint8_t vi1 = (uint8_t)(L[l+1] + 8);
-
-            assert(vi0 < 16);
-            assert(vi1 < 16);
-
-            y[i].qs[l/2] = vi0 | (vi1 << 4);
-        }
-
-        x += QK4_2;
-    }
-}
-
-static void quantize_row_q4_2(const float * restrict x, void * restrict vy, int k) {
-    assert(k % QK4_2 == 0);
-
-    block_q4_2 * restrict y = vy;
-
-    //quantize_row_q4_2_reference(x, y, k);
-    // This produces the exact same format, just better match to the input floats ("better" as measured by RMSE)
-    quantize_row_q4_2_rmse(x, y, k);
-}
-
-static void quantize_row_q4_3_reference(const float * restrict x, block_q4_3 * restrict y, int k) {
-    assert(k % QK4_3 == 0);
-    const int nb = k / QK4_3;
-
-    for (int i = 0; i < nb; i++) {
-        float min = FLT_MAX;
-        float max = -FLT_MAX;
-
-        for (int l = 0; l < QK4_3; l++) {
-            const float v = x[i*QK4_3 + l];
-            if (v < min) min = v;
-            if (v > max) max = v;
-        }
-
-        const float d = (max - min) / ((1 << 4) - 1);
-        const float id = d ? 1.0f/d : 0.0f;
-
-        y[i].d = GGML_FP32_TO_FP16(d);
-        y[i].m = GGML_FP32_TO_FP16(min);
-
-        for (int l = 0; l < QK4_3; l += 2) {
-            const float v0 = (x[i*QK4_3 + l + 0] - min)*id;
-            const float v1 = (x[i*QK4_3 + l + 1] - min)*id;
-
-            const uint8_t vi0 = (int) (v0 + 0.5f);
-            const uint8_t vi1 = (int) (v1 + 0.5f);
-
-            assert(vi0 < 16);
-            assert(vi1 < 16);
-
-            y[i].qs[l/2] = vi0 | (vi1 << 4);
-        }
-    }
-}
-
-static void quantize_row_q4_3(const float * restrict x, void * restrict vy, int k) {
-    assert(k % QK4_3 == 0);
-
-    block_q4_3 * restrict y = vy;
-
-    quantize_row_q4_3_reference(x, y, k);
-}
-
-// reference implementation for deterministic creation of model files
-static void quantize_row_q8_0_reference(const float * restrict x, block_q8_0 * restrict y, int k) {
-    assert(k % QK8_0 == 0);
-    const int nb = k / QK8_0;
-
-    for (int i = 0; i < nb; i++) {
-        float amax = 0.0f; // absolute max
-
-        for (int l = 0; l < QK8_0; l++) {
-            const float v = x[i*QK8_0 + l];
-            amax = MAX(amax, fabsf(v));
-        }
-
-        const float d = amax / ((1 << 7) - 1);
-        const float id = d ? 1.0f/d : 0.0f;
-
-        y[i].d = d;
-
-        int sum0 = 0;
-        int sum1 = 0;
-
-        for (int l = 0; l < QK8_0/2; ++l) {
-            const float v0 = x[i*QK8_0           + l]*id;
-            const float v1 = x[i*QK8_0 + QK8_0/2 + l]*id;
-
-            y[i].qs[          l] = roundf(v0);
-            y[i].qs[QK8_0/2 + l] = roundf(v1);
-
-            sum0 += y[i].qs[          l];
-            sum1 += y[i].qs[QK8_0/2 + l];
-        }
-
-        y[i].s0 = d * sum0;
-        y[i].s1 = d * sum1;
-    }
-}
-
-static void quantize_row_q8_0(const float * restrict x, void * restrict vy, int k) {
-    assert(k % QK8_0 == 0);
-    const int nb = k / QK8_0;
-
-    block_q8_0 * restrict y = vy;
-
-#if defined(__ARM_NEON)
-    for (int i = 0; i < nb; i++) {
-        float32x4_t srcv [8];
-        float32x4_t asrcv[8];
-        float32x4_t amaxv[8];
-
-        for (int l = 0; l < 8; l++) srcv[l]  = vld1q_f32(x + i*32 + 4*l);
-        for (int l = 0; l < 8; l++) asrcv[l] = vabsq_f32(srcv[l]);
-
-        for (int l = 0; l < 4; l++) amaxv[2*l] = vmaxq_f32(asrcv[2*l], asrcv[2*l+1]);
-        for (int l = 0; l < 2; l++) amaxv[4*l] = vmaxq_f32(amaxv[4*l], amaxv[4*l+2]);
-        for (int l = 0; l < 1; l++) amaxv[8*l] = vmaxq_f32(amaxv[8*l], amaxv[8*l+4]);
-
-        const float amax = vmaxvq_f32(amaxv[0]);
-
-        const float d = amax / ((1 << 7) - 1);
-        const float id = d ? 1.0f/d : 0.0f;
-
-        y[i].d = d;
-
-        int32x4_t accv0 = vdupq_n_s32(0);
-        int32x4_t accv1 = vdupq_n_s32(0);
-
-        // low half
-        for (int l = 0; l < 4; l++) {
-            const float32x4_t v  = vmulq_n_f32(srcv[l], id);
-            const int32x4_t   vi = vcvtnq_s32_f32(v);
-
-            y[i].qs[4*l + 0] = vgetq_lane_s32(vi, 0);
-            y[i].qs[4*l + 1] = vgetq_lane_s32(vi, 1);
-            y[i].qs[4*l + 2] = vgetq_lane_s32(vi, 2);
-            y[i].qs[4*l + 3] = vgetq_lane_s32(vi, 3);
-
-            accv0 = vaddq_s32(accv0, vi);
-        }
-
-        // high half
-        for (int l = 4; l < 8; l++) {
-            const float32x4_t v  = vmulq_n_f32(srcv[l], id);
-            const int32x4_t   vi = vcvtnq_s32_f32(v);
-
-            y[i].qs[4*l + 0] = vgetq_lane_s32(vi, 0);
-            y[i].qs[4*l + 1] = vgetq_lane_s32(vi, 1);
-            y[i].qs[4*l + 2] = vgetq_lane_s32(vi, 2);
-            y[i].qs[4*l + 3] = vgetq_lane_s32(vi, 3);
-
-            accv1 = vaddq_s32(accv1, vi);
-        }
-
-        const int32_t sum0 = vaddvq_s32(accv0);
-        const int32_t sum1 = vaddvq_s32(accv1);
-
-        y[i].s0 = d * sum0;
-        y[i].s1 = d * sum1;
-    }
-#elif defined(__AVX2__) || defined(__AVX__)
-    for (int i = 0; i < nb; i++) {
-        // Load elements into 4 AVX vectors
-        __m256 v0 = _mm256_loadu_ps( x );
-        __m256 v1 = _mm256_loadu_ps( x + 8 );
-        __m256 v2 = _mm256_loadu_ps( x + 16 );
-        __m256 v3 = _mm256_loadu_ps( x + 24 );
-        x += 32;
-
-        // Compute max(abs(e)) for the block
-        const __m256 signBit = _mm256_set1_ps( -0.0f );
-        __m256 maxAbs = _mm256_andnot_ps( signBit, v0 );
-        maxAbs = _mm256_max_ps( maxAbs, _mm256_andnot_ps( signBit, v1 ) );
-        maxAbs = _mm256_max_ps( maxAbs, _mm256_andnot_ps( signBit, v2 ) );
-        maxAbs = _mm256_max_ps( maxAbs, _mm256_andnot_ps( signBit, v3 ) );
-
-        __m128 max4 = _mm_max_ps( _mm256_extractf128_ps( maxAbs, 1 ), _mm256_castps256_ps128( maxAbs ) );
-        max4 = _mm_max_ps( max4, _mm_movehl_ps( max4, max4 ) );
-        max4 = _mm_max_ss( max4, _mm_movehdup_ps( max4 ) );
-        const float maxScalar = _mm_cvtss_f32( max4 );
-
-        // Quantize these floats
-        const float d = maxScalar / 127.f;
-        y[i].d = d;
-        const float id = ( maxScalar != 0.0f ) ? 127.f / maxScalar : 0.0f;
-        const __m256 mul = _mm256_set1_ps( id );
-
-        // Apply the multiplier
-        v0 = _mm256_mul_ps( v0, mul );
-        v1 = _mm256_mul_ps( v1, mul );
-        v2 = _mm256_mul_ps( v2, mul );
-        v3 = _mm256_mul_ps( v3, mul );
-
-        // Round to nearest integer
-        v0 = _mm256_round_ps( v0, _MM_ROUND_NEAREST );
-        v1 = _mm256_round_ps( v1, _MM_ROUND_NEAREST );
-        v2 = _mm256_round_ps( v2, _MM_ROUND_NEAREST );
-        v3 = _mm256_round_ps( v3, _MM_ROUND_NEAREST );
-
-        // Convert floats to integers
-        __m256i i0 = _mm256_cvtps_epi32( v0 );
-        __m256i i1 = _mm256_cvtps_epi32( v1 );
-        __m256i i2 = _mm256_cvtps_epi32( v2 );
-        __m256i i3 = _mm256_cvtps_epi32( v3 );
-
-#if defined(__AVX2__)
-        // Compute the sum of the quants and set y[i].s
-        //y[i].s = d * hsum_i32_8(_mm256_add_epi32(_mm256_add_epi32(i0, i1), _mm256_add_epi32(i2, i3)));
-        y[i].s0 = d * hsum_i32_8(_mm256_add_epi32(i0, i1));
-        y[i].s1 = d * hsum_i32_8(_mm256_add_epi32(i2, i3));
-
-        // Convert int32 to int16
-        i0 = _mm256_packs_epi32( i0, i1 );	// 0, 1, 2, 3,  8, 9, 10, 11,  4, 5, 6, 7, 12, 13, 14, 15
-        i2 = _mm256_packs_epi32( i2, i3 );	// 16, 17, 18, 19,  24, 25, 26, 27,  20, 21, 22, 23, 28, 29, 30, 31
-                                            // Convert int16 to int8
-        i0 = _mm256_packs_epi16( i0, i2 );	// 0, 1, 2, 3,  8, 9, 10, 11,  16, 17, 18, 19,  24, 25, 26, 27,  4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31
-
-        // We got our precious signed bytes, but the order is now wrong
-        // These AVX2 pack instructions process 16-byte pieces independently
-        // The following instruction is fixing the order
-        const __m256i perm = _mm256_setr_epi32( 0, 4, 1, 5, 2, 6, 3, 7 );
-        i0 = _mm256_permutevar8x32_epi32( i0, perm );
-
-        _mm256_storeu_si256((__m256i *)y[i].qs, i0);
-#else
-        // Since we don't have in AVX some necessary functions,
-        // we split the registers in half and call AVX2 analogs from SSE
-        __m128i ni0 = _mm256_castsi256_si128( i0 );
-        __m128i ni1 = _mm256_extractf128_si256( i0, 1);
-        __m128i ni2 = _mm256_castsi256_si128( i1 );
-        __m128i ni3 = _mm256_extractf128_si256( i1, 1);
-        __m128i ni4 = _mm256_castsi256_si128( i2 );
-        __m128i ni5 = _mm256_extractf128_si256( i2, 1);
-        __m128i ni6 = _mm256_castsi256_si128( i3 );
-        __m128i ni7 = _mm256_extractf128_si256( i3, 1);
-
-        // Compute the sum of the quants and set y[i].s
-        const __m128i s0 = _mm_add_epi32(_mm_add_epi32(ni0, ni1), _mm_add_epi32(ni2, ni3));
-        const __m128i s1 = _mm_add_epi32(_mm_add_epi32(ni4, ni5), _mm_add_epi32(ni6, ni7));
-        y[i].s0 = d * hsum_i32_4(s0);
-        y[i].s1 = d * hsum_i32_4(s1);
-
-        // Convert int32 to int16
-        ni0 = _mm_packs_epi32( ni0, ni1 );
-        ni2 = _mm_packs_epi32( ni2, ni3 );
-        ni4 = _mm_packs_epi32( ni4, ni5 );
-        ni6 = _mm_packs_epi32( ni6, ni7 );
-        // Convert int16 to int8
-        ni0 = _mm_packs_epi16( ni0, ni2 );
-        ni4 = _mm_packs_epi16( ni4, ni6 );
-
-        _mm_storeu_si128((__m128i *)(y[i].qs +  0), ni0);
-        _mm_storeu_si128((__m128i *)(y[i].qs + 16), ni4);
-#endif
-    }
-#else
-    // scalar
-    quantize_row_q8_0_reference(x, y, k);
-#endif
-}
-
-static void dequantize_row_q4_0(const void * restrict vx, float * restrict y, int k) {
-    assert(k % QK4_0 == 0);
-    const int nb = k / QK4_0;
-
-    const block_q4_0 * restrict x = vx;
-
-#if defined(__AVX2__)
-    for (int i = 0; i < nb; i++) {
-        // scale factor
-        const __m256 d_v = _mm256_broadcast_ss(&x[i].d);
-
-        const uint8_t * restrict pp = x[i].qs;
-
-        for (int l = 0; l < QK4_0; l += 32) {
-            // Load 32x4-bit integers into 32x8-bit integers
-            __m256i vx8 = bytes_from_nibbles_32(pp+l/2);
-
-            // Subtract 8 from the integers
-            vx8 = _mm256_sub_epi8(vx8, _mm256_set1_epi8(8));
-
-            // Convert to 16-bit int
-            const __m256i vx16_lo = _mm256_cvtepi8_epi16(_mm256_extracti128_si256(vx8, 0));
-            const __m256i vx16_hi = _mm256_cvtepi8_epi16(_mm256_extracti128_si256(vx8, 1));
-
-            // Convert to 32-bit int -> float 32
-            const __m256 vf[4] = {
-                _mm256_cvtepi32_ps(_mm256_cvtepi16_epi32(_mm256_extracti128_si256(vx16_lo, 0))),
-                _mm256_cvtepi32_ps(_mm256_cvtepi16_epi32(_mm256_extracti128_si256(vx16_lo, 1))),
-                _mm256_cvtepi32_ps(_mm256_cvtepi16_epi32(_mm256_extracti128_si256(vx16_hi, 0))),
-                _mm256_cvtepi32_ps(_mm256_cvtepi16_epi32(_mm256_extracti128_si256(vx16_hi, 1)))
-            };
-
-            // Scale and store
-            for (int j = 0; j < 4; j++) {
-                const __m256 result = _mm256_mul_ps(vf[j], d_v);
-                _mm256_storeu_ps(y + i * QK4_0 + l + j*8, result);
-            }
-        }
-    }
-#elif defined(__ARM_NEON)
-    for (int i = 0; i < nb; i++) {
-        const float32x4_t vd = vdupq_n_f32(x[i].d);
-
-        const uint8_t * restrict pp = x[i].qs;
-
-        for (int l = 0; l < QK4_0; l += 16) {
-            // Load 16x4-bit integers into 8x8-bit integers
-            const uint8x8_t v8 = vld1_u8(pp + l/2);
-
-            // Expand 4-bit qs to 8-bit bytes
-            const uint8x8_t v0 = vand_u8(v8, vdup_n_u8(0x0f));
-            const uint8x8_t v1 = vshr_n_u8(v8, 4);
-
-            // Convert to signed 8-bit integers
-            const int8x8_t vs_0 = vreinterpret_s8_u8(v0);
-            const int8x8_t vs_1 = vreinterpret_s8_u8(v1);
-
-            // Subtract 8 from each byte
-            const int8x8_t vb_0 = vsub_s8(vs_0, vdup_n_s8(8));
-            const int8x8_t vb_1 = vsub_s8(vs_1, vdup_n_s8(8));
-
-            // Interleave and combine
-            const int8x8_t vx_0 = vzip1_s8(vb_0, vb_1);
-            const int8x8_t vx_1 = vzip2_s8(vb_0, vb_1);
-
-            const int8x16_t vq = vcombine_s8(vx_0, vx_1);
-
-            // convert to 2x int16x8_t
-            const int16x8_t vi_0 = vmovl_s8(vget_low_s8 (vq));
-            const int16x8_t vi_1 = vmovl_s8(vget_high_s8(vq));
-
-            // convert to 4x float32x4_t
-            const float32x4_t vf_0 = vcvtq_f32_s32(vmovl_s16(vget_low_s16 (vi_0)));
-            const float32x4_t vf_1 = vcvtq_f32_s32(vmovl_s16(vget_high_s16(vi_0)));
-            const float32x4_t vf_2 = vcvtq_f32_s32(vmovl_s16(vget_low_s16 (vi_1)));
-            const float32x4_t vf_3 = vcvtq_f32_s32(vmovl_s16(vget_high_s16(vi_1)));
-
-            // Multiply by d
-            const float32x4_t r0 = vmulq_f32(vf_0, vd);
-            const float32x4_t r1 = vmulq_f32(vf_1, vd);
-            const float32x4_t r2 = vmulq_f32(vf_2, vd);
-            const float32x4_t r3 = vmulq_f32(vf_3, vd);
-
-            // Store
-            vst1q_f32(y + i*QK4_0 + l +  0, r0);
-            vst1q_f32(y + i*QK4_0 + l +  4, r1);
-            vst1q_f32(y + i*QK4_0 + l +  8, r2);
-            vst1q_f32(y + i*QK4_0 + l + 12, r3);
-        }
-    }
-#else
-    // scalar
-    for (int i = 0; i < nb; i++) {
-        const float d = x[i].d;
-
-        const uint8_t * restrict pp = x[i].qs;
-
-        for (int l = 0; l < QK4_0; l += 2) {
-            const uint8_t vi = pp[l/2];
-
-            const int8_t vi0 = vi & 0xf;
-            const int8_t vi1 = vi >> 4;
-
-            const float v0 = (vi0 - 8)*d;
-            const float v1 = (vi1 - 8)*d;
-
-            //printf("d = %f, vi = %d, vi0 = %d, vi1 = %d, v0 = %f, v1 = %f\n", d, vi, vi0, vi1, v0, v1);
-
-            y[i*QK4_0 + l + 0] = v0;
-            y[i*QK4_0 + l + 1] = v1;
-
-            assert(!isnan(y[i*QK4_0 + l + 0]));
-            assert(!isnan(y[i*QK4_0 + l + 1]));
-        }
-    }
-#endif
-}
-
-static void dequantize_row_q4_1(const void * restrict vx, float * restrict y, int k) {
-    assert(k % QK4_1 == 0);
-    const int nb = k / QK4_1;
-
-    const block_q4_1 * restrict x = vx;
-
-#if defined(__AVX2__)
-    for (int i = 0; i < nb; i++) {
-        const __m256 d_v = _mm256_broadcast_ss(&x[i].d);
-        const __m256 d_m = _mm256_broadcast_ss(&x[i].m);
-
-        const uint8_t * restrict pp = x[i].qs;
-
-        for (int l = 0; l < QK4_1; l += 32) {
-            // Load 32x4-bit integers into 32x8-bit integers
-            __m256i vx8 = bytes_from_nibbles_32(pp+l/2);
-
-            // Convert to 16-bit int
-            const __m256i vx16_lo = _mm256_cvtepi8_epi16(_mm256_extracti128_si256(vx8, 0));
-            const __m256i vx16_hi = _mm256_cvtepi8_epi16(_mm256_extracti128_si256(vx8, 1));
-
-            // Convert to 32-bit int -> float 32
-            const __m256 vf[4] = {
-                _mm256_cvtepi32_ps(_mm256_cvtepi16_epi32(_mm256_extracti128_si256(vx16_lo, 0))),
-                _mm256_cvtepi32_ps(_mm256_cvtepi16_epi32(_mm256_extracti128_si256(vx16_lo, 1))),
-                _mm256_cvtepi32_ps(_mm256_cvtepi16_epi32(_mm256_extracti128_si256(vx16_hi, 0))),
-                _mm256_cvtepi32_ps(_mm256_cvtepi16_epi32(_mm256_extracti128_si256(vx16_hi, 1)))
-            };
-
-            // Scale, add m and store
-            for (int j = 0; j < 4; j++) {
-                const __m256 result = _mm256_add_ps(_mm256_mul_ps(vf[j], d_v), d_m);
-                _mm256_storeu_ps(y + i * QK4_1 + l + j*8, result);
-            }
-        }
-    }
-#elif defined(__ARM_NEON)
-    for (int i = 0; i < nb; i++) {
-        const float32x4_t vd = vdupq_n_f32(x[i].d);
-        const float32x4_t vm = vdupq_n_f32(x[i].m);
-
-        const uint8_t * restrict pp = x[i].qs;
-
-        for (int l = 0; l < QK4_1; l += 16) {
-            // Load 16x4-bit integers into 8x8-bit integers
-            const uint8x8_t v8 = vld1_u8(pp + l/2);
-
-            // Expand 4-bit qs to 8-bit bytes
-            const uint8x8_t v0 = vand_u8(v8, vdup_n_u8(0x0f));
-            const uint8x8_t v1 = vshr_n_u8(v8, 4);
-
-            // Interleave and combine
-            const uint8x8_t vx_0 = vzip1_u8(v0, v1);
-            const uint8x8_t vx_1 = vzip2_u8(v0, v1);
-
-            const uint8x16_t vq = vcombine_u8(vx_0, vx_1);
-
-            // convert to 2x uint16x8_t
-            const uint16x8_t vi_0 = vmovl_u8(vget_low_u8 (vq));
-            const uint16x8_t vi_1 = vmovl_u8(vget_high_u8(vq));
-
-            // convert to 4x float32x4_t
-            const float32x4_t vf_0 = vcvtq_f32_u32(vmovl_u16(vget_low_u16 (vi_0)));
-            const float32x4_t vf_1 = vcvtq_f32_u32(vmovl_u16(vget_high_u16(vi_0)));
-            const float32x4_t vf_2 = vcvtq_f32_u32(vmovl_u16(vget_low_u16 (vi_1)));
-            const float32x4_t vf_3 = vcvtq_f32_u32(vmovl_u16(vget_high_u16(vi_1)));
-
-            // multiply by d and add m
-            const float32x4_t r0 = vmlaq_f32(vm, vf_0, vd);
-            const float32x4_t r1 = vmlaq_f32(vm, vf_1, vd);
-            const float32x4_t r2 = vmlaq_f32(vm, vf_2, vd);
-            const float32x4_t r3 = vmlaq_f32(vm, vf_3, vd);
-
-            // Store
-            vst1q_f32(y + i*QK4_1 + l +  0, r0);
-            vst1q_f32(y + i*QK4_1 + l +  4, r1);
-            vst1q_f32(y + i*QK4_1 + l +  8, r2);
-            vst1q_f32(y + i*QK4_1 + l + 12, r3);
-        }
-    }
-#else
-    for (int i = 0; i < nb; i++) {
-        const float d = x[i].d;
-        const float m = x[i].m;
-
-        const uint8_t * restrict pp = x[i].qs;
-
-        for (int l = 0; l < QK4_1; l += 2) {
-            const uint8_t vi = pp[l/2];
-
-            const int8_t vi0 = vi & 0xf;
-            const int8_t vi1 = vi >> 4;
-
-            const float v0 = vi0*d + m;
-            const float v1 = vi1*d + m;
-
-            y[i*QK4_1 + l + 0] = v0;
-            y[i*QK4_1 + l + 1] = v1;
-
-            assert(!isnan(y[i*QK4_1 + l + 0]));
-            assert(!isnan(y[i*QK4_1 + l + 1]));
-        }
-    }
-#endif
-}
-
-static void dequantize_row_q4_2(const void * restrict vx, float * restrict y, int k) {
-    assert(k % QK4_2 == 0);
-    const int nb = k / QK4_2;
-
-    const block_q4_2 * restrict x = vx;
-
-    for (int i = 0; i < nb; i++) {
-        const float d = GGML_FP16_TO_FP32(x[i].d);
-
-        const uint8_t * restrict pp = x[i].qs;
-
-        for (int l = 0; l < QK4_2; l += 2) {
-            const uint8_t vi = pp[l/2];
-
-            const int8_t vi0 = vi & 0xf;
-            const int8_t vi1 = vi >> 4;
-
-            const float v0 = (vi0 - 8)*d;
-            const float v1 = (vi1 - 8)*d;
-
-            y[i*QK4_2 + l + 0] = v0;
-            y[i*QK4_2 + l + 1] = v1;
-
-            assert(!isnan(y[i*QK4_2 + l + 0]));
-            assert(!isnan(y[i*QK4_2 + l + 1]));
-        }
-    }
-}
-
-static void dequantize_row_q4_3(const void * restrict vx, float * restrict y, int k) {
-    assert(k % QK4_3 == 0);
-    const int nb = k / QK4_3;
-
-    const block_q4_3 * restrict x = vx;
-
-    for (int i = 0; i < nb; i++) {
-        const float d = GGML_FP16_TO_FP32(x[i].d);
-        const float m = GGML_FP16_TO_FP32(x[i].m);
-
-        const uint8_t * restrict pp = x[i].qs;
-
-        for (int l = 0; l < QK4_3; l += 2) {
-            const uint8_t vi = pp[l/2];
-
-            const int8_t vi0 = vi & 0xf;
-            const int8_t vi1 = vi >> 4;
-
-            const float v0 = vi0*d + m;
-            const float v1 = vi1*d + m;
-
-            y[i*QK4_3 + l + 0] = v0;
-            y[i*QK4_3 + l + 1] = v1;
-
-            assert(!isnan(y[i*QK4_3 + l + 0]));
-            assert(!isnan(y[i*QK4_3 + l + 1]));
-        }
-    }
-}
-
-static void ggml_vec_dot_q4_0_q8_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy);
-static void ggml_vec_dot_q4_1_q8_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy);
-static void ggml_vec_dot_q4_2_q8_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy);
-static void ggml_vec_dot_q4_3_q8_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy);
-
-static const quantize_fns_t quantize_fns[GGML_TYPE_COUNT] = {
-    [GGML_TYPE_Q4_0] = {
-        .dequantize_row_q         = dequantize_row_q4_0,
-        .quantize_row_q           = quantize_row_q4_0,
-        .quantize_row_q_reference = (quantize_row_q_t) quantize_row_q4_0_reference,
-        .quantize_row_q_dot       = quantize_row_q8_0,
-        .vec_dot_q                = ggml_vec_dot_q4_0_q8_0,
-    },
-    [GGML_TYPE_Q4_1] = {
-        .dequantize_row_q         = dequantize_row_q4_1,
-        .quantize_row_q           = quantize_row_q4_1,
-        .quantize_row_q_reference = (quantize_row_q_t) quantize_row_q4_1_reference,
-        .quantize_row_q_dot       = quantize_row_q8_0,
-        .vec_dot_q                = ggml_vec_dot_q4_1_q8_0,
-    },
-    [GGML_TYPE_Q4_2] = {
-        .dequantize_row_q         = dequantize_row_q4_2,
-        .quantize_row_q           = quantize_row_q4_2,
-        .quantize_row_q_reference = (quantize_row_q_t) quantize_row_q4_2_rmse, //quantize_row_q4_2_reference,
-        .quantize_row_q_dot       = quantize_row_q8_0,
-        .vec_dot_q                = ggml_vec_dot_q4_2_q8_0,
-    },
-    [GGML_TYPE_Q4_3] = {
-        .dequantize_row_q         = dequantize_row_q4_3,
-        .quantize_row_q           = quantize_row_q4_3,
-        .quantize_row_q_reference = (quantize_row_q_t) quantize_row_q4_3_reference, // TODO: RMSE optimization
-        .quantize_row_q_dot       = quantize_row_q8_0,
-        .vec_dot_q                = ggml_vec_dot_q4_3_q8_0,
-    },
-    [GGML_TYPE_Q8_0] = {
-        .dequantize_row_q         = NULL,   // TODO
-        .quantize_row_q           = quantize_row_q8_0,
-        .quantize_row_q_reference = (quantize_row_q_t) quantize_row_q8_0_reference,
-        .quantize_row_q_dot       = quantize_row_q8_0,
-        .vec_dot_q                = NULL,   // TODO
-    },
-};
-
-// For internal test use
-quantize_fns_t ggml_internal_get_quantize_fn(size_t i) {
-    GGML_ASSERT(i < GGML_TYPE_COUNT);
-    return quantize_fns[i];
-}
-
-
-//
-// simd mappings
-//
-
-// we define a common set of C macros which map to specific intrinsics based on the current architecture
-// we then implement the fundamental computation operations below using only these macros
-// adding support for new architectures requires to define the corresponding SIMD macros
-//
-// GGML_F32_STEP / GGML_F16_STEP
-//   number of elements to process in a single step
-//
-// GGML_F32_EPR / GGML_F16_EPR
-//   number of elements to fit in a single register
-//
-
-#if defined(__ARM_NEON) && defined(__ARM_FEATURE_FMA)
-
-#define GGML_SIMD
-
-// F32 NEON
-
-#define GGML_F32_STEP 16
-#define GGML_F32_EPR  4
-
-#define GGML_F32x4              float32x4_t
-#define GGML_F32x4_ZERO         vdupq_n_f32(0.0f)
-#define GGML_F32x4_SET1(x)      vdupq_n_f32(x)
-#define GGML_F32x4_LOAD         vld1q_f32
-#define GGML_F32x4_STORE        vst1q_f32
-#define GGML_F32x4_FMA(a, b, c) vfmaq_f32(a, b, c)
-#define GGML_F32x4_ADD          vaddq_f32
-#define GGML_F32x4_MUL          vmulq_f32
-#define GGML_F32x4_REDUCE_ONE(x) vaddvq_f32(x)
-#define GGML_F32x4_REDUCE(res, x)              \
-{                                              \
-    for (int i = 0; i < GGML_F32_ARR/2; ++i) { \
-        x[2*i] = vaddq_f32(x[2*i], x[2*i+1]);  \
-    }                                          \
-    for (int i = 0; i < GGML_F32_ARR/4; ++i) { \
-        x[4*i] = vaddq_f32(x[4*i], x[4*i+2]);  \
-    }                                          \
-    for (int i = 0; i < GGML_F32_ARR/8; ++i) { \
-        x[8*i] = vaddq_f32(x[8*i], x[8*i+4]);  \
-    }                                          \
-    res = GGML_F32x4_REDUCE_ONE(x[0]);         \
-}
-
-#define GGML_F32_VEC        GGML_F32x4
-#define GGML_F32_VEC_ZERO   GGML_F32x4_ZERO
-#define GGML_F32_VEC_SET1   GGML_F32x4_SET1
-#define GGML_F32_VEC_LOAD   GGML_F32x4_LOAD
-#define GGML_F32_VEC_STORE  GGML_F32x4_STORE
-#define GGML_F32_VEC_FMA    GGML_F32x4_FMA
-#define GGML_F32_VEC_ADD    GGML_F32x4_ADD
-#define GGML_F32_VEC_MUL    GGML_F32x4_MUL
-#define GGML_F32_VEC_REDUCE GGML_F32x4_REDUCE
-
-// F16 NEON
-
-#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-    #define GGML_F16_STEP 32
-    #define GGML_F16_EPR  8
-
-    #define GGML_F16x8              float16x8_t
-    #define GGML_F16x8_ZERO         vdupq_n_f16(0.0f)
-    #define GGML_F16x8_SET1(x)      vdupq_n_f16(x)
-    #define GGML_F16x8_LOAD         vld1q_f16
-    #define GGML_F16x8_STORE        vst1q_f16
-    #define GGML_F16x8_FMA(a, b, c) vfmaq_f16(a, b, c)
-    #define GGML_F16x8_ADD          vaddq_f16
-    #define GGML_F16x8_MUL          vmulq_f16
-    #define GGML_F16x8_REDUCE(res, x)                             \
-    {                                                             \
-        for (int i = 0; i < GGML_F16_ARR/2; ++i) {                \
-            x[2*i] = vaddq_f16(x[2*i], x[2*i+1]);                 \
-        }                                                         \
-        for (int i = 0; i < GGML_F16_ARR/4; ++i) {                \
-            x[4*i] = vaddq_f16(x[4*i], x[4*i+2]);                 \
-        }                                                         \
-        for (int i = 0; i < GGML_F16_ARR/8; ++i) {                \
-            x[8*i] = vaddq_f16(x[8*i], x[8*i+4]);                 \
-        }                                                         \
-        const float32x4_t t0 = vcvt_f32_f16(vget_low_f16 (x[0])); \
-        const float32x4_t t1 = vcvt_f32_f16(vget_high_f16(x[0])); \
-        res = (ggml_float) vaddvq_f32(vaddq_f32(t0, t1));         \
-    }
-
-    #define GGML_F16_VEC                GGML_F16x8
-    #define GGML_F16_VEC_ZERO           GGML_F16x8_ZERO
-    #define GGML_F16_VEC_SET1           GGML_F16x8_SET1
-    #define GGML_F16_VEC_LOAD(p, i)     GGML_F16x8_LOAD(p)
-    #define GGML_F16_VEC_STORE(p, r, i) GGML_F16x8_STORE(p, r[i])
-    #define GGML_F16_VEC_FMA            GGML_F16x8_FMA
-    #define GGML_F16_VEC_ADD            GGML_F16x8_ADD
-    #define GGML_F16_VEC_MUL            GGML_F16x8_MUL
-    #define GGML_F16_VEC_REDUCE         GGML_F16x8_REDUCE
-#else
-    // if FP16 vector arithmetic is not supported, we use FP32 instead
-    // and take advantage of the vcvt_ functions to convert to/from FP16
-
-    #define GGML_F16_STEP 16
-    #define GGML_F16_EPR  4
-
-    #define GGML_F32Cx4              float32x4_t
-    #define GGML_F32Cx4_ZERO         vdupq_n_f32(0.0f)
-    #define GGML_F32Cx4_SET1(x)      vdupq_n_f32(x)
-    #define GGML_F32Cx4_LOAD(x)      vcvt_f32_f16(vld1_f16(x))
-    #define GGML_F32Cx4_STORE(x, y)  vst1_f16(x, vcvt_f16_f32(y))
-    #define GGML_F32Cx4_FMA(a, b, c) vfmaq_f32(a, b, c)
-    #define GGML_F32Cx4_ADD          vaddq_f32
-    #define GGML_F32Cx4_MUL          vmulq_f32
-    #define GGML_F32Cx4_REDUCE       GGML_F32x4_REDUCE
-
-    #define GGML_F16_VEC                GGML_F32Cx4
-    #define GGML_F16_VEC_ZERO           GGML_F32Cx4_ZERO
-    #define GGML_F16_VEC_SET1           GGML_F32Cx4_SET1
-    #define GGML_F16_VEC_LOAD(p, i)     GGML_F32Cx4_LOAD(p)
-    #define GGML_F16_VEC_STORE(p, r, i) GGML_F32Cx4_STORE(p, r[i])
-    #define GGML_F16_VEC_FMA            GGML_F32Cx4_FMA
-    #define GGML_F16_VEC_ADD            GGML_F32Cx4_ADD
-    #define GGML_F16_VEC_MUL            GGML_F32Cx4_MUL
-    #define GGML_F16_VEC_REDUCE         GGML_F32Cx4_REDUCE
-#endif
-
-#elif defined(__AVX__)
-
-#define GGML_SIMD
-
-// F32 AVX
-
-#define GGML_F32_STEP 32
-#define GGML_F32_EPR  8
-
-#define GGML_F32x8         __m256
-#define GGML_F32x8_ZERO    _mm256_setzero_ps()
-#define GGML_F32x8_SET1(x) _mm256_set1_ps(x)
-#define GGML_F32x8_LOAD    _mm256_loadu_ps
-#define GGML_F32x8_STORE   _mm256_storeu_ps
-#if defined(__FMA__)
-    #define GGML_F32x8_FMA(a, b, c) _mm256_fmadd_ps(b, c, a)
-#else
-    #define GGML_F32x8_FMA(a, b, c) _mm256_add_ps(_mm256_mul_ps(b, c), a)
-#endif
-#define GGML_F32x8_ADD     _mm256_add_ps
-#define GGML_F32x8_MUL     _mm256_mul_ps
-#define GGML_F32x8_REDUCE(res, x)                                 \
-{                                                                 \
-    for (int i = 0; i < GGML_F32_ARR/2; ++i) {                    \
-        x[2*i] = _mm256_add_ps(x[2*i], x[2*i+1]);                 \
-    }                                                             \
-    for (int i = 0; i < GGML_F32_ARR/4; ++i) {                    \
-        x[4*i] = _mm256_add_ps(x[4*i], x[4*i+2]);                 \
-    }                                                             \
-    for (int i = 0; i < GGML_F32_ARR/8; ++i) {                    \
-        x[8*i] = _mm256_add_ps(x[8*i], x[8*i+4]);                 \
-    }                                                             \
-    const __m128 t0 = _mm_add_ps(_mm256_castps256_ps128(x[0]),    \
-                                 _mm256_extractf128_ps(x[0], 1)); \
-    const __m128 t1 = _mm_hadd_ps(t0, t0);                        \
-    res = _mm_cvtss_f32(_mm_hadd_ps(t1, t1));                     \
-}
-// TODO: is this optimal ?
-
-#define GGML_F32_VEC        GGML_F32x8
-#define GGML_F32_VEC_ZERO   GGML_F32x8_ZERO
-#define GGML_F32_VEC_SET1   GGML_F32x8_SET1
-#define GGML_F32_VEC_LOAD   GGML_F32x8_LOAD
-#define GGML_F32_VEC_STORE  GGML_F32x8_STORE
-#define GGML_F32_VEC_FMA    GGML_F32x8_FMA
-#define GGML_F32_VEC_ADD    GGML_F32x8_ADD
-#define GGML_F32_VEC_MUL    GGML_F32x8_MUL
-#define GGML_F32_VEC_REDUCE GGML_F32x8_REDUCE
-
-// F16 AVX
-
-#define GGML_F16_STEP 32
-#define GGML_F16_EPR  8
-
-// F16 arithmetic is not supported by AVX, so we use F32 instead
-
-#define GGML_F32Cx8             __m256
-#define GGML_F32Cx8_ZERO        _mm256_setzero_ps()
-#define GGML_F32Cx8_SET1(x)     _mm256_set1_ps(x)
-
-#if defined(__F16C__)
-// the  _mm256_cvt intrinsics require F16C
-#define GGML_F32Cx8_LOAD(x)     _mm256_cvtph_ps(_mm_loadu_si128((__m128i *)(x)))
-#define GGML_F32Cx8_STORE(x, y) _mm_storeu_si128((__m128i *)(x), _mm256_cvtps_ph(y, 0))
-#else
-static inline __m256 __avx_f32cx8_load(ggml_fp16_t *x) {
-    float tmp[8];
-
-    for (int i = 0; i < 8; i++)
-        tmp[i] = GGML_FP16_TO_FP32(x[i]);
-
-    return _mm256_loadu_ps(tmp);
-}
-static inline void __avx_f32cx8_store(ggml_fp16_t *x, __m256 y) {
-    float arr[8];
-
-    _mm256_storeu_ps(arr, y);
-
-    for (int i = 0; i < 8; i++)
-        x[i] = GGML_FP32_TO_FP16(arr[i]);
-}
-#define GGML_F32Cx8_LOAD(x)     __avx_f32cx8_load(x)
-#define GGML_F32Cx8_STORE(x, y) __avx_f32cx8_store(x, y)
-#endif
-
-#define GGML_F32Cx8_FMA         GGML_F32x8_FMA
-#define GGML_F32Cx8_ADD         _mm256_add_ps
-#define GGML_F32Cx8_MUL         _mm256_mul_ps
-#define GGML_F32Cx8_REDUCE      GGML_F32x8_REDUCE
-
-#define GGML_F16_VEC                GGML_F32Cx8
-#define GGML_F16_VEC_ZERO           GGML_F32Cx8_ZERO
-#define GGML_F16_VEC_SET1           GGML_F32Cx8_SET1
-#define GGML_F16_VEC_LOAD(p, i)     GGML_F32Cx8_LOAD(p)
-#define GGML_F16_VEC_STORE(p, r, i) GGML_F32Cx8_STORE(p, r[i])
-#define GGML_F16_VEC_FMA            GGML_F32Cx8_FMA
-#define GGML_F16_VEC_ADD            GGML_F32Cx8_ADD
-#define GGML_F16_VEC_MUL            GGML_F32Cx8_MUL
-#define GGML_F16_VEC_REDUCE         GGML_F32Cx8_REDUCE
-
-#elif defined(__POWER9_VECTOR__)
-
-#define GGML_SIMD
-
-// F32 POWER9
-
-#define GGML_F32_STEP 32
-#define GGML_F32_EPR  4
-
-#define GGML_F32x4              vector float
-#define GGML_F32x4_ZERO         0.0f
-#define GGML_F32x4_SET1         vec_splats
-#define GGML_F32x4_LOAD(p)      vec_xl(0, p)
-#define GGML_F32x4_STORE(p, r)  vec_xst(r, 0, p)
-#define GGML_F32x4_FMA(a, b, c) vec_madd(b, c, a)
-#define GGML_F32x4_ADD          vec_add
-#define GGML_F32x4_MUL          vec_mul
-#define GGML_F32x4_REDUCE(res, x)              \
-{                                              \
-    for (int i = 0; i < GGML_F32_ARR/2; ++i) { \
-        x[2*i] = vec_add(x[2*i], x[2*i+1]);    \
-    }                                          \
-    for (int i = 0; i < GGML_F32_ARR/4; ++i) { \
-        x[4*i] = vec_add(x[4*i], x[4*i+2]);    \
-    }                                          \
-    for (int i = 0; i < GGML_F32_ARR/8; ++i) { \
-        x[8*i] = vec_add(x[8*i], x[8*i+4]);    \
-    }                                          \
-    res = vec_extract(x[0], 0) +               \
-          vec_extract(x[0], 1) +               \
-          vec_extract(x[0], 2) +               \
-          vec_extract(x[0], 3);                \
-}
-
-#define GGML_F32_VEC        GGML_F32x4
-#define GGML_F32_VEC_ZERO   GGML_F32x4_ZERO
-#define GGML_F32_VEC_SET1   GGML_F32x4_SET1
-#define GGML_F32_VEC_LOAD   GGML_F32x4_LOAD
-#define GGML_F32_VEC_STORE  GGML_F32x4_STORE
-#define GGML_F32_VEC_FMA    GGML_F32x4_FMA
-#define GGML_F32_VEC_ADD    GGML_F32x4_ADD
-#define GGML_F32_VEC_MUL    GGML_F32x4_MUL
-#define GGML_F32_VEC_REDUCE GGML_F32x4_REDUCE
-
-// F16 POWER9
-#define GGML_F16_STEP       GGML_F32_STEP
-#define GGML_F16_EPR        GGML_F32_EPR
-#define GGML_F16_VEC        GGML_F32x4
-#define GGML_F16_VEC_ZERO   GGML_F32x4_ZERO
-#define GGML_F16_VEC_SET1   GGML_F32x4_SET1
-#define GGML_F16_VEC_FMA    GGML_F32x4_FMA
-#define GGML_F16_VEC_REDUCE GGML_F32x4_REDUCE
-// Use vec_xl, not vec_ld, in case the load address is not aligned.
-#define GGML_F16_VEC_LOAD(p, i) (i & 0x1) ?                   \
-  vec_extract_fp32_from_shorth(vec_xl(0, p - GGML_F16_EPR)) : \
-  vec_extract_fp32_from_shortl(vec_xl(0, p))
-#define GGML_ENDIAN_BYTE(i) ((unsigned char *)&(uint16_t){1})[i]
-#define GGML_F16_VEC_STORE(p, r, i)                             \
-  if (i & 0x1)                                                  \
-    vec_xst(vec_pack_to_short_fp32(r[i - GGML_ENDIAN_BYTE(1)],  \
-                                   r[i - GGML_ENDIAN_BYTE(0)]), \
-            0, p - GGML_F16_EPR)
-
-#elif defined(__wasm_simd128__)
-
-#define GGML_SIMD
-
-// F32 WASM
-
-#define GGML_F32_STEP 16
-#define GGML_F32_EPR  4
-
-#define GGML_F32x4              v128_t
-#define GGML_F32x4_ZERO         wasm_f32x4_splat(0.0f)
-#define GGML_F32x4_SET1(x)      wasm_f32x4_splat(x)
-#define GGML_F32x4_LOAD         wasm_v128_load
-#define GGML_F32x4_STORE        wasm_v128_store
-#define GGML_F32x4_FMA(a, b, c) wasm_f32x4_add(wasm_f32x4_mul(b, c), a)
-#define GGML_F32x4_ADD          wasm_f32x4_add
-#define GGML_F32x4_MUL          wasm_f32x4_mul
-#define GGML_F32x4_REDUCE(res, x)                  \
-{                                                  \
-    for (int i = 0; i < GGML_F32_ARR/2; ++i) {     \
-        x[2*i] = wasm_f32x4_add(x[2*i], x[2*i+1]); \
-    }                                              \
-    for (int i = 0; i < GGML_F32_ARR/4; ++i) {     \
-        x[4*i] = wasm_f32x4_add(x[4*i], x[4*i+2]); \
-    }                                              \
-    for (int i = 0; i < GGML_F32_ARR/8; ++i) {     \
-        x[8*i] = wasm_f32x4_add(x[8*i], x[8*i+4]); \
-    }                                              \
-    res = wasm_f32x4_extract_lane(x[0], 0) +       \
-          wasm_f32x4_extract_lane(x[0], 1) +       \
-          wasm_f32x4_extract_lane(x[0], 2) +       \
-          wasm_f32x4_extract_lane(x[0], 3);        \
-}
-
-#define GGML_F32_VEC        GGML_F32x4
-#define GGML_F32_VEC_ZERO   GGML_F32x4_ZERO
-#define GGML_F32_VEC_SET1   GGML_F32x4_SET1
-#define GGML_F32_VEC_LOAD   GGML_F32x4_LOAD
-#define GGML_F32_VEC_STORE  GGML_F32x4_STORE
-#define GGML_F32_VEC_FMA    GGML_F32x4_FMA
-#define GGML_F32_VEC_ADD    GGML_F32x4_ADD
-#define GGML_F32_VEC_MUL    GGML_F32x4_MUL
-#define GGML_F32_VEC_REDUCE GGML_F32x4_REDUCE
-
-// F16 WASM
-
-#define GGML_F16_STEP 16
-#define GGML_F16_EPR  4
-
-inline static v128_t __wasm_f16x4_load(const ggml_fp16_t * p) {
-    float tmp[4];
-
-    tmp[0] = GGML_FP16_TO_FP32(p[0]);
-    tmp[1] = GGML_FP16_TO_FP32(p[1]);
-    tmp[2] = GGML_FP16_TO_FP32(p[2]);
-    tmp[3] = GGML_FP16_TO_FP32(p[3]);
-
-    return wasm_v128_load(tmp);
-}
-
-inline static void __wasm_f16x4_store(ggml_fp16_t * p, v128_t x) {
-    float tmp[4];
-
-    wasm_v128_store(tmp, x);
-
-    p[0] = GGML_FP32_TO_FP16(tmp[0]);
-    p[1] = GGML_FP32_TO_FP16(tmp[1]);
-    p[2] = GGML_FP32_TO_FP16(tmp[2]);
-    p[3] = GGML_FP32_TO_FP16(tmp[3]);
-}
-
-#define GGML_F16x4             v128_t
-#define GGML_F16x4_ZERO        wasm_f32x4_splat(0.0f)
-#define GGML_F16x4_SET1(x)     wasm_f32x4_splat(x)
-#define GGML_F16x4_LOAD(x)     __wasm_f16x4_load(x)
-#define GGML_F16x4_STORE(x, y) __wasm_f16x4_store(x, y)
-#define GGML_F16x4_FMA         GGML_F32x4_FMA
-#define GGML_F16x4_ADD         wasm_f32x4_add
-#define GGML_F16x4_MUL         wasm_f32x4_mul
-#define GGML_F16x4_REDUCE(res, x)                  \
-{                                                  \
-    for (int i = 0; i < GGML_F16_ARR/2; ++i) {     \
-        x[2*i] = wasm_f32x4_add(x[2*i], x[2*i+1]); \
-    }                                              \
-    for (int i = 0; i < GGML_F16_ARR/4; ++i) {     \
-        x[4*i] = wasm_f32x4_add(x[4*i], x[4*i+2]); \
-    }                                              \
-    for (int i = 0; i < GGML_F16_ARR/8; ++i) {     \
-        x[8*i] = wasm_f32x4_add(x[8*i], x[8*i+4]); \
-    }                                              \
-    res = wasm_f32x4_extract_lane(x[0], 0) +       \
-          wasm_f32x4_extract_lane(x[0], 1) +       \
-          wasm_f32x4_extract_lane(x[0], 2) +       \
-          wasm_f32x4_extract_lane(x[0], 3);        \
-}
-
-#define GGML_F16_VEC                GGML_F16x4
-#define GGML_F16_VEC_ZERO           GGML_F16x4_ZERO
-#define GGML_F16_VEC_SET1           GGML_F16x4_SET1
-#define GGML_F16_VEC_LOAD(p, i)     GGML_F16x4_LOAD(p)
-#define GGML_F16_VEC_STORE(p, r, i) GGML_F16x4_STORE(p, r[i])
-#define GGML_F16_VEC_FMA            GGML_F16x4_FMA
-#define GGML_F16_VEC_ADD            GGML_F16x4_ADD
-#define GGML_F16_VEC_MUL            GGML_F16x4_MUL
-#define GGML_F16_VEC_REDUCE         GGML_F16x4_REDUCE
-
-#elif defined(__SSE3__)
-
-#define GGML_SIMD
-
-// F32 SSE
-
-#define GGML_F32_STEP 32
-#define GGML_F32_EPR  4
-
-#define GGML_F32x4         __m128
-#define GGML_F32x4_ZERO    _mm_setzero_ps()
-#define GGML_F32x4_SET1(x) _mm_set1_ps(x)
-#define GGML_F32x4_LOAD    _mm_loadu_ps
-#define GGML_F32x4_STORE   _mm_storeu_ps
-#if defined(__FMA__)
-    // TODO: Does this work?
-    #define GGML_F32x4_FMA(a, b, c) _mm_fmadd_ps(b, c, a)
-#else
-    #define GGML_F32x4_FMA(a, b, c) _mm_add_ps(_mm_mul_ps(b, c), a)
-#endif
-#define GGML_F32x4_ADD     _mm_add_ps
-#define GGML_F32x4_MUL     _mm_mul_ps
-#define GGML_F32x4_REDUCE(res, x)                                 \
-{                                                                 \
-    for (int i = 0; i < GGML_F32_ARR/2; ++i) {                    \
-        x[2*i] = _mm_add_ps(x[2*i], x[2*i+1]);                    \
-    }                                                             \
-    for (int i = 0; i < GGML_F32_ARR/4; ++i) {                    \
-        x[4*i] = _mm_add_ps(x[4*i], x[4*i+2]);                    \
-    }                                                             \
-    for (int i = 0; i < GGML_F32_ARR/8; ++i) {                    \
-        x[8*i] = _mm_add_ps(x[8*i], x[8*i+4]);                    \
-    }                                                             \
-    const __m128 t0 = _mm_hadd_ps(x[0], x[0]);                    \
-    res = _mm_cvtss_f32(_mm_hadd_ps(t0, t0));                     \
-}
-// TODO: is this optimal ?
-
-#define GGML_F32_VEC        GGML_F32x4
-#define GGML_F32_VEC_ZERO   GGML_F32x4_ZERO
-#define GGML_F32_VEC_SET1   GGML_F32x4_SET1
-#define GGML_F32_VEC_LOAD   GGML_F32x4_LOAD
-#define GGML_F32_VEC_STORE  GGML_F32x4_STORE
-#define GGML_F32_VEC_FMA    GGML_F32x4_FMA
-#define GGML_F32_VEC_ADD    GGML_F32x4_ADD
-#define GGML_F32_VEC_MUL    GGML_F32x4_MUL
-#define GGML_F32_VEC_REDUCE GGML_F32x4_REDUCE
-
-// F16 SSE
-
-#define GGML_F16_STEP 32
-#define GGML_F16_EPR  4
-
-static inline __m128 __sse_f16x4_load(ggml_fp16_t *x) {
-    float tmp[4];
-
-    tmp[0] = GGML_FP16_TO_FP32(x[0]);
-    tmp[1] = GGML_FP16_TO_FP32(x[1]);
-    tmp[2] = GGML_FP16_TO_FP32(x[2]);
-    tmp[3] = GGML_FP16_TO_FP32(x[3]);
-
-    return _mm_loadu_ps(tmp);
-}
-
-static inline void __sse_f16x4_store(ggml_fp16_t *x, __m128 y) {
-    float arr[4];
-
-    _mm_storeu_ps(arr, y);
-
-    x[0] = GGML_FP32_TO_FP16(arr[0]);
-    x[1] = GGML_FP32_TO_FP16(arr[1]);
-    x[2] = GGML_FP32_TO_FP16(arr[2]);
-    x[3] = GGML_FP32_TO_FP16(arr[3]);
-}
-
-#define GGML_F32Cx4             __m128
-#define GGML_F32Cx4_ZERO        _mm_setzero_ps()
-#define GGML_F32Cx4_SET1(x)     _mm_set1_ps(x)
-#define GGML_F32Cx4_LOAD(x)     __sse_f16x4_load(x)
-#define GGML_F32Cx4_STORE(x, y) __sse_f16x4_store(x, y)
-#define GGML_F32Cx4_FMA         GGML_F32x4_FMA
-#define GGML_F32Cx4_ADD         _mm_add_ps
-#define GGML_F32Cx4_MUL         _mm_mul_ps
-#define GGML_F32Cx4_REDUCE      GGML_F32x4_REDUCE
-
-#define GGML_F16_VEC                 GGML_F32Cx4
-#define GGML_F16_VEC_ZERO            GGML_F32Cx4_ZERO
-#define GGML_F16_VEC_SET1            GGML_F32Cx4_SET1
-#define GGML_F16_VEC_LOAD(p, i)      GGML_F32Cx4_LOAD(p)
-#define GGML_F16_VEC_STORE(p, r, i)  GGML_F32Cx4_STORE(p, r[i])
-#define GGML_F16_VEC_FMA             GGML_F32Cx4_FMA
-#define GGML_F16_VEC_ADD             GGML_F32Cx4_ADD
-#define GGML_F16_VEC_MUL             GGML_F32Cx4_MUL
-#define GGML_F16_VEC_REDUCE          GGML_F32Cx4_REDUCE
-
-#endif
-
-// GGML_F32_ARR / GGML_F16_ARR
-//   number of registers to use per step
-#ifdef GGML_SIMD
-#define GGML_F32_ARR (GGML_F32_STEP/GGML_F32_EPR)
-#define GGML_F16_ARR (GGML_F16_STEP/GGML_F16_EPR)
-#endif
-
-//
-// fundamental operations
-//
-
-inline static void ggml_vec_set_i8(const int n, int8_t * x, const int8_t v) { for (int i = 0; i < n; ++i) x[i] = v; }
-
-inline static void ggml_vec_set_i16(const int n, int16_t * x, const int16_t v) { for (int i = 0; i < n; ++i) x[i] = v; }
-
-inline static void ggml_vec_set_i32(const int n, int32_t * x, const int32_t v) { for (int i = 0; i < n; ++i) x[i] = v; }
-
-inline static void ggml_vec_set_f16(const int n, ggml_fp16_t * x, const int32_t v) { for (int i = 0; i < n; ++i) x[i] = v; }
-
-inline static void ggml_vec_add_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i]  = x[i] + y[i]; }
-inline static void ggml_vec_acc_f32 (const int n, float * y, const float * x)                  { for (int i = 0; i < n; ++i) y[i] += x[i];        }
-inline static void ggml_vec_acc1_f32(const int n, float * y, const float   v)                  { for (int i = 0; i < n; ++i) y[i] += v;           }
-inline static void ggml_vec_sub_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i]  = x[i] - y[i]; }
-inline static void ggml_vec_set_f32 (const int n, float * x, const float   v)                  { for (int i = 0; i < n; ++i) x[i]  = v;           }
-inline static void ggml_vec_cpy_f32 (const int n, float * y, const float * x)                  { for (int i = 0; i < n; ++i) y[i]  = x[i];        }
-inline static void ggml_vec_neg_f32 (const int n, float * y, const float * x)                  { for (int i = 0; i < n; ++i) y[i]  = -x[i];       }
-inline static void ggml_vec_mul_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i]  = x[i]*y[i];   }
-inline static void ggml_vec_div_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i]  = x[i]/y[i];   }
-
-inline static void ggml_vec_dot_f32(const int n, float * restrict s, const float * restrict x, const float * restrict y) {
-#ifdef GGML_SIMD
-    float sumf = 0.0f;
-    const int np = (n & ~(GGML_F32_STEP - 1));
-
-    GGML_F32_VEC sum[GGML_F32_ARR] = { GGML_F32_VEC_ZERO };
-
-    GGML_F32_VEC ax[GGML_F32_ARR];
-    GGML_F32_VEC ay[GGML_F32_ARR];
-
-    for (int i = 0; i < np; i += GGML_F32_STEP) {
-        for (int j = 0; j < GGML_F32_ARR; j++) {
-            ax[j] = GGML_F32_VEC_LOAD(x + i + j*GGML_F32_EPR);
-            ay[j] = GGML_F32_VEC_LOAD(y + i + j*GGML_F32_EPR);
-
-            sum[j] = GGML_F32_VEC_FMA(sum[j], ax[j], ay[j]);
-        }
-    }
-
-    // reduce sum0..sum3 to sum0
-    GGML_F32_VEC_REDUCE(sumf, sum);
-
-    // leftovers
-    for (int i = np; i < n; ++i) {
-        sumf += x[i]*y[i];
-    }
-#else
-    // scalar
-    ggml_float sumf = 0.0;
-    for (int i = 0; i < n; ++i) {
-        sumf += (ggml_float)(x[i]*y[i]);
-    }
-#endif
-
-    *s = sumf;
-}
-
-inline static void ggml_vec_dot_f16(const int n, float * restrict s, ggml_fp16_t * restrict x, ggml_fp16_t * restrict y) {
-    ggml_float sumf = 0.0;
-
-#if defined(GGML_SIMD)
-    const int np = (n & ~(GGML_F16_STEP - 1));
-
-    GGML_F16_VEC sum[GGML_F16_ARR] = { GGML_F16_VEC_ZERO };
-
-    GGML_F16_VEC ax[GGML_F16_ARR];
-    GGML_F16_VEC ay[GGML_F16_ARR];
-
-    for (int i = 0; i < np; i += GGML_F16_STEP) {
-        for (int j = 0; j < GGML_F16_ARR; j++) {
-            ax[j] = GGML_F16_VEC_LOAD(x + i + j*GGML_F16_EPR, j);
-            ay[j] = GGML_F16_VEC_LOAD(y + i + j*GGML_F16_EPR, j);
-
-            sum[j] = GGML_F16_VEC_FMA(sum[j], ax[j], ay[j]);
-        }
-    }
-
-    // reduce sum0..sum3 to sum0
-    GGML_F16_VEC_REDUCE(sumf, sum);
-
-    // leftovers
-    for (int i = np; i < n; ++i) {
-        sumf += (ggml_float)(GGML_FP16_TO_FP32(x[i])*GGML_FP16_TO_FP32(y[i]));
-    }
-#else
-    for (int i = 0; i < n; ++i) {
-        sumf += (ggml_float)(GGML_FP16_TO_FP32(x[i])*GGML_FP16_TO_FP32(y[i]));
-    }
-#endif
-
-    *s = sumf;
-}
-
-static void ggml_vec_dot_q4_0_q8_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
-    const int nb = n / QK8_0;
-
-    assert(n % QK8_0 == 0);
-    assert(nb % 2 == 0);
-
-    const block_q4_0 * restrict x = vx;
-    const block_q8_0 * restrict y = vy;
-
-#if defined(__ARM_NEON)
-    float32x4_t sumv0 = vdupq_n_f32(0.0f);
-    float32x4_t sumv1 = vdupq_n_f32(0.0f);
-
-    float sum8 = 0;
-
-    for (int i = 0; i < nb; i += 2) {
-        const block_q4_0 * restrict x0 = &x[i + 0];
-        const block_q4_0 * restrict x1 = &x[i + 1];
-        const block_q8_0 * restrict y0 = &y[i + 0];
-        const block_q8_0 * restrict y1 = &y[i + 1];
-
-        sum8 += x0->d * (y0->s0 + y0->s1) + x1->d * (y1->s0 + y1->s1);
-
-        const uint8x16_t m4b   = vdupq_n_u8(0xf);
-
-        const uint8x16_t v0_0 = vld1q_u8(x0->qs);
-        const uint8x16_t v0_1 = vld1q_u8(x1->qs);
-
-        // 4-bit -> 8-bit
-        const int8x16_t v0_0l = vreinterpretq_s8_u8(vandq_u8  (v0_0, m4b));
-        const int8x16_t v0_0h = vreinterpretq_s8_u8(vshrq_n_u8(v0_0, 4));
-        const int8x16_t v0_1l = vreinterpretq_s8_u8(vandq_u8  (v0_1, m4b));
-        const int8x16_t v0_1h = vreinterpretq_s8_u8(vshrq_n_u8(v0_1, 4));
-
-        // load y
-        const int8x16_t v1_0l = vld1q_s8(y0->qs);
-        const int8x16_t v1_0h = vld1q_s8(y0->qs + 16);
-        const int8x16_t v1_1l = vld1q_s8(y1->qs);
-        const int8x16_t v1_1h = vld1q_s8(y1->qs + 16);
-
-        // interleave
-        const int8x16_t v1_0ls = vuzp1q_s8(v1_0l, v1_0h);
-        const int8x16_t v1_0hs = vuzp2q_s8(v1_0l, v1_0h);
-        const int8x16_t v1_1ls = vuzp1q_s8(v1_1l, v1_1h);
-        const int8x16_t v1_1hs = vuzp2q_s8(v1_1l, v1_1h);
-
-#if defined(__ARM_FEATURE_DOTPROD)
-        // dot product into int32x4_t
-        const int32x4_t p_0 = vdotq_s32(vdotq_s32(vdupq_n_s32(0), v0_0l, v1_0ls), v0_0h, v1_0hs);
-        const int32x4_t p_1 = vdotq_s32(vdotq_s32(vdupq_n_s32(0), v0_1l, v1_1ls), v0_1h, v1_1hs);
-
-        sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(p_0), x0->d*y0->d);
-        sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(p_1), x1->d*y1->d);
-#else
-        const int16x8_t pl0l = vmull_s8(vget_low_s8 (v0_0l), vget_low_s8 (v1_0ls));
-        const int16x8_t pl0h = vmull_s8(vget_high_s8(v0_0l), vget_high_s8(v1_0ls));
-        const int16x8_t ph0l = vmull_s8(vget_low_s8 (v0_0h), vget_low_s8 (v1_0hs));
-        const int16x8_t ph0h = vmull_s8(vget_high_s8(v0_0h), vget_high_s8(v1_0hs));
-
-        const int16x8_t pl1l = vmull_s8(vget_low_s8 (v0_1l), vget_low_s8 (v1_1ls));
-        const int16x8_t pl1h = vmull_s8(vget_high_s8(v0_1l), vget_high_s8(v1_1ls));
-        const int16x8_t ph1l = vmull_s8(vget_low_s8 (v0_1h), vget_low_s8 (v1_1hs));
-        const int16x8_t ph1h = vmull_s8(vget_high_s8(v0_1h), vget_high_s8(v1_1hs));
-
-        const int32x4_t pl0 = vaddq_s32(vpaddlq_s16(pl0l), vpaddlq_s16(pl0h));
-        const int32x4_t ph0 = vaddq_s32(vpaddlq_s16(ph0l), vpaddlq_s16(ph0h));
-        const int32x4_t pl1 = vaddq_s32(vpaddlq_s16(pl1l), vpaddlq_s16(pl1h));
-        const int32x4_t ph1 = vaddq_s32(vpaddlq_s16(ph1l), vpaddlq_s16(ph1h));
-
-        sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(pl0, ph0)), x0->d*y0->d);
-        sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(pl1, ph1)), x1->d*y1->d);
-#endif
-    }
-
-    *s = vaddvq_f32(sumv0) + vaddvq_f32(sumv1) - 8 * sum8;
-#elif defined(__AVX2__)
-    // Initialize accumulator with zeros
-    __m256 acc = _mm256_setzero_ps();
-
-    // Main loop
-    for (int i = 0; i < nb; ++i) {
-        /* Compute combined scale for the block */
-        const __m256 d = _mm256_mul_ps( _mm256_broadcast_ss( &x[i].d ), _mm256_broadcast_ss( &y[i].d ) );
-
-        __m256i bx = bytes_from_nibbles_32(x[i].qs);
-
-        // Now we have a vector with bytes in [ 0 .. 15 ] interval. Offset them into [ -8 .. +7 ] interval.
-        const __m256i off = _mm256_set1_epi8( 8 );
-        bx = _mm256_sub_epi8( bx, off );
-
-        __m256i by = _mm256_loadu_si256((const __m256i *)y[i].qs);
-
-        const __m256 q = mul_sum_i8_pairs_float(bx, by);
-
-        /* Multiply q with scale and accumulate */
-        acc = _mm256_fmadd_ps( d, q, acc );
-    }
-
-    *s = hsum_float_8(acc);
-#elif defined(__AVX__)
-    // Initialize accumulator with zeros
-    __m256 acc = _mm256_setzero_ps();
-
-    // Main loop
-    for (int i = 0; i < nb; ++i) {
-        // Compute combined scale for the block
-        const __m256 d = _mm256_mul_ps( _mm256_broadcast_ss( &x[i].d ), _mm256_broadcast_ss( &y[i].d ) );
-
-        __m128i i32[2];
-        for (int j = 0; j < 2; ++j) {
-            // Load 8 bytes, and unpack 4 bit fields into bytes, making 16 bytes
-            __m128i bx = bytes_from_nibbles_16(x[i].qs + 8*j);
-            __m128i by = _mm_loadu_si128((const __m128i *)(y[i].qs + 16*j));
-
-            // Now we have a vector with bytes in [ 0 .. 15 ] interval. Offset them into [ -8 .. +7 ] interval.
-            const __m128i off = _mm_set1_epi8( 8 );
-            bx = _mm_sub_epi8( bx, off );
-
-            // Get absolute values of x vectors
-            const __m128i ax = _mm_sign_epi8(bx, bx);
-
-            // Sign the values of the y vectors
-            const __m128i sy = _mm_sign_epi8(by, bx);
-
-            // Perform multiplication and create 16-bit values
-            const __m128i dot = _mm_maddubs_epi16(ax, sy);
-
-            const __m128i ones = _mm_set1_epi16(1);
-            i32[j] = _mm_madd_epi16(ones, dot);
-        }
-
-        // Convert int32_t to float
-        __m256 p = _mm256_cvtepi32_ps( _mm256_set_m128i( i32[0], i32[1] ));
-        // Apply the scale, and accumulate
-        acc = _mm256_add_ps(_mm256_mul_ps( d, p ), acc);
-    }
-
-    *s = hsum_float_8(acc);
-#else
-    // scalar
-    float sumf = 0.0;
-    for (int i = 0; i < nb; i++) {
-        const float d0 = x[i].d;
-        const float d1 = y[i].d;
-
-        const uint8_t * restrict p0 = x[i].qs;
-        const  int8_t * restrict p1 = y[i].qs;
-
-        int sumi = 0;
-        for (int j = 0; j < QK8_0/2; j++) {
-            const uint8_t v0 = p0[j];
-
-            const int i0 = (int8_t) (v0 & 0xf) - 8;
-            const int i1 = (int8_t) (v0 >> 4)  - 8;
-
-            const int i2 = p1[2*j + 0];
-            const int i3 = p1[2*j + 1];
-
-            sumi += i0*i2 + i1*i3;
-        }
-        sumf += d0*d1*sumi;
-    }
-    *s = sumf;
-#endif
-}
-
-static void ggml_vec_dot_q4_1_q8_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
-    const int nb = n / QK8_0;
-
-    assert(n % QK8_0 == 0);
-    assert(nb % 2 == 0);
-
-    const block_q4_1 * restrict x = vx;
-    const block_q8_0 * restrict y = vy;
-
-    // TODO: add AVX / WASM SIMD / etc
-#if defined(__ARM_NEON)
-    float32x4_t sumv0 = vdupq_n_f32(0.0f);
-    float32x4_t sumv1 = vdupq_n_f32(0.0f);
-
-    float summs = 0;
-
-    for (int i = 0; i < nb; i += 2) {
-        const block_q4_1 * restrict x0 = &x[i + 0];
-        const block_q4_1 * restrict x1 = &x[i + 1];
-        const block_q8_0 * restrict y0 = &y[i + 0];
-        const block_q8_0 * restrict y1 = &y[i + 1];
-
-        summs += x0->m * (y0->s0 + y0->s1) + x1->m * (y1->s0 + y1->s1);
-
-        const uint8x16_t m4b = vdupq_n_u8(0xf);
-
-        const uint8x16_t v0_0 = vld1q_u8(x0->qs);
-        const uint8x16_t v0_1 = vld1q_u8(x1->qs);
-
-        // 4-bit -> 8-bit
-        const int8x16_t v0_0l = vreinterpretq_s8_u8(vandq_u8  (v0_0, m4b));
-        const int8x16_t v0_0h = vreinterpretq_s8_u8(vshrq_n_u8(v0_0, 4));
-        const int8x16_t v0_1l = vreinterpretq_s8_u8(vandq_u8  (v0_1, m4b));
-        const int8x16_t v0_1h = vreinterpretq_s8_u8(vshrq_n_u8(v0_1, 4));
-
-        // interleave
-        const int8x16_t v0_0lz = vzip1q_s8(v0_0l, v0_0h);
-        const int8x16_t v0_0hz = vzip2q_s8(v0_0l, v0_0h);
-        const int8x16_t v0_1lz = vzip1q_s8(v0_1l, v0_1h);
-        const int8x16_t v0_1hz = vzip2q_s8(v0_1l, v0_1h);
-
-        // load y
-        const int8x16_t v1_0l = vld1q_s8(y0->qs);
-        const int8x16_t v1_0h = vld1q_s8(y0->qs + 16);
-        const int8x16_t v1_1l = vld1q_s8(y1->qs);
-        const int8x16_t v1_1h = vld1q_s8(y1->qs + 16);
-
-#if defined(__ARM_FEATURE_DOTPROD)
-        // dot product into int32x4_t
-        const int32x4_t p_0 = vdotq_s32(vdotq_s32(vdupq_n_s32(0), v0_0lz, v1_0l), v0_0hz, v1_0h);
-        const int32x4_t p_1 = vdotq_s32(vdotq_s32(vdupq_n_s32(0), v0_1lz, v1_1l), v0_1hz, v1_1h);
-
-        sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(p_0), x0->d*y0->d);
-        sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(p_1), x1->d*y1->d);
-#else
-        const int16x8_t pl0l = vmull_s8(vget_low_s8 (v0_0lz), vget_low_s8 (v1_0l));
-        const int16x8_t pl0h = vmull_s8(vget_high_s8(v0_0lz), vget_high_s8(v1_0l));
-        const int16x8_t ph0l = vmull_s8(vget_low_s8 (v0_0hz), vget_low_s8 (v1_0h));
-        const int16x8_t ph0h = vmull_s8(vget_high_s8(v0_0hz), vget_high_s8(v1_0h));
-
-        const int16x8_t pl1l = vmull_s8(vget_low_s8 (v0_1lz), vget_low_s8 (v1_1l));
-        const int16x8_t pl1h = vmull_s8(vget_high_s8(v0_1lz), vget_high_s8(v1_1l));
-        const int16x8_t ph1l = vmull_s8(vget_low_s8 (v0_1hz), vget_low_s8 (v1_1h));
-        const int16x8_t ph1h = vmull_s8(vget_high_s8(v0_1hz), vget_high_s8(v1_1h));
-
-        const int32x4_t pl0 = vaddq_s32(vpaddlq_s16(pl0l), vpaddlq_s16(pl0h));
-        const int32x4_t ph0 = vaddq_s32(vpaddlq_s16(ph0l), vpaddlq_s16(ph0h));
-        const int32x4_t pl1 = vaddq_s32(vpaddlq_s16(pl1l), vpaddlq_s16(pl1h));
-        const int32x4_t ph1 = vaddq_s32(vpaddlq_s16(ph1l), vpaddlq_s16(ph1h));
-
-        sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(pl0, ph0)), x0->d*y0->d);
-        sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(pl1, ph1)), x1->d*y1->d);
-#endif
-    }
-
-    *s = vaddvq_f32(sumv0) + vaddvq_f32(sumv1) + summs;
-#elif defined(__AVX2__)
-    // Initialize accumulator with zeros
-    __m256 acc = _mm256_setzero_ps();
-
-    float summs = 0;
-
-    // Main loop
-    for (int i = 0; i < nb; ++i) {
-        const float * d0 = &x[i].d;
-        const float * d1 = &y[i].d;
-
-        summs += x[i].m * (y[i].s0 + y[i].s1);
-
-        const __m256 d0v = _mm256_broadcast_ss( d0 );
-        const __m256 d1v = _mm256_broadcast_ss( d1 );
-
-        // Compute combined scales
-        const __m256 d0d1 = _mm256_mul_ps( d0v, d1v );
-
-        // Load 16 bytes, and unpack 4 bit fields into bytes, making 32 bytes
-        const __m256i bx = bytes_from_nibbles_32(x[i].qs);
-        const __m256i by = _mm256_loadu_si256( (const __m256i *)y[i].qs );
-
-        const __m256 xy = mul_sum_i8_pairs_float(bx, by);
-
-        // Accumulate d0*d1*x*y
-        acc = _mm256_fmadd_ps( d0d1, xy, acc );
-    }
-
-    *s = hsum_float_8(acc) + summs;
-#else
-    // scalar
-    float sumf = 0.0;
-    for (int i = 0; i < nb; i++) {
-        const float d0 = x[i].d;
-        const float m0 = x[i].m;
-        const float d1 = y[i].d;
-
-        const uint8_t * restrict p0 = x[i].qs;
-        const  int8_t * restrict p1 = y[i].qs;
-
-        // TODO: this is very slow ..
-        for (int j = 0; j < QK8_0/2; j++) {
-            const uint8_t v0 = p0[j];
-
-            const float f0 = d0*(v0 & 0xf) + m0;
-            const float f1 = d0*(v0 >> 4)  + m0;
-
-            const float f2 = d1*p1[2*j + 0];
-            const float f3 = d1*p1[2*j + 1];
-
-            sumf += f0*f2 + f1*f3;
-        }
-    }
-    *s = sumf;
-#endif
-}
-
-static void ggml_vec_dot_q4_2_q8_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
-    const int nb = n / QK8_0;
-
-    assert(n % QK8_0 == 0);
-    assert(nb % 2 == 0);
-    assert(QK8_0 == 2*QK4_2);
-
-    const block_q4_2 * restrict x = vx;
-    const block_q8_0 * restrict y = vy;
-
-#if defined(__ARM_NEON)
-    float32x4_t sumv0 = vdupq_n_f32(0.0f);
-    float32x4_t sumv1 = vdupq_n_f32(0.0f);
-
-    for (int i = 0; i < nb; i += 2) {
-        const block_q4_2 * restrict x0_0 = &x[2*(i + 0) + 0];
-        const block_q4_2 * restrict x0_1 = &x[2*(i + 0) + 1];
-        const block_q4_2 * restrict x1_0 = &x[2*(i + 1) + 0];
-        const block_q4_2 * restrict x1_1 = &x[2*(i + 1) + 1];
-
-        const block_q8_0 * restrict y0 = &y[i + 0];
-        const block_q8_0 * restrict y1 = &y[i + 1];
-
-        const uint8x16_t m4b   = vdupq_n_u8(0xf);
-        const int8x16_t  s8b   = vdupq_n_s8(0x8);
-
-        const uint8x16_t v0_0 = vcombine_u8(vld1_u8(x0_0->qs), vld1_u8(x0_1->qs));
-        const uint8x16_t v0_1 = vcombine_u8(vld1_u8(x1_0->qs), vld1_u8(x1_1->qs));
-
-        // 4-bit -> 8-bit
-        const int8x16_t v0_0l = vreinterpretq_s8_u8(vandq_u8  (v0_0, m4b));
-        const int8x16_t v0_0h = vreinterpretq_s8_u8(vshrq_n_u8(v0_0, 4));
-        const int8x16_t v0_1l = vreinterpretq_s8_u8(vandq_u8  (v0_1, m4b));
-        const int8x16_t v0_1h = vreinterpretq_s8_u8(vshrq_n_u8(v0_1, 4));
-
-        // sub 8
-        const int8x16_t v0_0ls = vsubq_s8(v0_0l, s8b);
-        const int8x16_t v0_0hs = vsubq_s8(v0_0h, s8b);
-        const int8x16_t v0_1ls = vsubq_s8(v0_1l, s8b);
-        const int8x16_t v0_1hs = vsubq_s8(v0_1h, s8b);
-
-        // interleave
-        const int8x16_t v0_0lz = vzip1q_s8(v0_0ls, v0_0hs);
-        const int8x16_t v0_0hz = vzip2q_s8(v0_0ls, v0_0hs);
-        const int8x16_t v0_1lz = vzip1q_s8(v0_1ls, v0_1hs);
-        const int8x16_t v0_1hz = vzip2q_s8(v0_1ls, v0_1hs);
-
-        // load y
-        const int8x16_t v1_0l = vld1q_s8(y0->qs);
-        const int8x16_t v1_0h = vld1q_s8(y0->qs + 16);
-        const int8x16_t v1_1l = vld1q_s8(y1->qs);
-        const int8x16_t v1_1h = vld1q_s8(y1->qs + 16);
-
-#if defined(__ARM_FEATURE_DOTPROD)
-        sumv0 = vmlaq_n_f32(sumv0, vaddq_f32(
-                vmulq_n_f32(vcvtq_f32_s32(vdotq_s32(vdupq_n_s32(0), v0_0lz, v1_0l)), GGML_FP16_TO_FP32(x0_0->d)),
-                vmulq_n_f32(vcvtq_f32_s32(vdotq_s32(vdupq_n_s32(0), v0_0hz, v1_0h)), GGML_FP16_TO_FP32(x0_1->d))), y0->d);
-
-        sumv1 = vmlaq_n_f32(sumv1, vaddq_f32(
-                vmulq_n_f32(vcvtq_f32_s32(vdotq_s32(vdupq_n_s32(0), v0_1lz, v1_1l)), GGML_FP16_TO_FP32(x1_0->d)),
-                vmulq_n_f32(vcvtq_f32_s32(vdotq_s32(vdupq_n_s32(0), v0_1hz, v1_1h)), GGML_FP16_TO_FP32(x1_1->d))), y1->d);
-#else
-        const int16x8_t pl0l = vmull_s8(vget_low_s8 (v0_0lz), vget_low_s8 (v1_0l));
-        const int16x8_t pl0h = vmull_s8(vget_high_s8(v0_0lz), vget_high_s8(v1_0l));
-        const int16x8_t ph0l = vmull_s8(vget_low_s8 (v0_0hz), vget_low_s8 (v1_0h));
-        const int16x8_t ph0h = vmull_s8(vget_high_s8(v0_0hz), vget_high_s8(v1_0h));
-
-        const int16x8_t pl1l = vmull_s8(vget_low_s8 (v0_1lz), vget_low_s8 (v1_1l));
-        const int16x8_t pl1h = vmull_s8(vget_high_s8(v0_1lz), vget_high_s8(v1_1l));
-        const int16x8_t ph1l = vmull_s8(vget_low_s8 (v0_1hz), vget_low_s8 (v1_1h));
-        const int16x8_t ph1h = vmull_s8(vget_high_s8(v0_1hz), vget_high_s8(v1_1h));
-
-        const int32x4_t pl0 = vaddq_s32(vpaddlq_s16(pl0l), vpaddlq_s16(pl0h));
-        const int32x4_t ph0 = vaddq_s32(vpaddlq_s16(ph0l), vpaddlq_s16(ph0h));
-        const int32x4_t pl1 = vaddq_s32(vpaddlq_s16(pl1l), vpaddlq_s16(pl1h));
-        const int32x4_t ph1 = vaddq_s32(vpaddlq_s16(ph1l), vpaddlq_s16(ph1h));
-
-        sumv0 = vmlaq_n_f32(sumv0, vaddq_f32(
-                vmulq_n_f32(vcvtq_f32_s32(pl0), GGML_FP16_TO_FP32(x0_0->d)),
-                vmulq_n_f32(vcvtq_f32_s32(ph0), GGML_FP16_TO_FP32(x0_1->d))), y0->d);
-
-        sumv1 = vmlaq_n_f32(sumv1, vaddq_f32(
-                vmulq_n_f32(vcvtq_f32_s32(pl1), GGML_FP16_TO_FP32(x1_0->d)),
-                vmulq_n_f32(vcvtq_f32_s32(ph1), GGML_FP16_TO_FP32(x1_1->d))), y1->d);
-#endif
-    }
-
-    *s = vaddvq_f32(sumv0) + vaddvq_f32(sumv1);
-#elif defined(__AVX2__)
-    // Initialize accumulator with zeros
-    __m256 acc = _mm256_setzero_ps();
-
-    // Main loop
-    for (int i = 0; i < nb; i++) {
-        /* Compute combined scale for the block */
-        const __m128 d0 = _mm_set1_ps(GGML_FP16_TO_FP32(x[2*i + 0].d));
-        const __m128 d1 = _mm_set1_ps(GGML_FP16_TO_FP32(x[2*i + 1].d));
-        const __m256 d = _mm256_mul_ps(_mm256_set_m128(d1, d0), _mm256_broadcast_ss(&y[i].d));
-
-        __m128i bx0 = bytes_from_nibbles_16(x[2*i + 0].qs);
-        __m128i bx1 = bytes_from_nibbles_16(x[2*i + 1].qs);
-        __m256i bx = _mm256_set_m128i(bx1, bx0);
-
-        // Now we have a vector with bytes in [ 0 .. 15 ] interval. Offset them into [ -8 .. +7 ] interval.
-        const __m256i off = _mm256_set1_epi8(8);
-        bx = _mm256_sub_epi8(bx, off);
-
-        __m256i by = _mm256_loadu_si256((const __m256i *)y[i].qs);
-
-        const __m256 q = mul_sum_i8_pairs_float(bx, by);
-
-        /* Multiply q with scale and accumulate */
-        acc = _mm256_fmadd_ps(d, q, acc);
-    }
-
-    *s = hsum_float_8(acc);
-#else
-    // scalar
-    float sumf = 0.0;
-    for (int i = 0; i < nb; i++) {
-        const uint8_t * restrict x0 = x[2*i + 0].qs;
-        const uint8_t * restrict x1 = x[2*i + 1].qs;
-        const  int8_t * restrict y0 = y[i].qs;
-
-        const float d0 = GGML_FP16_TO_FP32(x[2*i + 0].d);
-        const float d1 = GGML_FP16_TO_FP32(x[2*i + 1].d);
-
-        int sumi_0 = 0;
-        int sumi_1 = 0;
-
-        for (int j = 0; j < QK8_0/4; j++) {
-            const uint8_t v0 = x0[j];
-            const uint8_t v1 = x1[j];
-
-            const int i0_0 = (int8_t) (v0 & 0xf) - 8;
-            const int i1_0 = (int8_t) (v0 >> 4)  - 8;
-
-            const int i0_1 = (int8_t) (v1 & 0xf) - 8;
-            const int i1_1 = (int8_t) (v1 >> 4)  - 8;
-
-            const int i2_0 = y0[2*j + 0];
-            const int i3_0 = y0[2*j + 1];
-
-            const int i2_1 = y0[2*(j + QK8_0/4) + 0];
-            const int i3_1 = y0[2*(j + QK8_0/4) + 1];
-
-            sumi_0 += i0_0*i2_0 + i1_0*i3_0;
-            sumi_1 += i0_1*i2_1 + i1_1*i3_1;
-        }
-
-        sumf += (d0 * y[i].d) * sumi_0;
-        sumf += (d1 * y[i].d) * sumi_1;
-    }
-    *s = sumf;
-#endif
-}
-
-static void ggml_vec_dot_q4_3_q8_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
-    const int nb = n / QK8_0;
-
-    assert(n % QK8_0 == 0);
-    assert(nb % 2 == 0);
-    assert(QK8_0 == 2*QK4_2);
-
-    const block_q4_3 * restrict x = vx;
-    const block_q8_0 * restrict y = vy;
-
-#if defined(__ARM_NEON)
-    float32x4_t sumv0 = vdupq_n_f32(0.0f);
-    float32x4_t sumv1 = vdupq_n_f32(0.0f);
-
-    float summs0 = 0.0f;
-    float summs1 = 0.0f;
-
-    for (int i = 0; i < nb; ++i) {
-        const block_q4_3 * restrict x0_0 = &x[2*(i + 0) + 0];
-        const block_q4_3 * restrict x0_1 = &x[2*(i + 0) + 1];
-
-        const block_q8_0 * restrict y0 = &y[i + 0];
-
-        summs0 += GGML_FP16_TO_FP32(x0_0->m) * y0->s0;
-        summs1 += GGML_FP16_TO_FP32(x0_1->m) * y0->s1;
-
-        const uint8x16_t v0_0 = vcombine_u8(vld1_u8(x0_0->qs), vld1_u8(x0_1->qs));
-
-        // 4-bit -> 8-bit
-        const int8x16_t v0_0l = vreinterpretq_s8_u8(vandq_u8  (v0_0, vdupq_n_u8(0xf)));
-        const int8x16_t v0_0h = vreinterpretq_s8_u8(vshrq_n_u8(v0_0, 4));
-
-        // interleave
-        const int8x16_t v0_0lz = vzip1q_s8(v0_0l, v0_0h);
-        const int8x16_t v0_0hz = vzip2q_s8(v0_0l, v0_0h);
-
-        // load y
-        const int8x16_t v1_0l = vld1q_s8(y0->qs);
-        const int8x16_t v1_0h = vld1q_s8(y0->qs + 16);
-
-        const float x0_0d = GGML_FP16_TO_FP32(x0_0->d);
-        const float x0_1d = GGML_FP16_TO_FP32(x0_1->d);
-
-#if defined(__ARM_FEATURE_DOTPROD)
-        sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vdotq_s32(vdupq_n_s32(0), v0_0lz, v1_0l)), x0_0d*y0->d);
-        sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vdotq_s32(vdupq_n_s32(0), v0_0hz, v1_0h)), x0_1d*y0->d);
-#else
-        const int16x8_t pl0l = vmull_s8(vget_low_s8 (v0_0lz), vget_low_s8 (v1_0l));
-        const int16x8_t pl0h = vmull_s8(vget_high_s8(v0_0lz), vget_high_s8(v1_0l));
-        const int16x8_t ph0l = vmull_s8(vget_low_s8 (v0_0hz), vget_low_s8 (v1_0h));
-        const int16x8_t ph0h = vmull_s8(vget_high_s8(v0_0hz), vget_high_s8(v1_0h));
-
-        const int32x4_t pl0 = vaddq_s32(vpaddlq_s16(pl0l), vpaddlq_s16(pl0h));
-        const int32x4_t ph0 = vaddq_s32(vpaddlq_s16(ph0l), vpaddlq_s16(ph0h));
-
-        sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(pl0), x0_0d*y0->d);
-        sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(ph0), x0_1d*y0->d);
-#endif
-    }
-
-    *s = vaddvq_f32(vaddq_f32(sumv0, sumv1)) + summs0 + summs1;
-#elif defined(__AVX2__)
-    // Initialize accumulator with zeros
-    __m256 acc = _mm256_setzero_ps();
-
-    // Main loop
-    for (int i = 0; i < nb; i++) {
-        const __m128 d0 = _mm_set1_ps(GGML_FP16_TO_FP32(x[2*i + 0].d));
-        const __m128 d1 = _mm_set1_ps(GGML_FP16_TO_FP32(x[2*i + 1].d));
-        const __m256 dx = _mm256_set_m128(d1, d0);
-
-        const __m128 m0 = _mm_set1_ps(GGML_FP16_TO_FP32(x[2*i + 0].m));
-        const __m128 m1 = _mm_set1_ps(GGML_FP16_TO_FP32(x[2*i + 1].m));
-        const __m256 mx = _mm256_set_m128(m1, m0);
-
-        const __m128i bx0 = bytes_from_nibbles_16(x[2*i + 0].qs);
-        const __m128i bx1 = bytes_from_nibbles_16(x[2*i + 1].qs);
-        const __m256i bx = _mm256_set_m128i(bx1, bx0);
-
-        const __m256 dy = _mm256_broadcast_ss(&y[i].d);
-        const __m256i by = _mm256_loadu_si256((const __m256i *)y[i].qs);
-
-        const __m256i syi = _mm256_maddubs_epi16(_mm256_set1_epi8(1), by);
-        const __m256 syf = sum_i16_pairs_float(syi);
-
-        const __m256 q = mul_sum_i8_pairs_float(bx, by);
-
-        const __m256 sxy = _mm256_fmadd_ps(q, dx, _mm256_mul_ps(mx, syf));
-        acc = _mm256_fmadd_ps(sxy, dy, acc);
-    }
-
-    *s = hsum_float_8(acc);
-#else
-    // scalar
-    float sumf = 0.0;
-    for (int i = 0; i < nb; i++) {
-        const uint8_t * restrict x0 = x[2*i + 0].qs;
-        const uint8_t * restrict x1 = x[2*i + 1].qs;
-        const  int8_t * restrict y0 = y[i].qs;
-
-        const float d0 = GGML_FP16_TO_FP32(x[2*i + 0].d);
-        const float m0 = GGML_FP16_TO_FP32(x[2*i + 0].m);
-        const float d1 = GGML_FP16_TO_FP32(x[2*i + 1].d);
-        const float m1 = GGML_FP16_TO_FP32(x[2*i + 1].m);
-
-        int sxy_0 = 0;
-        int sxy_1 = 0;
-
-        for (int j = 0; j < QK8_0/4; j++) {
-            const uint8_t v0 = x0[j];
-            const uint8_t v1 = x1[j];
-
-            const int x0_0 = v0 & 0xf;
-            const int x1_0 = v0 >> 4;
-
-            const int x0_1 = v1 & 0xf;
-            const int x1_1 = v1 >> 4;
-
-            const int y0_0 = y0[2*j + 0];
-            const int y1_0 = y0[2*j + 1];
-
-            const int y0_1 = y0[2*(j + QK8_0/4) + 0];
-            const int y1_1 = y0[2*(j + QK8_0/4) + 1];
-
-            sxy_0 += x0_0*y0_0 + x1_0*y1_0;
-            sxy_1 += x0_1*y0_1 + x1_1*y1_1;
-        }
-
-        sumf += (d0*sxy_0 + d1*sxy_1)*y[i].d + m0*y[i].s0 + m1*y[i].s1;
-    }
-    *s = sumf;
-#endif
-}
-
-
-// compute GGML_VEC_DOT_UNROLL dot products at once
-// xs - x row stride in bytes
-inline static void ggml_vec_dot_f16_unroll(const int n, const int xs, float * restrict s, void * restrict xv, ggml_fp16_t * restrict y) {
-    ggml_float sumf[GGML_VEC_DOT_UNROLL] = { 0.0 };
-
-    ggml_fp16_t * restrict x[GGML_VEC_DOT_UNROLL];
-
-    for (int i = 0; i < GGML_VEC_DOT_UNROLL; ++i) {
-        x[i] = (ggml_fp16_t *) ((char *) xv + i*xs);
-    }
-
-#if defined(GGML_SIMD)
-    const int np = (n & ~(GGML_F16_STEP - 1));
-
-    GGML_F16_VEC sum[GGML_VEC_DOT_UNROLL][GGML_F16_ARR] = { { GGML_F16_VEC_ZERO } };
-
-    GGML_F16_VEC ax[GGML_F16_ARR];
-    GGML_F16_VEC ay[GGML_F16_ARR];
-
-    for (int i = 0; i < np; i += GGML_F16_STEP) {
-        for (int j = 0; j < GGML_F16_ARR; j++) {
-            ay[j] = GGML_F16_VEC_LOAD(y + i + j*GGML_F16_EPR, j);
-
-            for (int k = 0; k < GGML_VEC_DOT_UNROLL; ++k) {
-                ax[j] = GGML_F16_VEC_LOAD(x[k] + i + j*GGML_F16_EPR, j);
-
-                sum[k][j] = GGML_F16_VEC_FMA(sum[k][j], ax[j], ay[j]);
-            }
-        }
-    }
-
-    // reduce sum0..sum3 to sum0
-    for (int k = 0; k < GGML_VEC_DOT_UNROLL; ++k) {
-        GGML_F16_VEC_REDUCE(sumf[k], sum[k]);
-    }
-
-    // leftovers
-    for (int i = np; i < n; ++i) {
-        for (int j = 0; j < GGML_VEC_DOT_UNROLL; ++j) {
-            sumf[j] += (ggml_float)(GGML_FP16_TO_FP32(x[j][i])*GGML_FP16_TO_FP32(y[i]));
-        }
-    }
-#else
-    for (int i = 0; i < n; ++i) {
-        for (int j = 0; j < GGML_VEC_DOT_UNROLL; ++j) {
-            sumf[j] += (ggml_float)(GGML_FP16_TO_FP32(x[j][i])*GGML_FP16_TO_FP32(y[i]));
-        }
-    }
-#endif
-
-    for (int i = 0; i < GGML_VEC_DOT_UNROLL; ++i) {
-        s[i] = sumf[i];
-    }
-}
-
-inline static void ggml_vec_mad_f32(const int n, float * restrict y, const float * restrict x, const float v) {
-#if defined(GGML_SIMD)
-    const int np = (n & ~(GGML_F32_STEP - 1));
-
-    GGML_F32_VEC vx = GGML_F32_VEC_SET1(v);
-
-    GGML_F32_VEC ax[GGML_F32_ARR];
-    GGML_F32_VEC ay[GGML_F32_ARR];
-
-    for (int i = 0; i < np; i += GGML_F32_STEP) {
-        for (int j = 0; j < GGML_F32_ARR; j++) {
-            ax[j] = GGML_F32_VEC_LOAD(x + i + j*GGML_F32_EPR);
-            ay[j] = GGML_F32_VEC_LOAD(y + i + j*GGML_F32_EPR);
-            ay[j] = GGML_F32_VEC_FMA(ay[j], ax[j], vx);
-
-            GGML_F32_VEC_STORE(y + i + j*GGML_F32_EPR, ay[j]);
-        }
-    }
-
-    // leftovers
-    for (int i = np; i < n; ++i) {
-        y[i] += x[i]*v;
-    }
-#else
-    // scalar
-    for (int i = 0; i < n; ++i) {
-        y[i] += x[i]*v;
-    }
-#endif
-}
-
-//inline static void ggml_vec_scale_f32(const int n, float * y, const float   v) { for (int i = 0; i < n; ++i) y[i] *= v;          }
-inline static void ggml_vec_scale_f32(const int n, float * y, const float   v) {
-#if defined(GGML_SIMD)
-    const int np = (n & ~(GGML_F32_STEP - 1));
-
-    GGML_F32_VEC vx = GGML_F32_VEC_SET1(v);
-
-    GGML_F32_VEC ay[GGML_F32_ARR];
-
-    for (int i = 0; i < np; i += GGML_F32_STEP) {
-        for (int j = 0; j < GGML_F32_ARR; j++) {
-            ay[j] = GGML_F32_VEC_LOAD(y + i + j*GGML_F32_EPR);
-            ay[j] = GGML_F32_VEC_MUL(ay[j], vx);
-
-            GGML_F32_VEC_STORE(y + i + j*GGML_F32_EPR, ay[j]);
-        }
-    }
-
-    // leftovers
-    for (int i = np; i < n; ++i) {
-        y[i] *= v;
-    }
-#else
-    // scalar
-    for (int i = 0; i < n; ++i) {
-        y[i] *= v;
-    }
-#endif
-}
-
-inline static void ggml_vec_norm_f32 (const int n, float * s, const float * x) { ggml_vec_dot_f32(n, s, x, x); *s = sqrtf(*s);   }
-inline static void ggml_vec_sqr_f32  (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = x[i]*x[i];   }
-inline static void ggml_vec_sqrt_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = sqrtf(x[i]); }
-inline static void ggml_vec_abs_f32  (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = fabsf(x[i]); }
-inline static void ggml_vec_sgn_f32  (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? 1.f : ((x[i] < 0.f) ? -1.f : 0.f); }
-inline static void ggml_vec_step_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? 1.f : 0.f; }
-inline static void ggml_vec_relu_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : 0.f; }
-
-static const float GELU_COEF_A    = 0.044715f;
-static const float SQRT_2_OVER_PI = 0.79788456080286535587989211986876f;
-
-inline static float ggml_gelu_f32(float x) {
-    return 0.5f*x*(1.0f + tanhf(SQRT_2_OVER_PI*x*(1.0f + GELU_COEF_A*x*x)));
-}
-
-inline static void ggml_vec_gelu_f16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
-    const uint16_t * i16 = (const uint16_t *) x;
-    for (int i = 0; i < n; ++i) {
-        y[i] = table_gelu_f16[i16[i]];
-    }
-}
-
-#ifdef GGML_GELU_FP16
-inline static void ggml_vec_gelu_f32(const int n, float * y, const float * x) {
-    uint16_t t;
-    for (int i = 0; i < n; ++i) {
-        ggml_fp16_t fp16 = GGML_FP32_TO_FP16(x[i]);
-        memcpy(&t, &fp16, sizeof(uint16_t));
-        y[i] = GGML_FP16_TO_FP32(table_gelu_f16[t]);
-    }
-}
-#else
-inline static void ggml_vec_gelu_f32(const int n, float * y, const float * x) {
-    for (int i = 0; i < n; ++i) {
-        y[i] = ggml_gelu_f32(x[i]);
-    }
-}
-#endif
-
-// Sigmoid Linear Unit (SiLU) function
-inline static float ggml_silu_f32(float x) {
-    return x/(1.0f + expf(-x));
-}
-
-inline static void ggml_vec_silu_f16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
-    const uint16_t * i16 = (const uint16_t *) x;
-    for (int i = 0; i < n; ++i) {
-        y[i] = table_silu_f16[i16[i]];
-    }
-}
-
-#ifdef GGML_SILU_FP16
-inline static void ggml_vec_silu_f32(const int n, float * y, const float * x) {
-    uint16_t t;
-    for (int i = 0; i < n; ++i) {
-        ggml_fp16_t fp16 = GGML_FP32_TO_FP16(x[i]);
-        memcpy(&t, &fp16, sizeof(uint16_t));
-        y[i] = GGML_FP16_TO_FP32(table_silu_f16[t]);
-    }
-}
-#else
-inline static void ggml_vec_silu_f32(const int n, float * y, const float * x) {
-    for (int i = 0; i < n; ++i) {
-        y[i] = ggml_silu_f32(x[i]);
-    }
-}
-#endif
-
-inline static void ggml_vec_sum_f32(const int n, float * s, const float * x) {
-#ifndef GGML_USE_ACCELERATE
-    ggml_float sum = 0.0;
-    for (int i = 0; i < n; ++i) {
-        sum += (ggml_float)x[i];
-    }
-    *s = sum;
-#else
-    vDSP_sve(x, 1, s, n);
-#endif
-}
-
-inline static void ggml_vec_max_f32(const int n, float * s, const float * x) {
-#ifndef GGML_USE_ACCELERATE
-    float max = -INFINITY;
-    for (int i = 0; i < n; ++i) {
-        max = MAX(max, x[i]);
-    }
-    *s = max;
-#else
-    vDSP_maxv(x, 1, s, n);
-#endif
-}
-
-inline static void ggml_vec_norm_inv_f32(const int n, float * s, const float * x) {
-    ggml_vec_norm_f32(n, s, x);
-    *s = 1.f/(*s);
-}
-
-//
-// logging
-//
-
-#if (GGML_DEBUG >= 1)
-#define GGML_PRINT_DEBUG(...) printf(__VA_ARGS__)
-#else
-#define GGML_PRINT_DEBUG(...)
-#endif
-
-#if (GGML_DEBUG >= 5)
-#define GGML_PRINT_DEBUG_5(...) printf(__VA_ARGS__)
-#else
-#define GGML_PRINT_DEBUG_5(...)
-#endif
-
-#if (GGML_DEBUG >= 10)
-#define GGML_PRINT_DEBUG_10(...) printf(__VA_ARGS__)
-#else
-#define GGML_PRINT_DEBUG_10(...)
-#endif
-
-#define GGML_PRINT(...) printf(__VA_ARGS__)
-
-//
-// data types
-//
-
-static const int GGML_BLCK_SIZE[GGML_TYPE_COUNT] = {
-    [GGML_TYPE_F32]  = 1,
-    [GGML_TYPE_F16]  = 1,
-    [GGML_TYPE_Q4_0] = QK4_0,
-    [GGML_TYPE_Q4_1] = QK4_1,
-    [GGML_TYPE_Q4_2] = QK4_2,
-    [GGML_TYPE_Q4_3] = QK4_3,
-    [GGML_TYPE_Q8_0] = QK8_0,
-    [GGML_TYPE_I8]   = 1,
-    [GGML_TYPE_I16]  = 1,
-    [GGML_TYPE_I32]  = 1,
-};
-static_assert(GGML_TYPE_COUNT == 10, "GGML_BLCK_SIZE is outdated");
-
-static const size_t GGML_TYPE_SIZE[GGML_TYPE_COUNT] = {
-    [GGML_TYPE_F32]  = sizeof(float),
-    [GGML_TYPE_F16]  = sizeof(ggml_fp16_t),
-    [GGML_TYPE_Q4_0] = sizeof(block_q4_0),
-    [GGML_TYPE_Q4_1] = sizeof(block_q4_1),
-    [GGML_TYPE_Q4_2] = sizeof(block_q4_2),
-    [GGML_TYPE_Q4_3] = sizeof(block_q4_3),
-    [GGML_TYPE_Q8_0] = sizeof(block_q8_0),
-    [GGML_TYPE_I8]   = sizeof(int8_t),
-    [GGML_TYPE_I16]  = sizeof(int16_t),
-    [GGML_TYPE_I32]  = sizeof(int32_t),
-};
-static_assert(GGML_TYPE_COUNT == 10, "GGML_TYPE_SIZE is outdated");
-
-
-static const char * GGML_TYPE_NAME[GGML_TYPE_COUNT] = {
-    [GGML_TYPE_F32]  = "f32",
-    [GGML_TYPE_F16]  = "f16",
-    [GGML_TYPE_Q4_0] = "q4_0",
-    [GGML_TYPE_Q4_1] = "q4_1",
-    [GGML_TYPE_Q4_2] = "q4_2",
-    [GGML_TYPE_Q4_3] = "q4_3",
-    [GGML_TYPE_Q8_0] = "q8_0",
-    [GGML_TYPE_I8]   = "i8",
-    [GGML_TYPE_I16]  = "i16",
-    [GGML_TYPE_I32]  = "i32",
-};
-static_assert(GGML_TYPE_COUNT == 10, "GGML_TYPE_NAME is outdated");
-
-static bool GGML_IS_QUANTIZED[GGML_TYPE_COUNT] = {
-    [GGML_TYPE_F32]  = false,
-    [GGML_TYPE_F16]  = false,
-    [GGML_TYPE_Q4_0] = true,
-    [GGML_TYPE_Q4_1] = true,
-    [GGML_TYPE_Q4_2] = true,
-    [GGML_TYPE_Q4_3] = true,
-    [GGML_TYPE_Q8_0] = true,
-    [GGML_TYPE_I8]   = false,
-    [GGML_TYPE_I16]  = false,
-    [GGML_TYPE_I32]  = false,
-};
-static_assert(GGML_TYPE_COUNT == 10, "GGML_IS_QUANTIZED is outdated");
-
-static const char * GGML_OP_LABEL[GGML_OP_COUNT] = {
-    "NONE",
-
-    "DUP",
-    "ADD",
-    "SUB",
-    "MUL",
-    "DIV",
-    "SQR",
-    "SQRT",
-    "SUM",
-    "MEAN",
-    "REPEAT",
-    "ABS",
-    "SGN",
-    "NEG",
-    "STEP",
-    "RELU",
-    "GELU",
-    "SILU",
-    "NORM",
-    "RMS_NORM",
-
-    "MUL_MAT",
-
-    "SCALE",
-    "CPY",
-    "CONT",
-    "RESHAPE",
-    "VIEW",
-    "PERMUTE",
-    "TRANSPOSE",
-    "GET_ROWS",
-    "DIAG_MASK_INF",
-    "SOFT_MAX",
-    "ROPE",
-    "CONV_1D_1S",
-    "CONV_1D_2S",
-
-    "FLASH_ATTN",
-    "FLASH_FF",
-
-    "MAP_UNARY",
-    "MAP_BINARY",
-};
-
-static_assert(GGML_OP_COUNT == 39, "GGML_OP_COUNT != 39");
-
-static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
-    "none",
-
-    "x",
-    "x+y",
-    "x-y",
-    "x*y",
-    "x/y",
-    "x^2",
-    "√x",
-    "Σx",
-    "Σx/n",
-    "repeat(x)",
-    "abs(x)",
-    "sgn(x)",
-    "-x",
-    "step(x)",
-    "relu(x)",
-    "gelu(x)",
-    "silu(x)",
-    "norm(x)",
-    "rms_norm(x)",
-
-    "X*Y",
-
-    "x*v",
-    "x-\\>y",
-    "cont(x)",
-    "reshape(x)",
-    "view(x)",
-    "permute(x)",
-    "transpose(x)",
-    "get_rows(x)",
-    "diag_mask_inf(x)",
-    "soft_max(x)",
-    "rope(x)",
-    "conv_1d_1s(x)",
-    "conv_1d_2s(x)",
-
-    "flash_attn(x)",
-    "flash_ff(x)",
-
-    "f(x)",
-    "f(x,y)",
-};
-
-static_assert(GGML_OP_COUNT == 39, "GGML_OP_COUNT != 39");
-
-static_assert(sizeof(struct ggml_object)%GGML_MEM_ALIGN == 0, "ggml_object size must be a multiple of GGML_MEM_ALIGN");
-static_assert(sizeof(struct ggml_tensor)%GGML_MEM_ALIGN == 0, "ggml_tensor size must be a multiple of GGML_MEM_ALIGN");
-
-//
-// ggml context
-//
-
-struct ggml_context {
-    size_t mem_size;
-    void * mem_buffer;
-    bool   mem_buffer_owned;
-    bool   no_alloc;
-
-    int    n_objects;
-
-    struct ggml_object * objects_begin;
-    struct ggml_object * objects_end;
-
-    struct ggml_scratch scratch;
-    struct ggml_scratch scratch_save;
-};
-
-struct ggml_context_container {
-    bool used;
-
-    struct ggml_context context;
-};
-
-//
-// compute types
-//
-
-enum ggml_task_type {
-    GGML_TASK_INIT = 0,
-    GGML_TASK_COMPUTE,
-    GGML_TASK_FINALIZE,
-};
-
-struct ggml_compute_params {
-    enum ggml_task_type type;
-
-    int ith, nth;
-
-    // work buffer for all threads
-    size_t wsize;
-    void * wdata;
-};
-
-//
-// ggml state
-//
-
-struct ggml_state {
-    struct ggml_context_container contexts[GGML_MAX_CONTEXTS];
-};
-
-// global state
-static struct ggml_state g_state;
-static atomic_int g_state_barrier = 0;
-
-// barrier via spin lock
-inline static void ggml_critical_section_start(void) {
-    int processing = atomic_fetch_add(&g_state_barrier, 1);
-
-    while (processing > 0) {
-        // wait for other threads to finish
-        atomic_fetch_sub(&g_state_barrier, 1);
-        sched_yield(); // TODO: reconsider this
-        processing = atomic_fetch_add(&g_state_barrier, 1);
-    }
-}
-
-// TODO: make this somehow automatically executed
-//       some sort of "sentry" mechanism
-inline static void ggml_critical_section_end(void) {
-    atomic_fetch_sub(&g_state_barrier, 1);
-}
-
-////////////////////////////////////////////////////////////////////////////////
-
-void ggml_print_object(const struct ggml_object * obj) {
-    GGML_PRINT(" - ggml_object: offset = %zu, size = %zu, next = %p\n",
-            obj->offs, obj->size, (const void *) obj->next);
-}
-
-void ggml_print_objects(const struct ggml_context * ctx) {
-    struct ggml_object * obj = ctx->objects_begin;
-
-    GGML_PRINT("%s: objects in context %p:\n", __func__, (const void *) ctx);
-
-    while (obj != NULL) {
-        ggml_print_object(obj);
-        obj = obj->next;
-    }
-
-    GGML_PRINT("%s: --- end ---\n", __func__);
-}
-
-int64_t ggml_nelements(const struct ggml_tensor * tensor) {
-    static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
-
-    return tensor->ne[0]*tensor->ne[1]*tensor->ne[2]*tensor->ne[3];
-}
-
-int ggml_nrows(const struct ggml_tensor * tensor) {
-    static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
-
-    return tensor->ne[1]*tensor->ne[2]*tensor->ne[3];
-}
-
-size_t ggml_nbytes(const struct ggml_tensor * tensor) {
-    static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
-
-    return (ggml_nelements(tensor)*GGML_TYPE_SIZE[tensor->type])/GGML_BLCK_SIZE[tensor->type];
-}
-
-int ggml_blck_size(enum ggml_type type) {
-    return GGML_BLCK_SIZE[type];
-}
-
-size_t ggml_type_size(enum ggml_type type) {
-    return GGML_TYPE_SIZE[type];
-}
-
-float ggml_type_sizef(enum ggml_type type) {
-    return ((float)(GGML_TYPE_SIZE[type]))/GGML_BLCK_SIZE[type];
-}
-
-const char * ggml_type_name(enum ggml_type type) {
-    return GGML_TYPE_NAME[type];
-}
-
-
-size_t ggml_element_size(const struct ggml_tensor * tensor) {
-    return GGML_TYPE_SIZE[tensor->type];
-}
-
-static inline bool ggml_is_scalar(const struct ggml_tensor * tensor) {
-    static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
-
-    return tensor->ne[0] == 1 && tensor->ne[1] == 1 && tensor->ne[2] == 1 && tensor->ne[3] == 1;
-}
-
-static inline bool ggml_is_vector(const struct ggml_tensor * tensor) {
-    static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
-
-    return tensor->ne[1] == 1 && tensor->ne[2] == 1 && tensor->ne[3] == 1;
-}
-
-static inline bool ggml_is_matrix(const struct ggml_tensor * tensor) {
-    static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
-
-    return tensor->ne[2] == 1 && tensor->ne[3] == 1;
-}
-
-static inline bool ggml_can_mul_mat(const struct ggml_tensor * t0, const struct ggml_tensor * t1) {
-    static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
-
-    return
-        (t0->ne[0] == t1->ne[0])  &&
-        (t0->ne[2] == t1->ne[2])  &&
-        (t0->ne[3] == t1->ne[3]);
-}
-
-bool ggml_is_quantized(enum ggml_type type) {
-    return GGML_IS_QUANTIZED[type];
-}
-
-static inline bool ggml_is_transposed(const struct ggml_tensor * tensor) {
-    return tensor->nb[0] > tensor->nb[1];
-}
-
-static inline bool ggml_is_contiguous(const struct ggml_tensor * tensor) {
-    static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
-
-    return
-        tensor->nb[0] == GGML_TYPE_SIZE[tensor->type] &&
-        tensor->nb[1] == (tensor->nb[0]*tensor->ne[0])/GGML_BLCK_SIZE[tensor->type] &&
-        tensor->nb[2] == tensor->nb[1]*tensor->ne[1] &&
-        tensor->nb[3] == tensor->nb[2]*tensor->ne[2];
-}
-
-static inline bool ggml_is_padded_1d(const struct ggml_tensor * tensor) {
-    static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
-
-    return
-        tensor->nb[0] == GGML_TYPE_SIZE[tensor->type] &&
-        tensor->nb[2] == tensor->nb[1]*tensor->ne[1] &&
-        tensor->nb[3] == tensor->nb[2]*tensor->ne[2];
-}
-
-static inline bool ggml_are_same_shape(const struct ggml_tensor * t0, const struct ggml_tensor * t1) {
-    static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
-
-    return
-        (t0->ne[0] == t1->ne[0] ) &&
-        (t0->ne[1] == t1->ne[1] ) &&
-        (t0->ne[2] == t1->ne[2] ) &&
-        (t0->ne[3] == t1->ne[3] );
-}
-
-// check if t1 can be represented as a repeatition of t0
-static inline bool ggml_can_repeat(const struct ggml_tensor * t0, const struct ggml_tensor * t1) {
-    static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
-
-    return
-        (t1->ne[0]%t0->ne[0] == 0) &&
-        (t1->ne[1]%t0->ne[1] == 0) &&
-        (t1->ne[2]%t0->ne[2] == 0) &&
-        (t1->ne[3]%t0->ne[3] == 0);
-}
-
-static inline int ggml_up32(int n) {
-    return (n + 31) & ~31;
-}
-
-static inline int ggml_up64(int n) {
-    return (n + 63) & ~63;
-}
-
-static inline int ggml_up(int n, int m) {
-    // assert m is a power of 2
-    GGML_ASSERT((m & (m - 1)) == 0);
-    return (n + m - 1) & ~(m - 1);
-}
-
-// assert that pointer is aligned to GGML_MEM_ALIGN
-#define ggml_assert_aligned(ptr) \
-    GGML_ASSERT(((uintptr_t) (ptr))%GGML_MEM_ALIGN == 0)
-
-////////////////////////////////////////////////////////////////////////////////
-
-struct ggml_context * ggml_init(struct ggml_init_params params) {
-    // make this function thread safe
-    ggml_critical_section_start();
-
-    static bool is_first_call = true;
-
-    if (is_first_call) {
-        // initialize time system (required on Windows)
-        ggml_time_init();
-
-        // initialize GELU, SILU and EXP F32 tables
-        {
-            const uint64_t t_start = ggml_time_us(); UNUSED(t_start);
-
-            ggml_fp16_t ii;
-            for (int i = 0; i < (1 << 16); ++i) {
-                uint16_t ui = i;
-                memcpy(&ii, &ui, sizeof(ii));
-                const float f = table_f32_f16[i] = GGML_COMPUTE_FP16_TO_FP32(ii);
-                table_gelu_f16[i] = GGML_FP32_TO_FP16(ggml_gelu_f32(f));
-                table_silu_f16[i] = GGML_FP32_TO_FP16(ggml_silu_f32(f));
-                table_exp_f16[i]  = GGML_FP32_TO_FP16(expf(f));
-            }
-
-            const uint64_t t_end = ggml_time_us(); UNUSED(t_end);
-
-            GGML_PRINT_DEBUG("%s: GELU, SILU and EXP tables initialized in %f ms\n", __func__, (t_end - t_start)/1000.0f);
-        }
-
-        // initialize g_state
-        {
-            const uint64_t t_start = ggml_time_us(); UNUSED(t_start);
-
-            g_state = (struct ggml_state) {
-                /*.contexts =*/ { { 0 } },
-            };
-
-            for (int i = 0; i < GGML_MAX_CONTEXTS; ++i) {
-                g_state.contexts[i].used = false;
-            }
-
-            const uint64_t t_end = ggml_time_us(); UNUSED(t_end);
-
-            GGML_PRINT_DEBUG("%s: g_state initialized in %f ms\n", __func__, (t_end - t_start)/1000.0f);
-        }
-
-        // initialize cuBLAS
-        #if defined(GGML_USE_CUBLAS)
-        ggml_init_cublas();
-        #endif
-
-        is_first_call = false;
-    }
-
-    // find non-used context in g_state
-    struct ggml_context * ctx = NULL;
-
-    for (int i = 0; i < GGML_MAX_CONTEXTS; i++) {
-        if (!g_state.contexts[i].used) {
-            g_state.contexts[i].used = true;
-            ctx = &g_state.contexts[i].context;
-
-            GGML_PRINT_DEBUG("%s: found unused context %d\n", __func__, i);
-            break;
-        }
-    }
-
-    if (ctx == NULL) {
-        GGML_PRINT_DEBUG("%s: no unused context found\n", __func__);
-
-        ggml_critical_section_end();
-
-        return NULL;
-    }
-
-    const size_t mem_size = (params.mem_size + GGML_MEM_ALIGN - 1) & ~(GGML_MEM_ALIGN - 1);
-
-    *ctx = (struct ggml_context) {
-        /*.mem_size           =*/ mem_size,
-        /*.mem_buffer         =*/ params.mem_buffer ? params.mem_buffer : GGML_ALIGNED_MALLOC(mem_size),
-        /*.mem_buffer_owned   =*/ params.mem_buffer ? false : true,
-        /*.no_alloc           =*/ params.no_alloc,
-        /*.n_objects          =*/ 0,
-        /*.objects_begin      =*/ NULL,
-        /*.objects_end        =*/ NULL,
-        /*.scratch            =*/ { 0, 0, NULL, },
-        /*.scratch_save       =*/ { 0, 0, NULL, },
-    };
-
-    GGML_ASSERT(ctx->mem_buffer != NULL);
-
-    ggml_assert_aligned(ctx->mem_buffer);
-
-    GGML_PRINT_DEBUG("%s: context initialized\n", __func__);
-
-    ggml_critical_section_end();
-
-    return ctx;
-}
-
-void ggml_free(struct ggml_context * ctx) {
-    // make this function thread safe
-    ggml_critical_section_start();
-
-    bool found = false;
-
-    for (int i = 0; i < GGML_MAX_CONTEXTS; i++) {
-        if (&g_state.contexts[i].context == ctx) {
-            g_state.contexts[i].used = false;
-
-            GGML_PRINT_DEBUG("%s: context %d with %d objects has been freed. memory used = %zu\n",
-                    __func__, i, ctx->n_objects, ctx->objects_end->offs + ctx->objects_end->size);
-
-            if (ctx->mem_buffer_owned) {
-                GGML_ALIGNED_FREE(ctx->mem_buffer);
-            }
-
-            found = true;
-            break;
-        }
-    }
-
-    if (!found) {
-        GGML_PRINT_DEBUG("%s: context not found\n", __func__);
-    }
-
-    ggml_critical_section_end();
-}
-
-size_t ggml_used_mem(const struct ggml_context * ctx) {
-    return ctx->objects_end->offs + ctx->objects_end->size;
-}
-
-size_t ggml_set_scratch(struct ggml_context * ctx, struct ggml_scratch scratch) {
-    const size_t result = ctx->scratch.data ? ctx->scratch.offs : 0;
-
-    ctx->scratch = scratch;
-
-    return result;
-}
-
-////////////////////////////////////////////////////////////////////////////////
-
-struct ggml_tensor * ggml_new_tensor_impl(
-        struct ggml_context * ctx,
-        enum   ggml_type type,
-        int    n_dims,
-        const int64_t* ne,
-        void*  data) {
-    // always insert objects at the end of the context's memory pool
-    struct ggml_object * obj_cur = ctx->objects_end;
-
-    const size_t cur_offs = obj_cur == NULL ? 0 : obj_cur->offs;
-    const size_t cur_size = obj_cur == NULL ? 0 : obj_cur->size;
-    const size_t cur_end  = cur_offs + cur_size;
-
-    size_t size_needed = 0;
-
-    if (data == NULL && !ctx->no_alloc) {
-        size_needed += GGML_TYPE_SIZE[type]*(ne[0]/GGML_BLCK_SIZE[type]);
-        for (int i = 1; i < n_dims; i++) {
-            size_needed *= ne[i];
-        }
-        // align to GGML_MEM_ALIGN
-        size_needed = ((size_needed + GGML_MEM_ALIGN - 1)/GGML_MEM_ALIGN)*GGML_MEM_ALIGN;
-    }
-
-    char * const mem_buffer = ctx->mem_buffer;
-    struct ggml_object * const obj_new = (struct ggml_object *)(mem_buffer + cur_end);
-
-    if (ctx->scratch.data == NULL || data != NULL) {
-        size_needed += sizeof(struct ggml_tensor);
-
-        if (cur_end + size_needed + GGML_OBJECT_SIZE > ctx->mem_size) {
-            GGML_PRINT("%s: not enough space in the context's memory pool (needed %zu, available %zu)\n",
-                    __func__, cur_end + size_needed + GGML_OBJECT_SIZE, ctx->mem_size);
-            assert(false);
-            return NULL;
-        }
-
-        *obj_new = (struct ggml_object) {
-            .offs = cur_end + GGML_OBJECT_SIZE,
-            .size = size_needed,
-            .next = NULL,
-        };
-    } else {
-        if (ctx->scratch.offs + size_needed > ctx->scratch.size) {
-            GGML_PRINT("%s: not enough space in the scratch memory\n", __func__);
-            assert(false);
-            return NULL;
-        }
-
-        if (cur_end + sizeof(struct ggml_tensor) + GGML_OBJECT_SIZE > ctx->mem_size) {
-            GGML_PRINT("%s: not enough space in the context's memory pool (needed %zu, available %zu)\n",
-                    __func__, cur_end + sizeof(struct ggml_tensor) + GGML_OBJECT_SIZE, ctx->mem_size);
-            assert(false);
-            return NULL;
-        }
-
-        data = (char * const) ctx->scratch.data + ctx->scratch.offs;
-
-        *obj_new = (struct ggml_object) {
-            .offs = cur_end + GGML_OBJECT_SIZE,
-            .size = sizeof(struct ggml_tensor),
-            .next = NULL,
-        };
-
-        //printf("scratch offs = %zu, size_needed = %zu\n", ctx->scratch.offs, size_needed);
-
-        ctx->scratch.offs += size_needed;
-    }
-
-    if (obj_cur != NULL) {
-        obj_cur->next = obj_new;
-    } else {
-        // this is the first object in this context
-        ctx->objects_begin = obj_new;
-    }
-
-    ctx->objects_end = obj_new;
-
-    //printf("%s: inserted new object at %zu, size = %zu\n", __func__, cur_end, obj_new->size);
-
-    struct ggml_tensor * const result = (struct ggml_tensor *)(mem_buffer + obj_new->offs);
-
-    ggml_assert_aligned(result);
-
-    *result = (struct ggml_tensor) {
-        /*.type         =*/ type,
-        /*.n_dims       =*/ n_dims,
-        /*.ne           =*/ { 1, 1, 1, 1 },
-        /*.nb           =*/ { 0, 0, 0, 0 },
-        /*.op           =*/ GGML_OP_NONE,
-        /*.is_param     =*/ false,
-        /*.grad         =*/ NULL,
-        /*.src0         =*/ NULL,
-        /*.src1         =*/ NULL,
-        /*.opt          =*/ { NULL },
-        /*.n_tasks      =*/ 0,
-        /*.perf_runs    =*/ 0,
-        /*.perf_cycles  =*/ 0,
-        /*.perf_time_us =*/ 0,
-        /*.data         =*/ (data == NULL && !ctx->no_alloc) ? (void *)(result + 1) : data,
-        /*.pad          =*/ { 0 },
-    };
-
-    // TODO: this should not be needed as long as we don't rely on aligned SIMD loads
-    //ggml_assert_aligned(result->data);
-
-    for (int i = 0; i < n_dims; i++) {
-        result->ne[i] = ne[i];
-    }
-
-    result->nb[0] = GGML_TYPE_SIZE[type];
-    result->nb[1] = result->nb[0]*(result->ne[0]/GGML_BLCK_SIZE[type]);
-    for (int i = 2; i < GGML_MAX_DIMS; i++) {
-        result->nb[i] = result->nb[i - 1]*result->ne[i - 1];
-    }
-
-    ctx->n_objects++;
-
-    return result;
-}
-
-struct ggml_tensor * ggml_new_tensor(
-        struct ggml_context * ctx,
-        enum   ggml_type type,
-        int    n_dims,
-        const int64_t * ne) {
-    return ggml_new_tensor_impl(ctx, type, n_dims, ne, NULL);
-}
-
-struct ggml_tensor * ggml_new_tensor_1d(
-        struct ggml_context * ctx,
-        enum   ggml_type type,
-        int64_t ne0) {
-    return ggml_new_tensor(ctx, type, 1, &ne0);
-}
-
-struct ggml_tensor * ggml_new_tensor_2d(
-        struct ggml_context * ctx,
-        enum   ggml_type type,
-        int64_t ne0,
-        int64_t ne1) {
-    const int64_t ne[2] = { ne0, ne1 };
-    return ggml_new_tensor(ctx, type, 2, ne);
-}
-
-struct ggml_tensor * ggml_new_tensor_3d(
-        struct ggml_context * ctx,
-        enum   ggml_type type,
-        int64_t ne0,
-        int64_t ne1,
-        int64_t ne2) {
-    const int64_t ne[3] = { ne0, ne1, ne2 };
-    return ggml_new_tensor(ctx, type, 3, ne);
-}
-
-struct ggml_tensor * ggml_new_tensor_4d(
-        struct ggml_context * ctx,
-        enum   ggml_type type,
-        int64_t ne0,
-        int64_t ne1,
-        int64_t ne2,
-        int64_t ne3) {
-    const int64_t ne[4] = { ne0, ne1, ne2, ne3 };
-    return ggml_new_tensor(ctx, type, 4, ne);
-}
-
-struct ggml_tensor * ggml_new_i32(struct ggml_context * ctx, int32_t value) {
-    ctx->scratch_save = ctx->scratch;
-    ctx->scratch.data = NULL;
-
-    struct ggml_tensor * result = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 1);
-
-    ctx->scratch = ctx->scratch_save;
-
-    ggml_set_i32(result, value);
-
-    return result;
-}
-
-struct ggml_tensor * ggml_new_f32(struct ggml_context * ctx, float value) {
-    ctx->scratch_save = ctx->scratch;
-    ctx->scratch.data = NULL;
-
-    struct ggml_tensor * result = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 1);
-
-    ctx->scratch = ctx->scratch_save;
-
-    ggml_set_f32(result, value);
-
-    return result;
-}
-
-struct ggml_tensor * ggml_dup_tensor(struct ggml_context * ctx, const struct ggml_tensor * src) {
-    return ggml_new_tensor_impl(ctx, src->type, src->n_dims, src->ne, NULL);
-}
-
-struct ggml_tensor * ggml_set_zero(struct ggml_tensor * tensor) {
-    memset(tensor->data, 0, ggml_nbytes(tensor));
-    return tensor;
-}
-
-struct ggml_tensor * ggml_set_i32 (struct ggml_tensor * tensor, int32_t value) {
-    const int n     = ggml_nrows(tensor);
-    const int nc    = tensor->ne[0];
-    const size_t n1 = tensor->nb[1];
-
-    char * const data = tensor->data;
-
-    switch (tensor->type) {
-        case GGML_TYPE_I8:
-            {
-                assert(tensor->nb[0] == sizeof(int8_t));
-                for (int i = 0; i < n; i++) {
-                    ggml_vec_set_i8(nc, (int8_t *)(data + i*n1), value);
-                }
-            } break;
-        case GGML_TYPE_I16:
-            {
-                assert(tensor->nb[0] == sizeof(int16_t));
-                for (int i = 0; i < n; i++) {
-                    ggml_vec_set_i16(nc, (int16_t *)(data + i*n1), value);
-                }
-            } break;
-        case GGML_TYPE_I32:
-            {
-                assert(tensor->nb[0] == sizeof(int32_t));
-                for (int i = 0; i < n; i++) {
-                    ggml_vec_set_i32(nc, (int32_t *)(data + i*n1), value);
-                }
-            } break;
-        case GGML_TYPE_F16:
-            {
-                assert(tensor->nb[0] == sizeof(ggml_fp16_t));
-                for (int i = 0; i < n; i++) {
-                    ggml_vec_set_f16(nc, (ggml_fp16_t *)(data + i*n1), value);
-                }
-            } break;
-        case GGML_TYPE_F32:
-            {
-                assert(tensor->nb[0] == sizeof(float));
-                for (int i = 0; i < n; i++) {
-                    ggml_vec_set_f32(nc, (float *)(data + i*n1), value);
-                }
-            } break;
-        default:
-            {
-                GGML_ASSERT(false);
-            } break;
-    }
-
-    return tensor;
-}
-
-struct ggml_tensor * ggml_set_f32(struct ggml_tensor * tensor, float value) {
-    const int n     = ggml_nrows(tensor);
-    const int nc    = tensor->ne[0];
-    const size_t n1 = tensor->nb[1];
-
-    char * const data = tensor->data;
-
-    switch (tensor->type) {
-        case GGML_TYPE_I8:
-            {
-                assert(tensor->nb[0] == sizeof(int8_t));
-                for (int i = 0; i < n; i++) {
-                    ggml_vec_set_i8(nc, (int8_t *)(data + i*n1), value);
-                }
-            } break;
-        case GGML_TYPE_I16:
-            {
-                assert(tensor->nb[0] == sizeof(int16_t));
-                for (int i = 0; i < n; i++) {
-                    ggml_vec_set_i16(nc, (int16_t *)(data + i*n1), value);
-                }
-            } break;
-        case GGML_TYPE_I32:
-            {
-                assert(tensor->nb[0] == sizeof(int32_t));
-                for (int i = 0; i < n; i++) {
-                    ggml_vec_set_i32(nc, (int32_t *)(data + i*n1), value);
-                }
-            } break;
-        case GGML_TYPE_F16:
-            {
-                assert(tensor->nb[0] == sizeof(ggml_fp16_t));
-                for (int i = 0; i < n; i++) {
-                    ggml_vec_set_f16(nc, (ggml_fp16_t *)(data + i*n1), value);
-                }
-            } break;
-        case GGML_TYPE_F32:
-            {
-                assert(tensor->nb[0] == sizeof(float));
-                for (int i = 0; i < n; i++) {
-                    ggml_vec_set_f32(nc, (float *)(data + i*n1), value);
-                }
-            } break;
-        default:
-            {
-                GGML_ASSERT(false);
-            } break;
-    }
-
-    return tensor;
-}
-
-int32_t ggml_get_i32_1d(const struct ggml_tensor * tensor, int i) {
-    switch (tensor->type) {
-        case GGML_TYPE_I8:
-            {
-                GGML_ASSERT(tensor->nb[0] == sizeof(int8_t));
-                return ((int8_t *)(tensor->data))[i];
-            } break;
-        case GGML_TYPE_I16:
-            {
-                GGML_ASSERT(tensor->nb[0] == sizeof(int16_t));
-                return ((int16_t *)(tensor->data))[i];
-            } break;
-        case GGML_TYPE_I32:
-            {
-                GGML_ASSERT(tensor->nb[0] == sizeof(int32_t));
-                return ((int32_t *)(tensor->data))[i];
-            } break;
-        case GGML_TYPE_F16:
-            {
-                GGML_ASSERT(tensor->nb[0] == sizeof(ggml_fp16_t));
-                return GGML_FP16_TO_FP32(((ggml_fp16_t *)(tensor->data))[i]);
-            } break;
-        case GGML_TYPE_F32:
-            {
-                GGML_ASSERT(tensor->nb[0] == sizeof(float));
-                return ((float *)(tensor->data))[i];
-            } break;
-        default:
-            {
-                GGML_ASSERT(false);
-            } break;
-    }
-
-    return 0.0f;
-}
-
-void ggml_set_i32_1d(const struct ggml_tensor * tensor, int i, int32_t value) {
-    switch (tensor->type) {
-        case GGML_TYPE_I8:
-            {
-                GGML_ASSERT(tensor->nb[0] == sizeof(int8_t));
-                ((int8_t *)(tensor->data))[i] = value;
-            } break;
-        case GGML_TYPE_I16:
-            {
-                GGML_ASSERT(tensor->nb[0] == sizeof(int16_t));
-                ((int16_t *)(tensor->data))[i] = value;
-            } break;
-        case GGML_TYPE_I32:
-            {
-                GGML_ASSERT(tensor->nb[0] == sizeof(int32_t));
-                ((int32_t *)(tensor->data))[i] = value;
-            } break;
-        case GGML_TYPE_F16:
-            {
-                GGML_ASSERT(tensor->nb[0] == sizeof(ggml_fp16_t));
-                ((ggml_fp16_t *)(tensor->data))[i] = GGML_FP32_TO_FP16(value);
-            } break;
-        case GGML_TYPE_F32:
-            {
-                GGML_ASSERT(tensor->nb[0] == sizeof(float));
-                ((float *)(tensor->data))[i] = value;
-            } break;
-        default:
-            {
-                GGML_ASSERT(false);
-            } break;
-    }
-}
-
-float ggml_get_f32_1d(const struct ggml_tensor * tensor, int i) {
-    switch (tensor->type) {
-        case GGML_TYPE_I8:
-            {
-                GGML_ASSERT(tensor->nb[0] == sizeof(int8_t));
-                return ((int8_t *)(tensor->data))[i];
-            } break;
-        case GGML_TYPE_I16:
-            {
-                GGML_ASSERT(tensor->nb[0] == sizeof(int16_t));
-                return ((int16_t *)(tensor->data))[i];
-            } break;
-        case GGML_TYPE_I32:
-            {
-                GGML_ASSERT(tensor->nb[0] == sizeof(int32_t));
-                return ((int32_t *)(tensor->data))[i];
-            } break;
-        case GGML_TYPE_F16:
-            {
-                GGML_ASSERT(tensor->nb[0] == sizeof(ggml_fp16_t));
-                return GGML_FP16_TO_FP32(((ggml_fp16_t *)(tensor->data))[i]);
-            } break;
-        case GGML_TYPE_F32:
-            {
-                GGML_ASSERT(tensor->nb[0] == sizeof(float));
-                return ((float *)(tensor->data))[i];
-            } break;
-        default:
-            {
-                GGML_ASSERT(false);
-            } break;
-    }
-
-    return 0.0f;
-}
-
-void ggml_set_f32_1d(const struct ggml_tensor * tensor, int i, float value) {
-    switch (tensor->type) {
-        case GGML_TYPE_I8:
-            {
-                GGML_ASSERT(tensor->nb[0] == sizeof(int8_t));
-                ((int8_t *)(tensor->data))[i] = value;
-            } break;
-        case GGML_TYPE_I16:
-            {
-                GGML_ASSERT(tensor->nb[0] == sizeof(int16_t));
-                ((int16_t *)(tensor->data))[i] = value;
-            } break;
-        case GGML_TYPE_I32:
-            {
-                GGML_ASSERT(tensor->nb[0] == sizeof(int32_t));
-                ((int32_t *)(tensor->data))[i] = value;
-            } break;
-        case GGML_TYPE_F16:
-            {
-                GGML_ASSERT(tensor->nb[0] == sizeof(ggml_fp16_t));
-                ((ggml_fp16_t *)(tensor->data))[i] = GGML_FP32_TO_FP16(value);
-            } break;
-        case GGML_TYPE_F32:
-            {
-                GGML_ASSERT(tensor->nb[0] == sizeof(float));
-                ((float *)(tensor->data))[i] = value;
-            } break;
-        default:
-            {
-                GGML_ASSERT(false);
-            } break;
-    }
-}
-
-void * ggml_get_data(const struct ggml_tensor * tensor) {
-    return tensor->data;
-}
-
-float * ggml_get_data_f32(const struct ggml_tensor * tensor) {
-    assert(tensor->type == GGML_TYPE_F32);
-    return (float *)(tensor->data);
-}
-
-struct ggml_tensor * ggml_view_tensor(
-        struct ggml_context * ctx,
-        const struct ggml_tensor * src) {
-    struct ggml_tensor * result = ggml_new_tensor_impl(ctx, src->type, src->n_dims, src->ne, src->data);
-
-    result->nb[0] = src->nb[0];
-    result->nb[1] = src->nb[1];
-    result->nb[2] = src->nb[2];
-    result->nb[3] = src->nb[3];
-
-    return result;
-}
-
-////////////////////////////////////////////////////////////////////////////////
-
-// ggml_dup
-
-struct ggml_tensor * ggml_dup_impl(
-        struct ggml_context * ctx,
-        struct ggml_tensor * a,
-        bool inplace) {
-    bool is_node = false;
-
-    if (!inplace && (a->grad)) {
-        is_node = true;
-    }
-
-    struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
-
-    result->op   = GGML_OP_DUP;
-    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src0 = a;
-    result->src1 = NULL;
-
-    return result;
-}
-
-struct ggml_tensor * ggml_dup(
-        struct ggml_context * ctx,
-        struct ggml_tensor * a) {
-    return ggml_dup_impl(ctx, a, false);
-}
-
-struct ggml_tensor * ggml_dup_inplace(
-        struct ggml_context * ctx,
-        struct ggml_tensor * a) {
-    return ggml_dup_impl(ctx, a, true);
-}
-
-// ggml_add
-
-struct ggml_tensor * ggml_add_impl(
-        struct ggml_context * ctx,
-        struct ggml_tensor * a,
-        struct ggml_tensor * b,
-        bool inplace) {
-    GGML_ASSERT(ggml_are_same_shape(a, b));
-
-    bool is_node = false;
-
-    if (!inplace && (a->grad || b->grad)) {
-        is_node = true;
-    }
-
-    struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
-
-    result->op   = GGML_OP_ADD;
-    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src0 = a;
-    result->src1 = b;
-
-    return result;
-}
-
-struct ggml_tensor * ggml_add(
-        struct ggml_context * ctx,
-        struct ggml_tensor * a,
-        struct ggml_tensor * b) {
-    return ggml_add_impl(ctx, a, b, false);
-}
-
-struct ggml_tensor * ggml_add_inplace(
-        struct ggml_context * ctx,
-        struct ggml_tensor * a,
-        struct ggml_tensor * b) {
-    return ggml_add_impl(ctx, a, b, true);
-}
-
-// ggml_sub
-
-struct ggml_tensor * ggml_sub_impl(
-        struct ggml_context * ctx,
-        struct ggml_tensor * a,
-        struct ggml_tensor * b,
-        bool inplace) {
-    GGML_ASSERT(ggml_are_same_shape(a, b));
-
-    bool is_node = false;
-
-    if (!inplace && (a->grad || b->grad)) {
-        is_node = true;
-    }
-
-    struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
-
-    result->op   = GGML_OP_SUB;
-    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src0 = a;
-    result->src1 = b;
-
-    return result;
-}
-
-struct ggml_tensor * ggml_sub(
-        struct ggml_context * ctx,
-        struct ggml_tensor * a,
-        struct ggml_tensor * b) {
-    return ggml_sub_impl(ctx, a, b, false);
-}
-
-struct ggml_tensor * ggml_sub_inplace(
-        struct ggml_context * ctx,
-        struct ggml_tensor * a,
-        struct ggml_tensor * b) {
-    return ggml_sub_impl(ctx, a, b, true);
-}
-
-// ggml_mul
-
-struct ggml_tensor * ggml_mul_impl(
-        struct ggml_context * ctx,
-        struct ggml_tensor * a,
-        struct ggml_tensor * b,
-        bool inplace) {
-    GGML_ASSERT(ggml_are_same_shape(a, b));
-
-    bool is_node = false;
-
-    if (!inplace && (a->grad || b->grad)) {
-        is_node = true;
-    }
-
-    if (inplace) {
-        GGML_ASSERT(is_node == false);
-    }
-
-    struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
-
-    result->op   = GGML_OP_MUL;
-    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src0 = a;
-    result->src1 = b;
-
-    return result;
-}
-
-struct ggml_tensor * ggml_mul(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a,
-        struct ggml_tensor  * b) {
-    return ggml_mul_impl(ctx, a, b, false);
-}
-
-struct ggml_tensor * ggml_mul_inplace(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a,
-        struct ggml_tensor  * b) {
-    return ggml_mul_impl(ctx, a, b, true);
-}
-
-// ggml_div
-
-struct ggml_tensor * ggml_div_impl(
-        struct ggml_context * ctx,
-        struct ggml_tensor * a,
-        struct ggml_tensor * b,
-        bool inplace) {
-    GGML_ASSERT(ggml_are_same_shape(a, b));
-
-    bool is_node = false;
-
-    if (!inplace && (a->grad || b->grad)) {
-        is_node = true;
-    }
-
-    if (inplace) {
-        GGML_ASSERT(is_node == false);
-    }
-
-    struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
-
-    result->op   = GGML_OP_DIV;
-    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src0 = a;
-    result->src1 = b;
-
-    return result;
-}
-
-struct ggml_tensor * ggml_div(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a,
-        struct ggml_tensor  * b) {
-    return ggml_div_impl(ctx, a, b, false);
-}
-
-struct ggml_tensor * ggml_div_inplace(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a,
-        struct ggml_tensor  * b) {
-    return ggml_div_impl(ctx, a, b, true);
-}
-
-// ggml_sqr
-
-struct ggml_tensor * ggml_sqr_impl(
-        struct ggml_context * ctx,
-        struct ggml_tensor * a,
-        bool inplace) {
-    bool is_node = false;
-
-    if (!inplace && (a->grad)) {
-        is_node = true;
-    }
-
-    struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
-
-    result->op   = GGML_OP_SQR;
-    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src0 = a;
-    result->src1 = NULL;
-
-    return result;
-}
-
-struct ggml_tensor * ggml_sqr(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a) {
-    return ggml_sqr_impl(ctx, a, false);
-}
-
-struct ggml_tensor * ggml_sqr_inplace(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a) {
-    return ggml_sqr_impl(ctx, a, true);
-}
-
-// ggml_sqrt
-
-struct ggml_tensor * ggml_sqrt_impl(
-        struct ggml_context * ctx,
-        struct ggml_tensor * a,
-        bool inplace) {
-    bool is_node = false;
-
-    if (!inplace && (a->grad)) {
-        is_node = true;
-    }
-
-    struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
-
-    result->op   = GGML_OP_SQRT;
-    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src0 = a;
-    result->src1 = NULL;
-
-    return result;
-}
-
-struct ggml_tensor * ggml_sqrt(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a) {
-    return ggml_sqrt_impl(ctx, a, false);
-}
-
-struct ggml_tensor * ggml_sqrt_inplace(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a) {
-    return ggml_sqrt_impl(ctx, a, true);
-}
-
-// ggml_sum
-
-struct ggml_tensor * ggml_sum(
-        struct ggml_context * ctx,
-        struct ggml_tensor * a) {
-    bool is_node = false;
-
-    if (a->grad) {
-        is_node = true;
-    }
-
-    struct ggml_tensor * result = ggml_new_tensor_1d(ctx, a->type, 1);
-
-    result->op   = GGML_OP_SUM;
-    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src0 = a;
-    result->src1 = NULL;
-
-    return result;
-}
-
-// ggml_mean
-
-struct ggml_tensor * ggml_mean(
-        struct ggml_context * ctx,
-        struct ggml_tensor * a) {
-    bool is_node = false;
-
-    if (a->grad) {
-        GGML_ASSERT(false); // TODO: implement
-        is_node = true;
-    }
-
-    int64_t ne[GGML_MAX_DIMS] = { 1, a->ne[1], a->ne[2], a->ne[3] };
-    struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, a->n_dims, ne);
-
-    result->op   = GGML_OP_MEAN;
-    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src0 = a;
-    result->src1 = NULL;
-
-    return result;
-}
-
-// ggml_repeat
-
-struct ggml_tensor * ggml_repeat(
-        struct ggml_context * ctx,
-        struct ggml_tensor * a,
-        struct ggml_tensor * b) {
-    GGML_ASSERT(ggml_can_repeat(a, b));
-
-    bool is_node = false;
-
-    if (a->grad) {
-        is_node = true;
-    }
-
-    if (ggml_are_same_shape(a, b) && !is_node) {
-        return a;
-    }
-
-    struct ggml_tensor * result = ggml_new_tensor(ctx, a->type, b->n_dims, b->ne);
-
-    result->op   = GGML_OP_REPEAT;
-    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src0 = a;
-    result->src1 = b;
-
-    return result;
-}
-
-// ggml_abs
-
-struct ggml_tensor * ggml_abs_impl(
-        struct ggml_context * ctx,
-        struct ggml_tensor * a,
-        bool inplace) {
-    bool is_node = false;
-
-    if (!inplace && (a->grad)) {
-        is_node = true;
-    }
-
-    struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
-
-    result->op   = GGML_OP_ABS;
-    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src0 = a;
-    result->src1 = NULL;
-
-    return result;
-}
-
-struct ggml_tensor * ggml_abs(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a) {
-    return ggml_abs_impl(ctx, a, false);
-}
-
-struct ggml_tensor * ggml_abs_inplace(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a) {
-    return ggml_abs_impl(ctx, a, true);
-}
-
-
-// ggml_sgn
-
-struct ggml_tensor * ggml_sgn_impl(
-        struct ggml_context * ctx,
-        struct ggml_tensor * a,
-        bool inplace) {
-    bool is_node = false;
-
-    if (!inplace && (a->grad)) {
-        is_node = true;
-    }
-
-    struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
-
-    result->op   = GGML_OP_SGN;
-    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src0 = a;
-    result->src1 = NULL;
-
-    return result;
-}
-
-struct ggml_tensor * ggml_sgn(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a) {
-    return ggml_sgn_impl(ctx, a, false);
-}
-
-struct ggml_tensor * ggml_sgn_inplace(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a) {
-    return ggml_sgn_impl(ctx, a, true);
-}
-
-// ggml_neg
-
-struct ggml_tensor * ggml_neg_impl(
-        struct ggml_context * ctx,
-        struct ggml_tensor * a,
-        bool inplace) {
-    bool is_node = false;
-
-    if (!inplace && (a->grad)) {
-        is_node = true;
-    }
-
-    struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
-
-    result->op   = GGML_OP_NEG;
-    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src0 = a;
-    result->src1 = NULL;
-
-    return result;
-}
-
-struct ggml_tensor * ggml_neg(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a) {
-    return ggml_neg_impl(ctx, a, false);
-}
-
-struct ggml_tensor * ggml_neg_inplace(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a) {
-    return ggml_neg_impl(ctx, a, true);
-}
-
-// ggml_step
-
-struct ggml_tensor * ggml_step_impl(
-        struct ggml_context * ctx,
-        struct ggml_tensor * a,
-        bool inplace) {
-    bool is_node = false;
-
-    if (!inplace && (a->grad)) {
-        is_node = true;
-    }
-
-    struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
-
-    result->op   = GGML_OP_STEP;
-    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src0 = a;
-    result->src1 = NULL;
-
-    return result;
-}
-
-struct ggml_tensor * ggml_step(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a) {
-    return ggml_step_impl(ctx, a, false);
-}
-
-struct ggml_tensor * ggml_step_inplace(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a) {
-    return ggml_step_impl(ctx, a, true);
-}
-
-// ggml_relu
-
-struct ggml_tensor * ggml_relu_impl(
-        struct ggml_context * ctx,
-        struct ggml_tensor * a,
-        bool inplace) {
-    bool is_node = false;
-
-    if (!inplace && (a->grad)) {
-        is_node = true;
-    }
-
-    struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
-
-    result->op   = GGML_OP_RELU;
-    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src0 = a;
-    result->src1 = NULL;
-
-    return result;
-}
-
-struct ggml_tensor * ggml_relu(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a) {
-    return ggml_relu_impl(ctx, a, false);
-}
-
-struct ggml_tensor * ggml_relu_inplace(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a) {
-    return ggml_relu_impl(ctx, a, true);
-}
-
-// ggml_gelu
-
-struct ggml_tensor * ggml_gelu_impl(
-        struct ggml_context * ctx,
-        struct ggml_tensor * a,
-        bool inplace) {
-    bool is_node = false;
-
-    if (!inplace && (a->grad)) {
-        is_node = true;
-    }
-
-    struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
-
-    result->op   = GGML_OP_GELU;
-    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src0 = a;
-    result->src1 = NULL;
-
-    return result;
-}
-
-struct ggml_tensor * ggml_gelu(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a) {
-    return ggml_gelu_impl(ctx, a, false);
-}
-
-struct ggml_tensor * ggml_gelu_inplace(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a) {
-    return ggml_gelu_impl(ctx, a, true);
-}
-
-// ggml_silu
-
-struct ggml_tensor * ggml_silu_impl(
-        struct ggml_context * ctx,
-        struct ggml_tensor * a,
-        bool inplace) {
-    bool is_node = false;
-
-    if (!inplace && (a->grad)) {
-        is_node = true;
-    }
-
-    struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
-
-    result->op   = GGML_OP_SILU;
-    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src0 = a;
-    result->src1 = NULL;
-
-    return result;
-}
-
-struct ggml_tensor * ggml_silu(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a) {
-    return ggml_silu_impl(ctx, a, false);
-}
-
-struct ggml_tensor * ggml_silu_inplace(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a) {
-    return ggml_silu_impl(ctx, a, true);
-}
-
-// ggml_norm
-
-struct ggml_tensor * ggml_norm_impl(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a,
-        bool inplace) {
-    bool is_node = false;
-
-    if (!inplace && (a->grad)) {
-        GGML_ASSERT(false); // TODO: implement backward
-        is_node = true;
-    }
-
-    struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
-
-    result->op   = GGML_OP_NORM;
-    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src0 = a;
-    result->src1 = NULL; // TODO: maybe store epsilon here?
-
-    return result;
-}
-
-struct ggml_tensor * ggml_norm(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a) {
-    return ggml_norm_impl(ctx, a, false);
-}
-
-struct ggml_tensor * ggml_norm_inplace(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a) {
-    return ggml_norm_impl(ctx, a, true);
-}
-
-struct ggml_tensor * ggml_rms_norm_impl(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a,
-        bool inplace) {
-    bool is_node = false;
-
-    if (!inplace && (a->grad)) {
-        GGML_ASSERT(false); // TODO: implement backward
-        is_node = true;
-    }
-
-    struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
-
-    result->op   = GGML_OP_RMS_NORM;
-    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src0 = a;
-    result->src1 = NULL; // TODO: maybe store epsilon here?
-
-    return result;
-}
-
-struct ggml_tensor * ggml_rms_norm(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a) {
-    return ggml_rms_norm_impl(ctx, a, false);
-}
-
-struct ggml_tensor * ggml_rms_norm_inplace(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a) {
-    return ggml_rms_norm_impl(ctx, a, true);
-}
-
-// ggml_mul_mat
-
-struct ggml_tensor * ggml_mul_mat(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a,
-        struct ggml_tensor  * b) {
-    GGML_ASSERT(ggml_can_mul_mat(a, b));
-    GGML_ASSERT(!ggml_is_transposed(a));
-
-    bool is_node = false;
-
-    if (a->grad || b->grad) {
-        is_node = true;
-    }
-
-    const int64_t ne[4] = { a->ne[1], b->ne[1], a->ne[2], b->ne[3] };
-    struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, MIN(a->n_dims, b->n_dims), ne);
-
-    result->op   = GGML_OP_MUL_MAT;
-    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src0 = a;
-    result->src1 = b;
-
-    return result;
-}
-
-// ggml_scale
-
-struct ggml_tensor * ggml_scale_impl(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a,
-        struct ggml_tensor  * b,
-        bool inplace) {
-    GGML_ASSERT(ggml_is_scalar(b));
-    GGML_ASSERT(ggml_is_padded_1d(a));
-
-    bool is_node = false;
-
-    if (!inplace && (a->grad || b->grad)) {
-        GGML_ASSERT(false); // TODO: implement backward
-        is_node = true;
-    }
-
-    // TODO: when implement backward, fix this:
-    //struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
-    struct ggml_tensor * result = ggml_view_tensor(ctx, a);
-
-    result->op   = GGML_OP_SCALE;
-    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src0 = a;
-    result->src1 = b;
-
-    return result;
-}
-
-struct ggml_tensor * ggml_scale(
-        struct ggml_context * ctx,
-        struct ggml_tensor * a,
-        struct ggml_tensor * b) {
-    return ggml_scale_impl(ctx, a, b, false);
-}
-
-struct ggml_tensor * ggml_scale_inplace(
-        struct ggml_context * ctx,
-        struct ggml_tensor * a,
-        struct ggml_tensor * b) {
-    return ggml_scale_impl(ctx, a, b, true);
-}
-
-// ggml_cpy
-
-struct ggml_tensor * ggml_cpy_impl(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a,
-        struct ggml_tensor  * b,
-        bool inplace) {
-    GGML_ASSERT(ggml_nelements(a) == ggml_nelements(b));
-
-    bool is_node = false;
-
-    if (!inplace && (a->grad || b->grad)) {
-        GGML_ASSERT(false); // TODO: implement backward
-        is_node = true;
-    }
-
-    // make a view of the destination
-    struct ggml_tensor * result = ggml_view_tensor(ctx, b);
-
-    result->op   = GGML_OP_CPY;
-    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src0 = a;
-    result->src1 = b;
-
-    return result;
-}
-
-struct ggml_tensor * ggml_cpy(
-        struct ggml_context * ctx,
-        struct ggml_tensor * a,
-        struct ggml_tensor * b) {
-    return ggml_cpy_impl(ctx, a, b, false);
-}
-
-struct ggml_tensor * ggml_cpy_inplace(
-        struct ggml_context * ctx,
-        struct ggml_tensor * a,
-        struct ggml_tensor * b) {
-    return ggml_cpy_impl(ctx, a, b, true);
-}
-
-// ggml_cont
-
-struct ggml_tensor * ggml_cont_impl(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a,
-        bool inplace) {
-    bool is_node = false;
-
-    if (!inplace && a->grad) {
-        GGML_ASSERT(false); // TODO: implement backward
-        is_node = true;
-    }
-
-    struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
-
-    result->op   = GGML_OP_CONT;
-    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src0 = a;
-    result->src1 = NULL;
-
-    return result;
-}
-
-struct ggml_tensor * ggml_cont(
-        struct ggml_context * ctx,
-        struct ggml_tensor * a) {
-    return ggml_cont_impl(ctx, a, false);
-}
-
-struct ggml_tensor * ggml_cont_inplace(
-        struct ggml_context * ctx,
-        struct ggml_tensor * a) {
-    return ggml_cont_impl(ctx, a, true);
-}
-
-// ggml_reshape
-
-struct ggml_tensor * ggml_reshape(
-        struct ggml_context * ctx,
-        struct ggml_tensor * a,
-        struct ggml_tensor * b) {
-    GGML_ASSERT(ggml_is_contiguous(a));
-    GGML_ASSERT(ggml_is_contiguous(b));
-    GGML_ASSERT(ggml_nelements(a) == ggml_nelements(b));
-
-    bool is_node = false;
-
-    if (a->grad || b->grad) {
-        GGML_ASSERT(false); // TODO: implement backward
-        is_node = true;
-    }
-
-    struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, b->n_dims, b->ne, a->data);
-
-    result->op   = GGML_OP_RESHAPE;
-    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src0 = a;
-    result->src1 = NULL;
-
-    return result;
-}
-
-struct ggml_tensor * ggml_reshape_2d(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a,
-        int64_t               ne0,
-        int64_t               ne1) {
-    GGML_ASSERT(ggml_is_contiguous(a));
-    GGML_ASSERT(ggml_nelements(a) == ne0*ne1);
-
-    bool is_node = false;
-
-    if (a->grad) {
-        GGML_ASSERT(false); // TODO: implement backward
-        is_node = true;
-    }
-
-    const int64_t ne[2] = { ne0, ne1 };
-    struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 2, ne, a->data);
-
-    result->op   = GGML_OP_RESHAPE;
-    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src0 = a;
-    result->src1 = NULL;
-
-    return result;
-}
-
-struct ggml_tensor * ggml_reshape_3d(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a,
-        int64_t               ne0,
-        int64_t               ne1,
-        int64_t               ne2) {
-    GGML_ASSERT(ggml_is_contiguous(a));
-    GGML_ASSERT(ggml_nelements(a) == ne0*ne1*ne2);
-
-    bool is_node = false;
-
-    if (a->grad) {
-        GGML_ASSERT(false); // TODO: implement backward
-        is_node = true;
-    }
-
-    const int64_t ne[3] = { ne0, ne1, ne2 };
-    struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 3, ne, a->data);
-
-    result->op   = GGML_OP_RESHAPE;
-    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src0 = a;
-    result->src1 = NULL;
-
-    return result;
-}
-
-// ggml_view_1d
-
-struct ggml_tensor * ggml_view_1d(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a,
-        int64_t               ne0,
-        size_t                offset) {
-    if (a->grad) {
-        GGML_ASSERT(false); // gradient propagation is not supported
-    }
-
-    struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 1, &ne0, (char *) a->data + offset);
-
-    result->op   = GGML_OP_VIEW;
-    result->grad = NULL;
-    result->src0 = a;
-    result->src1 = NULL; // TODO: maybe store the offset here?
-
-    return result;
-}
-
-// ggml_view_2d
-
-struct ggml_tensor * ggml_view_2d(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a,
-        int64_t               ne0,
-        int64_t               ne1,
-        size_t                nb1,
-        size_t                offset) {
-    if (a->grad) {
-        GGML_ASSERT(false); // gradient propagation is not supported
-    }
-
-    const int64_t ne[GGML_MAX_DIMS] = { ne0, ne1, 1, 1 };
-
-    struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 2, ne, (char *) a->data + offset);
-
-    result->nb[1] = nb1;
-    result->nb[2] = result->nb[1]*ne1;
-    result->nb[3] = result->nb[2];
-
-    result->op   = GGML_OP_VIEW;
-    result->grad = NULL;
-    result->src0 = a;
-    result->src1 = NULL; // TODO: maybe store the offset here?
-
-    return result;
-}
-
-// ggml_view_3d
-
-struct ggml_tensor * ggml_view_3d(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a,
-        int64_t               ne0,
-        int64_t               ne1,
-        int64_t               ne2,
-        size_t                nb1,
-        size_t                nb2,
-        size_t                offset) {
-    if (a->grad) {
-        GGML_ASSERT(false); // gradient propagation is not supported
-    }
-
-    const int64_t ne[GGML_MAX_DIMS] = { ne0, ne1, ne2, 1 };
-
-    struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 3, ne, (char *) a->data + offset);
-
-    result->nb[1] = nb1;
-    result->nb[2] = nb2;
-    result->nb[3] = result->nb[2]*ne2;
-
-    result->op   = GGML_OP_VIEW;
-    result->grad = NULL;
-    result->src0 = a;
-    result->src1 = NULL; // TODO: maybe store the offset here?
-
-    return result;
-}
-
-// ggml_permute
-
-struct ggml_tensor * ggml_permute(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a,
-        int                   axis0,
-        int                   axis1,
-        int                   axis2,
-        int                   axis3) {
-    GGML_ASSERT(axis0 >= 0 && axis0 < GGML_MAX_DIMS);
-    GGML_ASSERT(axis1 >= 0 && axis1 < GGML_MAX_DIMS);
-    GGML_ASSERT(axis2 >= 0 && axis2 < GGML_MAX_DIMS);
-    GGML_ASSERT(axis3 >= 0 && axis3 < GGML_MAX_DIMS);
-
-    GGML_ASSERT(axis0 != axis1);
-    GGML_ASSERT(axis0 != axis2);
-    GGML_ASSERT(axis0 != axis3);
-    GGML_ASSERT(axis1 != axis2);
-    GGML_ASSERT(axis1 != axis3);
-    GGML_ASSERT(axis2 != axis3);
-
-    bool is_node = false;
-
-    if (a->grad) {
-        GGML_ASSERT(false); // TODO: implement backward
-        is_node = true;
-    }
-
-    struct ggml_tensor * result = ggml_view_tensor(ctx, a);
-
-    int ne[GGML_MAX_DIMS];
-    int nb[GGML_MAX_DIMS];
-
-    ne[axis0] = a->ne[0];
-    ne[axis1] = a->ne[1];
-    ne[axis2] = a->ne[2];
-    ne[axis3] = a->ne[3];
-
-    nb[axis0] = a->nb[0];
-    nb[axis1] = a->nb[1];
-    nb[axis2] = a->nb[2];
-    nb[axis3] = a->nb[3];
-
-    result->ne[0] = ne[0];
-    result->ne[1] = ne[1];
-    result->ne[2] = ne[2];
-    result->ne[3] = ne[3];
-
-    result->nb[0] = nb[0];
-    result->nb[1] = nb[1];
-    result->nb[2] = nb[2];
-    result->nb[3] = nb[3];
-
-    result->op   = GGML_OP_PERMUTE;
-    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src0 = a;
-    result->src1 = NULL; // TODO: maybe store the permutation here?
-
-    return result;
-}
-
-// ggml_transpose
-
-struct ggml_tensor * ggml_transpose(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a) {
-    bool is_node = false;
-
-    if (a->grad) {
-        GGML_ASSERT(false); // TODO: implement backward
-        is_node = true;
-    }
-
-    struct ggml_tensor * result = ggml_view_tensor(ctx, a);
-
-    result->ne[0] = a->ne[1];
-    result->ne[1] = a->ne[0];
-
-    result->nb[0] = a->nb[1];
-    result->nb[1] = a->nb[0];
-
-    result->op   = GGML_OP_TRANSPOSE;
-    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src0 = a;
-    result->src1 = NULL;
-
-    return result;
-}
-
-// ggml_get_rows
-
-struct ggml_tensor * ggml_get_rows(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a,
-        struct ggml_tensor  * b) {
-    GGML_ASSERT(ggml_is_matrix(a) && ggml_is_vector(b) && b->type == GGML_TYPE_I32);
-
-    bool is_node = false;
-
-    if (a->grad || b->grad) {
-        GGML_ASSERT(false); // TODO: implement backward
-        is_node = true;
-    }
-
-    // TODO: implement non F32 return
-    //struct ggml_tensor * result = ggml_new_tensor_2d(ctx, a->type, a->ne[0], b->ne[0]);
-    struct ggml_tensor * result = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, a->ne[0], b->ne[0]);
-
-    result->op   = GGML_OP_GET_ROWS;
-    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src0 = a;
-    result->src1 = b;
-
-    return result;
-}
-
-// ggml_diag_mask_inf
-
-struct ggml_tensor * ggml_diag_mask_inf(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a,
-        int                   n_past) {
-    bool is_node = false;
-
-    if (a->grad) {
-        GGML_ASSERT(false); // TODO: implement backward
-        is_node = true;
-    }
-
-    // TODO: when implement backward, fix this:
-    //struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
-    struct ggml_tensor * result = ggml_view_tensor(ctx, a);
-    struct ggml_tensor * b = ggml_new_i32(ctx, n_past);
-
-    result->op   = GGML_OP_DIAG_MASK_INF;
-    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src0 = a;
-    result->src1 = b;
-
-    return result;
-}
-
-// ggml_soft_max
-
-struct ggml_tensor * ggml_soft_max(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a) {
-    bool is_node = false;
-
-    if (a->grad) {
-        GGML_ASSERT(false); // TODO: implement backward
-        is_node = true;
-    }
-
-    // TODO: when implement backward, fix this:
-    //struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
-    struct ggml_tensor * result = ggml_view_tensor(ctx, a);
-
-    result->op   = GGML_OP_SOFT_MAX;
-    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src0 = a;
-    result->src1 = NULL;
-
-    return result;
-}
-
-// ggml_rope
-
-struct ggml_tensor * ggml_rope(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a,
-        int                   n_past,
-        int                   n_dims,
-        int                   mode) {
-    GGML_ASSERT(n_past >= 0);
-    bool is_node = false;
-
-    if (a->grad) {
-        GGML_ASSERT(false); // TODO: implement backward
-        is_node = true;
-    }
-
-    // TODO: when implement backward, fix this:
-    //struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
-    struct ggml_tensor * result = ggml_view_tensor(ctx, a);
-
-    struct ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 3);
-    ((int32_t *) b->data)[0] = n_past;
-    ((int32_t *) b->data)[1] = n_dims;
-    ((int32_t *) b->data)[2] = mode;
-
-    result->op   = GGML_OP_ROPE;
-    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src0 = a;
-    result->src1 = b;
-
-    return result;
-}
-
-// ggml_alibi
-struct ggml_tensor * ggml_alibi(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a,
-        int                   n_past,
-        int                   n_head) {
-    GGML_ASSERT(n_past >= 0);
-    bool is_node = false;
-
-    if (a->grad) {
-        GGML_ASSERT(false); // TODO: implement backward
-        is_node = true;
-    }
-
-    // TODO: when implement backward, fix this:
-    //struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
-    struct ggml_tensor * result = ggml_view_tensor(ctx, a);
-
-    struct ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 3);
-    ((int32_t *) b->data)[0] = n_past;
-    ((int32_t *) b->data)[1] = n_head;
-
-    result->op   = GGML_OP_ALIBI;
-    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src0 = a;
-    result->src1 = b;
-
-    return result;
-}
-
-// ggml_conv_1d_1s
-
-struct ggml_tensor * ggml_conv_1d_1s(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a,
-        struct ggml_tensor  * b) {
-    GGML_ASSERT(ggml_is_matrix(b));
-    GGML_ASSERT(a->ne[1] == b->ne[1]);
-    GGML_ASSERT(a->ne[3] == 1);
-    bool is_node = false;
-
-    if (a->grad || b->grad) {
-        GGML_ASSERT(false); // TODO: implement backward
-        is_node = true;
-    }
-
-    const int64_t ne[4] = { b->ne[0], a->ne[2], 1, 1, };
-    struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 2, ne);
-
-    result->op   = GGML_OP_CONV_1D_1S;
-    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src0 = a;
-    result->src1 = b;
-
-    return result;
-}
-
-// ggml_conv_1d_2s
-
-struct ggml_tensor * ggml_conv_1d_2s(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a,
-        struct ggml_tensor  * b) {
-    GGML_ASSERT(ggml_is_matrix(b));
-    GGML_ASSERT(a->ne[1] == b->ne[1]);
-    GGML_ASSERT(a->ne[3] == 1);
-    bool is_node = false;
-
-    if (a->grad || b->grad) {
-        GGML_ASSERT(false); // TODO: implement backward
-        is_node = true;
-    }
-
-    const int64_t ne[4] = { b->ne[0]/2, a->ne[2], 1, 1, };
-    struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 2, ne);
-
-    result->op   = GGML_OP_CONV_1D_2S;
-    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src0 = a;
-    result->src1 = b;
-
-    return result;
-}
-
-// ggml_flash_attn
-
-struct ggml_tensor * ggml_flash_attn(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * q,
-        struct ggml_tensor  * k,
-        struct ggml_tensor  * v,
-        bool                  masked) {
-    GGML_ASSERT(ggml_can_mul_mat(k, q));
-    // TODO: check if vT can be multiplied by (k*qT)
-
-    bool is_node = false;
-
-    if (q->grad || k->grad || v->grad) {
-        GGML_ASSERT(false); // TODO: implement backward
-        is_node = true;
-    }
-
-    //struct ggml_tensor * result = ggml_dup_tensor(ctx, q);
-    struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, q->ne);
-
-    result->op   = GGML_OP_FLASH_ATTN;
-    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src0 = q;
-    result->src1 = k;
-    result->opt[0] = v;
-    result->opt[1] = ggml_new_i32(ctx, masked ? 1 : 0);
-
-    return result;
-}
-
-// ggml_flash_ff
-
-struct ggml_tensor * ggml_flash_ff(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a,
-        struct ggml_tensor  * b0,
-        struct ggml_tensor  * b1,
-        struct ggml_tensor  * c0,
-        struct ggml_tensor  * c1) {
-    GGML_ASSERT(ggml_can_mul_mat(b0, a));
-    // TODO: more checks
-
-    bool is_node = false;
-
-    if (a->grad || b0->grad || b1->grad || c0->grad || c1->grad) {
-        GGML_ASSERT(false); // TODO: implement backward
-        is_node = true;
-    }
-
-    //struct ggml_tensor * result = ggml_dup_tensor(ctx, a);
-    struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, a->ne);
-
-    result->op   = GGML_OP_FLASH_FF;
-    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src0 = a;
-    result->src1 = b0;
-    result->opt[0] = b1;
-    result->opt[1] = c0;
-    result->opt[2] = c1;
-
-    return result;
-}
-
-// ggml_map_unary
-
-struct ggml_tensor * ggml_map_unary_impl_f32(
-        struct ggml_context        * ctx,
-        struct ggml_tensor         * a,
-        const  ggml_unary_op_f32_t fun,
-        bool   inplace) {
-    bool is_node = false;
-
-    if (!inplace && a->grad) {
-        is_node = true;
-    }
-
-    struct ggml_tensor * addr_tensor = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, sizeof(void *) / sizeof(int32_t));
-    *((void (**)(void))addr_tensor->data) = (void (*)(void))fun;
-    struct ggml_tensor *result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
-
-    result->op = GGML_OP_MAP_UNARY;
-    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src0 = a;
-    result->opt[0] = addr_tensor;
-
-    return result;
-}
-
-struct ggml_tensor * ggml_map_unary_f32(
-        struct ggml_context        * ctx,
-        struct ggml_tensor         * a,
-        const  ggml_unary_op_f32_t fun) {
-    return ggml_map_unary_impl_f32(ctx, a, fun, false);
-}
-
-struct ggml_tensor * ggml_map_unary_inplace_f32(
-        struct ggml_context        * ctx,
-        struct ggml_tensor         * a,
-        const  ggml_unary_op_f32_t fun) {
-    return ggml_map_unary_impl_f32(ctx, a, fun, true);
-}
-
-// ggml_map_binary
-
-struct ggml_tensor * ggml_map_binary_impl_f32(
-        struct ggml_context         * ctx,
-        struct ggml_tensor          * a,
-        struct ggml_tensor          * b,
-        const  ggml_binary_op_f32_t fun,
-        bool   inplace) {
-    GGML_ASSERT(ggml_are_same_shape(a, b));
-
-    bool is_node = false;
-
-    if (!inplace && (a->grad || b->grad)) {
-        is_node = true;
-    }
-
-    struct ggml_tensor * addr_tensor = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, sizeof(void *) / sizeof(int32_t));
-    *((void (**)(void))addr_tensor->data) = (void (*)(void))fun;
-    struct ggml_tensor *result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
-
-    result->op = GGML_OP_MAP_BINARY;
-    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src0 = a;
-    result->src1 = b;
-    result->opt[0] = addr_tensor;
-
-    return result;
-}
-
-struct ggml_tensor * ggml_map_binary_f32(
-        struct ggml_context         * ctx,
-        struct ggml_tensor          * a,
-        struct ggml_tensor          * b,
-        const  ggml_binary_op_f32_t fun) {
-    return ggml_map_binary_impl_f32(ctx, a, b, fun, false);
-}
-
-struct ggml_tensor * ggml_map_binary_inplace_f32(
-        struct ggml_context         * ctx,
-        struct ggml_tensor          * a,
-        struct ggml_tensor          * b,
-        const  ggml_binary_op_f32_t fun) {
-    return ggml_map_binary_impl_f32(ctx, a, b, fun, true);
-}
-
-////////////////////////////////////////////////////////////////////////////////
-
-void ggml_set_param(
-        struct ggml_context * ctx,
-        struct ggml_tensor * tensor) {
-    tensor->is_param = true;
-
-    GGML_ASSERT(tensor->grad == NULL);
-    tensor->grad = ggml_dup_tensor(ctx, tensor);
-}
-
-// ggml_compute_forward_dup
-
-static void ggml_compute_forward_dup_f16(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * src0,
-        struct ggml_tensor * dst) {
-    GGML_ASSERT(ggml_nelements(dst) == ggml_nelements(src0));
-
-    if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
-        return;
-    }
-
-    const int64_t ne00 = src0->ne[0];
-    const int64_t ne01 = src0->ne[1];
-    const int64_t ne02 = src0->ne[2];
-    const int64_t ne03 = src0->ne[3];
-
-    const int64_t ne0 = dst->ne[0];
-    const int64_t ne1 = dst->ne[1];
-    const int64_t ne2 = dst->ne[2];
-    const int64_t ne3 = dst->ne[3];
-
-    const size_t nb00 = src0->nb[0];
-    const size_t nb01 = src0->nb[1];
-    const size_t nb02 = src0->nb[2];
-    const size_t nb03 = src0->nb[3];
-
-    const size_t nb0 = dst->nb[0];
-    const size_t nb1 = dst->nb[1];
-    const size_t nb2 = dst->nb[2];
-    const size_t nb3 = dst->nb[3];
-
-    const int ith = params->ith; // thread index
-    const int nth = params->nth; // number of threads
-
-    if (ggml_is_contiguous(src0) && ggml_is_contiguous(dst) && src0->type == dst->type) {
-        // parallelize by elements
-        const int ne = ggml_nelements(dst);
-        const int dr = (ne + nth - 1) / nth;
-        const int ie0 = dr * ith;
-        const int ie1 = MIN(ie0 + dr, ne);
-
-        memcpy(
-            ((char *)  dst->data + ie0*nb0),
-            ((char *) src0->data + ie0*nb00),
-            (ie1 - ie0) * GGML_TYPE_SIZE[src0->type]);
-
-        return;
-    }
-
-    // parallelize by rows
-    const int nr = ne01;
-    // number of rows per thread
-    const int dr = (nr + nth - 1) / nth;
-    // row range for this thread
-    const int ir0 = dr * ith;
-    const int ir1 = MIN(ir0 + dr, nr);
-
-    if (src0->type == dst->type &&
-        ne00 == ne0 &&
-        nb00 == GGML_TYPE_SIZE[src0->type] && nb0 == GGML_TYPE_SIZE[dst->type]) {
-        // copy by rows
-        const size_t rs = ne00*nb00;
-        for (int64_t i03 = 0; i03 < ne03; i03++) {
-            for (int64_t i02 = 0; i02 < ne02; i02++) {
-                for (int64_t i01 = ir0; i01 < ir1; i01++) {
-                    memcpy(
-                        ((char *)  dst->data + i01*nb1  + i02*nb2  + i03*nb3),
-                        ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03),
-                        rs);
-                }
-            }
-        }
-        return;
-    }
-
-    // TODO: add more special-case implementations for tensor shapes/strides that can benefit from memcpy
-
-    if (ggml_is_contiguous(dst)) {
-        if (nb00 == sizeof(ggml_fp16_t)) {
-            if (dst->type == GGML_TYPE_F16) {
-                size_t id = 0;
-                const size_t rs = ne00 * nb00;
-                char * dst_ptr = (char *) dst->data;
-
-                for (int i03 = 0; i03 < ne03; i03++) {
-                    for (int i02 = 0; i02 < ne02; i02++) {
-                        id += rs * ir0;
-                        for (int i01 = ir0; i01 < ir1; i01++) {
-                            const char * src0_ptr = (char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03;
-                            memcpy(dst_ptr + id, src0_ptr, rs);
-                            id += rs;
-                        }
-                        id += rs * (ne01 - ir1);
-                    }
-                }
-            } else if (dst->type == GGML_TYPE_F32) {
-                size_t id = 0;
-                float * dst_ptr = (float *) dst->data;
-
-                for (int i03 = 0; i03 < ne03; i03++) {
-                    for (int i02 = 0; i02 < ne02; i02++) {
-                        id += ne00 * ir0;
-                        for (int i01 = ir0; i01 < ir1; i01++) {
-                            const ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03);
-                            for (int i00 = 0; i00 < ne00; i00++) {
-                                dst_ptr[id] = GGML_FP16_TO_FP32(src0_ptr[i00]);
-                                id++;
-                            }
-                        }
-                        id += ne00 * (ne01 - ir1);
-                    }
-                }
-            } else if (ggml_is_quantized(dst->type)) {
-                quantize_row_q_t const quantize_row_q = quantize_fns[dst->type].quantize_row_q;
-                float * src0_f32 = (float *) params->wdata + (ne00 + CACHE_LINE_SIZE_F32) * ith;
-
-                size_t id = 0;
-                size_t rs = nb0 * (ne00 / GGML_BLCK_SIZE[dst->type]);
-                char * dst_ptr = (char *) dst->data;
-
-                for (int i03 = 0; i03 < ne03; i03++) {
-                    for (int i02 = 0; i02 < ne02; i02++) {
-                        id += rs * ir0;
-                        for (int i01 = ir0; i01 < ir1; i01++) {
-                            const ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03);
-
-                            for (int i00 = 0; i00 < ne00; i00++) {
-                                src0_f32[i00] = GGML_FP16_TO_FP32(src0_ptr[i00]);
-                            }
-
-                            quantize_row_q(src0_f32, dst_ptr + id, ne00);
-                            id += rs;
-                        }
-                        id += rs * (ne01 - ir1);
-                    }
-                }
-            } else {
-                GGML_ASSERT(false); // TODO: implement
-            }
-        } else {
-            //printf("%s: this is not optimal - fix me\n", __func__);
-
-            if (dst->type == GGML_TYPE_F32) {
-                size_t id = 0;
-                float * dst_ptr = (float *) dst->data;
-
-                for (int i03 = 0; i03 < ne03; i03++) {
-                    for (int i02 = 0; i02 < ne02; i02++) {
-                        id += ne00 * ir0;
-                        for (int i01 = ir0; i01 < ir1; i01++) {
-                            for (int i00 = 0; i00 < ne00; i00++) {
-                                const ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
-
-                                dst_ptr[id] = GGML_FP16_TO_FP32(*src0_ptr);
-                                id++;
-                            }
-                        }
-                        id += ne00 * (ne01 - ir1);
-                    }
-                }
-            } else if (dst->type == GGML_TYPE_F16) {
-                size_t id = 0;
-                ggml_fp16_t * dst_ptr = (ggml_fp16_t *) dst->data;
-
-                for (int i03 = 0; i03 < ne03; i03++) {
-                    for (int i02 = 0; i02 < ne02; i02++) {
-                        id += ne00 * ir0;
-                        for (int i01 = ir0; i01 < ir1; i01++) {
-                            for (int i00 = 0; i00 < ne00; i00++) {
-                                const ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
-
-                                dst_ptr[id] = *src0_ptr;
-                                id++;
-                            }
-                        }
-                        id += ne00 * (ne01 - ir1);
-                    }
-                }
-            } else {
-                GGML_ASSERT(false); // TODO: implement
-            }
-        }
-        return;
-    }
-
-    // dst counters
-    int64_t i10 = 0;
-    int64_t i11 = 0;
-    int64_t i12 = 0;
-    int64_t i13 = 0;
-
-    if (dst->type == GGML_TYPE_F16) {
-        for (int64_t i03 = 0; i03 < ne03; i03++) {
-            for (int64_t i02 = 0; i02 < ne02; i02++) {
-                i10 += ne00 * ir0;
-                while (i10 >= ne0) {
-                    i10 -= ne0;
-                    if (++i11 == ne1) {
-                        i11 = 0;
-                        if (++i12 == ne2) {
-                            i12 = 0;
-                            if (++i13 == ne3) {
-                                i13 = 0;
-                            }
-                        }
-                    }
-                }
-                for (int64_t i01 = ir0; i01 < ir1; i01++) {
-                    for (int64_t i00 = 0; i00 < ne00; i00++) {
-                        const char * src0_ptr = ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
-                              char * dst_ptr  = ((char *)  dst->data + i10*nb0  + i11*nb1  + i12*nb2  + i13*nb3);
-
-                        memcpy(dst_ptr, src0_ptr, sizeof(ggml_fp16_t));
-
-                        if (++i10 == ne00) {
-                            i10 = 0;
-                            if (++i11 == ne01) {
-                                i11 = 0;
-                                if (++i12 == ne02) {
-                                    i12 = 0;
-                                    if (++i13 == ne03) {
-                                        i13 = 0;
-                                    }
-                                }
-                            }
-                        }
-                    }
-                }
-                i10 += ne00 * (ne01 - ir1);
-                while (i10 >= ne0) {
-                    i10 -= ne0;
-                    if (++i11 == ne1) {
-                        i11 = 0;
-                        if (++i12 == ne2) {
-                            i12 = 0;
-                            if (++i13 == ne3) {
-                                i13 = 0;
-                            }
-                        }
-                    }
-                }
-            }
-        }
-    } else if (dst->type == GGML_TYPE_F32) {
-        for (int64_t i03 = 0; i03 < ne03; i03++) {
-            for (int64_t i02 = 0; i02 < ne02; i02++) {
-                i10 += ne00 * ir0;
-                while (i10 >= ne0) {
-                    i10 -= ne0;
-                    if (++i11 == ne1) {
-                        i11 = 0;
-                        if (++i12 == ne2) {
-                            i12 = 0;
-                            if (++i13 == ne3) {
-                                i13 = 0;
-                            }
-                        }
-                    }
-                }
-                for (int64_t i01 = ir0; i01 < ir1; i01++) {
-                    for (int64_t i00 = 0; i00 < ne00; i00++) {
-                        const char * src0_ptr = ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
-                              char * dst_ptr  = ((char *)  dst->data + i10*nb0  + i11*nb1  + i12*nb2  + i13*nb3);
-
-                        *(float *) dst_ptr = GGML_FP16_TO_FP32(*(const ggml_fp16_t *) src0_ptr);
-
-                        if (++i10 == ne0) {
-                            i10 = 0;
-                            if (++i11 == ne1) {
-                                i11 = 0;
-                                if (++i12 == ne2) {
-                                    i12 = 0;
-                                    if (++i13 == ne3) {
-                                        i13 = 0;
-                                    }
-                                }
-                            }
-                        }
-                    }
-                }
-                i10 += ne00 * (ne01 - ir1);
-                while (i10 >= ne0) {
-                    i10 -= ne0;
-                    if (++i11 == ne1) {
-                        i11 = 0;
-                        if (++i12 == ne2) {
-                            i12 = 0;
-                            if (++i13 == ne3) {
-                                i13 = 0;
-                            }
-                        }
-                    }
-                }
-            }
-        }
-    } else {
-        GGML_ASSERT(false); // TODO: implement
-    }
-}
-
-static void ggml_compute_forward_dup_f32(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * src0,
-        struct ggml_tensor * dst) {
-    GGML_ASSERT(ggml_nelements(dst) == ggml_nelements(src0));
-
-    if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
-        return;
-    }
-
-    const int64_t ne00 = src0->ne[0];
-    const int64_t ne01 = src0->ne[1];
-    const int64_t ne02 = src0->ne[2];
-    const int64_t ne03 = src0->ne[3];
-
-    const int64_t ne0 = dst->ne[0];
-    const int64_t ne1 = dst->ne[1];
-    const int64_t ne2 = dst->ne[2];
-    const int64_t ne3 = dst->ne[3];
-
-    const size_t nb00 = src0->nb[0];
-    const size_t nb01 = src0->nb[1];
-    const size_t nb02 = src0->nb[2];
-    const size_t nb03 = src0->nb[3];
-
-    const size_t nb0 = dst->nb[0];
-    const size_t nb1 = dst->nb[1];
-    const size_t nb2 = dst->nb[2];
-    const size_t nb3 = dst->nb[3];
-
-    const int ith = params->ith; // thread index
-    const int nth = params->nth; // number of threads
-
-    if (ggml_is_contiguous(src0) && ggml_is_contiguous(dst) && src0->type == dst->type) {
-        // parallelize by elements
-        const int ne = ggml_nelements(dst);
-        const int dr = (ne + nth - 1) / nth;
-        const int ie0 = dr * ith;
-        const int ie1 = MIN(ie0 + dr, ne);
-
-        memcpy(
-            ((char *)  dst->data + ie0*nb0),
-            ((char *) src0->data + ie0*nb00),
-            (ie1 - ie0) * GGML_TYPE_SIZE[src0->type]);
-
-        return;
-    }
-
-    // parallelize by rows
-    const int nr = ne01;
-    // number of rows per thread
-    const int dr = (nr + nth - 1) / nth;
-    // row range for this thread
-    const int ir0 = dr * ith;
-    const int ir1 = MIN(ir0 + dr, nr);
-
-    if (src0->type == dst->type &&
-        ne00 == ne0 &&
-        nb00 == GGML_TYPE_SIZE[src0->type] && nb0 == GGML_TYPE_SIZE[dst->type]) {
-        // copy by rows
-        const size_t rs = ne00*nb00;
-        for (int64_t i03 = 0; i03 < ne03; i03++) {
-            for (int64_t i02 = 0; i02 < ne02; i02++) {
-                for (int64_t i01 = ir0; i01 < ir1; i01++) {
-                    memcpy(
-                        ((char *)  dst->data + i01*nb1  + i02*nb2  + i03*nb3),
-                        ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03),
-                        rs);
-                }
-            }
-        }
-        return;
-    }
-
-    if (ggml_is_contiguous(dst)) {
-        // TODO: simplify
-        if (nb00 == sizeof(float)) {
-            if (dst->type == GGML_TYPE_F32) {
-                size_t id = 0;
-                const size_t rs = ne00 * nb00;
-                char * dst_ptr = (char *) dst->data;
-
-                for (int i03 = 0; i03 < ne03; i03++) {
-                    for (int i02 = 0; i02 < ne02; i02++) {
-                        id += rs * ir0;
-                        for (int i01 = ir0; i01 < ir1; i01++) {
-                            const char * src0_ptr = (char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03;
-                            memcpy(dst_ptr + id, src0_ptr, rs);
-                            id += rs;
-                        }
-                        id += rs * (ne01 - ir1);
-                    }
-                }
-            } else if (dst->type == GGML_TYPE_F16) {
-                size_t id = 0;
-                ggml_fp16_t * dst_ptr = (ggml_fp16_t *) dst->data;
-
-                for (int i03 = 0; i03 < ne03; i03++) {
-                    for (int i02 = 0; i02 < ne02; i02++) {
-                        id += ne00 * ir0;
-                        for (int i01 = ir0; i01 < ir1; i01++) {
-                            for (int i00 = 0; i00 < ne00; i00++) {
-                                const float * src0_ptr = (float *) ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
-
-                                dst_ptr[id] = GGML_FP32_TO_FP16(*src0_ptr);
-                                id++;
-                            }
-                        }
-                        id += ne00 * (ne01 - ir1);
-                    }
-                }
-            } else if (ggml_is_quantized(dst->type)) {
-                quantize_row_q_t const quantize_row_q = quantize_fns[dst->type].quantize_row_q;
-
-                size_t id = 0;
-                size_t rs = nb0 * (ne00 / GGML_BLCK_SIZE[dst->type]);
-                char * dst_ptr = (char *) dst->data;
-
-                for (int i03 = 0; i03 < ne03; i03++) {
-                    for (int i02 = 0; i02 < ne02; i02++) {
-                        id += rs * ir0;
-                        for (int i01 = ir0; i01 < ir1; i01++) {
-                            const float * src0_ptr = (float *) ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03);
-                            quantize_row_q(src0_ptr, dst_ptr + id, ne00);
-                            id += rs;
-                        }
-                        id += rs * (ne01 - ir1);
-                    }
-                }
-            } else {
-                GGML_ASSERT(false); // TODO: implement
-            }
-        } else {
-            //printf("%s: this is not optimal - fix me\n", __func__);
-
-            if (dst->type == GGML_TYPE_F32) {
-                size_t id = 0;
-                float * dst_ptr = (float *) dst->data;
-
-                for (int i03 = 0; i03 < ne03; i03++) {
-                    for (int i02 = 0; i02 < ne02; i02++) {
-                        id += ne00 * ir0;
-                        for (int i01 = ir0; i01 < ir1; i01++) {
-                            for (int i00 = 0; i00 < ne00; i00++) {
-                                const float * src0_ptr = (float *) ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
-
-                                dst_ptr[id] = *src0_ptr;
-                                id++;
-                            }
-                        }
-                        id += ne00 * (ne01 - ir1);
-                    }
-                }
-            } else if (dst->type == GGML_TYPE_F16) {
-                size_t id = 0;
-                ggml_fp16_t * dst_ptr = (ggml_fp16_t *) dst->data;
-
-                for (int i03 = 0; i03 < ne03; i03++) {
-                    for (int i02 = 0; i02 < ne02; i02++) {
-                        id += ne00 * ir0;
-                        for (int i01 = ir0; i01 < ir1; i01++) {
-                            for (int i00 = 0; i00 < ne00; i00++) {
-                                const float * src0_ptr = (float *) ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
-
-                                dst_ptr[id] = GGML_FP32_TO_FP16(*src0_ptr);
-                                id++;
-                            }
-                        }
-                        id += ne00 * (ne01 - ir1);
-                    }
-                }
-            } else {
-                GGML_ASSERT(false); // TODO: implement
-            }
-        }
-
-        return;
-    }
-
-    // dst counters
-
-    int64_t i10 = 0;
-    int64_t i11 = 0;
-    int64_t i12 = 0;
-    int64_t i13 = 0;
-
-    if (dst->type == GGML_TYPE_F32) {
-        for (int64_t i03 = 0; i03 < ne03; i03++) {
-            for (int64_t i02 = 0; i02 < ne02; i02++) {
-                i10 += ne00 * ir0;
-                while (i10 >= ne0) {
-                    i10 -= ne0;
-                    if (++i11 == ne1) {
-                        i11 = 0;
-                        if (++i12 == ne2) {
-                            i12 = 0;
-                            if (++i13 == ne3) {
-                                i13 = 0;
-                            }
-                        }
-                    }
-                }
-                for (int64_t i01 = ir0; i01 < ir1; i01++) {
-                    for (int64_t i00 = 0; i00 < ne00; i00++) {
-                        const char * src0_ptr = ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
-                              char * dst_ptr  = ((char *)  dst->data + i10*nb0  + i11*nb1  + i12*nb2  + i13*nb3);
-
-                        memcpy(dst_ptr, src0_ptr, sizeof(float));
-
-                        if (++i10 == ne0) {
-                            i10 = 0;
-                            if (++i11 == ne1) {
-                                i11 = 0;
-                                if (++i12 == ne2) {
-                                    i12 = 0;
-                                    if (++i13 == ne3) {
-                                        i13 = 0;
-                                    }
-                                }
-                            }
-                        }
-                    }
-                }
-                i10 += ne00 * (ne01 - ir1);
-                while (i10 >= ne0) {
-                    i10 -= ne0;
-                    if (++i11 == ne1) {
-                        i11 = 0;
-                        if (++i12 == ne2) {
-                            i12 = 0;
-                            if (++i13 == ne3) {
-                                i13 = 0;
-                            }
-                        }
-                    }
-                }
-            }
-        }
-    } else if (dst->type == GGML_TYPE_F16) {
-        for (int64_t i03 = 0; i03 < ne03; i03++) {
-            for (int64_t i02 = 0; i02 < ne02; i02++) {
-                i10 += ne00 * ir0;
-                while (i10 >= ne0) {
-                    i10 -= ne0;
-                    if (++i11 == ne1) {
-                        i11 = 0;
-                        if (++i12 == ne2) {
-                            i12 = 0;
-                            if (++i13 == ne3) {
-                                i13 = 0;
-                            }
-                        }
-                    }
-                }
-                for (int64_t i01 = ir0; i01 < ir1; i01++) {
-                    for (int64_t i00 = 0; i00 < ne00; i00++) {
-                        const char * src0_ptr = ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
-                              char * dst_ptr  = ((char *)  dst->data + i10*nb0  + i11*nb1  + i12*nb2  + i13*nb3);
-
-                        *(ggml_fp16_t *) dst_ptr = GGML_FP32_TO_FP16(*(const float *) src0_ptr);
-
-                        if (++i10 == ne0) {
-                            i10 = 0;
-                            if (++i11 == ne1) {
-                                i11 = 0;
-                                if (++i12 == ne2) {
-                                    i12 = 0;
-                                    if (++i13 == ne3) {
-                                        i13 = 0;
-                                    }
-                                }
-                            }
-                        }
-                    }
-                }
-                i10 += ne00 * (ne01 - ir1);
-                while (i10 >= ne0) {
-                    i10 -= ne0;
-                    if (++i11 == ne1) {
-                        i11 = 0;
-                        if (++i12 == ne2) {
-                            i12 = 0;
-                            if (++i13 == ne3) {
-                                i13 = 0;
-                            }
-                        }
-                    }
-                }
-            }
-        }
-    } else {
-        GGML_ASSERT(false); // TODO: implement
-    }
-}
-
-static void ggml_compute_forward_dup(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * src0,
-        struct ggml_tensor * dst) {
-    switch (src0->type) {
-        case GGML_TYPE_F16:
-            {
-                ggml_compute_forward_dup_f16(params, src0, dst);
-            } break;
-        case GGML_TYPE_F32:
-            {
-                ggml_compute_forward_dup_f32(params, src0, dst);
-            } break;
-        default:
-            {
-                GGML_ASSERT(false);
-            } break;
-    }
-}
-
-// ggml_compute_forward_add
-
-static void ggml_compute_forward_add_f32(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * src0,
-        const struct ggml_tensor * src1,
-        struct ggml_tensor * dst) {
-    GGML_ASSERT(ggml_are_same_shape(src0, src1) && ggml_are_same_shape(src0, dst));
-
-    if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
-        return;
-    }
-
-    const int ith = params->ith;
-    const int nth = params->nth;
-
-    const int n  = ggml_nrows(src0);
-    const int nc = src0->ne[0];
-
-    const size_t nb00 = src0->nb[0];
-    const size_t nb01 = src0->nb[1];
-
-    const size_t nb10 = src1->nb[0];
-    const size_t nb11 = src1->nb[1];
-
-    const size_t nb0 = dst->nb[0];
-    const size_t nb1 = dst->nb[1];
-
-    GGML_ASSERT( nb0 == sizeof(float));
-    GGML_ASSERT(nb00 == sizeof(float));
-
-    if (nb10 == sizeof(float)) {
-        for (int j = ith; j < n; j += nth) {
-#ifdef GGML_USE_ACCELERATE
-            vDSP_vadd(
-                    (float *) ((char *) src0->data + j*nb01), 1,
-                    (float *) ((char *) src1->data + j*nb11), 1,
-                    (float *) ((char *) dst->data  + j*nb1),  1, nc);
-#else
-            ggml_vec_add_f32(nc,
-                    (float *) ((char *) dst->data  + j*nb1),
-                    (float *) ((char *) src0->data + j*nb01),
-                    (float *) ((char *) src1->data + j*nb11));
-#endif
-        }
-    } else {
-        // src1 is not contiguous
-        for (int j = ith; j < n; j += nth) {
-            float * dst_ptr  = (float *) ((char *) dst->data  + j*nb1);
-            float * src0_ptr = (float *) ((char *) src0->data + j*nb01);
-            for (int i = 0; i < nc; i++) {
-                float * src1_ptr = (float *) ((char *) src1->data + j*nb11 + i*nb10);
-
-                dst_ptr[i] = src0_ptr[i] + *src1_ptr;
-            }
-        }
-    }
-}
-
-static void ggml_compute_forward_add_f16_f32(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * src0,
-        const struct ggml_tensor * src1,
-        struct ggml_tensor * dst) {
-    GGML_ASSERT(ggml_are_same_shape(src0, src1) && ggml_are_same_shape(src0, dst));
-
-    if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
-        return;
-    }
-
-    const int ith = params->ith;
-    const int nth = params->nth;
-
-    const int n  = ggml_nrows(src0);
-    const int nc = src0->ne[0];
-
-    const size_t nb00 = src0->nb[0];
-    const size_t nb01 = src0->nb[1];
-
-    const size_t nb10 = src1->nb[0];
-    const size_t nb11 = src1->nb[1];
-
-    const size_t nb0 = dst->nb[0];
-    const size_t nb1 = dst->nb[1];
-
-    GGML_ASSERT(src0->type == GGML_TYPE_F16);
-    GGML_ASSERT(src1->type == GGML_TYPE_F32);
-    GGML_ASSERT(dst->type == GGML_TYPE_F16);
-
-    GGML_ASSERT( nb0 == sizeof(ggml_fp16_t));
-    GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
-
-    if (nb10 == sizeof(float)) {
-        for (int j = ith; j < n; j += nth) {
-            ggml_fp16_t * dst_ptr  = (ggml_fp16_t *) ((char *) dst->data  + j*nb1);
-            ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) src0->data + j*nb01);
-            for (int i = 0; i < nc; i++) {
-                float * src1_ptr = (float *) ((char *) src1->data + j*nb11 + i*nb10);
-                dst_ptr[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(src0_ptr[i]) + *src1_ptr);
-            }
-        }
-    }
-    else {
-        // src1 is not contiguous
-        GGML_ASSERT(false);
-    }
-}
-
-static void ggml_compute_forward_add_f16_f16(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * src0,
-        const struct ggml_tensor * src1,
-        struct ggml_tensor * dst) {
-    GGML_ASSERT(ggml_are_same_shape(src0, src1) && ggml_are_same_shape(src0, dst));
-
-    if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
-        return;
-    }
-
-    const int ith = params->ith;
-    const int nth = params->nth;
-
-    const int n  = ggml_nrows(src0);
-    const int nc = src0->ne[0];
-
-    const size_t nb00 = src0->nb[0];
-    const size_t nb01 = src0->nb[1];
-
-    const size_t nb10 = src1->nb[0];
-    const size_t nb11 = src1->nb[1];
-
-    const size_t nb0 = dst->nb[0];
-    const size_t nb1 = dst->nb[1];
-
-    GGML_ASSERT(src0->type == GGML_TYPE_F16);
-    GGML_ASSERT(src1->type == GGML_TYPE_F16);
-    GGML_ASSERT(dst->type == GGML_TYPE_F16);
-
-    GGML_ASSERT( nb0 == sizeof(ggml_fp16_t));
-    GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
-
-    if (nb10 == sizeof(ggml_fp16_t)) {
-        for (int j = ith; j < n; j += nth) {
-            ggml_fp16_t * dst_ptr  = (ggml_fp16_t *) ((char *) dst->data  + j*nb1);
-            ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) src0->data + j*nb01);
-            for (int i = 0; i < nc; i++) {
-                ggml_fp16_t * src1_ptr = (ggml_fp16_t *) ((char *) src1->data + j*nb11 + i*nb10);
-                dst_ptr[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(src0_ptr[i]) + GGML_FP16_TO_FP32(*src1_ptr));
-            }
-        }
-    }
-    else {
-        // src1 is not contiguous
-        GGML_ASSERT(false);
-    }
-}
-
-static void ggml_compute_forward_add_q_f32(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * src0,
-        const struct ggml_tensor * src1,
-        struct ggml_tensor * dst) {
-    GGML_ASSERT(ggml_are_same_shape(src0, src1) && ggml_are_same_shape(src0, dst));
-
-    if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
-        return;
-    }
-
-    const int64_t ne00 = src0->ne[0];
-    const int64_t ne01 = src0->ne[1];
-    const int64_t ne02 = src0->ne[2];
-    const int64_t ne03 = src0->ne[3];
-
-    //const int64_t ne10 = src1->ne[0];
-    //const int64_t ne11 = src1->ne[1];
-    const int64_t ne12 = src1->ne[2];
-    const int64_t ne13 = src1->ne[3];
-
-    //const int64_t ne0  = dst->ne[0];
-    //const int64_t ne1  = dst->ne[1];
-    const int64_t ne2  = dst->ne[2];
-    const int64_t ne3  = dst->ne[3];
-
-    const int nb00 = src0->nb[0];
-    const int nb01 = src0->nb[1];
-    const int nb02 = src0->nb[2];
-    const int nb03 = src0->nb[3];
-
-    const int nb10 = src1->nb[0];
-    const int nb11 = src1->nb[1];
-    const int nb12 = src1->nb[2];
-    const int nb13 = src1->nb[3];
-
-    const int nb0  = dst->nb[0];
-    const int nb1  = dst->nb[1];
-    const int nb2  = dst->nb[2];
-    const int nb3  = dst->nb[3];
-
-    const int ith = params->ith;
-    const int nth = params->nth;
-
-    GGML_ASSERT(ne02 == ne12);
-    GGML_ASSERT(ne03 == ne13);
-    GGML_ASSERT(ne2  == ne12);
-    GGML_ASSERT(ne3  == ne13);
-
-    const enum ggml_type type = src0->type;
-    dequantize_row_q_t const dequantize_row_q = quantize_fns[type].dequantize_row_q;
-    quantize_row_q_t const quantize_row_q = quantize_fns[type].quantize_row_q;
-
-    // we don't support permuted src0 or src1
-    GGML_ASSERT(nb00 == (int) GGML_TYPE_SIZE[type]);
-    GGML_ASSERT(nb10 == sizeof(float));
-
-    // dst cannot be transposed or permuted
-    GGML_ASSERT(nb0 <= nb1);
-    GGML_ASSERT(nb1 <= nb2);
-    GGML_ASSERT(nb2 <= nb3);
-
-    GGML_ASSERT(ggml_is_quantized(src0->type));
-    GGML_ASSERT(dst->type == src0->type);
-    GGML_ASSERT(src1->type == GGML_TYPE_F32);
-
-    // total rows in src0
-    const int nr = ne01*ne02*ne03;
-
-    // rows per thread
-    const int dr = (nr + nth - 1)/nth;
-
-    // row range for this thread
-    const int ir0 = dr*ith;
-    const int ir1 = MIN(ir0 + dr, nr);
-
-    float * wdata = (float *) params->wdata + (ne00 + CACHE_LINE_SIZE_F32) * ith;
-
-    for (int ir = ir0; ir < ir1; ++ir) {
-        // src0 indices
-        const int i03 = ir/(ne02*ne01);
-        const int i02 = (ir - i03*ne02*ne01)/ne01;
-        const int i01 = (ir - i03*ne02*ne01 - i02*ne01);
-
-        // src1 and dst are same shape as src0 => same indices
-        const int i13 = i03;
-        const int i12 = i02;
-        const int i11 = i01;
-
-        const int i3 = i03;
-        const int i2 = i02;
-        const int i1 = i01;
-
-        void  * src0_row = (void *) ((char *) src0->data + (i01*nb01 + i02*nb02 + i03*nb03));
-        float * src1_row = (float *)((char *) src1->data + (i11*nb11 + i12*nb12 + i13*nb13));
-        void  * dst_row  = (void *) ((char *)  dst->data + ( i1*nb1  +  i2*nb2  +  i3*nb0));
-
-        assert(ne00 % 32 == 0);
-
-        // unquantize row from src0 to temp buffer
-        dequantize_row_q(src0_row, wdata, ne00);
-        // add src1
-        ggml_vec_acc_f32(ne00, wdata, src1_row);
-        // quantize row to dst
-        quantize_row_q(wdata, dst_row, ne00);
-    }
-}
-
-static void ggml_compute_forward_add(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * src0,
-        const struct ggml_tensor * src1,
-        struct ggml_tensor * dst) {
-    switch (src0->type) {
-        case GGML_TYPE_F32:
-            {
-                ggml_compute_forward_add_f32(params, src0, src1, dst);
-            } break;
-        case GGML_TYPE_F16:
-            {
-                if (src1->type == GGML_TYPE_F16) {
-                    ggml_compute_forward_add_f16_f16(params, src0, src1, dst);
-                }
-                else if (src1->type == GGML_TYPE_F32) {
-                    ggml_compute_forward_add_f16_f32(params, src0, src1, dst);
-                }
-                else {
-                    GGML_ASSERT(false);
-                }
-            } break;
-        case GGML_TYPE_Q4_0:
-        case GGML_TYPE_Q4_1:
-        case GGML_TYPE_Q4_2:
-        case GGML_TYPE_Q4_3:
-            {
-                ggml_compute_forward_add_q_f32(params, src0, src1, dst);
-            } break;
-        default:
-            {
-                GGML_ASSERT(false);
-            } break;
-    }
-}
-
-// ggml_compute_forward_sub
-
-static void ggml_compute_forward_sub_f32(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * src0,
-        const struct ggml_tensor * src1,
-        struct ggml_tensor * dst) {
-    assert(params->ith == 0);
-    assert(ggml_are_same_shape(src0, src1) && ggml_are_same_shape(src0, dst));
-
-    if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
-        return;
-    }
-
-    const int n  = ggml_nrows(src0);
-    const int nc = src0->ne[0];
-
-    assert( dst->nb[0] == sizeof(float));
-    assert(src0->nb[0] == sizeof(float));
-    assert(src1->nb[0] == sizeof(float));
-
-    for (int i = 0; i < n; i++) {
-        ggml_vec_sub_f32(nc,
-                (float *) ((char *) dst->data  + i*( dst->nb[1])),
-                (float *) ((char *) src0->data + i*(src0->nb[1])),
-                (float *) ((char *) src1->data + i*(src1->nb[1])));
-    }
-}
-
-static void ggml_compute_forward_sub(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * src0,
-        const struct ggml_tensor * src1,
-        struct ggml_tensor * dst) {
-    switch (src0->type) {
-        case GGML_TYPE_F32:
-            {
-                ggml_compute_forward_sub_f32(params, src0, src1, dst);
-            } break;
-        default:
-            {
-                GGML_ASSERT(false);
-            } break;
-    }
-}
-
-// ggml_compute_forward_mul
-
-static void ggml_compute_forward_mul_f32(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * src0,
-        const struct ggml_tensor * src1,
-        struct ggml_tensor * dst) {
-    assert(params->ith == 0);
-    assert(ggml_are_same_shape(src0, src1) && ggml_are_same_shape(src0, dst));
-
-    if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
-        return;
-    }
-
-    const int n  = ggml_nrows(src0);
-    const int nc = src0->ne[0];
-
-    assert( dst->nb[0] == sizeof(float));
-    assert(src0->nb[0] == sizeof(float));
-    assert(src1->nb[0] == sizeof(float));
-
-    for (int i = 0; i < n; i++) {
-        ggml_vec_mul_f32(nc,
-                (float *) ((char *) dst->data  + i*( dst->nb[1])),
-                (float *) ((char *) src0->data + i*(src0->nb[1])),
-                (float *) ((char *) src1->data + i*(src1->nb[1])));
-    }
-}
-
-static void ggml_compute_forward_mul(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * src0,
-        const struct ggml_tensor * src1,
-        struct ggml_tensor * dst) {
-    switch (src0->type) {
-        case GGML_TYPE_F32:
-            {
-                ggml_compute_forward_mul_f32(params, src0, src1, dst);
-            } break;
-        default:
-            {
-                GGML_ASSERT(false);
-            } break;
-    }
-}
-
-// ggml_compute_forward_div
-
-static void ggml_compute_forward_div_f32(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * src0,
-        const struct ggml_tensor * src1,
-        struct ggml_tensor * dst) {
-    assert(params->ith == 0);
-    assert(ggml_are_same_shape(src0, src1) && ggml_are_same_shape(src0, dst));
-
-    if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
-        return;
-    }
-
-    const int n  = ggml_nrows(src0);
-    const int nc = src0->ne[0];
-
-    assert( dst->nb[0] == sizeof(float));
-    assert(src0->nb[0] == sizeof(float));
-    assert(src1->nb[0] == sizeof(float));
-
-    for (int i = 0; i < n; i++) {
-        ggml_vec_div_f32(nc,
-                (float *) ((char *) dst->data  + i*( dst->nb[1])),
-                (float *) ((char *) src0->data + i*(src0->nb[1])),
-                (float *) ((char *) src1->data + i*(src1->nb[1])));
-    }
-}
-
-static void ggml_compute_forward_div(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * src0,
-        const struct ggml_tensor * src1,
-        struct ggml_tensor * dst) {
-    switch (src0->type) {
-        case GGML_TYPE_F32:
-            {
-                ggml_compute_forward_div_f32(params, src0, src1, dst);
-            } break;
-        default:
-            {
-                GGML_ASSERT(false);
-            } break;
-    }
-}
-
-// ggml_compute_forward_sqr
-
-static void ggml_compute_forward_sqr_f32(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * src0,
-        struct ggml_tensor * dst) {
-    assert(params->ith == 0);
-    assert(ggml_are_same_shape(src0, dst));
-
-    if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
-        return;
-    }
-
-    const int n     = ggml_nrows(src0);
-    const int nc    = src0->ne[0];
-
-    assert( dst->nb[0] == sizeof(float));
-    assert(src0->nb[0] == sizeof(float));
-
-    for (int i = 0; i < n; i++) {
-        ggml_vec_sqr_f32(nc,
-                (float *) ((char *) dst->data  + i*( dst->nb[1])),
-                (float *) ((char *) src0->data + i*(src0->nb[1])));
-    }
-}
-
-static void ggml_compute_forward_sqr(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * src0,
-        struct ggml_tensor * dst) {
-    switch (src0->type) {
-        case GGML_TYPE_F32:
-            {
-                ggml_compute_forward_sqr_f32(params, src0, dst);
-            } break;
-        default:
-            {
-                GGML_ASSERT(false);
-            } break;
-    }
-}
-
-// ggml_compute_forward_sqrt
-
-static void ggml_compute_forward_sqrt_f32(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * src0,
-        struct ggml_tensor * dst) {
-    assert(params->ith == 0);
-    assert(ggml_are_same_shape(src0, dst));
-
-    if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
-        return;
-    }
-
-    const int n  = ggml_nrows(src0);
-    const int nc = src0->ne[0];
-
-    assert( dst->nb[0] == sizeof(float));
-    assert(src0->nb[0] == sizeof(float));
-
-    for (int i = 0; i < n; i++) {
-        ggml_vec_sqrt_f32(nc,
-                (float *) ((char *) dst->data  + i*( dst->nb[1])),
-                (float *) ((char *) src0->data + i*(src0->nb[1])));
-    }
-}
-
-static void ggml_compute_forward_sqrt(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * src0,
-        struct ggml_tensor * dst) {
-    switch (src0->type) {
-        case GGML_TYPE_F32:
-            {
-                ggml_compute_forward_sqrt_f32(params, src0, dst);
-            } break;
-        default:
-            {
-                GGML_ASSERT(false);
-            } break;
-    }
-}
-
-// ggml_compute_forward_sum
-
-static void ggml_compute_forward_sum_f32(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * src0,
-        struct ggml_tensor * dst) {
-    assert(params->ith == 0);
-    assert(ggml_is_scalar(dst));
-
-    if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
-        return;
-    }
-
-    assert(ggml_is_scalar(dst));
-    assert(src0->nb[0] == sizeof(float));
-
-    const int64_t ne00 = src0->ne[0];
-    const int64_t ne01 = src0->ne[1];
-    const int64_t ne02 = src0->ne[2];
-    const int64_t ne03 = src0->ne[3];
-
-    const size_t nb01 = src0->nb[1];
-    const size_t nb02 = src0->nb[2];
-    const size_t nb03 = src0->nb[3];
-
-    for (int64_t i03 = 0; i03 < ne03; i03++) {
-        for (int64_t i02 = 0; i02 < ne02; i02++) {
-            for (int64_t i01 = 0; i01 < ne01; i01++) {
-                ggml_vec_sum_f32(ne00,
-                        (float *) (dst->data),
-                        (float *) ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03));
-            }
-        }
-    }
-}
-
-static void ggml_compute_forward_sum(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * src0,
-        struct ggml_tensor * dst) {
-    switch (src0->type) {
-        case GGML_TYPE_F32:
-            {
-                ggml_compute_forward_sum_f32(params, src0, dst);
-            } break;
-        default:
-            {
-                GGML_ASSERT(false);
-            } break;
-    }
-}
-
-// ggml_compute_forward_mean
-
-static void ggml_compute_forward_mean_f32(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * src0,
-        struct ggml_tensor * dst) {
-    assert(params->ith == 0);
-
-    if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
-        return;
-    }
-
-    assert(src0->nb[0] == sizeof(float));
-
-    const int64_t ne00 = src0->ne[0];
-    const int64_t ne01 = src0->ne[1];
-    const int64_t ne02 = src0->ne[2];
-    const int64_t ne03 = src0->ne[3];
-
-    const size_t nb01 = src0->nb[1];
-    const size_t nb02 = src0->nb[2];
-    const size_t nb03 = src0->nb[3];
-
-    const int64_t ne0 = dst->ne[0];
-    const int64_t ne1 = dst->ne[1];
-    const int64_t ne2 = dst->ne[2];
-    const int64_t ne3 = dst->ne[3];
-
-    assert(ne0 == 1);
-    assert(ne1 == ne01);
-    assert(ne2 == ne02);
-    assert(ne3 == ne03);
-
-    UNUSED(ne0);
-    UNUSED(ne1);
-    UNUSED(ne2);
-    UNUSED(ne3);
-
-    const size_t nb1 = dst->nb[1];
-    const size_t nb2 = dst->nb[2];
-    const size_t nb3 = dst->nb[3];
-
-    for (int64_t i03 = 0; i03 < ne03; i03++) {
-        for (int64_t i02 = 0; i02 < ne02; i02++) {
-            for (int64_t i01 = 0; i01 < ne01; i01++) {
-                ggml_vec_sum_f32(ne00,
-                        (float *) ((char *)  dst->data + i01*nb1  + i02*nb2  + i03*nb3),
-                        (float *) ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03));
-
-                *(float *) ((char *) dst->data + i01*nb1 + i02*nb2 + i03*nb3) /= (float) ne00;
-            }
-        }
-    }
-}
-
-static void ggml_compute_forward_mean(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * src0,
-        struct ggml_tensor * dst) {
-    switch (src0->type) {
-        case GGML_TYPE_F32:
-            {
-                ggml_compute_forward_mean_f32(params, src0, dst);
-            } break;
-        default:
-            {
-                GGML_ASSERT(false);
-            } break;
-    }
-}
-
-// ggml_compute_forward_repeat
-
-static void ggml_compute_forward_repeat_f32(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * src0,
-        struct ggml_tensor * dst) {
-    assert(params->ith == 0);
-    assert(ggml_can_repeat(src0, dst));
-
-    if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
-        return;
-    }
-
-    // TODO: implement support for rank > 2 tensors
-    assert(src0->ne[2] == 1);
-    assert(src0->ne[3] == 1);
-    assert( dst->ne[2] == 1);
-    assert( dst->ne[3] == 1);
-
-    const int nc  = dst->ne[0];
-    const int nr  = dst->ne[1];
-    const int nc0 = src0->ne[0];
-    const int nr0 = src0->ne[1];
-    const int ncr = nc/nc0; // guaranteed to be an integer due to the check in ggml_can_repeat
-    const int nrr = nr/nr0; // guaranteed to be an integer due to the check in ggml_can_repeat
-
-    // TODO: support for transposed / permuted tensors
-    assert( dst->nb[0] == sizeof(float));
-    assert(src0->nb[0] == sizeof(float));
-
-    // TODO: maybe this is not optimal?
-    for (int i = 0; i < nrr; i++) {
-        for (int j = 0; j < ncr; j++) {
-            for (int k = 0; k < nr0; k++) {
-                ggml_vec_cpy_f32(nc0,
-                        (float *) ((char *)  dst->data + (i*nr0 + k)*( dst->nb[1]) + j*nc0*( dst->nb[0])),
-                        (float *) ((char *) src0->data + (        k)*(src0->nb[1])));
-            }
-        }
-    }
-}
-
-static void ggml_compute_forward_repeat(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * src0,
-        struct ggml_tensor * dst) {
-    switch (src0->type) {
-        case GGML_TYPE_F32:
-            {
-                ggml_compute_forward_repeat_f32(params, src0, dst);
-            } break;
-        default:
-            {
-                GGML_ASSERT(false);
-            } break;
-    }
-}
-
-// ggml_compute_forward_abs
-
-static void ggml_compute_forward_abs_f32(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * src0,
-        struct ggml_tensor * dst) {
-    assert(params->ith == 0);
-    assert(ggml_are_same_shape(src0, dst));
-
-    if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
-        return;
-    }
-
-    const int n  = ggml_nrows(src0);
-    const int nc = src0->ne[0];
-
-    assert(dst->nb[0]  == sizeof(float));
-    assert(src0->nb[0] == sizeof(float));
-
-    for (int i = 0; i < n; i++) {
-        ggml_vec_abs_f32(nc,
-                (float *) ((char *) dst->data  + i*( dst->nb[1])),
-                (float *) ((char *) src0->data + i*(src0->nb[1])));
-    }
-}
-
-static void ggml_compute_forward_abs(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * src0,
-        struct ggml_tensor * dst) {
-    switch (src0->type) {
-        case GGML_TYPE_F32:
-            {
-                ggml_compute_forward_abs_f32(params, src0, dst);
-            } break;
-        default:
-            {
-                GGML_ASSERT(false);
-            } break;
-    }
-}
-
-// ggml_compute_forward_sgn
-
-static void ggml_compute_forward_sgn_f32(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * src0,
-        struct ggml_tensor * dst) {
-    assert(params->ith == 0);
-    assert(ggml_are_same_shape(src0, dst));
-
-    if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
-        return;
-    }
-
-    const int n  = ggml_nrows(src0);
-    const int nc = src0->ne[0];
-
-    assert(dst->nb[0]  == sizeof(float));
-    assert(src0->nb[0] == sizeof(float));
-
-    for (int i = 0; i < n; i++) {
-        ggml_vec_sgn_f32(nc,
-                (float *) ((char *) dst->data  + i*( dst->nb[1])),
-                (float *) ((char *) src0->data + i*(src0->nb[1])));
-    }
-}
-
-static void ggml_compute_forward_sgn(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * src0,
-        struct ggml_tensor * dst) {
-    switch (src0->type) {
-        case GGML_TYPE_F32:
-            {
-                ggml_compute_forward_sgn_f32(params, src0, dst);
-            } break;
-        default:
-            {
-                GGML_ASSERT(false);
-            } break;
-    }
-}
-
-// ggml_compute_forward_neg
-
-static void ggml_compute_forward_neg_f32(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * src0,
-        struct ggml_tensor * dst) {
-    assert(params->ith == 0);
-    assert(ggml_are_same_shape(src0, dst));
-
-    if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
-        return;
-    }
-
-    const int n  = ggml_nrows(src0);
-    const int nc = src0->ne[0];
-
-    assert(dst->nb[0]  == sizeof(float));
-    assert(src0->nb[0] == sizeof(float));
-
-    for (int i = 0; i < n; i++) {
-        ggml_vec_neg_f32(nc,
-                (float *) ((char *) dst->data  + i*( dst->nb[1])),
-                (float *) ((char *) src0->data + i*(src0->nb[1])));
-    }
-}
-
-static void ggml_compute_forward_neg(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * src0,
-        struct ggml_tensor * dst) {
-    switch (src0->type) {
-        case GGML_TYPE_F32:
-            {
-                ggml_compute_forward_neg_f32(params, src0, dst);
-            } break;
-        default:
-            {
-                GGML_ASSERT(false);
-            } break;
-    }
-}
-
-// ggml_compute_forward_step
-
-static void ggml_compute_forward_step_f32(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * src0,
-        struct ggml_tensor * dst) {
-    assert(params->ith == 0);
-    assert(ggml_are_same_shape(src0, dst));
-
-    if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
-        return;
-    }
-
-    const int n  = ggml_nrows(src0);
-    const int nc = src0->ne[0];
-
-    assert(dst->nb[0]  == sizeof(float));
-    assert(src0->nb[0] == sizeof(float));
-
-    for (int i = 0; i < n; i++) {
-        ggml_vec_step_f32(nc,
-                (float *) ((char *) dst->data  + i*( dst->nb[1])),
-                (float *) ((char *) src0->data + i*(src0->nb[1])));
-    }
-}
-
-static void ggml_compute_forward_step(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * src0,
-        struct ggml_tensor * dst) {
-    switch (src0->type) {
-        case GGML_TYPE_F32:
-            {
-                ggml_compute_forward_step_f32(params, src0, dst);
-            } break;
-        default:
-            {
-                GGML_ASSERT(false);
-            } break;
-    }
-}
-
-// ggml_compute_forward_relu
-
-static void ggml_compute_forward_relu_f32(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * src0,
-        struct ggml_tensor * dst) {
-    assert(params->ith == 0);
-    assert(ggml_are_same_shape(src0, dst));
-
-    if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
-        return;
-    }
-
-    const int n  = ggml_nrows(src0);
-    const int nc = src0->ne[0];
-
-    assert(dst->nb[0]  == sizeof(float));
-    assert(src0->nb[0] == sizeof(float));
-
-    for (int i = 0; i < n; i++) {
-        ggml_vec_relu_f32(nc,
-                (float *) ((char *) dst->data  + i*( dst->nb[1])),
-                (float *) ((char *) src0->data + i*(src0->nb[1])));
-    }
-}
-
-static void ggml_compute_forward_relu(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * src0,
-        struct ggml_tensor * dst) {
-    switch (src0->type) {
-        case GGML_TYPE_F32:
-            {
-                ggml_compute_forward_relu_f32(params, src0, dst);
-            } break;
-        default:
-            {
-                GGML_ASSERT(false);
-            } break;
-    }
-}
-
-// ggml_compute_forward_gelu
-
-static void ggml_compute_forward_gelu_f32(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * src0,
-        struct ggml_tensor * dst) {
-    GGML_ASSERT(ggml_is_contiguous(src0));
-    GGML_ASSERT(ggml_is_contiguous(dst));
-    GGML_ASSERT(ggml_are_same_shape(src0, dst));
-
-    if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
-        return;
-    }
-
-    const int ith = params->ith;
-    const int nth = params->nth;
-
-    const int nc = src0->ne[0];
-    const int nr = ggml_nrows(src0);
-
-    // rows per thread
-    const int dr = (nr + nth - 1)/nth;
-
-    // row range for this thread
-    const int ir0 = dr*ith;
-    const int ir1 = MIN(ir0 + dr, nr);
-
-    for (int i1 = ir0; i1 < ir1; i1++) {
-        ggml_vec_gelu_f32(nc,
-                (float *) ((char *) dst->data  + i1*( dst->nb[1])),
-                (float *) ((char *) src0->data + i1*(src0->nb[1])));
-
-#ifndef NDEBUG
-        for (int k = 0; k < nc; k++) {
-            const float x = ((float *) ((char *) dst->data + i1*( dst->nb[1])))[k];
-            UNUSED(x);
-            assert(!isnan(x));
-            assert(!isinf(x));
-        }
-#endif
-    }
-}
-
-static void ggml_compute_forward_gelu(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * src0,
-        struct ggml_tensor * dst) {
-    switch (src0->type) {
-        case GGML_TYPE_F32:
-            {
-                ggml_compute_forward_gelu_f32(params, src0, dst);
-            } break;
-        default:
-            {
-                GGML_ASSERT(false);
-            } break;
-    }
-
-    //printf("XXXXXXXX gelu\n");
-}
-
-// ggml_compute_forward_silu
-
-static void ggml_compute_forward_silu_f32(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * src0,
-        struct ggml_tensor * dst) {
-    GGML_ASSERT(ggml_is_contiguous(src0));
-    GGML_ASSERT(ggml_is_contiguous(dst));
-    GGML_ASSERT(ggml_are_same_shape(src0, dst));
-
-    if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
-        return;
-    }
-
-    const int ith = params->ith;
-    const int nth = params->nth;
-
-    const int nc = src0->ne[0];
-    const int nr = ggml_nrows(src0);
-
-    // rows per thread
-    const int dr = (nr + nth - 1)/nth;
-
-    // row range for this thread
-    const int ir0 = dr*ith;
-    const int ir1 = MIN(ir0 + dr, nr);
-
-    for (int i1 = ir0; i1 < ir1; i1++) {
-        ggml_vec_silu_f32(nc,
-                (float *) ((char *) dst->data  + i1*( dst->nb[1])),
-                (float *) ((char *) src0->data + i1*(src0->nb[1])));
-
-#ifndef NDEBUG
-        for (int k = 0; k < nc; k++) {
-            const float x = ((float *) ((char *) dst->data + i1*( dst->nb[1])))[k];
-            UNUSED(x);
-            assert(!isnan(x));
-            assert(!isinf(x));
-        }
-#endif
-    }
-}
-
-static void ggml_compute_forward_silu(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * src0,
-        struct ggml_tensor * dst) {
-    switch (src0->type) {
-        case GGML_TYPE_F32:
-            {
-                ggml_compute_forward_silu_f32(params, src0, dst);
-            } break;
-        default:
-            {
-                GGML_ASSERT(false);
-            } break;
-    }
-}
-
-
-// ggml_compute_forward_norm
-
-static void ggml_compute_forward_norm_f32(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * src0,
-        struct ggml_tensor * dst) {
-    GGML_ASSERT(ggml_are_same_shape(src0, dst));
-
-    if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
-        return;
-    }
-
-    GGML_ASSERT(src0->nb[0] == sizeof(float));
-
-    const int ith = params->ith;
-    const int nth = params->nth;
-
-    const int64_t ne00 = src0->ne[0];
-    const int64_t ne01 = src0->ne[1];
-    const int64_t ne02 = src0->ne[2];
-    const int64_t ne03 = src0->ne[3];
-
-    const size_t nb01 = src0->nb[1];
-    const size_t nb02 = src0->nb[2];
-    const size_t nb03 = src0->nb[3];
-
-    const size_t nb1 = dst->nb[1];
-    const size_t nb2 = dst->nb[2];
-    const size_t nb3 = dst->nb[3];
-
-    const float eps = 1e-5f; // TODO: make this a parameter
-
-    // TODO: optimize
-    for (int64_t i03 = 0; i03 < ne03; i03++) {
-        for (int64_t i02 = 0; i02 < ne02; i02++) {
-            for (int64_t i01 = ith; i01 < ne01; i01 += nth) {
-                const float * x = (float *) ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03);
-
-                ggml_float sum = 0.0;
-                for (int64_t i00 = 0; i00 < ne00; i00++) {
-                    sum += (ggml_float)x[i00];
-                }
-
-                float mean = sum/ne00;
-
-                float * y = (float *) ((char *) dst->data + i01*nb1 + i02*nb2 + i03*nb3);
-
-                ggml_float sum2 = 0.0;
-                for (int64_t i00 = 0; i00 < ne00; i00++) {
-                    float v = x[i00] - mean;
-                    y[i00] = v;
-                    sum2 += (ggml_float)(v*v);
-                }
-
-                float variance = sum2/ne00;
-                const float scale = 1.0f/sqrtf(variance + eps);
-
-                ggml_vec_scale_f32(ne00, y, scale);
-            }
-        }
-    }
-}
-
-static void ggml_compute_forward_norm(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * src0,
-        struct ggml_tensor * dst) {
-    switch (src0->type) {
-        case GGML_TYPE_F32:
-            {
-                ggml_compute_forward_norm_f32(params, src0, dst);
-            } break;
-        default:
-            {
-                GGML_ASSERT(false);
-            } break;
-    }
-}
-
-static void ggml_compute_forward_rms_norm_f32(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * src0,
-        struct ggml_tensor * dst) {
-    GGML_ASSERT(ggml_are_same_shape(src0, dst));
-
-    if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
-        return;
-    }
-
-    GGML_ASSERT(src0->nb[0] == sizeof(float));
-
-    const int ith = params->ith;
-    const int nth = params->nth;
-
-    const int64_t ne00 = src0->ne[0];
-    const int64_t ne01 = src0->ne[1];
-    const int64_t ne02 = src0->ne[2];
-    const int64_t ne03 = src0->ne[3];
-
-    const size_t nb01 = src0->nb[1];
-    const size_t nb02 = src0->nb[2];
-    const size_t nb03 = src0->nb[3];
-
-    const size_t nb1 = dst->nb[1];
-    const size_t nb2 = dst->nb[2];
-    const size_t nb3 = dst->nb[3];
-
-    const float eps = 1e-6f; // TODO: make this a parameter
-
-    // TODO: optimize
-    for (int64_t i03 = 0; i03 < ne03; i03++) {
-        for (int64_t i02 = 0; i02 < ne02; i02++) {
-            for (int64_t i01 = ith; i01 < ne01; i01 += nth) {
-                const float * x = (float *) ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03);
-
-                ggml_float sum = 0.0;
-                for (int64_t i00 = 0; i00 < ne00; i00++) {
-                    sum += (ggml_float)(x[i00] * x[i00]);
-                }
-
-                float mean = sum/ne00;
-
-                float * y = (float *) ((char *) dst->data + i01*nb1 + i02*nb2 + i03*nb3);
-
-                memcpy(y, x, ne00 * sizeof(float));
-                // for (int i00 = 0; i00 < ne00; i00++) {
-                //     y[i00] = x[i00];
-                // }
-
-                const float scale = 1.0f/sqrtf(mean + eps);
-
-                ggml_vec_scale_f32(ne00, y, scale);
-            }
-        }
-    }
-}
-
-static void ggml_compute_forward_rms_norm(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * src0,
-        struct ggml_tensor * dst) {
-    switch (src0->type) {
-        case GGML_TYPE_F32:
-            {
-                ggml_compute_forward_rms_norm_f32(params, src0, dst);
-            } break;
-        default:
-            {
-                GGML_ASSERT(false);
-            } break;
-    }
-}
-
-
-// ggml_compute_forward_mul_mat
-
-#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CUBLAS)
-// helper function to determine if it is better to use BLAS or not
-// for large matrices, BLAS is faster
-static bool ggml_compute_forward_mul_mat_use_blas(
-        const struct ggml_tensor * src0,
-        const struct ggml_tensor * src1,
-              struct ggml_tensor * dst) {
-    //const int64_t ne00 = src0->ne[0];
-    //const int64_t ne01 = src0->ne[1];
-
-    const int64_t ne10 = src1->ne[0];
-
-    const int64_t ne0 = dst->ne[0];
-    const int64_t ne1 = dst->ne[1];
-
-    // TODO: find the optimal values for these
-    if (ggml_is_contiguous(src0) &&
-        ggml_is_contiguous(src1) && ((ne0 >= 32 && ne1 >= 32 && ne10 >= 32))) {
-
-        /*printf("BLAS: %d %d %d %d %d\n", ne0, ne1, ne10, ne00, ne01);*/
-        return true;
-    }
-
-    return false;
-}
-#endif
-
-static void ggml_compute_forward_mul_mat_f32(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * src0,
-        const struct ggml_tensor * src1,
-              struct ggml_tensor * dst) {
-    int64_t t0 = ggml_perf_time_us();
-    UNUSED(t0);
-
-    const int64_t ne00 = src0->ne[0];
-    const int64_t ne01 = src0->ne[1];
-    const int64_t ne02 = src0->ne[2];
-    const int64_t ne03 = src0->ne[3];
-
-#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CUBLAS)
-    const int64_t ne10 = src1->ne[0];
-#endif
-    const int64_t ne11 = src1->ne[1];
-#ifndef NDEBUG
-    const int64_t ne12 = src1->ne[2];
-    const int64_t ne13 = src1->ne[3];
-
-    const int64_t ne0  = dst->ne[0];
-    const int64_t ne1  = dst->ne[1];
-    const int64_t ne2  = dst->ne[2];
-    const int64_t ne3  = dst->ne[3];
-
-    const int nb00 = src0->nb[0];
-#endif
-    const int nb01 = src0->nb[1];
-    const int nb02 = src0->nb[2];
-    const int nb03 = src0->nb[3];
-
-#ifndef NDEBUG
-    const int nb10 = src1->nb[0];
-#endif
-    const int nb11 = src1->nb[1];
-    const int nb12 = src1->nb[2];
-    const int nb13 = src1->nb[3];
-
-    const int nb0  = dst->nb[0];
-    const int nb1  = dst->nb[1];
-    const int nb2  = dst->nb[2];
-    const int nb3  = dst->nb[3];
-
-    const int ith = params->ith;
-    const int nth = params->nth;
-
-    assert(ne02 == ne12);
-    assert(ne03 == ne13);
-    assert(ne2  == ne12);
-    assert(ne3  == ne13);
-
-    // we don't support permuted src0 or src1
-    assert(nb00 == sizeof(float));
-    assert(nb10 == sizeof(float));
-
-    // dst cannot be transposed or permuted
-    assert(nb0 == sizeof(float));
-    assert(nb0 <= nb1);
-    assert(nb1 <= nb2);
-    assert(nb2 <= nb3);
-
-    assert(ne0 == ne01);
-    assert(ne1 == ne11);
-    assert(ne2 == ne02);
-    assert(ne3 == ne03);
-
-    // nb01 >= nb00 - src0 is not transposed
-    //   compute by src0 rows
-
-#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CUBLAS)
-    if (ggml_compute_forward_mul_mat_use_blas(src0, src1, dst)) {
-        if (params->ith != 0) {
-            return;
-        }
-
-        if (params->type == GGML_TASK_INIT) {
-            return;
-        }
-
-        if (params->type == GGML_TASK_FINALIZE) {
-            return;
-        }
-
-#if defined(GGML_USE_CUBLAS)
-        const float alpha = 1.0f;
-        const float beta = 0.0f;
-        const int x_ne = ne01 * ne10;
-        const int y_ne = ne11 * ne10;
-        const int d_ne = ne11 * ne01;
-
-        size_t x_size, y_size, d_size;
-        float *d_X = ggml_cuda_pool_malloc(sizeof(float) * x_ne, &x_size);
-        float *d_Y = ggml_cuda_pool_malloc(sizeof(float) * y_ne, &y_size);
-        float *d_D = ggml_cuda_pool_malloc(sizeof(float) * d_ne, &d_size);
-#endif
-
-        for (int64_t i03 = 0; i03 < ne03; i03++) {
-            for (int64_t i02 = 0; i02 < ne02; i02++) {
-                const float * x = (float *) ((char *) src0->data + i02*nb02 + i03*nb03);
-                const float * y = (float *) ((char *) src1->data + i02*nb12 + i03*nb13);
-
-                float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3);
-
-#if defined(GGML_USE_CUBLAS)
-                // copy data to device
-                CUDA_CHECK(cudaMemcpyAsync(d_X, x, sizeof(float) * x_ne, cudaMemcpyHostToDevice, g_cudaStream));
-                CUDA_CHECK(cudaMemcpyAsync(d_Y, y, sizeof(float) * y_ne, cudaMemcpyHostToDevice, g_cudaStream));
-
-                // compute
-                CUBLAS_CHECK(
-                    cublasSgemm(g_cublasH, CUBLAS_OP_T, CUBLAS_OP_N,
-                            ne01, ne11, ne10,
-                            &alpha, d_X, ne00,
-                                    d_Y, ne10,
-                            &beta,  d_D, ne01));
-
-                // copy data to host
-                CUDA_CHECK(cudaMemcpyAsync(d, d_D, sizeof(float) * d_ne, cudaMemcpyDeviceToHost, g_cudaStream));
-#else
-                // zT = y * xT
-                cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans,
-                        ne11, ne01, ne10,
-                        1.0f,    y, ne10,
-                                 x, ne00,
-                        0.0f,    d, ne01);
-#endif
-            }
-        }
-#if defined(GGML_USE_CUBLAS)
-        CUDA_CHECK(cudaStreamSynchronize(g_cudaStream));
-        ggml_cuda_pool_free(d_X, x_size);
-        ggml_cuda_pool_free(d_Y, y_size);
-        ggml_cuda_pool_free(d_D, d_size);
-#endif
-        //printf("CBLAS F32 = %f ms, %d x %d x %d x %d\n", (ggml_perf_time_us() - t0)/1000.0, ne0, ne1, ne2, ne3);
-
-        return;
-    }
-#endif
-
-    if (params->type == GGML_TASK_INIT) {
-        return;
-    }
-
-    if (params->type == GGML_TASK_FINALIZE) {
-        return;
-    }
-
-    // parallelize by src0 rows using ggml_vec_dot_f32
-
-    // total rows in src0
-    const int nr = ne01*ne02*ne03;
-
-    // rows per thread
-    const int dr = (nr + nth - 1)/nth;
-
-    // row range for this thread
-    const int ir0 = dr*ith;
-    const int ir1 = MIN(ir0 + dr, nr);
-
-    for (int ir = ir0; ir < ir1; ++ir) {
-        // src0 indices
-        const int i03 = ir/(ne02*ne01);
-        const int i02 = (ir - i03*ne02*ne01)/ne01;
-        const int i01 = (ir - i03*ne02*ne01 - i02*ne01);
-
-        for (int64_t ic = 0; ic < ne11; ++ic) {
-            // src1 indices
-            const int i13 = i03;
-            const int i12 = i02;
-            const int i11 = ic;
-
-            // dst indices
-            const int i0 = i01;
-            const int i1 = i11;
-            const int i2 = i02;
-            const int i3 = i03;
-
-            ggml_vec_dot_f32(ne00,
-                    (float *) ((char *)  dst->data + (i0*nb0 + i1*nb1 + i2*nb2 + i3*nb3)),
-                    (float *) ((char *) src0->data + (i01*nb01 + i02*nb02 + i03*nb03)),
-                    (float *) ((char *) src1->data + (i11*nb11 + i12*nb12 + i13*nb13)));
-        }
-    }
-
-    //int64_t t1 = ggml_perf_time_us();
-    //static int64_t acc = 0;
-    //acc += t1 - t0;
-    //if (t1 - t0 > 10) {
-    //    printf("\n");
-    //    printf("ne00 = %5d, ne01 = %5d, ne02 = %5d, ne03 = %5d\n", ne00, ne01, ne02, ne03);
-    //    printf("nb00 = %5d, nb01 = %5d, nb02 = %5d, nb03 = %5d\n", nb00, nb01, nb02, nb03);
-    //    printf("ne10 = %5d, ne11 = %5d, ne12 = %5d, ne13 = %5d\n", ne10, ne11, ne12, ne13);
-    //    printf("nb10 = %5d, nb11 = %5d, nb12 = %5d, nb13 = %5d\n", nb10, nb11, nb12, nb13);
-
-    //    printf("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX task %d/%d: %d us, acc = %d\n", ith, nth, (int) (t1 - t0), (int) acc);
-    //}
-}
-
-static void ggml_compute_forward_mul_mat_f16_f32(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * src0,
-        const struct ggml_tensor * src1,
-              struct ggml_tensor * dst) {
-    int64_t t0 = ggml_perf_time_us();
-    UNUSED(t0);
-
-    const int64_t ne00 = src0->ne[0];
-    const int64_t ne01 = src0->ne[1];
-    const int64_t ne02 = src0->ne[2];
-    const int64_t ne03 = src0->ne[3];
-
-    const int64_t ne10 = src1->ne[0];
-    const int64_t ne11 = src1->ne[1];
-    const int64_t ne12 = src1->ne[2];
-    const int64_t ne13 = src1->ne[3];
-
-    const int64_t ne0  = dst->ne[0];
-    const int64_t ne1  = dst->ne[1];
-    const int64_t ne2  = dst->ne[2];
-    const int64_t ne3  = dst->ne[3];
-    //const int64_t ne   = ne0*ne1*ne2*ne3;
-
-    const int nb00 = src0->nb[0];
-    const int nb01 = src0->nb[1];
-    const int nb02 = src0->nb[2];
-    const int nb03 = src0->nb[3];
-
-    const int nb10 = src1->nb[0];
-    const int nb11 = src1->nb[1];
-    const int nb12 = src1->nb[2];
-    const int nb13 = src1->nb[3];
-
-    const int nb0  = dst->nb[0];
-    const int nb1  = dst->nb[1];
-    const int nb2  = dst->nb[2];
-    const int nb3  = dst->nb[3];
-
-    const int ith = params->ith;
-    const int nth = params->nth;
-
-    GGML_ASSERT(ne02 == ne12);
-    GGML_ASSERT(ne03 == ne13);
-    GGML_ASSERT(ne2  == ne12);
-    GGML_ASSERT(ne3  == ne13);
-
-    // TODO: we don't support permuted src0
-    GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
-
-    // dst cannot be transposed or permuted
-    GGML_ASSERT(nb0 == sizeof(float));
-    GGML_ASSERT(nb0 <= nb1);
-    GGML_ASSERT(nb1 <= nb2);
-    GGML_ASSERT(nb2 <= nb3);
-
-    GGML_ASSERT(ne0 == ne01);
-    GGML_ASSERT(ne1 == ne11);
-    GGML_ASSERT(ne2 == ne02);
-    GGML_ASSERT(ne3 == ne03);
-
-    // nb01 >= nb00 - src0 is not transposed
-    //   compute by src0 rows
-
-#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CUBLAS)
-    if (ggml_compute_forward_mul_mat_use_blas(src0, src1, dst)) {
-        GGML_ASSERT(nb10 == sizeof(float));
-
-        if (params->ith != 0) {
-            return;
-        }
-
-        if (params->type == GGML_TASK_INIT) {
-            return;
-        }
-
-        if (params->type == GGML_TASK_FINALIZE) {
-            return;
-        }
-
-#if defined(GGML_USE_CUBLAS)
-        ggml_fp16_t * const wdata = params->wdata;
-
-        const float alpha = 1.0f;
-        const float beta = 0.0f;
-        const int x_ne = ne01 * ne10;
-        const int y_ne = ne11 * ne10;
-        const int d_ne = ne11 * ne01;
-
-        size_t x_size, y_size, d_size;
-        float *d_X = ggml_cuda_pool_malloc(sizeof(float) * x_ne, &x_size);
-        float *d_Y = ggml_cuda_pool_malloc(sizeof(float) * y_ne, &y_size);
-        float *d_D = ggml_cuda_pool_malloc(sizeof(float) * d_ne, &d_size);
-#else
-        float * const wdata = params->wdata;
-#endif
-        for (int64_t i03 = 0; i03 < ne03; i03++) {
-            for (int64_t i02 = 0; i02 < ne02; i02++) {
-#if defined(GGML_USE_CUBLAS)
-                // with cuBlAS, instead of converting src0 to fp32, we convert src1 to fp16
-                {
-                    size_t id = 0;
-                    for (int64_t i01 = 0; i01 < ne11; ++i01) {
-                        for (int64_t i00 = 0; i00 < ne10; ++i00) {
-                            wdata[id++] = GGML_FP32_TO_FP16(*(float *) ((char *) src1->data + i03*nb13 + i02*nb12 + i01*nb11 + i00*nb10));
-                        }
-                    }
-                }
-#else
-                {
-                    size_t id = 0;
-                    for (int64_t i01 = 0; i01 < ne01; ++i01) {
-                        for (int64_t i00 = 0; i00 < ne00; ++i00) {
-                            wdata[id++] = GGML_FP16_TO_FP32(*(ggml_fp16_t *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01 + i00*nb00));
-                        }
-                    }
-                }
-#endif
-
-#if defined(GGML_USE_CUBLAS)
-                const ggml_fp16_t * x = (ggml_fp16_t *) ((char *) src0->data + i02*nb02 + i03*nb03);
-                const ggml_fp16_t * y = (ggml_fp16_t *) wdata;
-
-                float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3);
-
-                // copy data to device
-                CUDA_CHECK(cudaMemcpyAsync(d_X, x, sizeof(ggml_fp16_t) * x_ne, cudaMemcpyHostToDevice, g_cudaStream));
-                CUDA_CHECK(cudaMemcpyAsync(d_Y, y, sizeof(ggml_fp16_t) * y_ne, cudaMemcpyHostToDevice, g_cudaStream));
-
-                // compute
-                CUBLAS_CHECK(
-                    cublasGemmEx(g_cublasH, CUBLAS_OP_T, CUBLAS_OP_N,
-                            ne01, ne11, ne10,
-                            &alpha, d_X, CUDA_R_16F, ne00,
-                                    d_Y, CUDA_R_16F, ne10,
-                            &beta,  d_D, CUDA_R_32F, ne01,
-                            CUBLAS_COMPUTE_32F,
-                            CUBLAS_GEMM_DEFAULT));
-
-                // copy data to host
-                CUDA_CHECK(cudaMemcpyAsync(d, d_D, sizeof(float) * d_ne, cudaMemcpyDeviceToHost, g_cudaStream));
-#else
-                const float * x = wdata;
-                const float * y = (float *) ((char *) src1->data + i02*nb12 + i03*nb13);
-
-                float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3);
-
-                // zT = y * xT
-                cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans,
-                        ne11, ne01, ne10,
-                        1.0f,    y, ne10,
-                                 x, ne00,
-                        0.0f,    d, ne01);
-#endif
-            }
-        }
-
-#if defined(GGML_USE_CUBLAS)
-        CUDA_CHECK(cudaStreamSynchronize(g_cudaStream));
-        ggml_cuda_pool_free(d_X, x_size);
-        ggml_cuda_pool_free(d_Y, y_size);
-        ggml_cuda_pool_free(d_D, d_size);
-#endif
-        /*printf("CBLAS F16 = %f ms, %d x %d x %d x %d\n", (ggml_perf_time_us() - t0)/1000.0, ne0, ne1, ne2, ne3);*/
-
-        return;
-    }
-#endif
-
-    if (params->type == GGML_TASK_INIT) {
-        ggml_fp16_t * const wdata = params->wdata;
-
-        size_t id = 0;
-        for (int64_t i13 = 0; i13 < ne13; ++i13) {
-            for (int64_t i12 = 0; i12 < ne12; ++i12) {
-                for (int64_t i11 = 0; i11 < ne11; ++i11) {
-                    for (int64_t i10 = 0; i10 < ne10; ++i10) {
-                        wdata[id++] = GGML_FP32_TO_FP16(*(float *)((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11 + i10*nb10));
-                    }
-                }
-            }
-        }
-
-        GGML_ASSERT(id*sizeof(ggml_fp16_t) <= params->wsize);
-
-        return;
-    }
-
-    if (params->type == GGML_TASK_FINALIZE) {
-        return;
-    }
-
-    // fp16 -> half the size, so divide by 2
-    // TODO: do not support transposed src1
-    assert(nb10/2 == sizeof(ggml_fp16_t));
-
-    // parallelize by src0 rows using ggml_vec_dot_f16
-
-    // total rows in src0
-    const int nr = ne01*ne02*ne03;
-
-    // rows per thread
-    const int dr = (nr + nth - 1)/nth;
-
-    // row range for this thread
-    const int ir0 = dr*ith;
-    const int ir1 = MIN(ir0 + dr, nr);
-
-    ggml_fp16_t * wdata = params->wdata;
-
-    for (int ir = ir0; ir < ir1; ++ir) {
-        // src0 indices
-        const int i03 = ir/(ne02*ne01);
-        const int i02 = (ir - i03*ne02*ne01)/ne01;
-        const int i01 = (ir - i03*ne02*ne01 - i02*ne01);
-
-        const int i13 = i03;
-        const int i12 = i02;
-
-        const int i0 = i01;
-        const int i2 = i02;
-        const int i3 = i03;
-
-        ggml_fp16_t * src0_row = (ggml_fp16_t *) ((char *) src0->data + (i01*nb01 + i02*nb02 + i03*nb03));
-        ggml_fp16_t * src1_col =                                wdata + (       0 + i12*ne11 + i13*ne12*ne11)*ne00;
-
-        float * dst_col = (float *) ((char *) dst->data + (i0*nb0 + 0*nb1 + i2*nb2 + i3*nb3));
-
-        for (int64_t ic = 0; ic < ne11; ++ic) {
-            ggml_vec_dot_f16(ne00, &dst_col[ic*ne0], src0_row, src1_col + ic*ne00);
-        }
-    }
-
-    //int64_t t1 = ggml_time_us();
-    //static int64_t acc = 0;
-    //acc += t1 - t0;
-    //if (t1 - t0 > 10) {
-    //    printf("\n");
-    //    printf("ne00 = %5d, ne01 = %5d, ne02 = %5d, ne03 = %5d\n", ne00, ne01, ne02, ne03);
-    //    printf("nb00 = %5d, nb01 = %5d, nb02 = %5d, nb03 = %5d\n", nb00, nb01, nb02, nb03);
-    //    printf("ne10 = %5d, ne11 = %5d, ne12 = %5d, ne13 = %5d\n", ne10, ne11, ne12, ne13);
-
-    //    printf("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX task %d/%d: %d us, acc = %d\n", ith, nth, (int) (t1 - t0), (int) acc);
-    //}
-}
-
-static void ggml_compute_forward_mul_mat_q_f32(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * src0,
-        const struct ggml_tensor * src1,
-              struct ggml_tensor * dst) {
-    int64_t t0 = ggml_perf_time_us();
-    UNUSED(t0);
-
-    const int64_t ne00 = src0->ne[0];
-    const int64_t ne01 = src0->ne[1];
-    const int64_t ne02 = src0->ne[2];
-    const int64_t ne03 = src0->ne[3];
-
-    const int64_t ne10 = src1->ne[0];
-    const int64_t ne11 = src1->ne[1];
-    const int64_t ne12 = src1->ne[2];
-    const int64_t ne13 = src1->ne[3];
-
-    const int64_t ne0  = dst->ne[0];
-    const int64_t ne1  = dst->ne[1];
-    const int64_t ne2  = dst->ne[2];
-    const int64_t ne3  = dst->ne[3];
-
-    const int nb00 = src0->nb[0];
-    const int nb01 = src0->nb[1];
-    const int nb02 = src0->nb[2];
-    const int nb03 = src0->nb[3];
-
-    const int nb10 = src1->nb[0];
-    const int nb11 = src1->nb[1];
-    const int nb12 = src1->nb[2];
-    const int nb13 = src1->nb[3];
-
-    const int nb0  = dst->nb[0];
-    const int nb1  = dst->nb[1];
-    const int nb2  = dst->nb[2];
-    const int nb3  = dst->nb[3];
-
-    const int ith = params->ith;
-    const int nth = params->nth;
-
-    GGML_ASSERT(ne02 == ne12);
-    GGML_ASSERT(ne03 == ne13);
-    GGML_ASSERT(ne2  == ne12);
-    GGML_ASSERT(ne3  == ne13);
-
-    const enum ggml_type type = src0->type;
-    quantize_row_q_t const quantize_row_q_dot = quantize_fns[type].quantize_row_q_dot;
-    vec_dot_q_t      const vec_dot_q          = quantize_fns[type].vec_dot_q;
-
-    // we don't support permuted src0 or src1
-    GGML_ASSERT(nb00 == (int) GGML_TYPE_SIZE[type]);
-    GGML_ASSERT(nb10 == sizeof(float));
-
-    // dst cannot be transposed or permuted
-    GGML_ASSERT(nb0 == sizeof(float));
-    GGML_ASSERT(nb0 <= nb1);
-    GGML_ASSERT(nb1 <= nb2);
-    GGML_ASSERT(nb2 <= nb3);
-
-    GGML_ASSERT(ne0 == ne01);
-    GGML_ASSERT(ne1 == ne11);
-    GGML_ASSERT(ne2 == ne02);
-    GGML_ASSERT(ne3 == ne03);
-
-    // nb01 >= nb00 - src0 is not transposed
-    //   compute by src0 rows
-
-#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CUBLAS)
-    if (ggml_compute_forward_mul_mat_use_blas(src0, src1, dst)) {
-        if (params->ith != 0) {
-            return;
-        }
-
-        if (params->type == GGML_TASK_INIT) {
-            return;
-        }
-
-        if (params->type == GGML_TASK_FINALIZE) {
-            return;
-        }
-
-#if defined(GGML_USE_CUBLAS)
-        const float alpha = 1.0f;
-        const float beta = 0.0f;
-        const int x_ne = ne01 * ne10;
-        const int y_ne = ne11 * ne10;
-        const int d_ne = ne11 * ne01;
-
-        size_t x_size, y_size, d_size, q_size;
-        float *d_X = ggml_cuda_pool_malloc(sizeof(float) * x_ne, &x_size);
-        float *d_Y = ggml_cuda_pool_malloc(sizeof(float) * y_ne, &y_size);
-        float *d_D = ggml_cuda_pool_malloc(sizeof(float) * d_ne, &d_size);
-        float *d_Q = ggml_cuda_pool_malloc(GGML_TYPE_SIZE[type] * x_ne / GGML_BLCK_SIZE[type], &q_size);
-
-        void (*dequantize_row_q_cuda)(const void * x, float * y, int k, cudaStream_t stream)  = NULL;
-        if (type == GGML_TYPE_Q4_0) {
-            dequantize_row_q_cuda = dequantize_row_q4_0_cuda;
-        }
-        else if (type == GGML_TYPE_Q4_1) {
-            dequantize_row_q_cuda = dequantize_row_q4_1_cuda;
-        }
-        else if (type == GGML_TYPE_Q4_2) {
-            dequantize_row_q_cuda = dequantize_row_q4_2_cuda;
-        }
-        else if (type == GGML_TYPE_Q4_3) {
-            dequantize_row_q_cuda = dequantize_row_q4_3_cuda;
-        }
-        else {
-            GGML_ASSERT(false);
-        }
-#else
-        float * const wdata = params->wdata;
-        dequantize_row_q_t const dequantize_row_q = quantize_fns[type].dequantize_row_q;
-#endif
-
-        for (int64_t i03 = 0; i03 < ne03; i03++) {
-            for (int64_t i02 = 0; i02 < ne02; i02++) {
-                const float * y = (float *) ((char *) src1->data + i02*nb12 + i03*nb13);
-
-                float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3);
-
-#if defined(GGML_USE_CUBLAS)
-                // copy and dequantize on device
-                CUDA_CHECK(
-                    cudaMemcpyAsync(d_Q, (char *) src0->data + i03*nb03 + i02*nb02,
-                        GGML_TYPE_SIZE[type] * x_ne / GGML_BLCK_SIZE[type], cudaMemcpyHostToDevice, g_cudaStream));
-
-                dequantize_row_q_cuda(d_Q, d_X, ne01 * ne00, g_cudaStream);
-                CUDA_CHECK(cudaGetLastError());
-#else
-                {
-                    size_t id = 0;
-                    for (int64_t i01 = 0; i01 < ne01; ++i01) {
-                        dequantize_row_q((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01, wdata + id, ne00);
-                        id += ne00;
-                    }
-                }
-                const float * x = wdata;
-#endif
-
-
-#if defined(GGML_USE_CUBLAS)
-                // copy data to device
-                CUDA_CHECK(cudaMemcpyAsync(d_Y, y, sizeof(float) * y_ne, cudaMemcpyHostToDevice, g_cudaStream));
-
-                // compute
-                CUBLAS_CHECK(
-                    cublasSgemm(g_cublasH, CUBLAS_OP_T, CUBLAS_OP_N,
-                            ne01, ne11, ne10,
-                            &alpha, d_X, ne00,
-                                    d_Y, ne10,
-                            &beta,  d_D, ne01));
-
-                // copy data to host
-                CUDA_CHECK(cudaMemcpyAsync(d, d_D, sizeof(float) * d_ne, cudaMemcpyDeviceToHost, g_cudaStream));
-#else
-                // zT = y * xT
-                cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans,
-                        ne11, ne01, ne10,
-                        1.0f,    y, ne10,
-                                 x, ne00,
-                        0.0f,    d, ne01);
-#endif
-            }
-        }
-
-#if defined(GGML_USE_CUBLAS)
-        CUDA_CHECK(cudaStreamSynchronize(g_cudaStream));
-        ggml_cuda_pool_free(d_X, x_size);
-        ggml_cuda_pool_free(d_Y, y_size);
-        ggml_cuda_pool_free(d_D, d_size);
-        ggml_cuda_pool_free(d_Q, q_size);
-#endif
-        //printf("CBLAS = %f ms, %d x %d x %d x %d\n", (ggml_perf_time_us() - t0)/1000.0, ne0, ne1, ne2, ne3);
-
-        return;
-    }
-#endif
-
-    if (params->type == GGML_TASK_INIT) {
-        char * wdata = params->wdata;
-        const size_t row_size = ne10*GGML_TYPE_SIZE[GGML_TYPE_Q8_0]/GGML_BLCK_SIZE[GGML_TYPE_Q8_0];
-
-        for (int64_t i13 = 0; i13 < ne13; ++i13) {
-            for (int64_t i12 = 0; i12 < ne12; ++i12) {
-                for (int64_t i11 = 0; i11 < ne11; ++i11) {
-                    quantize_row_q_dot((float *)((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11), (void *) wdata, ne10);
-                    wdata += row_size;
-                }
-            }
-        }
-
-        return;
-    }
-
-    if (params->type == GGML_TASK_FINALIZE) {
-        return;
-    }
-
-    // parallelize by src0 rows using ggml_vec_dot_q
-
-    // total rows in src0
-    const int nr = ne01*ne02*ne03;
-
-    // rows per thread
-    const int dr = (nr + nth - 1)/nth;
-
-    // row range for this thread
-    const int ir0 = dr*ith;
-    const int ir1 = MIN(ir0 + dr, nr);
-
-    void * wdata = params->wdata;
-    const size_t row_size = ne00*GGML_TYPE_SIZE[GGML_TYPE_Q8_0]/GGML_BLCK_SIZE[GGML_TYPE_Q8_0];
-
-    for (int ir = ir0; ir < ir1; ++ir) {
-        // src0 indices
-        const int i03 = ir/(ne02*ne01);
-        const int i02 = (ir - i03*ne02*ne01)/ne01;
-        const int i01 = (ir - i03*ne02*ne01 - i02*ne01);
-
-        const int i13 = i03;
-        const int i12 = i02;
-
-        const int i0 = i01;
-        const int i2 = i02;
-        const int i3 = i03;
-
-        void * src0_row = (void *) ((char *) src0->data + (i01*nb01 + i02*nb02 + i03*nb03));
-        char * src1_col =          ((char *)      wdata + (      (0 + i12*ne11 + i13*ne12*ne11)*row_size));
-
-        float * dst_col = (float *) ((char *) dst->data + (i0*nb0 + 0*nb1 + i2*nb2 + i3*nb3));
-
-        assert(ne00 % 32 == 0);
-
-        for (int64_t ic = 0; ic < ne11; ++ic) {
-            vec_dot_q(ne00, &dst_col[ic*ne0], src0_row, (void *) (src1_col + ic*row_size));
-        }
-    }
-
-    //int64_t t1 = ggml_time_us();
-    //static int64_t acc = 0;
-    //acc += t1 - t0;
-    //if (t1 - t0 > 10) {
-    //    printf("\n");
-    //    printf("ne00 = %5d, ne01 = %5d, ne02 = %5d, ne03 = %5d\n", ne00, ne01, ne02, ne03);
-    //    printf("nb00 = %5d, nb01 = %5d, nb02 = %5d, nb03 = %5d\n", nb00, nb01, nb02, nb03);
-    //    printf("ne10 = %5d, ne11 = %5d, ne12 = %5d, ne13 = %5d\n", ne10, ne11, ne12, ne13);
-
-    //    printf("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX task %d/%d: %d us, acc = %d\n", ith, nth, (int) (t1 - t0), (int) acc);
-    //}
-}
-
-static void ggml_compute_forward_mul_mat(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * src0,
-        const struct ggml_tensor * src1,
-        struct ggml_tensor * dst) {
-    switch (src0->type) {
-        case GGML_TYPE_Q4_0:
-        case GGML_TYPE_Q4_1:
-        case GGML_TYPE_Q4_2:
-        case GGML_TYPE_Q4_3:
-        case GGML_TYPE_Q8_0:
-            {
-                ggml_compute_forward_mul_mat_q_f32(params, src0, src1, dst);
-            } break;
-        case GGML_TYPE_F16:
-            {
-                ggml_compute_forward_mul_mat_f16_f32(params, src0, src1, dst);
-            } break;
-        case GGML_TYPE_F32:
-            {
-                ggml_compute_forward_mul_mat_f32(params, src0, src1, dst);
-            } break;
-        default:
-            {
-                GGML_ASSERT(false);
-            } break;
-    }
-}
-
-// ggml_compute_forward_scale
-
-static void ggml_compute_forward_scale_f32(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * src0,
-        const struct ggml_tensor * src1,
-        struct ggml_tensor * dst) {
-    GGML_ASSERT(ggml_is_contiguous(src0));
-    GGML_ASSERT(ggml_is_contiguous(dst));
-    GGML_ASSERT(ggml_are_same_shape(src0, dst));
-    GGML_ASSERT(ggml_is_scalar(src1));
-
-    if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
-        return;
-    }
-
-    // scale factor
-    const float v = *(float *) src1->data;
-
-    const int ith = params->ith;
-    const int nth = params->nth;
-
-    const int nc = src0->ne[0];
-    const int nr = ggml_nrows(src0);
-
-    // rows per thread
-    const int dr = (nr + nth - 1)/nth;
-
-    // row range for this thread
-    const int ir0 = dr*ith;
-    const int ir1 = MIN(ir0 + dr, nr);
-
-    for (int i1 = ir0; i1 < ir1; i1++) {
-        ggml_vec_scale_f32(nc, (float *) ((char *) dst->data + i1*(dst->nb[1])), v);
-    }
-}
-
-static void ggml_compute_forward_scale(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * src0,
-        const struct ggml_tensor * src1,
-        struct ggml_tensor * dst) {
-    switch (src0->type) {
-        case GGML_TYPE_F32:
-            {
-                ggml_compute_forward_scale_f32(params, src0, src1, dst);
-            } break;
-        default:
-            {
-                GGML_ASSERT(false);
-            } break;
-    }
-}
-
-// ggml_compute_forward_cpy
-
-static void ggml_compute_forward_cpy(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * src0,
-        struct ggml_tensor * dst) {
-    ggml_compute_forward_dup(params, src0, dst);
-}
-
-// ggml_compute_forward_cont
-
-static void ggml_compute_forward_cont(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * src0,
-        struct ggml_tensor * dst) {
-    ggml_compute_forward_dup(params, src0, dst);
-}
-
-// ggml_compute_forward_reshape
-
-static void ggml_compute_forward_reshape(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * src0,
-        struct ggml_tensor * dst) {
-    // NOP
-    UNUSED(params);
-    UNUSED(src0);
-    UNUSED(dst);
-}
-
-// ggml_compute_forward_view
-
-static void ggml_compute_forward_view(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * src0) {
-    // NOP
-    UNUSED(params);
-    UNUSED(src0);
-}
-
-// ggml_compute_forward_permute
-
-static void ggml_compute_forward_permute(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * src0) {
-    // NOP
-    UNUSED(params);
-    UNUSED(src0);
-}
-
-// ggml_compute_forward_transpose
-
-static void ggml_compute_forward_transpose(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * src0) {
-    // NOP
-    UNUSED(params);
-    UNUSED(src0);
-}
-
-// ggml_compute_forward_get_rows
-
-static void ggml_compute_forward_get_rows_q(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * src0,
-        const struct ggml_tensor * src1,
-              struct ggml_tensor * dst) {
-    assert(params->ith == 0);
-
-    if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
-        return;
-    }
-
-    const int nc = src0->ne[0];
-    const int nr = ggml_nelements(src1);
-    const enum ggml_type type = src0->type;
-    dequantize_row_q_t const dequantize_row_q = quantize_fns[type].dequantize_row_q;
-
-    assert( dst->ne[0] == nc);
-    assert( dst->ne[1] == nr);
-    assert(src0->nb[0] == GGML_TYPE_SIZE[type]);
-
-    for (int i = 0; i < nr; ++i) {
-        const int r = ((int32_t *) src1->data)[i];
-
-        dequantize_row_q(
-                (const void *) ((char *) src0->data + r*src0->nb[1]),
-                     (float *) ((char *)  dst->data + i*dst->nb[1]), nc);
-    }
-}
-
-static void ggml_compute_forward_get_rows_f16(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * src0,
-        const struct ggml_tensor * src1,
-              struct ggml_tensor * dst) {
-    assert(params->ith == 0);
-
-    if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
-        return;
-    }
-
-    const int nc = src0->ne[0];
-    const int nr = ggml_nelements(src1);
-
-    assert( dst->ne[0] == nc);
-    assert( dst->ne[1] == nr);
-    assert(src0->nb[0] == sizeof(ggml_fp16_t));
-
-    for (int i = 0; i < nr; ++i) {
-        const int r = ((int32_t *) src1->data)[i];
-
-        for (int j = 0; j < nc; ++j) {
-            ggml_fp16_t v = ((ggml_fp16_t *) ((char *) src0->data + r*src0->nb[1]))[j];
-            ((float *) ((char *)  dst->data + i*dst->nb[1]))[j] = GGML_FP16_TO_FP32(v);
-        }
-    }
-}
-
-static void ggml_compute_forward_get_rows_f32(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * src0,
-        const struct ggml_tensor * src1,
-              struct ggml_tensor * dst) {
-    assert(params->ith == 0);
-
-    if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
-        return;
-    }
-
-    const int nc = src0->ne[0];
-    const int nr = ggml_nelements(src1);
-
-    assert( dst->ne[0] == nc);
-    assert( dst->ne[1] == nr);
-    assert(src0->nb[0] == sizeof(float));
-
-    for (int i = 0; i < nr; ++i) {
-        const int r = ((int32_t *) src1->data)[i];
-
-        ggml_vec_cpy_f32(nc,
-                (float *) ((char *)  dst->data + i*dst->nb[1]),
-                (float *) ((char *) src0->data + r*src0->nb[1]));
-    }
-}
-
-static void ggml_compute_forward_get_rows(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * src0,
-        const struct ggml_tensor * src1,
-        struct ggml_tensor * dst) {
-    switch (src0->type) {
-        case GGML_TYPE_Q4_0:
-        case GGML_TYPE_Q4_1:
-        case GGML_TYPE_Q4_2:
-        case GGML_TYPE_Q4_3:
-        case GGML_TYPE_Q8_0:
-            {
-                ggml_compute_forward_get_rows_q(params, src0, src1, dst);
-            } break;
-        case GGML_TYPE_F16:
-            {
-                ggml_compute_forward_get_rows_f16(params, src0, src1, dst);
-            } break;
-        case GGML_TYPE_F32:
-            {
-                ggml_compute_forward_get_rows_f32(params, src0, src1, dst);
-            } break;
-        default:
-            {
-                GGML_ASSERT(false);
-            } break;
-    }
-
-    //static bool first = true;
-    //printf("ne0 = %d, ne1 = %d, ne2 = %d\n", dst->ne[0], dst->ne[1], dst->ne[2]);
-    //if (first) {
-    //    first = false;
-    //} else {
-    //    for (int k = 0; k < dst->ne[1]; ++k) {
-    //        for (int j = 0; j < dst->ne[0]/16; ++j) {
-    //            for (int i = 0; i < 16; ++i) {
-    //                printf("%8.4f ", ((float *) dst->data)[k*dst->ne[0] + j*16 + i]);
-    //            }
-    //            printf("\n");
-    //        }
-    //        printf("\n");
-    //    }
-    //    printf("\n");
-    //    exit(0);
-    //}
-}
-
-// ggml_compute_forward_diag_mask_inf
-
-static void ggml_compute_forward_diag_mask_inf_f32(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * src0,
-        const struct ggml_tensor * src1,
-        struct ggml_tensor * dst) {
-    assert(params->ith == 0);
-    assert(src1->type == GGML_TYPE_I32);
-    assert(ggml_nelements(src1) == 1);
-
-    if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
-        return;
-    }
-
-    const int n_past = ((int32_t *) src1->data)[0];
-
-    // TODO: handle transposed/permuted matrices
-
-    const int n  = ggml_nrows(src0);
-    const int nc = src0->ne[0];
-    const int nr = src0->ne[1];
-    const int nz = n/nr;
-
-    assert( dst->nb[0] == sizeof(float));
-    assert(src0->nb[0] == sizeof(float));
-
-    for (int k = 0; k < nz; k++) {
-        for (int j = 0; j < nr; j++) {
-            for (int i = n_past; i < nc; i++) {
-                if (i > n_past + j) {
-                    *(float *)((char *) dst->data + k*dst->nb[2] + j*dst->nb[1] + i*dst->nb[0]) = -INFINITY;
-                }
-            }
-        }
-    }
-}
-
-static void ggml_compute_forward_diag_mask_inf(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * src0,
-        const struct ggml_tensor * src1,
-        struct ggml_tensor * dst) {
-    switch (src0->type) {
-        case GGML_TYPE_F32:
-            {
-                ggml_compute_forward_diag_mask_inf_f32(params, src0, src1, dst);
-            } break;
-        default:
-            {
-                GGML_ASSERT(false);
-            } break;
-    }
-}
-
-// ggml_compute_forward_soft_max
-
-static void ggml_compute_forward_soft_max_f32(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * src0,
-        struct ggml_tensor * dst) {
-    GGML_ASSERT(ggml_is_contiguous(src0));
-    GGML_ASSERT(ggml_is_contiguous(dst));
-    GGML_ASSERT(ggml_are_same_shape(src0, dst));
-
-    if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
-        return;
-    }
-
-    // TODO: handle transposed/permuted matrices
-
-    const int ith = params->ith;
-    const int nth = params->nth;
-
-    const int nc = src0->ne[0];
-    const int nr = ggml_nrows(src0);
-
-    // rows per thread
-    const int dr = (nr + nth - 1)/nth;
-
-    // row range for this thread
-    const int ir0 = dr*ith;
-    const int ir1 = MIN(ir0 + dr, nr);
-
-    for (int i1 = ir0; i1 < ir1; i1++) {
-        float *p = (float *)((char *) dst->data + i1*dst->nb[1]);
-
-#ifndef NDEBUG
-        for (int i = 0; i < nc; ++i) {
-            //printf("p[%d] = %f\n", i, p[i]);
-            assert(!isnan(p[i]));
-        }
-#endif
-
-        float max = -INFINITY;
-        ggml_vec_max_f32(nc, &max, p);
-
-        ggml_float sum = 0.0;
-
-        uint16_t scvt;
-        for (int i = 0; i < nc; i++) {
-            if (p[i] == -INFINITY) {
-                p[i] = 0.0f;
-            } else {
-                //const float val = (p[i] == -INFINITY) ? 0.0 : exp(p[i] - max);
-                ggml_fp16_t s = GGML_FP32_TO_FP16(p[i] - max);
-                memcpy(&scvt, &s, sizeof(scvt));
-                const float val = GGML_FP16_TO_FP32(table_exp_f16[scvt]);
-                sum += (ggml_float)val;
-                p[i] = val;
-            }
-        }
-
-        assert(sum > 0.0);
-
-        sum = 1.0/sum;
-        ggml_vec_scale_f32(nc, p, sum);
-
-#ifndef NDEBUG
-        for (int i = 0; i < nc; ++i) {
-            assert(!isnan(p[i]));
-            assert(!isinf(p[i]));
-        }
-#endif
-    }
-}
-
-static void ggml_compute_forward_soft_max(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * src0,
-        struct ggml_tensor * dst) {
-    switch (src0->type) {
-        case GGML_TYPE_F32:
-            {
-                ggml_compute_forward_soft_max_f32(params, src0, dst);
-            } break;
-        default:
-            {
-                GGML_ASSERT(false);
-            } break;
-    }
-}
-
-// ggml_compute_forward_alibi
-
-static void ggml_compute_forward_alibi_f32(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * src0,
-        const struct ggml_tensor * src1,
-        struct ggml_tensor * dst) {
-    assert(params->ith == 0);
-    assert(src1->type == GGML_TYPE_I32);
-    assert(ggml_nelements(src1) == 3);
-
-    if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
-        return;
-    }
-
-    const int n_past = ((int32_t *) src1->data)[0];
-    const int n_head = ((int32_t *) src1->data)[1];
-    const int mode   = ((int32_t *) src1->data)[2];
-
-    const int ne0 = src0->ne[0]; // all_seq_len = n_past + ne1
-    const int ne1 = src0->ne[1]; // seq_len_without_past
-    const int ne2 = src0->ne[2]; // n_head -> this is k
-    const int ne3 = src0->ne[3]; // 1 -> bsz
-
-    const int n  = ggml_nrows(src0);
-    const int ne2_ne3 = n/ne1; // ne2*ne3
-
-    const int nb0 = src0->nb[0];
-    const int nb1 = src0->nb[1];
-    const int nb2 = src0->nb[2];
-    const int nb3 = src0->nb[3];
-
-
-    // printf("\nne0: %d, ne1: %d, ne2: %d, ne3: %d", ne0, ne1, ne2, ne3);
-    // printf("\nn_past = %d, ne2 = %d", n_past, ne2);
-
-    assert(nb0 == sizeof(float));
-    assert(ne1+n_past == ne0);
-
-    // add alibi to src0 (KQ_scaled)
-    const int n_heads_log2_floor = 1 << (int) floor(log2(n_head));
-    const float m0 = pow(2.0, -8.0 / n_heads_log2_floor);
-    const float m1 = pow(2.0, -4.0 / n_heads_log2_floor);
-
-    for (int i = 0; i < ne0; i++) {
-        for (int j = 0; j < ne1; j++) {
-            for (int k = 0; k < ne2_ne3; k++) {
-                float * const src = (float *)((char *) src0->data + i*nb0 + j*nb1 + k*nb2);
-                float * dst_data  = (float *)((char *)  dst->data + i*nb0 + j*nb1 + k*nb2);
-
-                // TODO: k*nb2 or k*nb3
-
-                float m_k;
-                if (k < n_heads_log2_floor) {
-                    m_k = pow(m0, k + 1);
-                } else {
-                    m_k = pow(m1, 2 * (k - n_heads_log2_floor) + 1);
-                }
-                //TODO: optimize
-                dst_data[0] = (j+1) * m_k + src[0];
-            }
-        }
-    }
-
-}
-
-
-static void ggml_compute_forward_alibi_f16(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * src0,
-        const struct ggml_tensor * src1,
-        struct ggml_tensor * dst) {
-    assert(params->ith == 0);
-    assert(src1->type == GGML_TYPE_I32);
-    assert(ggml_nelements(src1) == 3);
-
-    if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
-        return;
-    }
-
-    const int n_past = ((int32_t *) src1->data)[0];
-    const int n_head = ((int32_t *) src1->data)[1];
-    const int mode   = ((int32_t *) src1->data)[2];
-
-    const int ne0 = src0->ne[0]; // all_seq_len = n_past + ne1
-    const int ne1 = src0->ne[1]; // seq_len_without_past
-    const int ne2 = src0->ne[2]; // n_head -> this is k
-    const int ne3 = src0->ne[3]; // 1 -> bsz
-
-    const int n  = ggml_nrows(src0);
-    const int ne2_ne3 = n/ne1; // ne2*ne3
-
-    const int nb0 = src0->nb[0];
-    const int nb1 = src0->nb[1];
-    const int nb2 = src0->nb[2];
-    const int nb3 = src0->nb[3];
-
-
-    // printf("\nne0: %d, ne1: %d, ne2: %d, ne3: %d", ne0, ne1, ne2, ne3);
-    // printf("\nn_past = %d, ne2 = %d", n_past, ne2);
-
-    assert(nb0 == sizeof(float));
-    assert(ne1+n_past == ne0);
-
-    // add alibi to src0 (KQ_scaled)
-    const int n_heads_log2_floor = 1 << (int) floor(log2(n_head));
-    const ggml_fp16_t m0 = pow(2.0, -8.0 / n_heads_log2_floor);
-    const ggml_fp16_t m1 = pow(2.0, -4.0 / n_heads_log2_floor);
-
-    for (int i = 0; i < ne0; i++) {
-        for (int j = 0; j < ne1; j++) {
-            for (int k = 0; k < ne2_ne3; k++) {
-                ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i*nb0 + j*nb1 + k*nb2);
-                ggml_fp16_t * dst_data  = (ggml_fp16_t *)((char *)  dst->data + i*nb0 + j*nb1 + k*nb2);
-
-                // TODO: k*nb2 or k*nb3
-
-                ggml_fp16_t m_k;
-                if (k < n_heads_log2_floor) {
-                    m_k = pow(m0, k + 1);
-                } else {
-                    m_k = pow(m1, 2 * (k - n_heads_log2_floor) + 1);
-                }
-                //TODO: optimize
-                dst_data[0] = (j+1) * m_k + src[0];
-            }
-        }
-    }
-
-}
-
-static void ggml_compute_forward_alibi(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * src0,
-        const struct ggml_tensor * src1,
-        struct ggml_tensor * dst) {
-    switch (src0->type) {
-        case GGML_TYPE_F16:
-            {
-                ggml_compute_forward_alibi_f16(params, src0, src1, dst);
-            } break;
-        case GGML_TYPE_F32:
-            {
-                ggml_compute_forward_alibi_f32(params, src0, src1, dst);
-            } break;
-        case GGML_TYPE_Q4_0:
-        case GGML_TYPE_Q4_1:
-        case GGML_TYPE_Q4_2:
-        case GGML_TYPE_Q4_3:
-        case GGML_TYPE_Q8_0:
-        case GGML_TYPE_I8:
-        case GGML_TYPE_I16:
-        case GGML_TYPE_I32:
-        case GGML_TYPE_COUNT:
-            {
-                GGML_ASSERT(false);
-            } break;
-    }
-}
-
-// ggml_compute_forward_rope
-
-static void ggml_compute_forward_rope_f32(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * src0,
-        const struct ggml_tensor * src1,
-        struct ggml_tensor * dst) {
-    assert(src1->type == GGML_TYPE_I32);
-    assert(ggml_nelements(src1) == 3);
-
-    if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
-        return;
-    }
-
-    const int n_past = ((int32_t *) src1->data)[0];
-    const int n_dims = ((int32_t *) src1->data)[1];
-    const int mode   = ((int32_t *) src1->data)[2];
-
-    //const int64_t ne0 = src0->ne[0];
-    const int64_t ne1 = src0->ne[1];
-    const int64_t ne2 = src0->ne[2];
-    const int64_t ne3 = src0->ne[3];
-
-    const int nb0 = src0->nb[0];
-    const int nb1 = src0->nb[1];
-    const int nb2 = src0->nb[2];
-    const int nb3 = src0->nb[3];
-
-    //printf("ne0: %d, ne1: %d, ne2: %d, ne3: %d\n", ne0, ne1, ne2, ne3);
-    //printf("n_past = %d, ne2 = %d\n", n_past, ne2);
-
-    assert(nb0 == sizeof(float));
-
-    const int ith = params->ith;
-    const int nth = params->nth;
-
-    const int nr = ggml_nrows(src0);
-
-    // rows per thread
-    const int dr = (nr + nth - 1)/nth;
-
-    // row range for this thread
-    const int ir0 = dr*ith;
-    const int ir1 = MIN(ir0 + dr, nr);
-
-    // row index used to determine which thread to use
-    int ir = 0;
-
-    const float theta_scale = powf(10000.0, -2.0f/n_dims);
-
-    const bool is_neox = mode & 2;
-
-    for (int64_t i3 = 0; i3 < ne3; i3++) {
-        for (int64_t i2 = ((mode & 1) == 0 ? 0 : n_past); i2 < ne2; i2++) {
-            const int p = ((mode & 1) == 0 ? n_past + i2 : i2);
-            for (int64_t i1 = 0; i1 < ne1; i1++) {
-                if (ir++ < ir0) continue;
-                if (ir   > ir1) break;
-
-                float theta = (float)p;
-
-                for (int i0 = 0; i0 < n_dims; i0 += 2) {
-                    const float cos_theta = cosf(theta);
-                    const float sin_theta = sinf(theta);
-
-                    theta *= theta_scale;
-
-                    if (!is_neox) {
-                        const float * const src = (float *)((char *) src0->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
-                              float * dst_data  = (float *)((char *)  dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
-
-                        const float x0 = src[0];
-                        const float x1 = src[1];
-
-                        dst_data[0] = x0*cos_theta - x1*sin_theta;
-                        dst_data[1] = x0*sin_theta + x1*cos_theta;
-                    } else {
-                        const float * const src = (float *)((char *) src0->data + i3*nb3 + i2*nb2 + i1*nb1 + (i0/2)*nb0);
-                              float * dst_data  = (float *)((char *)  dst->data + i3*nb3 + i2*nb2 + i1*nb1 + (i0/2)*nb0);
-
-                        const float x0 = src[0];
-                        const float x1 = src[n_dims/2];
-
-                        dst_data[0]        = x0*cos_theta - x1*sin_theta;
-                        dst_data[n_dims/2] = x0*sin_theta + x1*cos_theta;
-                    }
-                }
-            }
-        }
-    }
-}
-
-static void ggml_compute_forward_rope_f16(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * src0,
-        const struct ggml_tensor * src1,
-        struct ggml_tensor * dst) {
-    assert(src1->type == GGML_TYPE_I32);
-    assert(ggml_nelements(src1) == 3);
-
-    if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
-        return;
-    }
-
-    const int n_past = ((int32_t *) src1->data)[0];
-    const int n_dims = ((int32_t *) src1->data)[1];
-    const int mode   = ((int32_t *) src1->data)[2];
-
-    //const int64_t ne0 = src0->ne[0];
-    const int64_t ne1 = src0->ne[1];
-    const int64_t ne2 = src0->ne[2];
-    const int64_t ne3 = src0->ne[3];
-
-    const int nb0 = src0->nb[0];
-    const int nb1 = src0->nb[1];
-    const int nb2 = src0->nb[2];
-    const int nb3 = src0->nb[3];
-
-    //printf("ne0: %d, ne1: %d, ne2: %d, ne3: %d\n", ne0, ne1, ne2, ne3);
-    //printf("n_past = %d, ne2 = %d\n", n_past, ne2);
-
-    assert(nb0 == sizeof(ggml_fp16_t));
-
-    const int ith = params->ith;
-    const int nth = params->nth;
-
-    const int nr = ggml_nrows(src0);
-
-    // rows per thread
-    const int dr = (nr + nth - 1)/nth;
-
-    // row range for this thread
-    const int ir0 = dr*ith;
-    const int ir1 = MIN(ir0 + dr, nr);
-
-    // row index used to determine which thread to use
-    int ir = 0;
-
-    const float theta_scale = powf(10000.0, -2.0f/n_dims);
-
-    const bool is_neox = mode & 2;
-
-    for (int64_t i3 = 0; i3 < ne3; i3++) {
-        for (int64_t i2 = ((mode & 1) == 0 ? 0 : n_past); i2 < ne2; i2++) {
-            const int p = ((mode & 1) == 0 ? n_past + i2 : i2);
-            for (int64_t i1 = 0; i1 < ne1; i1++) {
-                if (ir++ < ir0) continue;
-                if (ir   > ir1) break;
-
-                float theta = (float)p;
-
-                for (int i0 = 0; i0 < n_dims; i0 += 2) {
-                    const float cos_theta = cosf(theta);
-                    const float sin_theta = sinf(theta);
-
-                    theta *= theta_scale;
-
-                    if (!is_neox) {
-                        const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
-                              ggml_fp16_t * dst_data  = (ggml_fp16_t *)((char *)  dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
-
-                        const float x0 = GGML_FP16_TO_FP32(src[0]);
-                        const float x1 = GGML_FP16_TO_FP32(src[1]);
-
-                        dst_data[0] = GGML_FP32_TO_FP16(x0*cos_theta - x1*sin_theta);
-                        dst_data[1] = GGML_FP32_TO_FP16(x0*sin_theta + x1*cos_theta);
-                    } else {
-                        const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i3*nb3 + i2*nb2 + i1*nb1 + (i0/2)*nb0);
-                              ggml_fp16_t * dst_data  = (ggml_fp16_t *)((char *)  dst->data + i3*nb3 + i2*nb2 + i1*nb1 + (i0/2)*nb0);
-
-                        const float x0 = GGML_FP16_TO_FP32(src[0]);
-                        const float x1 = GGML_FP16_TO_FP32(src[n_dims/2]);
-
-                        dst_data[0]        = GGML_FP32_TO_FP16(x0*cos_theta - x1*sin_theta);
-                        dst_data[n_dims/2] = GGML_FP32_TO_FP16(x0*sin_theta + x1*cos_theta);
-                    }
-                }
-            }
-        }
-    }
-}
-
-static void ggml_compute_forward_rope(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * src0,
-        const struct ggml_tensor * src1,
-        struct ggml_tensor * dst) {
-    switch (src0->type) {
-        case GGML_TYPE_F16:
-            {
-                ggml_compute_forward_rope_f16(params, src0, src1, dst);
-            } break;
-        case GGML_TYPE_F32:
-            {
-                ggml_compute_forward_rope_f32(params, src0, src1, dst);
-            } break;
-        default:
-            {
-                GGML_ASSERT(false);
-            } break;
-    }
-}
-
-// ggml_compute_forward_conv_1d_1s
-
-static void ggml_compute_forward_conv_1d_1s_f16_f32(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * src0,
-        const struct ggml_tensor * src1,
-              struct ggml_tensor * dst) {
-    GGML_ASSERT(src0->type == GGML_TYPE_F16);
-    GGML_ASSERT(src1->type == GGML_TYPE_F32);
-    GGML_ASSERT( dst->type == GGML_TYPE_F32);
-
-    int64_t t0 = ggml_perf_time_us();
-    UNUSED(t0);
-
-    const int64_t ne00 = src0->ne[0];
-    const int64_t ne01 = src0->ne[1];
-    const int64_t ne02 = src0->ne[2];
-    //const int64_t ne03 = src0->ne[3];
-
-    const int64_t ne10 = src1->ne[0];
-    const int64_t ne11 = src1->ne[1];
-    //const int64_t ne12 = src1->ne[2];
-    //const int64_t ne13 = src1->ne[3];
-
-    //const int64_t ne0  = dst->ne[0];
-    //const int64_t ne1  = dst->ne[1];
-    //const int64_t ne2  = dst->ne[2];
-    //const int64_t ne3  = dst->ne[3];
-    //const int64_t ne   = ne0*ne1*ne2*ne3;
-
-    const int nb00 = src0->nb[0];
-    const int nb01 = src0->nb[1];
-    const int nb02 = src0->nb[2];
-    //const int nb03 = src0->nb[3];
-
-    const int nb10 = src1->nb[0];
-    const int nb11 = src1->nb[1];
-    //const int nb12 = src1->nb[2];
-    //const int nb13 = src1->nb[3];
-
-    //const int nb0  = dst->nb[0];
-    const int nb1  = dst->nb[1];
-    //const int nb2  = dst->nb[2];
-    //const int nb3  = dst->nb[3];
-
-    const int ith = params->ith;
-    const int nth = params->nth;
-
-    const int nk = ne00;
-    const int nh = nk/2;
-
-    const int ew0 = ggml_up32(ne01);
-
-    GGML_ASSERT(ne00 % 2 == 1); // TODO: support even kernel sizes
-    GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
-    GGML_ASSERT(nb10 == sizeof(float));
-
-    if (params->type == GGML_TASK_INIT) {
-        // TODO: fix this memset (wsize is overestimated)
-        memset(params->wdata, 0, params->wsize);
-
-        // prepare kernel data (src0)
-        {
-            ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + 0;
-
-            for (int64_t i02 = 0; i02 < ne02; i02++) {
-                for (int64_t i01 = 0; i01 < ne01; i01++) {
-                    const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i02*nb02 + i01*nb01);
-                    ggml_fp16_t * dst_data = wdata + i02*ew0*ne00;
-                    for (int64_t i00 = 0; i00 < ne00; i00++) {
-                        dst_data[i00*ew0 + i01] = src[i00];
-                    }
-                }
-            }
-        }
-
-        // prepare source data (src1)
-        {
-            ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + ne02*ew0*ne00;
-
-            for (int64_t i11 = 0; i11 < ne11; i11++) {
-                const float * const src = (float *)((char *) src1->data + i11*nb11);
-                ggml_fp16_t * dst_data = wdata;
-                for (int64_t i10 = 0; i10 < ne10; i10++) {
-                    dst_data[(i10 + nh)*ew0 + i11] = GGML_FP32_TO_FP16(src[i10]);
-                }
-            }
-        }
-
-        return;
-    }
-
-    if (params->type == GGML_TASK_FINALIZE) {
-        return;
-    }
-
-    // total rows in dst
-    const int nr = ne02;
-
-    // rows per thread
-    const int dr = (nr + nth - 1)/nth;
-
-    // row range for this thread
-    const int ir0 = dr*ith;
-    const int ir1 = MIN(ir0 + dr, nr);
-
-    for (int i1 = ir0; i1 < ir1; i1++) {
-        float * dst_data = (float *)((char *) dst->data + i1*nb1);
-        for (int64_t i0 = 0; i0 < ne10; ++i0) {
-            dst_data[i0] = 0;
-            for (int k = -nh; k <= nh; k++) {
-                float v = 0.0f;
-                ggml_vec_dot_f16(ew0, &v,
-                        (ggml_fp16_t *) params->wdata +   i1*ew0*ne00 +      (nh + k)*ew0,
-                        (ggml_fp16_t *) params->wdata + ne02*ew0*ne00 + (i0 + nh + k)*ew0);
-
-                dst_data[i0] += v;
-            }
-        }
-    }
-}
-
-static void ggml_compute_forward_conv_1d_1s_f32(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * src0,
-        const struct ggml_tensor * src1,
-              struct ggml_tensor * dst) {
-    GGML_ASSERT(src0->type == GGML_TYPE_F32);
-    GGML_ASSERT(src1->type == GGML_TYPE_F32);
-    GGML_ASSERT( dst->type == GGML_TYPE_F32);
-
-    int64_t t0 = ggml_perf_time_us();
-    UNUSED(t0);
-
-    const int64_t ne00 = src0->ne[0];
-    const int64_t ne01 = src0->ne[1];
-    const int64_t ne02 = src0->ne[2];
-    //const int64_t ne03 = src0->ne[3];
-
-    const int64_t ne10 = src1->ne[0];
-    const int64_t ne11 = src1->ne[1];
-    //const int64_t ne12 = src1->ne[2];
-    //const int64_t ne13 = src1->ne[3];
-
-    //const int64_t ne0  = dst->ne[0];
-    //const int64_t ne1  = dst->ne[1];
-    //const int64_t ne2  = dst->ne[2];
-    //const int64_t ne3  = dst->ne[3];
-    //const int64_t ne   = ne0*ne1*ne2*ne3;
-
-    const int nb00 = src0->nb[0];
-    const int nb01 = src0->nb[1];
-    const int nb02 = src0->nb[2];
-    //const int nb03 = src0->nb[3];
-
-    const int nb10 = src1->nb[0];
-    const int nb11 = src1->nb[1];
-    //const int nb12 = src1->nb[2];
-    //const int nb13 = src1->nb[3];
-
-    //const int nb0  = dst->nb[0];
-    const int nb1  = dst->nb[1];
-    //const int nb2  = dst->nb[2];
-    //const int nb3  = dst->nb[3];
-
-    const int ith = params->ith;
-    const int nth = params->nth;
-
-    const int nk = ne00;
-    const int nh = nk/2;
-
-    const int ew0 = ggml_up32(ne01);
-
-    GGML_ASSERT(ne00 % 2 == 1); // TODO: support even kernel sizes
-    GGML_ASSERT(nb00 == sizeof(float));
-    GGML_ASSERT(nb10 == sizeof(float));
-
-    if (params->type == GGML_TASK_INIT) {
-        // TODO: fix this memset (wsize is overestimated)
-        memset(params->wdata, 0, params->wsize);
-
-        // prepare kernel data (src0)
-        {
-            float * const wdata = (float *) params->wdata + 0;
-
-            for (int64_t i02 = 0; i02 < ne02; i02++) {
-                for (int64_t i01 = 0; i01 < ne01; i01++) {
-                    const float * const src = (float *)((char *) src0->data + i02*nb02 + i01*nb01);
-                    float * dst_data = wdata + i02*ew0*ne00;
-                    for (int64_t i00 = 0; i00 < ne00; i00++) {
-                        dst_data[i00*ew0 + i01] = src[i00];
-                    }
-                }
-            }
-        }
-
-        // prepare source data (src1)
-        {
-            float * const wdata = (float *) params->wdata + ne02*ew0*ne00;
-
-            for (int64_t i11 = 0; i11 < ne11; i11++) {
-                const float * const src = (float *)((char *) src1->data + i11*nb11);
-                float * dst_data = wdata;
-                for (int64_t i10 = 0; i10 < ne10; i10++) {
-                    dst_data[(i10 + nh)*ew0 + i11] = src[i10];
-                }
-            }
-        }
-
-        return;
-    }
-
-    if (params->type == GGML_TASK_FINALIZE) {
-        return;
-    }
-
-    // total rows in dst
-    const int nr = ne02;
-
-    // rows per thread
-    const int dr = (nr + nth - 1)/nth;
-
-    // row range for this thread
-    const int ir0 = dr*ith;
-    const int ir1 = MIN(ir0 + dr, nr);
-
-    for (int i1 = ir0; i1 < ir1; i1++) {
-        float * dst_data = (float *)((char *) dst->data + i1*nb1);
-        for (int64_t i0 = 0; i0 < ne10; ++i0) {
-            dst_data[i0] = 0;
-            for (int k = -nh; k <= nh; k++) {
-                float v = 0.0f;
-                ggml_vec_dot_f32(ew0, &v,
-                        (float *) params->wdata +   i1*ew0*ne00 +      (nh + k)*ew0,
-                        (float *) params->wdata + ne02*ew0*ne00 + (i0 + nh + k)*ew0);
-
-                dst_data[i0] += v;
-            }
-        }
-    }
-}
-
-static void ggml_compute_forward_conv_1d_1s(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * src0,
-        const struct ggml_tensor * src1,
-        struct ggml_tensor * dst) {
-    switch (src0->type) {
-        case GGML_TYPE_F16:
-            {
-                ggml_compute_forward_conv_1d_1s_f16_f32(params, src0, src1, dst);
-            } break;
-        case GGML_TYPE_F32:
-            {
-                ggml_compute_forward_conv_1d_1s_f32(params, src0, src1, dst);
-            } break;
-        default:
-            {
-                GGML_ASSERT(false);
-            } break;
-    }
-}
-
-// ggml_compute_forward_conv_1d_2s
-
-static void ggml_compute_forward_conv_1d_2s_f16_f32(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * src0,
-        const struct ggml_tensor * src1,
-              struct ggml_tensor * dst) {
-    GGML_ASSERT(src0->type == GGML_TYPE_F16);
-    GGML_ASSERT(src1->type == GGML_TYPE_F32);
-    GGML_ASSERT( dst->type == GGML_TYPE_F32);
-
-    int64_t t0 = ggml_perf_time_us();
-    UNUSED(t0);
-
-    const int64_t ne00 = src0->ne[0];
-    const int64_t ne01 = src0->ne[1];
-    const int64_t ne02 = src0->ne[2];
-    //const int64_t ne03 = src0->ne[3];
-
-    const int64_t ne10 = src1->ne[0];
-    const int64_t ne11 = src1->ne[1];
-    //const int64_t ne12 = src1->ne[2];
-    //const int64_t ne13 = src1->ne[3];
-
-    //const int64_t ne0  = dst->ne[0];
-    //const int64_t ne1  = dst->ne[1];
-    //const int64_t ne2  = dst->ne[2];
-    //const int64_t ne3  = dst->ne[3];
-    //const int64_t ne   = ne0*ne1*ne2*ne3;
-
-    const int nb00 = src0->nb[0];
-    const int nb01 = src0->nb[1];
-    const int nb02 = src0->nb[2];
-    //const int nb03 = src0->nb[3];
-
-    const int nb10 = src1->nb[0];
-    const int nb11 = src1->nb[1];
-    //const int nb12 = src1->nb[2];
-    //const int nb13 = src1->nb[3];
-
-    //const int nb0  = dst->nb[0];
-    const int nb1  = dst->nb[1];
-    //const int nb2  = dst->nb[2];
-    //const int nb3  = dst->nb[3];
-
-    const int ith = params->ith;
-    const int nth = params->nth;
-
-    const int nk = ne00;
-    const int nh = nk/2;
-
-    const int ew0 = ggml_up32(ne01);
-
-    GGML_ASSERT(ne00 % 2 == 1); // TODO: support even kernel sizes
-    GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
-    GGML_ASSERT(nb10 == sizeof(float));
-
-    if (params->type == GGML_TASK_INIT) {
-        // TODO: fix this memset (wsize is overestimated)
-        memset(params->wdata, 0, params->wsize);
-
-        // prepare kernel data (src0)
-        {
-            ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + 0;
-
-            for (int64_t i02 = 0; i02 < ne02; i02++) {
-                for (int64_t i01 = 0; i01 < ne01; i01++) {
-                    const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i02*nb02 + i01*nb01);
-                    ggml_fp16_t * dst_data = wdata + i02*ew0*ne00;
-                    for (int64_t i00 = 0; i00 < ne00; i00++) {
-                        dst_data[i00*ew0 + i01] = src[i00];
-                    }
-                }
-            }
-        }
-
-        // prepare source data (src1)
-        {
-            ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + ne02*ew0*ne00;
-
-            for (int64_t i11 = 0; i11 < ne11; i11++) {
-                const float * const src = (float *)((char *) src1->data + i11*nb11);
-                ggml_fp16_t * dst_data = wdata;
-                for (int64_t i10 = 0; i10 < ne10; i10++) {
-                    dst_data[(i10 + nh)*ew0 + i11] = GGML_FP32_TO_FP16(src[i10]);
-                }
-            }
-        }
-
-        return;
-    }
-
-    if (params->type == GGML_TASK_FINALIZE) {
-        return;
-    }
-
-    // total rows in dst
-    const int nr = ne02;
-
-    // rows per thread
-    const int dr = (nr + nth - 1)/nth;
-
-    // row range for this thread
-    const int ir0 = dr*ith;
-    const int ir1 = MIN(ir0 + dr, nr);
-
-    for (int i1 = ir0; i1 < ir1; i1++) {
-        float * dst_data = (float *)((char *) dst->data + i1*nb1);
-        for (int64_t i0 = 0; i0 < ne10; i0 += 2) {
-            dst_data[i0/2] = 0;
-            for (int k = -nh; k <= nh; k++) {
-                float v = 0.0f;
-                ggml_vec_dot_f16(ew0, &v,
-                        (ggml_fp16_t *) params->wdata +   i1*ew0*ne00 +      (nh + k)*ew0,
-                        (ggml_fp16_t *) params->wdata + ne02*ew0*ne00 + (i0 + nh + k)*ew0);
-
-                dst_data[i0/2] += v;
-            }
-        }
-    }
-}
-
-static void ggml_compute_forward_conv_1d_2s_f32(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * src0,
-        const struct ggml_tensor * src1,
-              struct ggml_tensor * dst) {
-    GGML_ASSERT(src0->type == GGML_TYPE_F32);
-    GGML_ASSERT(src1->type == GGML_TYPE_F32);
-    GGML_ASSERT( dst->type == GGML_TYPE_F32);
-
-    int64_t t0 = ggml_perf_time_us();
-    UNUSED(t0);
-
-    const int64_t ne00 = src0->ne[0];
-    const int64_t ne01 = src0->ne[1];
-    const int64_t ne02 = src0->ne[2];
-    //const int64_t ne03 = src0->ne[3];
-
-    const int64_t ne10 = src1->ne[0];
-    const int64_t ne11 = src1->ne[1];
-    //const int64_t ne12 = src1->ne[2];
-    //const int64_t ne13 = src1->ne[3];
-
-    //const int64_t ne0  = dst->ne[0];
-    //const int64_t ne1  = dst->ne[1];
-    //const int64_t ne2  = dst->ne[2];
-    //const int64_t ne3  = dst->ne[3];
-    //const int64_t ne   = ne0*ne1*ne2*ne3;
-
-    const int nb00 = src0->nb[0];
-    const int nb01 = src0->nb[1];
-    const int nb02 = src0->nb[2];
-    //const int nb03 = src0->nb[3];
-
-    const int nb10 = src1->nb[0];
-    const int nb11 = src1->nb[1];
-    //const int nb12 = src1->nb[2];
-    //const int nb13 = src1->nb[3];
-
-    //const int nb0  = dst->nb[0];
-    const int nb1  = dst->nb[1];
-    //const int nb2  = dst->nb[2];
-    //const int nb3  = dst->nb[3];
-
-    const int ith = params->ith;
-    const int nth = params->nth;
-
-    const int nk = ne00;
-    const int nh = nk/2;
-
-    const int ew0 = ggml_up32(ne01);
-
-    GGML_ASSERT(ne00 % 2 == 1); // TODO: support even kernel sizes
-    GGML_ASSERT(nb00 == sizeof(float));
-    GGML_ASSERT(nb10 == sizeof(float));
-
-    if (params->type == GGML_TASK_INIT) {
-        // TODO: fix this memset (wsize is overestimated)
-        memset(params->wdata, 0, params->wsize);
-
-        // prepare kernel data (src0)
-        {
-            float * const wdata = (float *) params->wdata + 0;
-
-            for (int64_t i02 = 0; i02 < ne02; i02++) {
-                for (int64_t i01 = 0; i01 < ne01; i01++) {
-                    const float * const src = (float *)((char *) src0->data + i02*nb02 + i01*nb01);
-                    float * dst_data = wdata + i02*ew0*ne00;
-                    for (int64_t i00 = 0; i00 < ne00; i00++) {
-                        dst_data[i00*ew0 + i01] = src[i00];
-                    }
-                }
-            }
-        }
-
-        // prepare source data (src1)
-        {
-            float * const wdata = (float *) params->wdata + ne02*ew0*ne00;
-
-            for (int64_t i11 = 0; i11 < ne11; i11++) {
-                const float * const src = (float *)((char *) src1->data + i11*nb11);
-                float * dst_data = wdata;
-                for (int64_t i10 = 0; i10 < ne10; i10++) {
-                    dst_data[(i10 + nh)*ew0 + i11] = src[i10];
-                }
-            }
-        }
-
-        return;
-    }
-
-    if (params->type == GGML_TASK_FINALIZE) {
-        return;
-    }
-
-    // total rows in dst
-    const int nr = ne02;
-
-    // rows per thread
-    const int dr = (nr + nth - 1)/nth;
-
-    // row range for this thread
-    const int ir0 = dr*ith;
-    const int ir1 = MIN(ir0 + dr, nr);
-
-    for (int i1 = ir0; i1 < ir1; i1++) {
-        float * dst_data = (float *)((char *) dst->data + i1*nb1);
-        for (int64_t i0 = 0; i0 < ne10; i0 += 2) {
-            dst_data[i0/2] = 0;
-            for (int k = -nh; k <= nh; k++) {
-                float v = 0.0f;
-                ggml_vec_dot_f32(ew0, &v,
-                        (float *) params->wdata +   i1*ew0*ne00 +      (nh + k)*ew0,
-                        (float *) params->wdata + ne02*ew0*ne00 + (i0 + nh + k)*ew0);
-
-                dst_data[i0/2] += v;
-            }
-        }
-    }
-}
-
-static void ggml_compute_forward_conv_1d_2s(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * src0,
-        const struct ggml_tensor * src1,
-        struct ggml_tensor * dst) {
-    switch (src0->type) {
-        case GGML_TYPE_F16:
-            {
-                ggml_compute_forward_conv_1d_2s_f16_f32(params, src0, src1, dst);
-            } break;
-        case GGML_TYPE_F32:
-            {
-                ggml_compute_forward_conv_1d_2s_f32(params, src0, src1, dst);
-            } break;
-        default:
-            {
-                GGML_ASSERT(false);
-            } break;
-    }
-}
-
-// ggml_compute_forward_flash_attn
-
-static void ggml_compute_forward_flash_attn_f32(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * q,
-        const struct ggml_tensor * k,
-        const struct ggml_tensor * v,
-        const bool masked,
-             struct ggml_tensor * dst) {
-    int64_t t0 = ggml_perf_time_us();
-    UNUSED(t0);
-
-    const int64_t neq0 = q->ne[0];
-    const int64_t neq1 = q->ne[1];
-    const int64_t neq2 = q->ne[2];
-    const int64_t neq3 = q->ne[3];
-
-    const int64_t nek0 = k->ne[0];
-    const int64_t nek1 = k->ne[1];
-    //const int64_t nek2 = k->ne[2];
-    //const int64_t nek3 = k->ne[3];
-
-    //const int64_t nev0 = v->ne[0];
-    const int64_t nev1 = v->ne[1];
-    //const int64_t nev2 = v->ne[2];
-    //const int64_t nev3 = v->ne[3];
-
-    const int64_t ne0  = dst->ne[0];
-    const int64_t ne1  = dst->ne[1];
-    //const int64_t ne2  = dst->ne[2];
-    //const int64_t ne3  = dst->ne[3];
-
-    const int nbk0 = k->nb[0];
-    const int nbk1 = k->nb[1];
-    const int nbk2 = k->nb[2];
-    const int nbk3 = k->nb[3];
-
-    const int nbq0 = q->nb[0];
-    const int nbq1 = q->nb[1];
-    const int nbq2 = q->nb[2];
-    const int nbq3 = q->nb[3];
-
-    const int nbv0 = v->nb[0];
-    const int nbv1 = v->nb[1];
-    const int nbv2 = v->nb[2];
-    const int nbv3 = v->nb[3];
-
-    const int nb0  = dst->nb[0];
-    const int nb1  = dst->nb[1];
-    const int nb2  = dst->nb[2];
-    const int nb3  = dst->nb[3];
-
-    const int ith = params->ith;
-    const int nth = params->nth;
-
-    const int64_t D = neq0;
-    const int64_t N = neq1;
-    const int64_t P = nek1 - N;
-    const int64_t M = P + N;
-
-    const int Mup = ggml_up(M, GGML_SOFT_MAX_UNROLL);
-
-    GGML_ASSERT(ne0 == D);
-    GGML_ASSERT(ne1 == N);
-    GGML_ASSERT(P >= 0);
-
-    GGML_ASSERT(nbq0 == sizeof(float));
-    GGML_ASSERT(nbk0 == sizeof(float));
-    GGML_ASSERT(nbv0 == sizeof(float));
-
-    GGML_ASSERT(neq0 == D);
-    GGML_ASSERT(nek0 == D);
-    GGML_ASSERT(nev1 == D);
-
-    GGML_ASSERT(neq1 == N);
-    GGML_ASSERT(nek1 == N + P);
-    GGML_ASSERT(nev1 == D);
-
-    // dst cannot be transposed or permuted
-    GGML_ASSERT(nb0 == sizeof(float));
-    GGML_ASSERT(nb0 <= nb1);
-    GGML_ASSERT(nb1 <= nb2);
-    GGML_ASSERT(nb2 <= nb3);
-
-    if (params->type == GGML_TASK_INIT) {
-        return;
-    }
-
-    if (params->type == GGML_TASK_FINALIZE) {
-        return;
-    }
-
-    // parallelize by q rows using ggml_vec_dot_f32
-
-    // total rows in q
-    const int nr = neq1*neq2*neq3;
-
-    // rows per thread
-    const int dr = (nr + nth - 1)/nth;
-
-    // row range for this thread
-    const int ir0 = dr*ith;
-    const int ir1 = MIN(ir0 + dr, nr);
-
-    const float scale = 1.0f/sqrtf(D);
-
-    //printf("P=%d N=%d D=%d ir0=%d ir1=%d scale = %f\n", P, N, D, ir0, ir1, scale);
-
-    for (int ir = ir0; ir < ir1; ++ir) {
-        // q indices
-        const int iq3 = ir/(neq2*neq1);
-        const int iq2 = (ir - iq3*neq2*neq1)/neq1;
-        const int iq1 = (ir - iq3*neq2*neq1 - iq2*neq1);
-
-        float * S = (float *) params->wdata + ith*(Mup + CACHE_LINE_SIZE_F32);
-
-        for (int i = M; i < Mup; ++i) {
-            S[i] = -INFINITY;
-        }
-
-        for (int64_t ic = 0; ic < nek1; ++ic) {
-            // k indices
-            const int ik3 = iq3;
-            const int ik2 = iq2;
-            const int ik1 = ic;
-
-            // S indices
-            const int i1 = ik1;
-
-            ggml_vec_dot_f32(neq0,
-                    S + i1,
-                    (float *) ((char *) k->data + (ik1*nbk1 + ik2*nbk2 + ik3*nbk3)),
-                    (float *) ((char *) q->data + (iq1*nbq1 + iq2*nbq2 + iq3*nbq3)));
-        }
-
-        // scale
-        ggml_vec_scale_f32(nek1, S, scale);
-
-        if (masked) {
-            for (int64_t i = P; i < M; i++) {
-                if (i > P + iq1) {
-                    S[i] = -INFINITY;
-                }
-            }
-        }
-
-        // softmax
-        {
-            float max = -INFINITY;
-            ggml_vec_max_f32(M, &max, S);
-
-            ggml_float sum = 0.0;
-            {
-#ifdef GGML_SOFT_MAX_ACCELERATE
-                max = -max;
-                vDSP_vsadd(S, 1, &max, S, 1, Mup);
-                vvexpf(S, S, &Mup);
-                ggml_vec_sum_f32(Mup, &sum, S);
-#else
-                uint16_t   scvt[GGML_SOFT_MAX_UNROLL];
-                ggml_float sump[GGML_SOFT_MAX_UNROLL] = { 0.0 };
-
-                for (int i = 0; i < Mup; i += GGML_SOFT_MAX_UNROLL) {
-                    float * SS = S + i;
-
-                    for (int j = 0; j < GGML_SOFT_MAX_UNROLL; ++j) {
-                        if (SS[j] == -INFINITY) {
-                            SS[j] = 0.0f;
-                        } else {
-                            ggml_fp16_t s = GGML_FP32_TO_FP16(SS[j] - max);
-                            memcpy(&scvt[j], &s, sizeof(uint16_t));
-                            const float val = GGML_FP16_TO_FP32(table_exp_f16[scvt[j]]);
-                            sump[j] += (ggml_float)val;
-                            SS[j] = val;
-                        }
-                    }
-                }
-
-                for (int i = 0; i < GGML_SOFT_MAX_UNROLL; i++) {
-                    sum += sump[i];
-                }
-#endif
-            }
-
-            assert(sum > 0.0);
-
-            sum = 1.0/sum;
-            ggml_vec_scale_f32(M, S, sum);
-
-#ifndef NDEBUG
-            for (int i = 0; i < M; ++i) {
-                assert(!isnan(S[i]));
-                assert(!isinf(S[i]));
-            }
-#endif
-        }
-
-        for (int64_t ic = 0; ic < nev1; ++ic) {
-            // dst indices
-            const int i1 = iq1;
-            const int i2 = iq2;
-            const int i3 = iq3;
-
-            ggml_vec_dot_f32(nek1,
-                    (float *) ((char *) dst->data + (ic*nb0 + i1*nb1  + i2*nb2  + i3*nb3)),
-                    (float *) ((char *) v->data   + (         ic*nbv1 + i2*nbv2 + i3*nbv3)),
-                    S);
-        }
-    }
-}
-
-static void ggml_compute_forward_flash_attn_f16(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * q,
-        const struct ggml_tensor * k,
-        const struct ggml_tensor * v,
-        const bool masked,
-             struct ggml_tensor * dst) {
-    int64_t t0 = ggml_perf_time_us();
-    UNUSED(t0);
-
-    const int64_t neq0 = q->ne[0];
-    const int64_t neq1 = q->ne[1];
-    const int64_t neq2 = q->ne[2];
-    const int64_t neq3 = q->ne[3];
-
-    const int64_t nek0 = k->ne[0];
-    const int64_t nek1 = k->ne[1];
-    //const int64_t nek2 = k->ne[2];
-    //const int64_t nek3 = k->ne[3];
-
-    //const int64_t nev0 = v->ne[0];
-    const int64_t nev1 = v->ne[1];
-    //const int64_t nev2 = v->ne[2];
-    //const int64_t nev3 = v->ne[3];
-
-    const int64_t ne0  = dst->ne[0];
-    const int64_t ne1  = dst->ne[1];
-    //const int64_t ne2  = dst->ne[2];
-    //const int64_t ne3  = dst->ne[3];
-
-    const int nbk0 = k->nb[0];
-    const int nbk1 = k->nb[1];
-    const int nbk2 = k->nb[2];
-    const int nbk3 = k->nb[3];
-
-    const int nbq0 = q->nb[0];
-    const int nbq1 = q->nb[1];
-    const int nbq2 = q->nb[2];
-    const int nbq3 = q->nb[3];
-
-    const int nbv0 = v->nb[0];
-    const int nbv1 = v->nb[1];
-    const int nbv2 = v->nb[2];
-    const int nbv3 = v->nb[3];
-
-    const int nb0  = dst->nb[0];
-    const int nb1  = dst->nb[1];
-    const int nb2  = dst->nb[2];
-    const int nb3  = dst->nb[3];
-
-    const int ith = params->ith;
-    const int nth = params->nth;
-
-    const int64_t D = neq0;
-    const int64_t N = neq1;
-    const int64_t P = nek1 - N;
-    const int64_t M = P + N;
-
-    const int Mup = ggml_up(M, GGML_SOFT_MAX_UNROLL);
-
-    GGML_ASSERT(ne0 == D);
-    GGML_ASSERT(ne1 == N);
-    GGML_ASSERT(P >= 0);
-
-    GGML_ASSERT(nbq0 == sizeof(ggml_fp16_t));
-    GGML_ASSERT(nbk0 == sizeof(ggml_fp16_t));
-    GGML_ASSERT(nbv0 == sizeof(ggml_fp16_t));
-
-    GGML_ASSERT(neq0 == D);
-    GGML_ASSERT(nek0 == D);
-    GGML_ASSERT(nev1 == D);
-
-    GGML_ASSERT(neq1 == N);
-    GGML_ASSERT(nek1 == N + P);
-    GGML_ASSERT(nev1 == D);
-
-    // dst cannot be transposed or permuted
-    GGML_ASSERT(nb0 == sizeof(float));
-    GGML_ASSERT(nb0 <= nb1);
-    GGML_ASSERT(nb1 <= nb2);
-    GGML_ASSERT(nb2 <= nb3);
-
-    if (params->type == GGML_TASK_INIT) {
-        return;
-    }
-
-    if (params->type == GGML_TASK_FINALIZE) {
-        return;
-    }
-
-    // parallelize by q rows using ggml_vec_dot_f32
-
-    // total rows in q
-    const int nr = neq1*neq2*neq3;
-
-    // rows per thread
-    const int dr = (nr + nth - 1)/nth;
-
-    // row range for this thread
-    const int ir0 = dr*ith;
-    const int ir1 = MIN(ir0 + dr, nr);
-
-    const float scale = 1.0f/sqrtf(D);
-
-    //printf("P=%d N=%d D=%d ir0=%d ir1=%d scale = %f\n", P, N, D, ir0, ir1, scale);
-
-    for (int ir = ir0; ir < ir1; ++ir) {
-        // q indices
-        const int iq3 = ir/(neq2*neq1);
-        const int iq2 = (ir - iq3*neq2*neq1)/neq1;
-        const int iq1 = (ir - iq3*neq2*neq1 - iq2*neq1);
-
-        float * S = (float *) params->wdata + ith*(2*Mup + CACHE_LINE_SIZE_F32);
-
-        for (int i = M; i < Mup; ++i) {
-            S[i] = -INFINITY;
-        }
-
-        if (GGML_VEC_DOT_UNROLL > 2 || nek1 % GGML_VEC_DOT_UNROLL != 0) {
-            for (int64_t ic = 0; ic < nek1; ++ic) {
-                // k indices
-                const int ik3 = iq3;
-                const int ik2 = iq2;
-                const int ik1 = ic;
-
-                // S indices
-                const int i1 = ik1;
-
-                ggml_vec_dot_f16(neq0,
-                        S + i1,
-                        (ggml_fp16_t *) ((char *) k->data + (ik1*nbk1 + ik2*nbk2 + ik3*nbk3)),
-                        (ggml_fp16_t *) ((char *) q->data + (iq1*nbq1 + iq2*nbq2 + iq3*nbq3)));
-            }
-        } else {
-            for (int64_t ic = 0; ic < nek1; ic += GGML_VEC_DOT_UNROLL) {
-                // k indices
-                const int ik3 = iq3;
-                const int ik2 = iq2;
-                const int ik1 = ic;
-
-                // S indices
-                const int i1 = ik1;
-
-                ggml_vec_dot_f16_unroll(neq0, nbk1,
-                        S + i1,
-                        ((char *) k->data + (ik1*nbk1 + ik2*nbk2 + ik3*nbk3)),
-                        (ggml_fp16_t *) ((char *) q->data + (iq1*nbq1 + iq2*nbq2 + iq3*nbq3)));
-            }
-        }
-
-        // scale
-        ggml_vec_scale_f32(nek1, S, scale);
-
-        if (masked) {
-            for (int64_t i = P; i < M; i++) {
-                if (i > P + iq1) {
-                    S[i] = -INFINITY;
-                }
-            }
-        }
-
-        // softmax
-        {
-            float max = -INFINITY;
-            ggml_vec_max_f32(M, &max, S);
-
-            ggml_float sum = 0.0;
-            {
-#ifdef GGML_SOFT_MAX_ACCELERATE
-                max = -max;
-                vDSP_vsadd(S, 1, &max, S, 1, Mup);
-                vvexpf(S, S, &Mup);
-                ggml_vec_sum_f32(Mup, &sum, S);
-#else
-                uint16_t   scvt[GGML_SOFT_MAX_UNROLL];
-                ggml_float sump[GGML_SOFT_MAX_UNROLL] = { 0.0 };
-
-                for (int i = 0; i < Mup; i += GGML_SOFT_MAX_UNROLL) {
-                    float * SS = S + i;
-
-                    for (int j = 0; j < GGML_SOFT_MAX_UNROLL; ++j) {
-                        if (SS[j] == -INFINITY) {
-                            SS[j] = 0.0f;
-                        } else {
-                            ggml_fp16_t s = GGML_FP32_TO_FP16(SS[j] - max);
-                            memcpy(&scvt[j], &s, sizeof(uint16_t));
-                            const float val = GGML_FP16_TO_FP32(table_exp_f16[scvt[j]]);
-                            sump[j] += (ggml_float)val;
-                            SS[j] = val;
-                        }
-                    }
-                }
-
-                for (int i = 0; i < GGML_SOFT_MAX_UNROLL; i++) {
-                    sum += sump[i];
-                }
-#endif
-            }
-
-            assert(sum > 0.0);
-
-            sum = 1.0/sum;
-            ggml_vec_scale_f32(M, S, sum);
-
-#ifndef NDEBUG
-            for (int i = 0; i < M; ++i) {
-                assert(!isnan(S[i]));
-                assert(!isinf(S[i]));
-            }
-#endif
-        }
-
-        ggml_fp16_t * S16 = (ggml_fp16_t *) ((float *) params->wdata + ith*(2*Mup + CACHE_LINE_SIZE_F32) + Mup);
-
-        for (int64_t i = 0; i < M; i++) {
-            S16[i] = GGML_FP32_TO_FP16(S[i]);
-        }
-
-        if (GGML_VEC_DOT_UNROLL == 1 || (nev1 % GGML_VEC_DOT_UNROLL != 0)) {
-            for (int64_t ic = 0; ic < nev1; ++ic) {
-                // dst indices
-                const int i1 = iq1;
-                const int i2 = iq2;
-                const int i3 = iq3;
-
-                ggml_vec_dot_f16(nek1,
-                        (float *)       ((char *) dst->data + (ic*nb0 + i1*nb1  + i2*nb2  + i3*nb3)),
-                        (ggml_fp16_t *) ((char *) v->data   + (         ic*nbv1 + i2*nbv2 + i3*nbv3)),
-                        S16);
-            }
-        } else {
-            for (int64_t ic = 0; ic < nev1; ic += GGML_VEC_DOT_UNROLL) {
-                // dst indices
-                const int i1 = iq1;
-                const int i2 = iq2;
-                const int i3 = iq3;
-
-                ggml_vec_dot_f16_unroll(nek1, nbv1,
-                        (float *) ((char *) dst->data + (ic*nb0 + i1*nb1  + i2*nb2  + i3*nb3)),
-                        ((char *) v->data   + (         ic*nbv1 + i2*nbv2 + i3*nbv3)),
-                        S16);
-            }
-        }
-    }
-}
-
-static void ggml_compute_forward_flash_attn(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * q,
-        const struct ggml_tensor * k,
-        const struct ggml_tensor * v,
-        const bool masked,
-        struct ggml_tensor * dst) {
-    switch (q->type) {
-        case GGML_TYPE_F16:
-            {
-                ggml_compute_forward_flash_attn_f16(params, q, k, v, masked, dst);
-            } break;
-        case GGML_TYPE_F32:
-            {
-                ggml_compute_forward_flash_attn_f32(params, q, k, v, masked, dst);
-            } break;
-        default:
-            {
-                GGML_ASSERT(false);
-            } break;
-    }
-}
-
-// ggml_compute_forward_flash_ff
-
-static void ggml_compute_forward_flash_ff_f16(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * a,  // F16
-        const struct ggml_tensor * b0, // F16 fc_w
-        const struct ggml_tensor * b1, // F32 fc_b
-        const struct ggml_tensor * c0, // F16 proj_w
-        const struct ggml_tensor * c1, // F32 proj_b
-        struct ggml_tensor * dst) {
-    int64_t t0 = ggml_perf_time_us();
-    UNUSED(t0);
-
-    const int64_t nea0 = a->ne[0];
-    const int64_t nea1 = a->ne[1];
-    const int64_t nea2 = a->ne[2];
-    const int64_t nea3 = a->ne[3];
-
-    const int64_t neb00 = b0->ne[0];
-    const int64_t neb01 = b0->ne[1];
-    //const int64_t neb02 = b0->ne[2];
-    //const int64_t neb03 = b0->ne[3];
-
-    const int64_t neb10 = b1->ne[0];
-    const int64_t neb11 = b1->ne[1];
-    //const int64_t neb12 = b1->ne[2];
-    //const int64_t neb13 = b1->ne[3];
-
-    const int64_t nec00 = c0->ne[0];
-    const int64_t nec01 = c0->ne[1];
-    //const int64_t nec02 = c0->ne[2];
-    //const int64_t nec03 = c0->ne[3];
-
-    const int64_t nec10 = c1->ne[0];
-    const int64_t nec11 = c1->ne[1];
-    //const int64_t nec12 = c1->ne[2];
-    //const int64_t nec13 = c1->ne[3];
-
-    const int64_t ne0 = dst->ne[0];
-    const int64_t ne1 = dst->ne[1];
-    const int64_t ne2 = dst->ne[2];
-    //const int64_t ne3 = dst->ne[3];
-
-    const int nba0 = a->nb[0];
-    const int nba1 = a->nb[1];
-    const int nba2 = a->nb[2];
-    const int nba3 = a->nb[3];
-
-    const int nbb00 = b0->nb[0];
-    const int nbb01 = b0->nb[1];
-    const int nbb02 = b0->nb[2];
-    const int nbb03 = b0->nb[3];
-
-    const int nbb10 = b1->nb[0];
-    //const int nbb11 = b1->nb[1];
-    //const int nbb12 = b1->nb[2];
-    //const int nbb13 = b1->nb[3];
-
-    const int nbc00 = c0->nb[0];
-    const int nbc01 = c0->nb[1];
-    const int nbc02 = c0->nb[2];
-    const int nbc03 = c0->nb[3];
-
-    const int nbc10 = c1->nb[0];
-    //const int nbc11 = c1->nb[1];
-    //const int nbc12 = c1->nb[2];
-    //const int nbc13 = c1->nb[3];
-
-    const int nb0 = dst->nb[0];
-    const int nb1 = dst->nb[1];
-    const int nb2 = dst->nb[2];
-    const int nb3 = dst->nb[3];
-
-    const int ith = params->ith;
-    const int nth = params->nth;
-
-    const int64_t D = nea0;
-    //const int64_t N = nea1;
-    const int64_t M = neb01;
-
-    GGML_ASSERT(ne0 == nea0);
-    GGML_ASSERT(ne1 == nea1);
-    GGML_ASSERT(ne2 == nea2);
-
-    GGML_ASSERT(nba0  == sizeof(ggml_fp16_t));
-    GGML_ASSERT(nbb00 == sizeof(ggml_fp16_t));
-    GGML_ASSERT(nbb10 == sizeof(float));
-    GGML_ASSERT(nbc00 == sizeof(ggml_fp16_t));
-    GGML_ASSERT(nbc10 == sizeof(float));
-
-    GGML_ASSERT(neb00 == D);
-    GGML_ASSERT(neb01 == M);
-    GGML_ASSERT(neb10 == M);
-    GGML_ASSERT(neb11 == 1);
-
-    GGML_ASSERT(nec00 == M);
-    GGML_ASSERT(nec01 == D);
-    GGML_ASSERT(nec10 == D);
-    GGML_ASSERT(nec11 == 1);
-
-    // dst cannot be transposed or permuted
-    GGML_ASSERT(nb0 == sizeof(float));
-    GGML_ASSERT(nb0 <= nb1);
-    GGML_ASSERT(nb1 <= nb2);
-    GGML_ASSERT(nb2 <= nb3);
-
-    if (params->type == GGML_TASK_INIT) {
-        return;
-    }
-
-    if (params->type == GGML_TASK_FINALIZE) {
-        return;
-    }
-
-    // parallelize by a rows using ggml_vec_dot_f32
-
-    // total rows in a
-    const int nr = nea1*nea2*nea3;
-
-    // rows per thread
-    const int dr = (nr + nth - 1)/nth;
-
-    // row range for this thread
-    const int ir0 = dr*ith;
-    const int ir1 = MIN(ir0 + dr, nr);
-
-    for (int ir = ir0; ir < ir1; ++ir) {
-        // a indices
-        const int ia3 = ir/(nea2*nea1);
-        const int ia2 = (ir - ia3*nea2*nea1)/nea1;
-        const int ia1 = (ir - ia3*nea2*nea1 - ia2*nea1);
-
-        float * S = (float *) params->wdata + ith*(2*M + CACHE_LINE_SIZE_F32);
-
-        for (int64_t ic = 0; ic < neb01; ++ic) {
-            // b0 indices
-            const int ib03 = ia3;
-            const int ib02 = ia2;
-            const int ib01 = ic;
-
-            // S indices
-            const int i1 = ib01;
-
-            ggml_vec_dot_f16(nea0,
-                    S + i1,
-                    (ggml_fp16_t *) ((char *) b0->data + (ib01*nbb01 + ib02*nbb02 + ib03*nbb03)),
-                    (ggml_fp16_t *) ((char *)  a->data + ( ia1*nba1  +  ia2*nba2  +  ia3*nba3)));
-        }
-
-        ggml_vec_add_f32(neb01, S, S, (float *) b1->data);
-        //ggml_vec_gelu_f32(neb01, S, S);
-
-        ggml_fp16_t * S16 = (ggml_fp16_t *) ((float *) params->wdata + ith*(2*M + CACHE_LINE_SIZE_F32) + M);
-
-        for (int64_t i = 0; i < M; i++) {
-            S16[i] = GGML_FP32_TO_FP16(S[i]);
-        }
-
-        ggml_vec_gelu_f16(neb01, S16, S16);
-
-        {
-            // dst indices
-            const int i1 = ia1;
-            const int i2 = ia2;
-            const int i3 = ia3;
-
-            for (int64_t ic = 0; ic < nec01; ++ic) {
-
-                ggml_vec_dot_f16(neb01,
-                        (float *)       ((char *) dst->data + (ic*nb0 + i1*nb1   + i2*nb2   + i3*nb3)),
-                        (ggml_fp16_t *) ((char *) c0->data  + (         ic*nbc01 + i2*nbc02 + i3*nbc03)),
-                        S16);
-            }
-
-            ggml_vec_add_f32(nec01,
-                    (float *) ((char *) dst->data + (i1*nb1 + i2*nb2 + i3*nb3)),
-                    (float *) ((char *) dst->data + (i1*nb1 + i2*nb2 + i3*nb3)),
-                    (float *) c1->data);
-        }
-    }
-}
-
-static void ggml_compute_forward_flash_ff(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * a,
-        const struct ggml_tensor * b0,
-        const struct ggml_tensor * b1,
-        const struct ggml_tensor * c0,
-        const struct ggml_tensor * c1,
-        struct ggml_tensor * dst) {
-    switch (b0->type) {
-        case GGML_TYPE_F16:
-            {
-                ggml_compute_forward_flash_ff_f16(params, a, b0, b1, c0, c1, dst);
-            } break;
-        case GGML_TYPE_F32:
-            {
-                GGML_ASSERT(false); // TODO
-            } break;
-        default:
-            {
-                GGML_ASSERT(false);
-            } break;
-    }
-}
-
-// ggml_compute_forward_map_unary
-
-static void ggml_compute_forward_map_unary_f32(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * src0,
-        struct ggml_tensor * dst,
-        const ggml_unary_op_f32_t fun) {
-    GGML_ASSERT(ggml_are_same_shape(src0, dst));
-
-    if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
-        return;
-    }
-
-    const int n  = ggml_nrows(src0);
-    const int nc = src0->ne[0];
-
-    assert( dst->nb[0] == sizeof(float));
-    assert(src0->nb[0] == sizeof(float));
-
-    for (int i = 0; i < n; i++) {
-        fun(nc,
-                (float *) ((char *) dst->data  + i*( dst->nb[1])),
-                (float *) ((char *) src0->data + i*(src0->nb[1])));
-    }
-}
-
-
-static void ggml_compute_forward_map_unary(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * src0,
-        struct ggml_tensor * dst,
-        const ggml_unary_op_f32_t fun) {
-    switch (src0->type) {
-        case GGML_TYPE_F32:
-            {
-                ggml_compute_forward_map_unary_f32(params, src0, dst, fun);
-            } break;
-        default:
-            {
-                GGML_ASSERT(false);
-            } break;
-    }
-}
-
-// ggml_compute_forward_map_binary
-
-static void ggml_compute_forward_map_binary_f32(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * src0,
-        const struct ggml_tensor * src1,
-        struct ggml_tensor * dst,
-        const ggml_binary_op_f32_t fun) {
-    assert(params->ith == 0);
-    assert(ggml_are_same_shape(src0, src1) && ggml_are_same_shape(src0, dst));
-
-    if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
-        return;
-    }
-
-    const int n  = ggml_nrows(src0);
-    const int nc = src0->ne[0];
-
-    assert( dst->nb[0] == sizeof(float));
-    assert(src0->nb[0] == sizeof(float));
-    assert(src1->nb[0] == sizeof(float));
-
-    for (int i = 0; i < n; i++) {
-        fun(nc,
-                (float *) ((char *) dst->data  + i*( dst->nb[1])),
-                (float *) ((char *) src0->data + i*(src0->nb[1])),
-                (float *) ((char *) src1->data + i*(src1->nb[1])));
-    }
-}
-
-
-static void ggml_compute_forward_map_binary(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * src0,
-        const struct ggml_tensor * src1,
-        struct ggml_tensor * dst,
-        const ggml_binary_op_f32_t fun) {
-    switch (src0->type) {
-        case GGML_TYPE_F32:
-            {
-                ggml_compute_forward_map_binary_f32(params, src0, src1, dst, fun);
-            } break;
-        default:
-            {
-                GGML_ASSERT(false);
-            } break;
-    }
-}
-
-/////////////////////////////////
-
-static void ggml_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor) {
-    GGML_ASSERT(params);
-
-    switch (tensor->op) {
-        case GGML_OP_DUP:
-            {
-                ggml_compute_forward_dup(params, tensor->src0, tensor);
-            } break;
-        case GGML_OP_ADD:
-            {
-                ggml_compute_forward_add(params, tensor->src0, tensor->src1, tensor);
-            } break;
-        case GGML_OP_SUB:
-            {
-                ggml_compute_forward_sub(params, tensor->src0, tensor->src1, tensor);
-            } break;
-        case GGML_OP_MUL:
-            {
-                ggml_compute_forward_mul(params, tensor->src0, tensor->src1, tensor);
-            } break;
-        case GGML_OP_DIV:
-            {
-                ggml_compute_forward_div(params, tensor->src0, tensor->src1, tensor);
-            } break;
-        case GGML_OP_SQR:
-            {
-                ggml_compute_forward_sqr(params, tensor->src0, tensor);
-            } break;
-        case GGML_OP_SQRT:
-            {
-                ggml_compute_forward_sqrt(params, tensor->src0, tensor);
-            } break;
-        case GGML_OP_SUM:
-            {
-                ggml_compute_forward_sum(params, tensor->src0, tensor);
-            } break;
-        case GGML_OP_MEAN:
-            {
-                ggml_compute_forward_mean(params, tensor->src0, tensor);
-            } break;
-        case GGML_OP_REPEAT:
-            {
-                ggml_compute_forward_repeat(params, tensor->src0, tensor);
-            } break;
-        case GGML_OP_ABS:
-            {
-                ggml_compute_forward_abs(params, tensor->src0, tensor);
-            } break;
-        case GGML_OP_SGN:
-            {
-                ggml_compute_forward_sgn(params, tensor->src0, tensor);
-            } break;
-        case GGML_OP_NEG:
-            {
-                ggml_compute_forward_neg(params, tensor->src0, tensor);
-            } break;
-        case GGML_OP_STEP:
-            {
-                ggml_compute_forward_step(params, tensor->src0, tensor);
-            } break;
-        case GGML_OP_RELU:
-            {
-                ggml_compute_forward_relu(params, tensor->src0, tensor);
-            } break;
-        case GGML_OP_GELU:
-            {
-                ggml_compute_forward_gelu(params, tensor->src0, tensor);
-            } break;
-        case GGML_OP_SILU:
-            {
-                ggml_compute_forward_silu(params, tensor->src0, tensor);
-            } break;
-        case GGML_OP_NORM:
-            {
-                ggml_compute_forward_norm(params, tensor->src0, tensor);
-            } break;
-        case GGML_OP_RMS_NORM:
-            {
-                ggml_compute_forward_rms_norm(params, tensor->src0, tensor);
-            } break;
-        case GGML_OP_MUL_MAT:
-            {
-                ggml_compute_forward_mul_mat(params, tensor->src0, tensor->src1, tensor);
-            } break;
-        case GGML_OP_SCALE:
-            {
-                ggml_compute_forward_scale(params, tensor->src0, tensor->src1, tensor);
-            } break;
-        case GGML_OP_CPY:
-            {
-                ggml_compute_forward_cpy(params, tensor->src0, tensor);
-            } break;
-        case GGML_OP_CONT:
-            {
-                ggml_compute_forward_cont(params, tensor->src0, tensor);
-            } break;
-        case GGML_OP_RESHAPE:
-            {
-                ggml_compute_forward_reshape(params, tensor->src0, tensor);
-            } break;
-        case GGML_OP_VIEW:
-            {
-                ggml_compute_forward_view(params, tensor->src0);
-            } break;
-        case GGML_OP_PERMUTE:
-            {
-                ggml_compute_forward_permute(params, tensor->src0);
-            } break;
-        case GGML_OP_TRANSPOSE:
-            {
-                ggml_compute_forward_transpose(params, tensor->src0);
-            } break;
-        case GGML_OP_GET_ROWS:
-            {
-                ggml_compute_forward_get_rows(params, tensor->src0, tensor->src1, tensor);
-            } break;
-        case GGML_OP_DIAG_MASK_INF:
-            {
-                ggml_compute_forward_diag_mask_inf(params, tensor->src0, tensor->src1, tensor);
-            } break;
-        case GGML_OP_SOFT_MAX:
-            {
-                ggml_compute_forward_soft_max(params, tensor->src0, tensor);
-            } break;
-        case GGML_OP_ROPE:
-            {
-                ggml_compute_forward_rope(params, tensor->src0, tensor->src1, tensor);
-            } break;
-        case GGML_OP_ALIBI:
-            {
-                ggml_compute_forward_alibi(params, tensor->src0, tensor->src1, tensor);
-            } break;
-        case GGML_OP_CONV_1D_1S:
-            {
-                ggml_compute_forward_conv_1d_1s(params, tensor->src0, tensor->src1, tensor);
-            } break;
-        case GGML_OP_CONV_1D_2S:
-            {
-                ggml_compute_forward_conv_1d_2s(params, tensor->src0, tensor->src1, tensor);
-            } break;
-        case GGML_OP_FLASH_ATTN:
-            {
-                int32_t t = ggml_get_i32_1d(tensor->opt[1], 0);
-                GGML_ASSERT(t == 0 || t == 1);
-                bool masked = t != 0;
-                ggml_compute_forward_flash_attn(params, tensor->src0, tensor->src1, tensor->opt[0], masked, tensor);
-            } break;
-        case GGML_OP_FLASH_FF:
-            {
-                ggml_compute_forward_flash_ff(params, tensor->src0, tensor->src1, tensor->opt[0], tensor->opt[1], tensor->opt[2], tensor);
-            } break;
-        case GGML_OP_MAP_UNARY:
-            {
-                const ggml_unary_op_f32_t fun = *((ggml_unary_op_f32_t *)tensor->opt[0]->data);
-                ggml_compute_forward_map_unary(params, tensor->src0, tensor, fun);
-            }
-            break;
-        case GGML_OP_MAP_BINARY:
-            {
-                const ggml_binary_op_f32_t fun = *((ggml_binary_op_f32_t *)tensor->opt[0]->data);
-                ggml_compute_forward_map_binary(params, tensor->src0, tensor->src1, tensor, fun);
-            }
-            break;
-        case GGML_OP_NONE:
-            {
-                // nop
-            } break;
-        case GGML_OP_COUNT:
-            {
-                GGML_ASSERT(false);
-            } break;
-    }
-}
-
-////////////////////////////////////////////////////////////////////////////////
-
-static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor * tensor, bool inplace) {
-    struct ggml_tensor * src0 = tensor->src0;
-    struct ggml_tensor * src1 = tensor->src1;
-
-    switch (tensor->op) {
-        case GGML_OP_DUP:
-            {
-                if (src0->grad) {
-                    src0->grad = ggml_add_impl(ctx, src0->grad, tensor->grad, inplace);
-                }
-            } break;
-        case GGML_OP_ADD:
-            {
-                if (src0->grad) {
-                    src0->grad = ggml_add_impl(ctx, src0->grad, tensor->grad, inplace);
-                }
-                if (src1->grad) {
-                    src1->grad = ggml_add_impl(ctx, src1->grad, tensor->grad, inplace);
-                }
-            } break;
-        case GGML_OP_SUB:
-            {
-                if (src0->grad) {
-                    src0->grad = ggml_add_impl(ctx, src0->grad, tensor->grad, inplace);
-                }
-                if (src1->grad) {
-                    src1->grad = ggml_sub_impl(ctx, src1->grad, tensor->grad, inplace);
-                }
-            } break;
-        case GGML_OP_MUL:
-            {
-                if (src0->grad) {
-                    src0->grad =
-                        ggml_add_impl(ctx,
-                                src0->grad,
-                                ggml_mul(ctx, src1, tensor->grad),
-                                inplace);
-                }
-                if (src1->grad) {
-                    src1->grad =
-                        ggml_add_impl(ctx,
-                                src1->grad,
-                                ggml_mul(ctx, src0, tensor->grad),
-                                inplace);
-                }
-            } break;
-        case GGML_OP_DIV:
-            {
-                if (src0->grad) {
-                    src0->grad =
-                        ggml_add_impl(ctx,
-                                src0->grad,
-                                ggml_div(ctx, tensor->grad, src1),
-                                inplace);
-                }
-                if (src1->grad) {
-                    src1->grad =
-                        ggml_sub_impl(ctx,
-                                src1->grad,
-                                ggml_mul(ctx,
-                                    tensor->grad,
-                                    ggml_div(ctx, tensor, src1)),
-                                inplace);
-                }
-            } break;
-        case GGML_OP_SQR:
-            {
-                if (src0->grad) {
-                    src0->grad =
-                        ggml_add_impl(ctx,
-                                src0->grad,
-                                ggml_mul(ctx,
-                                    ggml_mul(ctx, src0, tensor->grad),
-                                    ggml_repeat(ctx, ggml_new_f32(ctx, 2.0f), src0)),
-                                inplace);
-                }
-            } break;
-        case GGML_OP_SQRT:
-            {
-                if (src0->grad) {
-                    src0->grad =
-                        ggml_add_impl(ctx,
-                                src0->grad,
-                                ggml_div(ctx,
-                                    ggml_repeat(ctx, ggml_new_f32(ctx, 0.5f), tensor),
-                                    tensor),
-                                inplace);
-                }
-            } break;
-        case GGML_OP_SUM:
-            {
-                if (src0->grad) {
-                    src0->grad =
-                        ggml_add_impl(ctx,
-                                src0->grad,
-                                ggml_repeat(ctx, tensor->grad, src0->grad),
-                                inplace);
-                }
-            } break;
-        case GGML_OP_MEAN:
-            {
-                GGML_ASSERT(false); // TODO: implement
-            } break;
-        case GGML_OP_REPEAT:
-            {
-                if (src0->grad) {
-                    src0->grad =
-                        ggml_add_impl(ctx,
-                                src0->grad,
-                                ggml_sum(ctx, tensor->grad),
-                                inplace);
-                }
-            } break;
-        case GGML_OP_ABS:
-            {
-                if (src0->grad) {
-                    src0->grad =
-                        ggml_add_impl(ctx,
-                                src0->grad,
-                                ggml_mul(ctx,
-                                    ggml_sgn(ctx, src0),
-                                    tensor->grad),
-                                inplace);
-                }
-            } break;
-        case GGML_OP_SGN:
-            {
-                if (src0->grad) {
-                    // noop
-                }
-            } break;
-        case GGML_OP_NEG:
-            {
-                if (src0->grad) {
-                    src0->grad = ggml_sub_impl(ctx, src0->grad, tensor->grad, inplace);
-                }
-            } break;
-        case GGML_OP_STEP:
-            {
-                if (src0->grad) {
-                    // noop
-                }
-            } break;
-        case GGML_OP_RELU:
-            {
-                if (src0->grad) {
-                    src0->grad = ggml_sub_impl(ctx,
-                            src0->grad,
-                            ggml_mul(ctx,
-                                ggml_step(ctx, src0),
-                                tensor->grad),
-                            inplace);
-                }
-            } break;
-        case GGML_OP_GELU:
-            {
-                GGML_ASSERT(false); // TODO: not implemented
-            } break;
-        case GGML_OP_ALIBI:
-            {
-                GGML_ASSERT(false); // TODO: not implemented
-            } break;
-        case GGML_OP_SILU:
-            {
-                GGML_ASSERT(false); // TODO: not implemented
-            } break;
-        case GGML_OP_NORM:
-            {
-                GGML_ASSERT(false); // TODO: not implemented
-            } break;
-        case GGML_OP_RMS_NORM:
-            {
-                GGML_ASSERT(false); // TODO: not implemented
-            } break;
-        case GGML_OP_MUL_MAT:
-            {
-                if (src0->grad) {
-                    // TODO: this requires outer product - ggml_out_prod(ctx, src1, tensor->grad);
-                    GGML_ASSERT(false);
-                }
-                if (src1->grad) {
-                    src1->grad =
-                        ggml_add_impl(ctx,
-                                src1->grad,
-                                ggml_mul_mat(ctx,
-                                    ggml_cont(ctx, ggml_transpose(ctx, src0)),
-                                    tensor->grad),
-                                inplace);
-                }
-            } break;
-        case GGML_OP_SCALE:
-            {
-                GGML_ASSERT(false); // TODO: not implemented
-            } break;
-        case GGML_OP_CPY:
-            {
-                GGML_ASSERT(false); // TODO: not implemented
-            } break;
-        case GGML_OP_CONT:
-            {
-                GGML_ASSERT(false); // TODO: not implemented
-            } break;
-        case GGML_OP_RESHAPE:
-            {
-                GGML_ASSERT(false); // TODO: not implemented
-            } break;
-        case GGML_OP_VIEW:
-            {
-                GGML_ASSERT(false); // not supported
-            } break;
-        case GGML_OP_PERMUTE:
-            {
-                GGML_ASSERT(false); // TODO: not implemented
-            } break;
-        case GGML_OP_TRANSPOSE:
-            {
-                GGML_ASSERT(false); // TODO: not implemented
-            } break;
-        case GGML_OP_GET_ROWS:
-            {
-                GGML_ASSERT(false); // TODO: not implemented
-            } break;
-        case GGML_OP_DIAG_MASK_INF:
-            {
-                GGML_ASSERT(false); // TODO: not implemented
-            } break;
-        case GGML_OP_SOFT_MAX:
-            {
-                GGML_ASSERT(false); // TODO: not implemented
-            } break;
-        case GGML_OP_ROPE:
-            {
-                GGML_ASSERT(false); // TODO: not implemented
-            } break;
-        case GGML_OP_CONV_1D_1S:
-            {
-                GGML_ASSERT(false); // TODO: not implemented
-            } break;
-        case GGML_OP_CONV_1D_2S:
-            {
-                GGML_ASSERT(false); // TODO: not implemented
-            } break;
-        case GGML_OP_FLASH_ATTN:
-            {
-                GGML_ASSERT(false); // not supported
-            } break;
-        case GGML_OP_FLASH_FF:
-            {
-                GGML_ASSERT(false); // not supported
-            } break;
-        case GGML_OP_MAP_UNARY:
-        case GGML_OP_MAP_BINARY:
-            {
-                GGML_ASSERT(false); // not supported
-            } break;
-        case GGML_OP_NONE:
-            {
-                // nop
-            } break;
-        case GGML_OP_COUNT:
-            {
-                GGML_ASSERT(false);
-            } break;
-    }
-}
-
-static void ggml_visit_parents(struct ggml_cgraph * cgraph, struct ggml_tensor * node) {
-    if (node->grad == NULL) {
-        // this usually happens when we generate intermediate nodes from constants in the backward pass
-        // it can also happen during forward pass, if the user performs computations with constants
-        if (node->op != GGML_OP_NONE) {
-            //GGML_PRINT_DEBUG("%s: warning: node %p has no grad, but op %d\n", __func__, (void *) node, node->op);
-        }
-    }
-
-    // check if already visited
-    for (int i = 0; i < cgraph->n_nodes; i++) {
-        if (cgraph->nodes[i] == node) {
-            return;
-        }
-    }
-
-    for (int i = 0; i < cgraph->n_leafs; i++) {
-        if (cgraph->leafs[i] == node) {
-            return;
-        }
-    }
-
-    if (node->src0) {
-        ggml_visit_parents(cgraph, node->src0);
-    }
-
-    if (node->src1) {
-        ggml_visit_parents(cgraph, node->src1);
-    }
-
-    for (int i = 0; i < GGML_MAX_OPT; ++i) {
-        if (node->opt[i]) {
-            ggml_visit_parents(cgraph, node->opt[i]);
-        }
-    }
-
-    if (node->op == GGML_OP_NONE && node->grad == NULL) {
-        // reached a leaf node, not part of the gradient graph (e.g. a constant)
-        GGML_ASSERT(cgraph->n_leafs < GGML_MAX_NODES);
-
-        cgraph->leafs[cgraph->n_leafs] = node;
-        cgraph->n_leafs++;
-    } else {
-        GGML_ASSERT(cgraph->n_nodes < GGML_MAX_NODES);
-
-        cgraph->nodes[cgraph->n_nodes] = node;
-        cgraph->grads[cgraph->n_nodes] = node->grad;
-        cgraph->n_nodes++;
-    }
-}
-
-static void ggml_build_forward_impl(struct ggml_cgraph * cgraph, struct ggml_tensor * tensor, bool expand) {
-    if (!expand) {
-        cgraph->n_nodes = 0;
-        cgraph->n_leafs = 0;
-    }
-
-    const int n0 = cgraph->n_nodes;
-    UNUSED(n0);
-
-    ggml_visit_parents(cgraph, tensor);
-
-    const int n_new = cgraph->n_nodes - n0;
-    GGML_PRINT_DEBUG("%s: visited %d new nodes\n", __func__, n_new);
-
-    if (n_new > 0) {
-        // the last added node should always be starting point
-        GGML_ASSERT(cgraph->nodes[cgraph->n_nodes - 1] == tensor);
-    }
-}
-
-void ggml_build_forward_expand(struct ggml_cgraph * cgraph, struct ggml_tensor * tensor) {
-    ggml_build_forward_impl(cgraph, tensor, true);
-}
-
-struct ggml_cgraph ggml_build_forward(struct ggml_tensor * tensor) {
-    struct ggml_cgraph result = {
-        /*.n_nodes      =*/ 0,
-        /*.n_leafs      =*/ 0,
-        /*.n_threads    =*/ GGML_DEFAULT_N_THREADS,
-        /*.work_size    =*/ 0,
-        /*.work         =*/ NULL,
-        /*.nodes        =*/ { NULL },
-        /*.grads        =*/ { NULL },
-        /*.leafs        =*/ { NULL },
-        /*.perf_runs    =*/ 0,
-        /*.perf_cycles  =*/ 0,
-        /*.perf_time_us =*/ 0,
-    };
-
-    ggml_build_forward_impl(&result, tensor, false);
-
-    return result;
-}
-
-struct ggml_cgraph ggml_build_backward(struct ggml_context * ctx, struct ggml_cgraph * gf, bool keep) {
-    struct ggml_cgraph result = *gf;
-
-    GGML_ASSERT(gf->n_nodes > 0);
-
-    // if we are keeping the gradient graph, we have to detach the gradient nodes from the original graph
-    if (keep) {
-        for (int i = 0; i < gf->n_nodes; i++) {
-            struct ggml_tensor * node = gf->nodes[i];
-
-            if (node->grad) {
-                node->grad = ggml_dup_tensor(ctx, node);
-                gf->grads[i] = node->grad;
-            }
-        }
-    }
-
-    for (int i = gf->n_nodes - 1; i >= 0; i--) {
-        struct ggml_tensor * node = gf->nodes[i];
-
-        // because we detached the grad nodes from the original graph, we can afford inplace operations
-        if (node->grad) {
-            ggml_compute_backward(ctx, node, keep);
-        }
-    }
-
-    for (int i = gf->n_nodes - 1; i >= 0; i--) {
-        struct ggml_tensor * node = gf->nodes[i];
-
-        if (node->is_param) {
-            GGML_PRINT_DEBUG("%s: found root node %p\n", __func__, (void *) node);
-            ggml_build_forward_impl(&result, node->grad, true);
-        }
-    }
-
-    return result;
-}
-
-//
-// thread data
-//
-// synchronization is done via busy loops
-// I tried using spin locks, but not sure how to use them correctly - the things I tried were slower than busy loops
-//
-
-#ifdef __APPLE__
-
-//#include <os/lock.h>
-//
-//typedef os_unfair_lock ggml_lock_t;
-//
-//#define ggml_lock_init(x)    UNUSED(x)
-//#define ggml_lock_destroy(x) UNUSED(x)
-//#define ggml_lock_lock       os_unfair_lock_lock
-//#define ggml_lock_unlock     os_unfair_lock_unlock
-//
-//#define GGML_LOCK_INITIALIZER OS_UNFAIR_LOCK_INIT
-
-typedef int ggml_lock_t;
-
-#define ggml_lock_init(x)    UNUSED(x)
-#define ggml_lock_destroy(x) UNUSED(x)
-#define ggml_lock_lock(x)    UNUSED(x)
-#define ggml_lock_unlock(x)  UNUSED(x)
-
-#define GGML_LOCK_INITIALIZER 0
-
-typedef pthread_t ggml_thread_t;
-
-#define ggml_thread_create pthread_create
-#define ggml_thread_join   pthread_join
-
-#else
-
-//typedef pthread_spinlock_t ggml_lock_t;
-
-//#define ggml_lock_init(x) pthread_spin_init(x, PTHREAD_PROCESS_PRIVATE)
-//#define ggml_lock_destroy pthread_spin_destroy
-//#define ggml_lock_lock    pthread_spin_lock
-//#define ggml_lock_unlock  pthread_spin_unlock
-
-typedef int ggml_lock_t;
-
-#define ggml_lock_init(x)    UNUSED(x)
-#define ggml_lock_destroy(x) UNUSED(x)
-#define ggml_lock_lock(x)    UNUSED(x)
-#define ggml_lock_unlock(x)  UNUSED(x)
-
-#define GGML_LOCK_INITIALIZER 0
-
-typedef pthread_t ggml_thread_t;
-
-#define ggml_thread_create pthread_create
-#define ggml_thread_join   pthread_join
-
-#endif
-
-struct ggml_compute_state_shared {
-    ggml_lock_t spin;
-
-    int n_threads;
-
-    // synchronization primitives
-    atomic_int  n_ready;
-    atomic_bool has_work;
-    atomic_bool stop; // stop all threads
-};
-
-struct ggml_compute_state {
-    ggml_thread_t thrd;
-
-    struct ggml_compute_params params;
-    struct ggml_tensor * node;
-
-    struct ggml_compute_state_shared * shared;
-};
-
-static thread_ret_t ggml_graph_compute_thread(void * data) {
-    struct ggml_compute_state * state = (struct ggml_compute_state *) data;
-
-    const int n_threads = state->shared->n_threads;
-
-    while (true) {
-        if (atomic_fetch_add(&state->shared->n_ready, 1) == n_threads - 1) {
-            atomic_store(&state->shared->has_work, false);
-        } else {
-            while (atomic_load(&state->shared->has_work)) {
-                if (atomic_load(&state->shared->stop)) {
-                    return 0;
-                }
-                ggml_lock_lock  (&state->shared->spin);
-                ggml_lock_unlock(&state->shared->spin);
-            }
-        }
-
-        atomic_fetch_sub(&state->shared->n_ready, 1);
-
-        // wait for work
-        while (!atomic_load(&state->shared->has_work)) {
-            if (atomic_load(&state->shared->stop)) {
-                return 0;
-            }
-            ggml_lock_lock  (&state->shared->spin);
-            ggml_lock_unlock(&state->shared->spin);
-        }
-
-        // check if we should stop
-        if (atomic_load(&state->shared->stop)) {
-            break;
-        }
-
-        if (state->node) {
-            if (state->params.ith < state->params.nth) {
-                ggml_compute_forward(&state->params, state->node);
-            }
-
-            state->node = NULL;
-        } else {
-            break;
-        }
-    }
-
-    return 0;
-}
-
-void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph) {
-    const int n_threads = cgraph->n_threads;
-
-    struct ggml_compute_state_shared state_shared = {
-        /*.spin      =*/ GGML_LOCK_INITIALIZER,
-        /*.n_threads =*/ n_threads,
-        /*.n_ready   =*/ 0,
-        /*.has_work  =*/ false,
-        /*.stop      =*/ false,
-    };
-    struct ggml_compute_state * workers = n_threads > 1 ? alloca(sizeof(struct ggml_compute_state)*(n_threads - 1)) : NULL;
-
-    // create thread pool
-    if (n_threads > 1) {
-        ggml_lock_init(&state_shared.spin);
-
-        atomic_store(&state_shared.has_work, true);
-
-        for (int j = 0; j < n_threads - 1; j++) {
-            workers[j] = (struct ggml_compute_state) {
-                .thrd   = 0,
-                .params = {
-                    .type  = GGML_TASK_COMPUTE,
-                    .ith   = j + 1,
-                    .nth   = n_threads,
-                    .wsize = cgraph->work ? ggml_nbytes(cgraph->work) : 0,
-                    .wdata = cgraph->work ? cgraph->work->data : NULL,
-                },
-                .node   = NULL,
-                .shared = &state_shared,
-            };
-
-            int rc = ggml_thread_create(&workers[j].thrd, NULL, ggml_graph_compute_thread, &workers[j]);
-            GGML_ASSERT(rc == 0);
-            UNUSED(rc);
-        }
-    }
-
-    // initialize tasks + work buffer
-    {
-        size_t work_size = 0;
-
-        // thread scheduling for the different operations
-        for (int i = 0; i < cgraph->n_nodes; i++) {
-            struct ggml_tensor * node = cgraph->nodes[i];
-
-            switch (node->op) {
-                case GGML_OP_CPY:
-                case GGML_OP_DUP:
-                    {
-                        node->n_tasks = n_threads;
-
-                        size_t cur = 0;
-                        if (ggml_is_quantized(node->type)) {
-                            cur = GGML_TYPE_SIZE[GGML_TYPE_F32] * node->ne[0] * n_threads;
-                        }
-
-                        work_size = MAX(work_size, cur);
-                    } break;
-                case GGML_OP_ADD:
-                    {
-                        node->n_tasks = n_threads;
-
-                        size_t cur = 0;
-
-                        if (ggml_is_quantized(node->src0->type)) {
-                            cur = GGML_TYPE_SIZE[GGML_TYPE_F32] * node->src0->ne[0] * n_threads;
-                        }
-
-                        work_size = MAX(work_size, cur);
-                    } break;
-                case GGML_OP_SUB:
-                case GGML_OP_MUL:
-                case GGML_OP_DIV:
-                case GGML_OP_SQR:
-                case GGML_OP_SQRT:
-                case GGML_OP_SUM:
-                case GGML_OP_MEAN:
-                case GGML_OP_REPEAT:
-                case GGML_OP_ABS:
-                case GGML_OP_SGN:
-                case GGML_OP_NEG:
-                case GGML_OP_STEP:
-                case GGML_OP_RELU:
-                    {
-                        node->n_tasks = 1;
-                    } break;
-                case GGML_OP_GELU:
-                    {
-                        node->n_tasks = n_threads;
-                    } break;
-                case GGML_OP_SILU:
-                    {
-                        node->n_tasks = n_threads;
-                    } break;
-                case GGML_OP_NORM:
-                case GGML_OP_RMS_NORM:
-                    {
-                        node->n_tasks = n_threads;
-                    } break;
-                case GGML_OP_MUL_MAT:
-                    {
-                        node->n_tasks = n_threads;
-
-                        // TODO: use different scheduling for different matrix sizes
-                        //const int nr0 = ggml_nrows(node->src0);
-                        //const int nr1 = ggml_nrows(node->src1);
-
-                        //node->n_tasks = MIN(n_threads, MAX(1, nr0/128));
-                        //printf("nr0 = %8d, nr1 = %8d, nr0*nr1 = %8d, n_tasks = %d\n", nr0, nr1, nr0*nr1, node->n_tasks);
-
-                        size_t cur = 0;
-
-                        if (node->src0->type == GGML_TYPE_F16 && node->src1->type == GGML_TYPE_F32) {
-#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CUBLAS)
-                            if (ggml_compute_forward_mul_mat_use_blas(node->src0, node->src1, node)) {
-                                node->n_tasks = 1; // TODO: this actually is doing nothing
-                                                   //       the threads are still spinning
-                                cur = GGML_TYPE_SIZE[GGML_TYPE_F32]*(node->src0->ne[0]*node->src0->ne[1]);
-                                //printf("src0: ne0 = %d, ne1 = %d, ne = %d\n", node->src0->ne[0], node->src0->ne[1], node->src0->ne[0]*node->src0->ne[1]);
-                                //printf("src1: ne0 = %d, ne1 = %d, ne = %d\n", node->src1->ne[0], node->src1->ne[1], node->src1->ne[0]*node->src1->ne[1]);
-                                //printf("cur = %zu\n", cur);
-                            } else {
-                                cur = GGML_TYPE_SIZE[GGML_TYPE_F16]*ggml_nelements(node->src1);
-                            }
-#else
-                            cur = GGML_TYPE_SIZE[GGML_TYPE_F16]*ggml_nelements(node->src1);
-#endif
-                        } else if (node->src0->type == GGML_TYPE_F32 && node->src1->type == GGML_TYPE_F32) {
-                            cur = 0;
-                        } else if (ggml_is_quantized(node->src0->type) && node->src1->type == GGML_TYPE_F32) {
-#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CUBLAS)
-                            if (ggml_compute_forward_mul_mat_use_blas(node->src0, node->src1, node)) {
-                                node->n_tasks = 1;
-                                cur = GGML_TYPE_SIZE[GGML_TYPE_F32]*(node->src0->ne[0]*node->src0->ne[1]);
-                            } else
-#endif
-                            {
-                                cur = GGML_TYPE_SIZE[GGML_TYPE_Q8_0]*ggml_nelements(node->src1)/GGML_BLCK_SIZE[GGML_TYPE_Q8_0];
-                            }
-                        } else {
-                            GGML_ASSERT(false);
-                        }
-
-                        work_size = MAX(work_size, cur);
-                    } break;
-                case GGML_OP_SCALE:
-                    {
-                        node->n_tasks = n_threads;
-                    } break;
-                case GGML_OP_CONT:
-                case GGML_OP_RESHAPE:
-                case GGML_OP_VIEW:
-                case GGML_OP_PERMUTE:
-                case GGML_OP_TRANSPOSE:
-                case GGML_OP_GET_ROWS:
-                case GGML_OP_DIAG_MASK_INF:
-                    {
-                        node->n_tasks = 1;
-                    } break;
-                case GGML_OP_SOFT_MAX:
-                    {
-                        node->n_tasks = n_threads;
-                    } break;
-                case GGML_OP_ROPE:
-                    {
-                        node->n_tasks = n_threads;
-                    } break;
-                case GGML_OP_ALIBI:
-                    {
-                        node->n_tasks = 1; //TODO
-                    } break;
-                case GGML_OP_CONV_1D_1S:
-                case GGML_OP_CONV_1D_2S:
-                    {
-                        node->n_tasks = n_threads;
-
-                        GGML_ASSERT(node->src0->ne[3] == 1);
-                        GGML_ASSERT(node->src1->ne[2] == 1);
-                        GGML_ASSERT(node->src1->ne[3] == 1);
-
-                        size_t cur = 0;
-                        const int nk = node->src0->ne[0];
-
-                        if (node->src0->type == GGML_TYPE_F16 &&
-                            node->src1->type == GGML_TYPE_F32) {
-                            cur = sizeof(ggml_fp16_t)*(
-                                    nk*ggml_up32(node->src0->ne[1])*node->src0->ne[2] +
-                                    ( 2*(nk/2) + node->src1->ne[0])*node->src1->ne[1]
-                                    );
-                        } else if (node->src0->type == GGML_TYPE_F32 &&
-                                   node->src1->type == GGML_TYPE_F32) {
-                            cur = sizeof(float)*(
-                                    nk*ggml_up32(node->src0->ne[1])*node->src0->ne[2] +
-                                    ( 2*(nk/2) + node->src1->ne[0])*node->src1->ne[1]
-                                    );
-                        } else {
-                            GGML_ASSERT(false);
-                        }
-
-                        work_size = MAX(work_size, cur);
-                    } break;
-                case GGML_OP_FLASH_ATTN:
-                    {
-                        node->n_tasks = n_threads;
-
-                        size_t cur = 0;
-
-                        const int64_t ne11 = ggml_up(node->src1->ne[1], GGML_SOFT_MAX_UNROLL);
-
-                        if (node->src1->type == GGML_TYPE_F32) {
-                            cur  = sizeof(float)*ne11*node->n_tasks; // TODO: this can become (n_tasks-1)
-                            cur += sizeof(float)*ne11*node->n_tasks; // this is overestimated by x2
-                        }
-
-                        if (node->src1->type == GGML_TYPE_F16) {
-                            cur  = sizeof(float)*ne11*node->n_tasks; // TODO: this can become (n_tasks-1)
-                            cur += sizeof(float)*ne11*node->n_tasks; // this is overestimated by x2
-                        }
-
-                        work_size = MAX(work_size, cur);
-                    } break;
-                case GGML_OP_FLASH_FF:
-                    {
-                        node->n_tasks = n_threads;
-
-                        size_t cur = 0;
-
-                        if (node->src1->type == GGML_TYPE_F32) {
-                            cur  = sizeof(float)*node->src1->ne[1]*node->n_tasks; // TODO: this can become (n_tasks-1)
-                            cur += sizeof(float)*node->src1->ne[1]*node->n_tasks; // this is overestimated by x2
-                        }
-
-                        if (node->src1->type == GGML_TYPE_F16) {
-                            cur  = sizeof(float)*node->src1->ne[1]*node->n_tasks; // TODO: this can become (n_tasks-1)
-                            cur += sizeof(float)*node->src1->ne[1]*node->n_tasks; // this is overestimated by x2
-                        }
-
-                        work_size = MAX(work_size, cur);
-                    } break;
-                case GGML_OP_MAP_UNARY:
-                case GGML_OP_MAP_BINARY:
-                    {
-                        node->n_tasks = 1;
-                    } break;
-                case GGML_OP_NONE:
-                    {
-                        node->n_tasks = 1;
-                    } break;
-                case GGML_OP_COUNT:
-                    {
-                        GGML_ASSERT(false);
-                    } break;
-            }
-        }
-
-        if (cgraph->work != NULL && work_size > cgraph->work_size) {
-            GGML_ASSERT(false); // TODO: better handling
-        }
-
-        if (work_size > 0 && cgraph->work == NULL) {
-            cgraph->work_size = work_size + CACHE_LINE_SIZE*(n_threads - 1);
-
-            GGML_PRINT_DEBUG("%s: allocating work buffer for graph (%zu bytes)\n", __func__, cgraph->work_size);
-            cgraph->work = ggml_new_tensor_1d(ctx, GGML_TYPE_I8, cgraph->work_size);
-        }
-    }
-
-    const int64_t perf_start_cycles  = ggml_perf_cycles();
-    const int64_t perf_start_time_us = ggml_perf_time_us();
-
-    for (int i = 0; i < cgraph->n_nodes; i++) {
-        GGML_PRINT_DEBUG_5("%s: %d/%d\n", __func__, i, cgraph->n_nodes);
-
-        struct ggml_tensor * node = cgraph->nodes[i];
-
-        // TODO: this could be used to avoid unnecessary computations, but it needs to be improved
-        //if (node->grad == NULL && node->perf_runs > 0) {
-        //    continue;
-        //}
-
-        const int64_t perf_node_start_cycles  = ggml_perf_cycles();
-        const int64_t perf_node_start_time_us = ggml_perf_time_us();
-
-        // INIT
-        struct ggml_compute_params params = {
-            /*.type  =*/ GGML_TASK_INIT,
-            /*.ith   =*/ 0,
-            /*.nth   =*/ node->n_tasks,
-            /*.wsize =*/ cgraph->work ? ggml_nbytes(cgraph->work) : 0,
-            /*.wdata =*/ cgraph->work ? cgraph->work->data : NULL,
-        };
-
-        ggml_compute_forward(&params, node);
-
-        // COMPUTE
-        if (node->n_tasks > 1) {
-            if (atomic_fetch_add(&state_shared.n_ready, 1) == n_threads - 1) {
-                atomic_store(&state_shared.has_work, false);
-            }
-
-            while (atomic_load(&state_shared.has_work)) {
-                ggml_lock_lock  (&state_shared.spin);
-                ggml_lock_unlock(&state_shared.spin);
-            }
-
-            // launch thread pool
-            for (int j = 0; j < n_threads - 1; j++) {
-                workers[j].params = (struct ggml_compute_params) {
-                    .type  = GGML_TASK_COMPUTE,
-                    .ith   = j + 1,
-                    .nth   = node->n_tasks,
-                    .wsize = cgraph->work ? ggml_nbytes(cgraph->work) : 0,
-                    .wdata = cgraph->work ? cgraph->work->data : NULL,
-                };
-                workers[j].node = node;
-            }
-
-            atomic_fetch_sub(&state_shared.n_ready, 1);
-
-            while (atomic_load(&state_shared.n_ready) > 0) {
-                ggml_lock_lock  (&state_shared.spin);
-                ggml_lock_unlock(&state_shared.spin);
-            }
-
-            atomic_store(&state_shared.has_work, true);
-        }
-
-        params.type = GGML_TASK_COMPUTE;
-        ggml_compute_forward(&params, node);
-
-        // wait for thread pool
-        if (node->n_tasks > 1) {
-            if (atomic_fetch_add(&state_shared.n_ready, 1) == n_threads - 1) {
-                atomic_store(&state_shared.has_work, false);
-            }
-
-            while (atomic_load(&state_shared.has_work)) {
-                ggml_lock_lock  (&state_shared.spin);
-                ggml_lock_unlock(&state_shared.spin);
-            }
-
-            atomic_fetch_sub(&state_shared.n_ready, 1);
-
-            while (atomic_load(&state_shared.n_ready) != 0) {
-                ggml_lock_lock  (&state_shared.spin);
-                ggml_lock_unlock(&state_shared.spin);
-            }
-        }
-
-        // FINALIZE
-        if (node->n_tasks > 1) {
-            if (atomic_fetch_add(&state_shared.n_ready, 1) == n_threads - 1) {
-                atomic_store(&state_shared.has_work, false);
-            }
-
-            while (atomic_load(&state_shared.has_work)) {
-                ggml_lock_lock  (&state_shared.spin);
-                ggml_lock_unlock(&state_shared.spin);
-            }
-
-            // launch thread pool
-            for (int j = 0; j < n_threads - 1; j++) {
-                workers[j].params = (struct ggml_compute_params) {
-                    .type  = GGML_TASK_FINALIZE,
-                    .ith   = j + 1,
-                    .nth   = node->n_tasks,
-                    .wsize = cgraph->work ? ggml_nbytes(cgraph->work) : 0,
-                    .wdata = cgraph->work ? cgraph->work->data : NULL,
-                };
-                workers[j].node = node;
-            }
-
-            atomic_fetch_sub(&state_shared.n_ready, 1);
-
-            while (atomic_load(&state_shared.n_ready) > 0) {
-                ggml_lock_lock  (&state_shared.spin);
-                ggml_lock_unlock(&state_shared.spin);
-            }
-
-            atomic_store(&state_shared.has_work, true);
-        }
-
-        params.type = GGML_TASK_FINALIZE;
-        ggml_compute_forward(&params, node);
-
-        // wait for thread pool
-        if (node->n_tasks > 1) {
-            if (atomic_fetch_add(&state_shared.n_ready, 1) == n_threads - 1) {
-                atomic_store(&state_shared.has_work, false);
-            }
-
-            while (atomic_load(&state_shared.has_work)) {
-                ggml_lock_lock  (&state_shared.spin);
-                ggml_lock_unlock(&state_shared.spin);
-            }
-
-            atomic_fetch_sub(&state_shared.n_ready, 1);
-
-            while (atomic_load(&state_shared.n_ready) != 0) {
-                ggml_lock_lock  (&state_shared.spin);
-                ggml_lock_unlock(&state_shared.spin);
-            }
-        }
-
-        // performance stats (node)
-        {
-            int64_t perf_cycles_cur  = ggml_perf_cycles()  - perf_node_start_cycles;
-            int64_t perf_time_us_cur = ggml_perf_time_us() - perf_node_start_time_us;
-
-            node->perf_runs++;
-            node->perf_cycles  += perf_cycles_cur;
-            node->perf_time_us += perf_time_us_cur;
-        }
-    }
-
-    // join thread pool
-    if (n_threads > 1) {
-        atomic_store(&state_shared.stop, true);
-        atomic_store(&state_shared.has_work, true);
-
-        for (int j = 0; j < n_threads - 1; j++) {
-            int rc = ggml_thread_join(workers[j].thrd, NULL);
-            GGML_ASSERT(rc == 0);
-            UNUSED(rc);
-        }
-
-        ggml_lock_destroy(&state_shared.spin);
-    }
-
-    // performance stats (graph)
-    {
-        int64_t perf_cycles_cur  = ggml_perf_cycles()  - perf_start_cycles;
-        int64_t perf_time_us_cur = ggml_perf_time_us() - perf_start_time_us;
-
-        cgraph->perf_runs++;
-        cgraph->perf_cycles  += perf_cycles_cur;
-        cgraph->perf_time_us += perf_time_us_cur;
-
-        GGML_PRINT_DEBUG("%s: perf (%d) - cpu = %.3f / %.3f ms, wall = %.3f / %.3f ms\n",
-                __func__, cgraph->perf_runs,
-                (double) perf_cycles_cur      / (double) ggml_cycles_per_ms(),
-                (double) cgraph->perf_cycles  / (double) ggml_cycles_per_ms() / (double) cgraph->perf_runs,
-                (double) perf_time_us_cur     / 1000.0,
-                (double) cgraph->perf_time_us / 1000.0 / cgraph->perf_runs);
-    }
-}
-
-void ggml_graph_reset(struct ggml_cgraph * cgraph) {
-    for (int i = 0; i < cgraph->n_nodes; i++) {
-        struct ggml_tensor * grad = cgraph->grads[i];
-
-        if (grad) {
-            ggml_set_zero(grad);
-        }
-    }
-}
-
-void ggml_graph_print(const struct ggml_cgraph * cgraph) {
-    int64_t perf_total_per_op_us[GGML_OP_COUNT] = {0};
-
-    GGML_PRINT("=== GRAPH ===\n");
-
-    GGML_PRINT_DEBUG("n_threads       = %d\n",        cgraph->n_threads);
-    GGML_PRINT_DEBUG("total work size = %zu bytes\n", cgraph->work_size);
-
-    GGML_PRINT("n_nodes = %d\n", cgraph->n_nodes);
-    for (int i = 0; i < cgraph->n_nodes; i++) {
-        struct ggml_tensor * node = cgraph->nodes[i];
-
-        perf_total_per_op_us[node->op] += node->perf_time_us;
-
-        GGML_PRINT(" - %3d: [ %" PRId64 ", %" PRId64 ", %" PRId64 "] %16s %s (%3d) cpu = %7.3f / %7.3f ms, wall = %7.3f / %7.3f ms\n",
-                i,
-                node->ne[0], node->ne[1], node->ne[2],
-                GGML_OP_LABEL[node->op], node->is_param ? "x" : node->grad ? "g" : " ", node->perf_runs,
-                (double) node->perf_cycles  / (double) ggml_cycles_per_ms(),
-                (double) node->perf_cycles  / (double) ggml_cycles_per_ms() / (double) node->perf_runs,
-                (double) node->perf_time_us / 1000.0,
-                (double) node->perf_time_us / 1000.0 / node->perf_runs);
-    }
-
-    GGML_PRINT("n_leafs = %d\n", cgraph->n_leafs);
-    for (int i = 0; i < cgraph->n_leafs; i++) {
-        struct ggml_tensor * node = cgraph->leafs[i];
-
-        GGML_PRINT(" - %3d: [ %" PRId64 ", %" PRId64 "] %8s\n",
-                i,
-                node->ne[0], node->ne[1],
-                GGML_OP_LABEL[node->op]);
-    }
-
-    for (int i = 0; i < GGML_OP_COUNT; i++) {
-        GGML_PRINT("perf_total_per_op_us[%16s] = %7.3f ms\n", GGML_OP_LABEL[i], (double) perf_total_per_op_us[i] / 1000.0);
-    }
-
-    GGML_PRINT("========================================\n");
-}
-
-// check if node is part of the graph
-static bool ggml_graph_find(const struct ggml_cgraph * cgraph, const struct ggml_tensor * node) {
-    if (cgraph == NULL) {
-        return true;
-    }
-
-    for (int i = 0; i < cgraph->n_nodes; i++) {
-        if (cgraph->nodes[i] == node) {
-            return true;
-        }
-    }
-
-    return false;
-}
-
-static struct ggml_tensor * ggml_graph_get_parent(const struct ggml_cgraph * cgraph, const struct ggml_tensor * node) {
-    for (int i = 0; i < cgraph->n_nodes; i++) {
-        struct ggml_tensor * parent = cgraph->nodes[i];
-
-        if (parent->grad == node) {
-            return parent;
-        }
-    }
-
-    return NULL;
-}
-
-void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph * gf, const char * filename) {
-    char color[16];
-
-    FILE * fp = fopen(filename, "w");
-    GGML_ASSERT(fp);
-
-    fprintf(fp, "digraph G {\n");
-    fprintf(fp, "  newrank = true;\n");
-    fprintf(fp, "  rankdir = LR;\n");
-
-    for (int i = 0; i < gb->n_nodes; i++) {
-        struct ggml_tensor * node = gb->nodes[i];
-
-        if (ggml_graph_get_parent(gb, node) != NULL) {
-            continue;
-        }
-
-        if (node->is_param) {
-            snprintf(color, sizeof(color), "yellow");
-        } else if (node->grad) {
-            if (ggml_graph_find(gf, node)) {
-                snprintf(color, sizeof(color), "green");
-            } else {
-                snprintf(color, sizeof(color), "lightblue");
-            }
-        } else {
-            snprintf(color, sizeof(color), "white");
-        }
-
-        fprintf(fp, "  \"%p\" [ \
-style = filled; fillcolor = %s; shape = record; \
-label=\"%d [%" PRId64 ", %" PRId64 "] | <x>%s",
-                (void *) node, color,
-                i, node->ne[0], node->ne[1],
-                GGML_OP_SYMBOL[node->op]);
-
-        if (node->grad) {
-            fprintf(fp, " | <g>%s\"; ]\n", GGML_OP_SYMBOL[node->grad->op]);
-        } else {
-            fprintf(fp, "\"; ]\n");
-        }
-    }
-
-    for (int i = 0; i < gb->n_leafs; i++) {
-        struct ggml_tensor * node = gb->leafs[i];
-
-        snprintf(color, sizeof(color), "pink");
-
-        if (ggml_nelements(node) == 1) {
-            fprintf(fp, "  \"%p\" [ \
-style = filled; fillcolor = %s; shape = record; \
-label=\"<x>%.1e\"; ]\n",
-                    (void *) node, color, (double)ggml_get_f32_1d(node, 0));
-        } else {
-            fprintf(fp, "  \"%p\" [ \
-style = filled; fillcolor = %s; shape = record; \
-label=\"<x>CONST %d [%" PRId64 ", %" PRId64 "]\"; ]\n",
-                    (void *) node, color,
-                    i, node->ne[0], node->ne[1]);
-        }
-    }
-
-    for (int i = 0; i < gb->n_nodes; i++) {
-        struct ggml_tensor * node = gb->nodes[i];
-
-        struct ggml_tensor * parent = ggml_graph_get_parent(gb, node);
-
-        if (node->src0) {
-            struct ggml_tensor * parent0 = ggml_graph_get_parent(gb, node->src0);
-
-            fprintf(fp, "  \"%p\":%s -> \"%p\":%s [ arrowhead = %s; style = %s; label = \"x\"; ]\n",
-                    parent0 ? (void *) parent0 : (void *) node->src0,
-                    parent0 ? "g" : "x",
-                    parent ? (void *) parent : (void *) node,
-                    parent ? "g" : "x",
-                    parent ? "empty" : "vee",
-                    parent ? "dashed" : "solid");
-        }
-
-        if (node->src1) {
-            struct ggml_tensor * parent1 = ggml_graph_get_parent(gb, node->src1);
-
-            fprintf(fp, "  \"%p\":%s -> \"%p\":%s [ arrowhead = %s; style = %s; label = \"y\"; ]\n",
-                    parent1 ? (void *) parent1 : (void *) node->src1,
-                    parent1 ? "g" : "x",
-                    parent ? (void *) parent : (void *) node,
-                    parent ? "g" : "x",
-                    parent ? "empty" : "vee",
-                    parent ? "dashed" : "solid");
-        }
-    }
-
-    for (int i = 0; i < gb->n_leafs; i++) {
-        struct ggml_tensor * node = gb->leafs[i];
-
-        if (node->src0) {
-            fprintf(fp, "  \"%p\":%s -> \"%p\":%s [ label = \"x\"; ]\n",
-                    (void *) node->src0, "x",
-                    (void *) node, "x");
-        }
-
-        if (node->src1) {
-            fprintf(fp, "  \"%p\":%s -> \"%p\":%s [ label = \"y\"; ]\n",
-                    (void *) node->src1, "x",
-                    (void *) node, "x");
-        }
-    }
-
-    fprintf(fp, "}\n");
-
-    fclose(fp);
-
-    GGML_PRINT("%s: dot -Tpng %s -o %s.png && open %s.png\n", __func__, filename, filename, filename);
-}
-
-////////////////////////////////////////////////////////////////////////////////
-
-static void ggml_opt_set_params(int np, struct ggml_tensor * const ps[], const float * x) {
-    int i = 0;
-    for (int p = 0; p < np; ++p) {
-        const int64_t ne = ggml_nelements(ps[p]) ;
-        // TODO: add function to set tensor from array
-        for (int64_t j = 0; j < ne; ++j) {
-            ggml_set_f32_1d(ps[p], j, x[i++]);
-        }
-    }
-}
-
-static void ggml_opt_get_params(int np, struct ggml_tensor * const ps[], float * x) {
-    int i = 0;
-    for (int p = 0; p < np; ++p) {
-        const int64_t ne = ggml_nelements(ps[p]) ;
-        // TODO: add function to get all elements at once
-        for (int64_t j = 0; j < ne; ++j) {
-            x[i++] = ggml_get_f32_1d(ps[p], j);
-        }
-    }
-}
-
-static void ggml_opt_get_grad(int np, struct ggml_tensor * const ps[], float * g) {
-    int i = 0;
-    for (int p = 0; p < np; ++p) {
-        const int64_t ne = ggml_nelements(ps[p]) ;
-        // TODO: add function to get all elements at once
-        for (int64_t j = 0; j < ne; ++j) {
-            g[i++] = ggml_get_f32_1d(ps[p]->grad, j);
-        }
-    }
-}
-
-//
-// ADAM
-//
-//   ref: https://arxiv.org/pdf/1412.6980.pdf
-//
-
-static enum ggml_opt_result ggml_opt_adam(
-        struct ggml_context * ctx,
-        struct ggml_opt_params params,
-        struct ggml_tensor * f,
-        struct ggml_cgraph * gf,
-        struct ggml_cgraph * gb) {
-    GGML_ASSERT(ggml_is_scalar(f));
-
-    gf->n_threads = params.n_threads;
-    gb->n_threads = params.n_threads;
-
-    // these will store the parameters we want to optimize
-    struct ggml_tensor * ps[GGML_MAX_PARAMS];
-
-    int np = 0;
-    int nx = 0;
-    for (int i = 0; i < gf->n_nodes; ++i) {
-        if (gf->nodes[i]->is_param) {
-            GGML_PRINT_DEBUG("found param %d: grad->op = %d\n", np, gf->nodes[i]->grad->op);
-
-            GGML_ASSERT(np < GGML_MAX_PARAMS);
-
-            ps[np++] = gf->nodes[i];
-            nx += ggml_nelements(gf->nodes[i]);
-        }
-    }
-
-    // constants
-    const float alpha = params.adam.alpha;
-    const float beta1 = params.adam.beta1;
-    const float beta2 = params.adam.beta2;
-    const float eps   = params.adam.eps;
-
-    float * x  = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, nx)->data; // view of the parameters
-    float * g1 = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, nx)->data; // gradient
-    float * g2 = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, nx)->data; // gradient squared
-    float * m  = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, nx)->data; // first moment
-    float * v  = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, nx)->data; // second moment
-    float * mh = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, nx)->data; // first moment hat
-    float * vh = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, nx)->data; // second moment hat
-
-    float * pf = params.past > 0 ? ggml_new_tensor_1d(ctx, GGML_TYPE_F32, params.past)->data : NULL; // past function values
-
-    // initialize
-    ggml_vec_set_f32(nx, m, 0.0f);
-    ggml_vec_set_f32(nx, v, 0.0f);
-
-    // update view
-    ggml_opt_get_params(np, ps, x);
-
-    // compute the function value
-    ggml_graph_reset  (gf);
-    ggml_set_f32      (f->grad, 1.0f);
-    ggml_graph_compute(ctx, gb);
-
-    float fx_prev = ggml_get_f32_1d(f, 0);
-    if (pf) {
-        pf[0] = fx_prev;
-    }
-
-    int n_no_improvement = 0;
-    float fx_best = fx_prev;
-
-    // run the optimizer
-    for (int t = 0; t < params.adam.n_iter; ++t) {
-        GGML_PRINT_DEBUG  ("=== iter %d ===\n", t);
-
-        GGML_PRINT_DEBUG  ("f      = %10.6f\n", ggml_get_f32_1d(f, 0));
-        GGML_PRINT_DEBUG_5("df/dx0 = %10.6f\n", ggml_get_f32_1d(ps[0]->grad, 0));
-        GGML_PRINT_DEBUG_5("df/dx1 = %10.6f\n", ggml_get_f32_1d(ps[1]->grad, 0));
-
-        for (int i = 0; i < np; ++i) {
-            GGML_PRINT_DEBUG("param %d: %10.6f, g = %10.6f\n", i,
-                    ggml_get_f32_1d(ps[i], 0), ggml_get_f32_1d(ps[i]->grad, 0));
-        }
-
-        const int64_t t_start_wall = ggml_time_us();
-        const int64_t t_start_cpu = ggml_cycles();
-        UNUSED(t_start_wall);
-        UNUSED(t_start_cpu);
-
-        {
-            // update the gradient
-            ggml_opt_get_grad(np, ps, g1);
-
-            // m_t = beta1*m_t-1 + (1 - beta1)*g_t
-            ggml_vec_scale_f32(nx, m, beta1);
-            ggml_vec_mad_f32  (nx, m, g1, 1.0f - beta1);
-
-            // g2 = g1^2
-            ggml_vec_sqr_f32  (nx, g2, g1);
-
-            // v_t = beta2*v_t-1 + (1 - beta2)*g_t^2
-            ggml_vec_scale_f32(nx, v, beta2);
-            ggml_vec_mad_f32  (nx, v, g2, 1.0f - beta2);
-
-            // m^hat = m_t / (1 - beta1^t)
-            // v^hat = v_t / (1 - beta2^t)
-            // x_t = x_t-1 - alpha*m^hat/(sqrt(v^hat) + eps)
-            ggml_vec_cpy_f32  (nx, mh, m);
-            ggml_vec_cpy_f32  (nx, vh, v);
-
-            ggml_vec_scale_f32(nx, mh, alpha/(1.0f - powf(beta1, t + 1)));
-            ggml_vec_scale_f32(nx, vh,  1.0f/(1.0f - powf(beta2, t + 1)));
-
-            ggml_vec_sqrt_f32 (nx, vh, vh);
-            ggml_vec_acc1_f32 (nx, vh, eps);
-
-            ggml_vec_div_f32  (nx, mh, mh, vh);
-            ggml_vec_sub_f32  (nx, x,  x,  mh);
-
-            // update the parameters
-            ggml_opt_set_params(np, ps, x);
-        }
-
-        ggml_graph_reset  (gf);
-        ggml_set_f32      (f->grad, 1.0f);
-        ggml_graph_compute(ctx, gb);
-
-        const float fx = ggml_get_f32_1d(f, 0);
-
-        // check convergence
-        if (fabsf(fx - fx_prev)/fx < params.adam.eps_f) {
-            GGML_PRINT_DEBUG("converged\n");
-
-            return GGML_OPT_OK;
-        }
-
-        // delta-based convergence test
-        if (pf != NULL) {
-            // need at least params.past iterations to start checking for convergence
-            if (params.past <= t) {
-                const float rate = (pf[t%params.past] - fx)/fx;
-
-                if (fabsf(rate) < params.delta) {
-                    return GGML_OPT_OK;
-                }
-            }
-
-            pf[t%params.past] = fx;
-        }
-
-        // check for improvement
-        if (params.max_no_improvement > 0) {
-            if (fx_best > fx) {
-                fx_best = fx;
-                n_no_improvement = 0;
-            } else {
-                ++n_no_improvement;
-
-                if (n_no_improvement >= params.max_no_improvement) {
-                    return GGML_OPT_OK;
-                }
-            }
-        }
-
-        fx_prev = fx;
-
-        {
-            const int64_t t_end_cpu = ggml_cycles();
-            GGML_PRINT_DEBUG("time iter:      %5.3f s\n", ((float)(t_end_cpu - t_start_cpu))/CLOCKS_PER_SEC);
-            UNUSED(t_end_cpu);
-
-            const int64_t t_end_wall = ggml_time_us();
-            GGML_PRINT_DEBUG("wall time iter: %5.3f s\n", (t_end_wall - t_start_wall)/1e6);
-            UNUSED(t_end_wall);
-        }
-    }
-
-    return GGML_OPT_DID_NOT_CONVERGE;
-}
-
-//
-// L-BFGS
-//
-// the L-BFGS implementation below is based on the following implementation:
-//
-//   https://github.com/chokkan/liblbfgs
-//
-
-struct ggml_lbfgs_iteration_data {
-    float alpha;
-    float ys;
-    float * s;
-    float * y;
-};
-
-static enum ggml_opt_result linesearch_backtracking(
-        struct ggml_context * ctx,
-        const struct ggml_opt_params * params,
-        int nx,
-        float * x,
-        float * fx,
-        float * g,
-        float * d,
-        float * step,
-        const float * xp,
-        struct ggml_tensor * f,
-        struct ggml_cgraph * gf,
-        struct ggml_cgraph * gb,
-        const int np,
-        struct ggml_tensor * ps[]) {
-    int count = 0;
-
-    float width  = 0.0f;
-    float dg     = 0.0f;
-    float finit  = 0.0f;
-    float dginit = 0.0f;
-    float dgtest = 0.0f;
-
-    const float dec = 0.5f;
-    const float inc = 2.1f;
-
-    if (*step <= 0.f) {
-        return GGML_LINESEARCH_INVALID_PARAMETERS;
-    }
-
-    // compute the initial gradient in the search direction
-    ggml_vec_dot_f32(nx, &dginit, g, d);
-
-    // make sure that d points to a descent direction
-    if (0 < dginit) {
-        return GGML_LINESEARCH_FAIL;
-    }
-
-    // initialize local variables
-    finit = *fx;
-    dgtest = params->lbfgs.ftol*dginit;
-
-    while (true) {
-        ggml_vec_cpy_f32(nx, x, xp);
-        ggml_vec_mad_f32(nx, x, d, *step);
-
-        // evaluate the function and gradient values
-        {
-            ggml_opt_set_params(np, ps, x);
-
-            ggml_graph_reset  (gf);
-            ggml_set_f32      (f->grad, 1.0f);
-            ggml_graph_compute(ctx, gb);
-
-            ggml_opt_get_grad(np, ps, g);
-
-            *fx = ggml_get_f32_1d(f, 0);
-        }
-
-        ++count;
-
-        if (*fx > finit + (*step)*dgtest) {
-            width = dec;
-        } else {
-            // Armijo condition is satisfied
-            if (params->lbfgs.linesearch == GGML_LINESEARCH_BACKTRACKING_ARMIJO) {
-                return count;
-            }
-
-            ggml_vec_dot_f32(nx, &dg, g, d);
-
-            // check the Wolfe condition
-            if (dg < params->lbfgs.wolfe * dginit) {
-                width = inc;
-            } else {
-                if(params->lbfgs.linesearch == GGML_LINESEARCH_BACKTRACKING_WOLFE) {
-                    // regular Wolfe conditions
-                    return count;
-                }
-
-                if(dg > -params->lbfgs.wolfe*dginit) {
-                    width = dec;
-                } else {
-                    // strong Wolfe condition (GGML_LINESEARCH_BACKTRACKING_STRONG_WOLFE)
-                    return count;
-                }
-                return count;
-            }
-        }
-
-        if (*step < params->lbfgs.min_step) {
-            return GGML_LINESEARCH_MINIMUM_STEP;
-        }
-        if (*step > params->lbfgs.max_step) {
-            return GGML_LINESEARCH_MAXIMUM_STEP;
-        }
-        if (params->lbfgs.max_linesearch <= count) {
-            return GGML_LINESEARCH_MAXIMUM_ITERATIONS;
-        }
-
-        (*step) *= width;
-    }
-
-    return GGML_LINESEARCH_FAIL;
-}
-
-static enum ggml_opt_result ggml_opt_lbfgs(
-        struct ggml_context * ctx,
-        struct ggml_opt_params params,
-        struct ggml_tensor * f,
-        struct ggml_cgraph * gf,
-        struct ggml_cgraph * gb) {
-    if (params.lbfgs.linesearch == GGML_LINESEARCH_BACKTRACKING_WOLFE ||
-        params.lbfgs.linesearch == GGML_LINESEARCH_BACKTRACKING_STRONG_WOLFE) {
-        if (params.lbfgs.wolfe <= params.lbfgs.ftol || 1.f <= params.lbfgs.wolfe) {
-            return GGML_OPT_INVALID_WOLFE;
-        }
-    }
-
-    gf->n_threads = params.n_threads;
-    gb->n_threads = params.n_threads;
-
-    const int m = params.lbfgs.m;
-
-    // these will store the parameters we want to optimize
-    struct ggml_tensor * ps[GGML_MAX_PARAMS];
-
-    int np = 0;
-    int nx = 0;
-    for (int i = 0; i < gf->n_nodes; ++i) {
-        if (gf->nodes[i]->is_param) {
-            GGML_PRINT_DEBUG("found param %d: grad->op = %d\n", np, gf->nodes[i]->grad->op);
-
-            GGML_ASSERT(np < GGML_MAX_PARAMS);
-
-            ps[np++] = gf->nodes[i];
-            nx += ggml_nelements(gf->nodes[i]);
-        }
-    }
-
-    float * x  = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, nx)->data; // current parameters
-    float * xp = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, nx)->data; // previous parameters
-    float * g  = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, nx)->data; // current gradient
-    float * gp = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, nx)->data; // previous gradient
-    float * d  = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, nx)->data; // search direction
-
-    float * pf = params.past > 0 ? ggml_new_tensor_1d(ctx, GGML_TYPE_F32, params.past)->data : NULL; // past function values
-
-    float fx    = 0.0f; // cost function value
-    float xnorm = 0.0f; // ||x||
-    float gnorm = 0.0f; // ||g||
-    float step  = 0.0f;
-
-    // initialize x from the graph nodes
-    ggml_opt_get_params(np, ps, x);
-
-    // the L-BFGS memory
-    struct ggml_lbfgs_iteration_data * lm = alloca(sizeof(struct ggml_lbfgs_iteration_data)*m);
-
-    for (int i = 0; i < m; ++i) {
-        lm[i].alpha = 0.0f;
-        lm[i].ys    = 0.0f;
-        lm[i].s     = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, nx)->data;
-        lm[i].y     = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, nx)->data;
-    }
-
-    // evaluate the function value and its gradient
-    {
-        ggml_opt_set_params(np, ps, x);
-
-        ggml_graph_reset  (gf);
-        ggml_set_f32      (f->grad, 1.0f);
-        ggml_graph_compute(ctx, gb);
-
-        ggml_opt_get_grad(np, ps, g);
-
-        fx = ggml_get_f32_1d(f, 0);
-    }
-
-    if (pf) {
-        pf[0] = fx;
-    }
-
-    float fx_best = fx;
-
-    // search direction = -gradient
-    ggml_vec_neg_f32(nx, d, g);
-
-    // ||x||, ||g||
-    ggml_vec_norm_f32(nx, &xnorm, x);
-    ggml_vec_norm_f32(nx, &gnorm, g);
-
-    if (xnorm < 1.0f) {
-        xnorm = 1.0f;
-    }
-
-    // already optimized
-    if (gnorm/xnorm <= params.lbfgs.eps) {
-        return GGML_OPT_OK;
-    }
-
-    // initial step
-    ggml_vec_norm_inv_f32(nx, &step, d);
-
-    int j                = 0;
-    int k                = 1;
-    int ls               = 0;
-    int end              = 0;
-    int bound            = 0;
-    int n_no_improvement = 0;
-
-    float ys   = 0.0f;
-    float yy   = 0.0f;
-    float beta = 0.0f;
-
-    while (true) {
-        // store the current position and gradient vectors
-        ggml_vec_cpy_f32(nx, xp, x);
-        ggml_vec_cpy_f32(nx, gp, g);
-
-        ls = linesearch_backtracking(ctx, &params, nx, x, &fx, g, d, &step, xp, f, gf, gb, np, ps);
-
-        if (ls < 0) {
-            // linesearch failed - go back to the previous point and return
-            ggml_vec_cpy_f32(nx, x, xp);
-            ggml_vec_cpy_f32(nx, g, gp);
-
-            return ls;
-        }
-
-        ggml_vec_norm_f32(nx, &xnorm, x);
-        ggml_vec_norm_f32(nx, &gnorm, g);
-
-        GGML_PRINT_DEBUG("f = %10.6f\n", ggml_get_f32_1d(f, 0));
-
-        if (xnorm < 1.0f) {
-            xnorm = 1.0f;
-        }
-        if (gnorm/xnorm <= params.lbfgs.eps) {
-            // converged
-            return GGML_OPT_OK;
-        }
-
-        // delta-based convergence test
-        if (pf != NULL) {
-            // need at least params.past iterations to start checking for convergence
-            if (params.past <= k) {
-                const float rate = (pf[k%params.past] - fx)/fx;
-
-                if (fabsf(rate) < params.delta) {
-                    return GGML_OPT_OK;
-                }
-            }
-
-            pf[k%params.past] = fx;
-        }
-
-        // check for improvement
-        if (params.max_no_improvement > 0) {
-            if (fx < fx_best) {
-                fx_best = fx;
-                n_no_improvement = 0;
-            } else {
-                n_no_improvement++;
-
-                if (n_no_improvement >= params.max_no_improvement) {
-                    return GGML_OPT_OK;
-                }
-            }
-        }
-
-        if (params.lbfgs.n_iter != 0 && params.lbfgs.n_iter < k + 1) {
-            // reached the maximum number of iterations
-            return GGML_OPT_DID_NOT_CONVERGE;
-        }
-
-        // update vectors s and y:
-        //   s_{k+1} = x_{k+1} - x_{k} = \step * d_{k}.
-        //   y_{k+1} = g_{k+1} - g_{k}.
-        //
-        ggml_vec_sub_f32(nx, lm[end].s, x, xp);
-        ggml_vec_sub_f32(nx, lm[end].y, g, gp);
-
-        // compute scalars ys and yy:
-        //     ys = y^t \cdot s    -> 1 / \rho.
-        //     yy = y^t \cdot y.
-        //
-        ggml_vec_dot_f32(nx, &ys, lm[end].y, lm[end].s);
-        ggml_vec_dot_f32(nx, &yy, lm[end].y, lm[end].y);
-
-        lm[end].ys = ys;
-
-        // find new search direction
-        //   ref: https://en.wikipedia.org/wiki/Limited-memory_BFGS
-
-        bound = (m <= k) ? m : k;
-        k++;
-        end = (end + 1)%m;
-
-        // initialize search direction with -g
-        ggml_vec_neg_f32(nx, d, g);
-
-        j = end;
-        for (int i = 0; i < bound; ++i) {
-            j = (j + m - 1) % m;
-            // \alpha_{j} = \rho_{j} s^{t}_{j} \cdot q_{k+1}
-            ggml_vec_dot_f32(nx, &lm[j].alpha, lm[j].s, d);
-            lm[j].alpha /= lm[j].ys;
-            // q_{i} = q_{i+1} - \alpha_{i} y_{i}
-            ggml_vec_mad_f32(nx, d, lm[j].y, -lm[j].alpha);
-        }
-
-        ggml_vec_scale_f32(nx, d, ys/yy);
-
-        for (int i = 0; i < bound; ++i) {
-            // \beta_{j} = \rho_{j} y^t_{j} \cdot \gamma_{i}
-            ggml_vec_dot_f32(nx, &beta, lm[j].y, d);
-            beta /= lm[j].ys;
-            // \gamma_{i+1} = \gamma_{i} + (\alpha_{j} - \beta_{j}) s_{j}
-            ggml_vec_mad_f32(nx, d, lm[j].s, lm[j].alpha - beta);
-            j = (j + 1)%m;
-        }
-
-        step = 1.0;
-    }
-
-    return GGML_OPT_DID_NOT_CONVERGE;
-}
-
-struct ggml_opt_params ggml_opt_default_params(enum ggml_opt_type type) {
-    struct ggml_opt_params result;
-
-    switch (type) {
-        case GGML_OPT_ADAM:
-            {
-                result = (struct ggml_opt_params) {
-                    .type      = GGML_OPT_ADAM,
-                    .n_threads = 1,
-                    .past      = 0,
-                    .delta     = 1e-5f,
-
-                    .max_no_improvement = 100,
-
-                    .print_forward_graph  = true,
-                    .print_backward_graph = true,
-
-                    .adam = {
-                        .n_iter = 10000,
-                        .alpha  = 0.001f,
-                        .beta1  = 0.9f,
-                        .beta2  = 0.999f,
-                        .eps    = 1e-8f,
-                        .eps_f  = 1e-5f,
-                        .eps_g  = 1e-3f,
-                    },
-                };
-            } break;
-        case GGML_OPT_LBFGS:
-            {
-                result = (struct ggml_opt_params) {
-                    .type      = GGML_OPT_LBFGS,
-                    .n_threads = 1,
-                    .past      = 0,
-                    .delta     = 1e-5f,
-
-                    .max_no_improvement = 0,
-
-                    .print_forward_graph  = true,
-                    .print_backward_graph = true,
-
-                    .lbfgs = {
-                        .m              = 6,
-                        .n_iter         = 100,
-                        .max_linesearch = 20,
-
-                        .eps      = 1e-5f,
-                        .ftol     = 1e-4f,
-                        .wolfe    = 0.9f,
-                        .min_step = 1e-20f,
-                        .max_step = 1e+20f,
-
-                        .linesearch = GGML_LINESEARCH_DEFAULT,
-                    },
-                };
-            } break;
-    }
-
-    return result;
-}
-
-enum ggml_opt_result ggml_opt(
-        struct ggml_context * ctx,
-        struct ggml_opt_params params,
-        struct ggml_tensor * f) {
-    bool free_ctx = false;
-    if (ctx == NULL) {
-        struct ggml_init_params params_ctx = {
-            .mem_size   = 16*1024*1024,
-            .mem_buffer = NULL,
-            .no_alloc   = false,
-        };
-
-        ctx = ggml_init(params_ctx);
-        if (ctx == NULL) {
-            return GGML_OPT_NO_CONTEXT;
-        }
-
-        free_ctx = true;
-    }
-
-    enum ggml_opt_result result = GGML_OPT_OK;
-
-    // build forward + backward compute graphs
-    struct ggml_cgraph gf = ggml_build_forward (f);
-    struct ggml_cgraph gb = ggml_build_backward(ctx, &gf, false);
-
-    switch (params.type) {
-        case GGML_OPT_ADAM:
-            {
-                result = ggml_opt_adam(ctx, params, f, &gf, &gb);
-            } break;
-        case GGML_OPT_LBFGS:
-            {
-                result = ggml_opt_lbfgs(ctx, params, f, &gf, &gb);
-            } break;
-    }
-
-    if (params.print_forward_graph) {
-        ggml_graph_print   (&gf);
-        ggml_graph_dump_dot(&gf, NULL, "opt-forward.dot");
-    }
-
-    if (params.print_backward_graph) {
-        ggml_graph_print   (&gb);
-        ggml_graph_dump_dot(&gb, &gf, "opt-backward.dot");
-    }
-
-    if (free_ctx) {
-        ggml_free(ctx);
-    }
-
-    return result;
-}
-
-////////////////////////////////////////////////////////////////////////////////
-
-size_t ggml_quantize_q4_0(const float * src, void * dst, int n, int k, int64_t * hist) {
-    assert(k % QK4_0 == 0);
-    const int nb = k / QK4_0;
-
-    for (int j = 0; j < n; j += k) {
-        block_q4_0 * restrict y = (block_q4_0 *)dst + j/QK4_0;
-
-        quantize_row_q4_0_reference(src + j, y, k);
-
-        for (int i = 0; i < nb; i++) {
-            for (int l = 0; l < QK4_0; l += 2) {
-                const uint8_t vi0 = y[i].qs[l/2] & 0xF;
-                const uint8_t vi1 = y[i].qs[l/2] >> 4;
-
-                hist[vi0]++;
-                hist[vi1]++;
-            }
-        }
-    }
-
-    return (n/QK4_0*sizeof(block_q4_0));
-}
-
-size_t ggml_quantize_q4_1(const float * src, void * dst, int n, int k, int64_t * hist) {
-    assert(k % QK4_1 == 0);
-    const int nb = k / QK4_1;
-
-    for (int j = 0; j < n; j += k) {
-        block_q4_1 * restrict y = (block_q4_1 *)dst + j/QK4_1;
-
-        quantize_row_q4_1_reference(src + j, y, k);
-
-        for (int i = 0; i < nb; i++) {
-            for (int l = 0; l < QK4_1; l += 2) {
-                const uint8_t vi0 = y[i].qs[l/2] & 0xF;
-                const uint8_t vi1 = y[i].qs[l/2] >> 4;
-
-                hist[vi0]++;
-                hist[vi1]++;
-            }
-        }
-    }
-
-    return (n/QK4_1*sizeof(block_q4_1));
-}
-
-size_t ggml_quantize_q4_2(const float * src, void * dst, int n, int k, int64_t * hist) {
-    assert(k % QK4_2 == 0);
-    const int nb = k / QK4_2;
-
-    for (int j = 0; j < n; j += k) {
-        block_q4_2 * restrict y = (block_q4_2 *)dst + j/QK4_2;
-
-        //quantize_row_q4_2_reference(src + j, y, k);
-        quantize_row_q4_2_rmse(src + j, y, k);
-
-        for (int i = 0; i < nb; i++) {
-            for (int l = 0; l < QK4_2; l += 2) {
-                const uint8_t vi0 = y[i].qs[l/2] & 0xF;
-                const uint8_t vi1 = y[i].qs[l/2] >> 4;
-
-                hist[vi0]++;
-                hist[vi1]++;
-            }
-        }
-    }
-
-    return (n/QK4_2*sizeof(block_q4_2));
-}
-
-size_t ggml_quantize_q4_3(const float * src, void * dst, int n, int k, int64_t * hist) {
-    assert(k % QK4_3 == 0);
-    const int nb = k / QK4_3;
-
-    for (int j = 0; j < n; j += k) {
-        block_q4_3 * restrict y = (block_q4_3 *)dst + j/QK4_3;
-
-        quantize_row_q4_3_reference(src + j, y, k);
-
-        for (int i = 0; i < nb; i++) {
-            for (int l = 0; l < QK4_3; l += 2) {
-                const uint8_t vi0 = y[i].qs[l/2] & 0xF;
-                const uint8_t vi1 = y[i].qs[l/2] >> 4;
-
-                hist[vi0]++;
-                hist[vi1]++;
-            }
-        }
-    }
-
-    return (n/QK4_3*sizeof(block_q4_3));
-}
-
-size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, int start, int n, int64_t * hist) {
-    size_t result = 0;
-    switch (type) {
-        case GGML_TYPE_Q4_0:
-            {
-                GGML_ASSERT(start % QK4_0 == 0);
-                block_q4_0 * block = (block_q4_0*)dst + start / QK4_0;
-                result = ggml_quantize_q4_0(src + start, block, n, n, hist);
-            } break;
-        case GGML_TYPE_Q4_1:
-            {
-                GGML_ASSERT(start % QK4_1 == 0);
-                block_q4_1 * block = (block_q4_1*)dst + start / QK4_1;
-                result = ggml_quantize_q4_1(src + start, block, n, n, hist);
-            } break;
-        case GGML_TYPE_Q4_2:
-            {
-                GGML_ASSERT(start % QK4_2 == 0);
-                block_q4_2 * block = (block_q4_2*)dst + start / QK4_2;
-                result = ggml_quantize_q4_2(src + start, block, n, n, hist);
-            } break;
-        case GGML_TYPE_Q4_3:
-            {
-                GGML_ASSERT(start % QK4_3 == 0);
-                block_q4_3 * block = (block_q4_3*)dst + start / QK4_3;
-                result = ggml_quantize_q4_3(src + start, block, n, n, hist);
-            } break;
-        default:
-            assert(false);
-    }
-    return result;
-}
-
-////////////////////////////////////////////////////////////////////////////////
-
-int ggml_cpu_has_avx(void) {
-#if defined(__AVX__)
-    return 1;
-#else
-    return 0;
-#endif
-}
-
-int ggml_cpu_has_avx2(void) {
-#if defined(__AVX2__)
-    return 1;
-#else
-    return 0;
-#endif
-}
-
-int ggml_cpu_has_avx512(void) {
-#if defined(__AVX512F__)
-    return 1;
-#else
-    return 0;
-#endif
-}
-
-int ggml_cpu_has_avx512_vbmi(void) {
-#if defined(__AVX512VBMI__)
-    return 1;
-#else
-    return 0;
-#endif
-}
-
-int ggml_cpu_has_avx512_vnni(void) {
-#if defined(__AVX512VNNI__)
-    return 1;
-#else
-    return 0;
-#endif
-}
-
-int ggml_cpu_has_fma(void) {
-#if defined(__FMA__)
-    return 1;
-#else
-    return 0;
-#endif
-}
-
-int ggml_cpu_has_neon(void) {
-#if defined(__ARM_NEON)
-    return 1;
-#else
-    return 0;
-#endif
-}
-
-int ggml_cpu_has_arm_fma(void) {
-#if defined(__ARM_FEATURE_FMA)
-    return 1;
-#else
-    return 0;
-#endif
-}
-
-int ggml_cpu_has_f16c(void) {
-#if defined(__F16C__)
-    return 1;
-#else
-    return 0;
-#endif
-}
-
-int ggml_cpu_has_fp16_va(void) {
-#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-    return 1;
-#else
-    return 0;
-#endif
-}
-
-int ggml_cpu_has_wasm_simd(void) {
-#if defined(__wasm_simd128__)
-    return 1;
-#else
-    return 0;
-#endif
-}
-
-int ggml_cpu_has_blas(void) {
-#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CUBLAS)
-    return 1;
-#else
-    return 0;
-#endif
-}
-
-int ggml_cpu_has_cublas(void) {
-#if defined(GGML_USE_CUBLAS)
-    return 1;
-#else
-    return 0;
-#endif
-}
-
-int ggml_cpu_has_sse3(void) {
-#if defined(__SSE3__)
-    return 1;
-#else
-    return 0;
-#endif
-}
-
-int ggml_cpu_has_vsx(void) {
-#if defined(__POWER9_VECTOR__)
-    return 1;
-#else
-    return 0;
-#endif
-}
-
-////////////////////////////////////////////////////////////////////////////////
diff --git a/ggml-sys/ggml/ggml.h b/ggml-sys/ggml/ggml.h
deleted file mode 100644
index dfba33b7..00000000
--- a/ggml-sys/ggml/ggml.h
+++ /dev/null
@@ -1,875 +0,0 @@
-#pragma once
-
-//
-// GGML Tensor Library
-//
-// This documentation is still a work in progress.
-// If you wish some specific topics to be covered, feel free to drop a comment:
-//
-//   https://github.com/ggerganov/whisper.cpp/issues/40
-//
-// ## Overview
-//
-// This library implements:
-//
-//  - a set of tensor operations
-//  - automatic differentiation
-//  - basic optimization algorithms
-//
-// The aim of this library is to provide a minimalistic approach for various machine learning tasks. This includes,
-// but is not limited to, the following:
-//
-//  - linear regression
-//  - support vector machines
-//  - neural networks
-//
-// The library allows the user to define a certain function using the available tensor operations. This function
-// definition is represented internally via a computation graph. Each tensor operation in the function definition
-// corresponds to a node in the graph. Having the computation graph defined, the user can choose to compute the
-// function's value and/or its gradient with respect to the input variables. Optionally, the function can be optimized
-// using one of the available optimization algorithms.
-//
-// For example, here we define the function: f(x) = a*x^2 + b
-//
-//   {
-//       struct ggml_init_params params = {
-//           .mem_size   = 16*1024*1024,
-//           .mem_buffer = NULL,
-//       };
-//
-//       // memory allocation happens here
-//       struct ggml_context * ctx = ggml_init(params);
-//
-//       struct ggml_tensor * x = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 1);
-//
-//       ggml_set_param(ctx, x); // x is an input variable
-//
-//       struct ggml_tensor * a  = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 1);
-//       struct ggml_tensor * b  = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 1);
-//       struct ggml_tensor * x2 = ggml_mul(ctx, x, x);
-//       struct ggml_tensor * f  = ggml_add(ctx, ggml_mul(ctx, a, x2), b);
-//
-//       ...
-//   }
-//
-// Notice that the function definition above does not involve any actual computation. The computation is performed only
-// when the user explicitly requests it. For example, to compute the function's value at x = 2.0:
-//
-//   {
-//       ...
-//
-//       struct ggml_cgraph gf = ggml_build_forward(f);
-//
-//       // set the input variable and parameter values
-//       ggml_set_f32(x, 2.0f);
-//       ggml_set_f32(a, 3.0f);
-//       ggml_set_f32(b, 4.0f);
-//
-//       ggml_graph_compute(ctx0, &gf);
-//
-//       printf("f = %f\n", ggml_get_f32_1d(f, 0));
-//
-//       ...
-//   }
-//
-// The actual computation is performed in the ggml_graph_compute() function.
-//
-// The ggml_new_tensor_...() functions create new tensors. They are allocated in the memory buffer provided to the
-// ggml_init() function. You have to be careful not to exceed the memory buffer size. Therefore, you have to know
-// in advance how much memory you need for your computation. Alternatively, you can allocate a large enough memory
-// and after defining the computation graph, call the ggml_used_mem() function to find out how much memory was
-// actually needed.
-//
-// The ggml_set_param() function marks a tensor as an input variable. This is used by the automatic
-// differentiation and optimization algorithms.
-//
-// The described approach allows to define the function graph once and then compute its forward or backward graphs
-// multiple times. All computations will use the same memory buffer allocated in the ggml_init() function. This way
-// the user can avoid the memory allocation overhead at runtime.
-//
-// The library supports multi-dimensional tensors - up to 4 dimensions. The FP16 and FP32 data types are first class
-// citizens, but in theory the library can be extended to support FP8 and integer data types.
-//
-// Each tensor operation produces a new tensor. Initially the library was envisioned to support only the use of unary
-// and binary operations. Most of the available operations fall into one of these two categories. With time, it became
-// clear that the library needs to support more complex operations. The way to support these operations is not clear
-// yet, but a few examples are demonstrated in the following operations:
-//
-//   - ggml_permute()
-//   - ggml_conv_1d_1s()
-//   - ggml_conv_1d_2s()
-//
-// For each tensor operator, the library implements a forward and backward computation function. The forward function
-// computes the output tensor value given the input tensor values. The backward function computes the adjoint of the
-// input tensors given the adjoint of the output tensor. For a detailed explanation of what this means, take a
-// calculus class, or watch the following video:
-//
-//   What is Automatic Differentiation?
-//   https://www.youtube.com/watch?v=wG_nF1awSSY
-//
-//
-// ## Tensor data (struct ggml_tensor)
-//
-// The tensors are stored in memory via the ggml_tensor struct. The structure provides information about the size of
-// the tensor, the data type, and the memory buffer where the tensor data is stored. Additionally, it contains
-// pointers to the "source" tensors - i.e. the tensors that were used to compute the current tensor. For example:
-//
-//   {
-//       struct ggml_tensor * c = ggml_add(ctx, a, b);
-//
-//       assert(c->src[0] == a);
-//       assert(c->src[1] == b);
-//   }
-//
-// The multi-dimensional tensors are stored in row-major order. The ggml_tensor struct contains fields for the
-// number of elements in each dimension ("ne") as well as the number of bytes ("nb", a.k.a. stride). This allows
-// to store tensors that are not contiguous in memory, which is useful for operations such as transposition and
-// permutation. All tensor operations have to take the stride into account and not assume that the tensor is
-// contiguous in memory.
-//
-// The data of the tensor is accessed via the "data" pointer. For example:
-//
-//   {
-//       struct ggml_tensor * a = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, 2, 3);
-//
-//       // a[1, 2] = 1.0f;
-//       *(float *) ((char *) a->data + 2*a->nb[1] + 1*a->nb[0]) = 1.0f;
-//
-//       // a[2, 0] = 2.0f;
-//       *(float *) ((char *) a->data + 0*a->nb[1] + 2*a->nb[0]) = 2.0f;
-//
-//       ...
-//   }
-//
-// Alternatively, there are helper functions, such as ggml_get_f32_1d() and ggml_set_f32_1d() that can be used.
-//
-// ## The matrix multiplication operator (ggml_mul_mat)
-//
-// TODO
-//
-//
-// ## Multi-threading
-//
-// TODO
-//
-//
-// ## Overview of ggml.c
-//
-// TODO
-//
-//
-// ## SIMD optimizations
-//
-// TODO
-//
-//
-// ## Debugging ggml
-//
-// TODO
-//
-//
-
-#ifdef  __cplusplus
-extern "C" {
-#endif
-
-#include <stdint.h>
-#include <stddef.h>
-#include <stdbool.h>
-
-#define GGML_MAX_DIMS          4
-#define GGML_MAX_NODES         4096
-#define GGML_MAX_PARAMS        16
-#define GGML_MAX_CONTEXTS      64
-#define GGML_MAX_OPT           4
-#define GGML_DEFAULT_N_THREADS 4
-
-#ifdef __ARM_NEON
-// we use the built-in 16-bit float type
-typedef __fp16 ggml_fp16_t;
-#else
-typedef uint16_t ggml_fp16_t;
-#endif
-
-// convert FP16 <-> FP32
-float       ggml_fp16_to_fp32(ggml_fp16_t x);
-ggml_fp16_t ggml_fp32_to_fp16(float x);
-
-struct ggml_object;
-struct ggml_context;
-
-enum ggml_type {
-    // explicitly numbered values are used in llama.cpp files
-    GGML_TYPE_F32  = 0,
-    GGML_TYPE_F16  = 1,
-    GGML_TYPE_Q4_0 = 2,
-    GGML_TYPE_Q4_1 = 3,
-    GGML_TYPE_Q4_2 = 4,
-    GGML_TYPE_Q4_3 = 5,
-    GGML_TYPE_Q8_0 = 6,
-    GGML_TYPE_I8,
-    GGML_TYPE_I16,
-    GGML_TYPE_I32,
-    GGML_TYPE_COUNT,
-};
-
-// available tensor operations:
-enum ggml_op {
-    GGML_OP_NONE = 0,
-
-    GGML_OP_DUP,
-    GGML_OP_ADD,
-    GGML_OP_SUB,
-    GGML_OP_MUL,
-    GGML_OP_DIV,
-    GGML_OP_SQR,
-    GGML_OP_SQRT,
-    GGML_OP_SUM,
-    GGML_OP_MEAN,
-    GGML_OP_REPEAT,
-    GGML_OP_ABS,
-    GGML_OP_SGN,
-    GGML_OP_NEG,
-    GGML_OP_STEP,
-    GGML_OP_RELU,
-    GGML_OP_GELU,
-    GGML_OP_SILU,
-    GGML_OP_NORM, // normalize
-    GGML_OP_RMS_NORM,
-
-    GGML_OP_MUL_MAT,
-
-    GGML_OP_SCALE,
-    GGML_OP_CPY,
-    GGML_OP_CONT,
-    GGML_OP_RESHAPE,
-    GGML_OP_VIEW,
-    GGML_OP_PERMUTE,
-    GGML_OP_TRANSPOSE,
-    GGML_OP_GET_ROWS,
-    GGML_OP_DIAG_MASK_INF,
-    GGML_OP_SOFT_MAX,
-    GGML_OP_ROPE,
-    GGML_OP_ALIBI,
-    GGML_OP_CONV_1D_1S,
-    GGML_OP_CONV_1D_2S,
-
-    GGML_OP_FLASH_ATTN,
-    GGML_OP_FLASH_FF,
-
-    GGML_OP_MAP_UNARY,
-    GGML_OP_MAP_BINARY,
-
-    GGML_OP_COUNT,
-};
-
-
-// ggml object
-struct ggml_object {
-    size_t offs;
-    size_t size;
-
-    struct ggml_object * next;
-
-    char padding[8];
-};
-
-static const size_t GGML_OBJECT_SIZE = sizeof(struct ggml_object);
-
-// n-dimensional tensor
-struct ggml_tensor {
-    enum ggml_type type;
-
-    int    n_dims;
-    int64_t ne[GGML_MAX_DIMS]; // number of elements
-    size_t  nb[GGML_MAX_DIMS]; // stride in bytes:
-                               // nb[0] = sizeof(type)
-                               // nb[1] = nb[0]   * ne[0] + padding
-                               // nb[i] = nb[i-1] * ne[i-1]
-
-    // compute data
-    enum ggml_op op;
-
-    bool is_param;
-
-    struct ggml_tensor * grad;
-    struct ggml_tensor * src0;
-    struct ggml_tensor * src1;
-    struct ggml_tensor * opt[GGML_MAX_OPT];
-
-    // thread scheduling
-    int n_tasks;
-
-    // performance
-    int     perf_runs;
-    int64_t perf_cycles;
-    int64_t perf_time_us;
-
-    void * data;
-    char padding[8];
-};
-
-// computation graph
-struct ggml_cgraph {
-    int n_nodes;
-    int n_leafs;
-    int n_threads;
-
-    size_t work_size;
-    struct ggml_tensor * work;
-
-    struct ggml_tensor * nodes[GGML_MAX_NODES];
-    struct ggml_tensor * grads[GGML_MAX_NODES];
-    struct ggml_tensor * leafs[GGML_MAX_NODES];
-
-    // performance
-    int     perf_runs;
-    int64_t perf_cycles;
-    int64_t perf_time_us;
-};
-
-// scratch buffer
-struct ggml_scratch {
-    size_t offs;
-    size_t size;
-    void * data;
-};
-
-struct ggml_init_params {
-    // memory pool
-    size_t mem_size;   // bytes
-    void * mem_buffer; // if NULL, memory will be allocated internally
-    bool   no_alloc;   // don't allocate memory for the tensor data
-};
-
-void    ggml_time_init(void); // call this once at the beginning of the program
-int64_t ggml_time_ms(void);
-int64_t ggml_time_us(void);
-int64_t ggml_cycles(void);
-int64_t ggml_cycles_per_ms(void);
-
-void ggml_print_object (const struct ggml_object * obj);
-void ggml_print_objects(const struct ggml_context * ctx);
-
-int64_t ggml_nelements(const struct ggml_tensor * tensor);
-size_t  ggml_nbytes   (const struct ggml_tensor * tensor);
-
-int    ggml_blck_size (enum ggml_type type);
-size_t ggml_type_size (enum ggml_type type); // size in bytes for all elements in a block
-float  ggml_type_sizef(enum ggml_type type); // ggml_type_size()/ggml_blck_size() as float
-
-const char * ggml_type_name(enum ggml_type type);
-
-size_t ggml_element_size(const struct ggml_tensor * tensor);
-
-bool ggml_is_quantized(enum ggml_type type);
-
-struct ggml_context * ggml_init(struct ggml_init_params params);
-void ggml_free(struct ggml_context * ctx);
-
-size_t ggml_used_mem(const struct ggml_context * ctx);
-
-size_t ggml_set_scratch(struct ggml_context * ctx, struct ggml_scratch scratch);
-
-struct ggml_tensor * ggml_new_tensor(
-        struct ggml_context * ctx,
-        enum   ggml_type type,
-        int    n_dims,
-        const int64_t *ne);
-
-struct ggml_tensor * ggml_new_tensor_1d(
-        struct ggml_context * ctx,
-        enum   ggml_type type,
-        int64_t ne0);
-
-struct ggml_tensor * ggml_new_tensor_2d(
-        struct ggml_context * ctx,
-        enum   ggml_type type,
-        int64_t ne0,
-        int64_t ne1);
-
-struct ggml_tensor * ggml_new_tensor_3d(
-        struct ggml_context * ctx,
-        enum   ggml_type type,
-        int64_t ne0,
-        int64_t ne1,
-        int64_t ne2);
-
-struct ggml_tensor * ggml_new_tensor_4d(
-        struct ggml_context * ctx,
-        enum   ggml_type type,
-        int64_t ne0,
-        int64_t ne1,
-        int64_t ne2,
-        int64_t ne3);
-
-struct ggml_tensor * ggml_new_i32(struct ggml_context * ctx, int32_t value);
-struct ggml_tensor * ggml_new_f32(struct ggml_context * ctx, float value);
-
-struct ggml_tensor * ggml_dup_tensor (struct ggml_context * ctx, const struct ggml_tensor * src);
-struct ggml_tensor * ggml_view_tensor(struct ggml_context * ctx, const struct ggml_tensor * src);
-
-struct ggml_tensor * ggml_set_zero(struct ggml_tensor * tensor);
-struct ggml_tensor * ggml_set_i32 (struct ggml_tensor * tensor, int32_t value);
-struct ggml_tensor * ggml_set_f32 (struct ggml_tensor * tensor, float value);
-
-int32_t ggml_get_i32_1d(const struct ggml_tensor * tensor, int i);
-void    ggml_set_i32_1d(const struct ggml_tensor * tensor, int i, int32_t value);
-
-float ggml_get_f32_1d(const struct ggml_tensor * tensor, int i);
-void  ggml_set_f32_1d(const struct ggml_tensor * tensor, int i, float value);
-
- void * ggml_get_data    (const struct ggml_tensor * tensor);
-float * ggml_get_data_f32(const struct ggml_tensor * tensor);
-
-//
-// operations on tensors with backpropagation
-//
-
-struct ggml_tensor * ggml_dup(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a);
-
-struct ggml_tensor * ggml_add(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a,
-        struct ggml_tensor  * b);
-
-
-struct ggml_tensor * ggml_add_inplace(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a,
-        struct ggml_tensor  * b);
-
-struct ggml_tensor * ggml_sub(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a,
-        struct ggml_tensor  * b);
-
-struct ggml_tensor * ggml_mul(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a,
-        struct ggml_tensor  * b);
-
-struct ggml_tensor * ggml_div(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a,
-        struct ggml_tensor  * b);
-
-struct ggml_tensor * ggml_sqr(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a);
-
-struct ggml_tensor * ggml_sqrt(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a);
-
-// return scalar
-// TODO: compute sum along rows
-struct ggml_tensor * ggml_sum(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a);
-
-// mean along rows
-struct ggml_tensor * ggml_mean(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a);
-
-// if a is the same shape as b, and a is not parameter, return a
-// otherwise, return a new tensor: repeat(a) to fit in b
-struct ggml_tensor * ggml_repeat(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a,
-        struct ggml_tensor  * b);
-
-struct ggml_tensor * ggml_abs(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a);
-
-struct ggml_tensor * ggml_sgn(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a);
-
-struct ggml_tensor * ggml_neg(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a);
-
-struct ggml_tensor * ggml_step(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a);
-
-struct ggml_tensor * ggml_relu(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a);
-
-// TODO: double-check this computation is correct
-struct ggml_tensor * ggml_gelu(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a);
-
-struct ggml_tensor * ggml_silu(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a);
-
-// normalize along rows
-// TODO: eps is hardcoded to 1e-5 for now
-struct ggml_tensor * ggml_norm(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a);
-
-struct ggml_tensor * ggml_rms_norm(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a);
-
-// A: m rows, n columns
-// B: p rows, n columns (i.e. we transpose it internally)
-// result is m columns, p rows
-struct ggml_tensor * ggml_mul_mat(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a,
-        struct ggml_tensor  * b);
-
-//
-// operations on tensors without backpropagation
-//
-
-// in-place, returns view(a)
-struct ggml_tensor * ggml_scale(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a,
-        struct ggml_tensor  * b);
-
-// a -> b, return view(b)
-struct ggml_tensor * ggml_cpy(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a,
-        struct ggml_tensor  * b);
-
-// make contiguous
-struct ggml_tensor * ggml_cont(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a);
-
-// return view(a), b specifies the new shape
-// TODO: when we start computing gradient, make a copy instead of view
-struct ggml_tensor * ggml_reshape(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a,
-        struct ggml_tensor  * b);
-
-// return view(a)
-// TODO: when we start computing gradient, make a copy instead of view
-struct ggml_tensor * ggml_reshape_2d(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a,
-        int64_t               ne0,
-        int64_t               ne1);
-
-// return view(a)
-// TODO: when we start computing gradient, make a copy instead of view
-struct ggml_tensor * ggml_reshape_3d(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a,
-        int64_t               ne0,
-        int64_t               ne1,
-        int64_t               ne2);
-
-// offset in bytes
-struct ggml_tensor * ggml_view_1d(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a,
-        int64_t               ne0,
-        size_t                offset);
-
-struct ggml_tensor * ggml_view_2d(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a,
-        int64_t               ne0,
-        int64_t               ne1,
-        size_t                nb1, // row stride in bytes
-        size_t                offset);
-
-struct ggml_tensor * ggml_view_3d(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a,
-        int64_t               ne0,
-        int64_t               ne1,
-        int64_t               ne2,
-        size_t                nb1, // row   stride in bytes
-        size_t                nb2, // slice stride in bytes
-        size_t                offset);
-
-struct ggml_tensor * ggml_permute(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a,
-        int                   axis0,
-        int                   axis1,
-        int                   axis2,
-        int                   axis3);
-
-// alias for ggml_permute(ctx, a, 1, 0, 2, 3)
-struct ggml_tensor * ggml_transpose(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a);
-
-struct ggml_tensor * ggml_get_rows(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a,
-        struct ggml_tensor  * b);
-
-// set elements above the diagonal to -INF
-// in-place, returns view(a)
-struct ggml_tensor * ggml_diag_mask_inf(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a,
-        int                   n_past);
-
-// in-place, returns view(a)
-struct ggml_tensor * ggml_soft_max(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a);
-
-// rotary position embedding
-// in-place, returns view(a)
-// if mode & 1 == 1, skip n_past elements
-// if mode & 2 == 1, GPT-NeoX style
-// TODO: avoid creating a new tensor every time
-struct ggml_tensor * ggml_rope(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a,
-        int                   n_past,
-        int                   n_dims,
-        int                   mode);
-
-// alibi position embedding
-// in-place, returns view(a)
-struct ggml_tensor * ggml_alibi(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a,
-        int                   n_past,
-        int                   n_head);
-
-// padding = 1
-// TODO: we don't support extra parameters for now
-//       that's why we are hard-coding the stride, padding, and dilation
-//       not great ..
-struct ggml_tensor * ggml_conv_1d_1s(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a,
-        struct ggml_tensor  * b);
-
-struct ggml_tensor * ggml_conv_1d_2s(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a,
-        struct ggml_tensor  * b);
-
-struct ggml_tensor * ggml_flash_attn(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * q,
-        struct ggml_tensor  * k,
-        struct ggml_tensor  * v,
-        bool                  masked);
-
-struct ggml_tensor * ggml_flash_ff(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a,
-        struct ggml_tensor  * b0,
-        struct ggml_tensor  * b1,
-        struct ggml_tensor  * c0,
-        struct ggml_tensor  * c1);
-
-// Mapping operations
-typedef void (*ggml_unary_op_f32_t)(const int, float *, const float *);
-typedef void (*ggml_binary_op_f32_t)(const int, float *, const float *, const float *);
-
-struct ggml_tensor * ggml_map_unary_f32(
-        struct ggml_context        * ctx,
-        struct ggml_tensor         * a,
-        const  ggml_unary_op_f32_t fun);
-
-struct ggml_tensor * ggml_map_binary_f32(
-        struct ggml_context         * ctx,
-        struct ggml_tensor          * a,
-        struct ggml_tensor          * b,
-        const  ggml_binary_op_f32_t fun);
-
-//
-// automatic differentiation
-//
-
-void ggml_set_param(
-        struct ggml_context * ctx,
-        struct ggml_tensor * tensor);
-
-void ggml_build_forward_expand(struct ggml_cgraph * cgraph, struct ggml_tensor * tensor);
-
-struct ggml_cgraph ggml_build_forward (struct ggml_tensor * tensor);
-struct ggml_cgraph ggml_build_backward(struct ggml_context * ctx, struct ggml_cgraph * gf, bool keep);
-
-void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph);
-void ggml_graph_reset  (struct ggml_cgraph * cgraph);
-
-// print info and performance information for the graph
-void ggml_graph_print(const struct ggml_cgraph * cgraph);
-
-// dump the graph into a file using the dot format
-void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph * gf, const char * filename);
-
-//
-// optimization
-//
-
-// optimization methods
-enum ggml_opt_type {
-    GGML_OPT_ADAM,
-    GGML_OPT_LBFGS,
-};
-
-// linesearch methods
-enum ggml_linesearch {
-    GGML_LINESEARCH_DEFAULT = 1,
-
-    GGML_LINESEARCH_BACKTRACKING_ARMIJO       = 0,
-    GGML_LINESEARCH_BACKTRACKING_WOLFE        = 1,
-    GGML_LINESEARCH_BACKTRACKING_STRONG_WOLFE = 2,
-};
-
-// optimization return values
-enum ggml_opt_result {
-    GGML_OPT_OK = 0,
-    GGML_OPT_DID_NOT_CONVERGE,
-    GGML_OPT_NO_CONTEXT,
-    GGML_OPT_INVALID_WOLFE,
-    GGML_OPT_FAIL,
-
-    GGML_LINESEARCH_FAIL = -128,
-    GGML_LINESEARCH_MINIMUM_STEP,
-    GGML_LINESEARCH_MAXIMUM_STEP,
-    GGML_LINESEARCH_MAXIMUM_ITERATIONS,
-    GGML_LINESEARCH_INVALID_PARAMETERS,
-};
-
-// optimization parameters
-//
-//   see ggml.c (ggml_opt_default_params) for default values
-//
-struct ggml_opt_params {
-    enum ggml_opt_type type;
-
-    int n_threads;
-
-    // delta-based convergence test
-    //
-    //   if past == 0 - disabled
-    //   if past > 0:
-    //     stop if |f(x) - f(x_past)| < delta * max(1, |f(x)|)
-    //
-    int past;
-    float delta;
-
-    // maximum number of iterations without improvement
-    //
-    //   if 0 - disabled
-    //   if > 0:
-    //     assume convergence if no cost improvement in this number of iterations
-    //
-    int max_no_improvement;
-
-    bool print_forward_graph;
-    bool print_backward_graph;
-
-    // ADAM parameters
-    struct {
-        int n_iter;
-
-        float alpha; // learning rate
-        float beta1;
-        float beta2;
-        float eps;   // epsilon for numerical stability
-        float eps_f; // epsilon for convergence test
-        float eps_g; // epsilon for convergence test
-    } adam;
-
-    // LBFGS parameters
-    struct {
-        int m; // number of corrections to approximate the inv. Hessian
-        int n_iter;
-        int max_linesearch;
-
-        float eps;      // convergence tolerance
-        float ftol;     // line search tolerance
-        float wolfe;
-        float min_step;
-        float max_step;
-
-        enum ggml_linesearch linesearch;
-    } lbfgs;
-};
-
-struct ggml_opt_params ggml_opt_default_params(enum ggml_opt_type type);
-
-// optimize the function defined by the tensor f
-enum ggml_opt_result ggml_opt(
-        struct ggml_context * ctx,
-        struct ggml_opt_params params,
-        struct ggml_tensor * f);
-
-//
-// quantization
-//
-
-size_t ggml_quantize_q4_0(const float * src, void * dst, int n, int k, int64_t * hist);
-size_t ggml_quantize_q4_1(const float * src, void * dst, int n, int k, int64_t * hist);
-size_t ggml_quantize_q4_2(const float * src, void * dst, int n, int k, int64_t * hist);
-size_t ggml_quantize_q4_3(const float * src, void * dst, int n, int k, int64_t * hist);
-
-size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, int start, int n, int64_t * hist);
-
-//
-// system info
-//
-
-int ggml_cpu_has_avx(void);
-int ggml_cpu_has_avx2(void);
-int ggml_cpu_has_avx512(void);
-int ggml_cpu_has_avx512_vbmi(void);
-int ggml_cpu_has_avx512_vnni(void);
-int ggml_cpu_has_fma(void);
-int ggml_cpu_has_neon(void);
-int ggml_cpu_has_arm_fma(void);
-int ggml_cpu_has_f16c(void);
-int ggml_cpu_has_fp16_va(void);
-int ggml_cpu_has_wasm_simd(void);
-int ggml_cpu_has_blas(void);
-int ggml_cpu_has_cublas(void);
-int ggml_cpu_has_sse3(void);
-int ggml_cpu_has_vsx(void);
-
-
-//
-// Internal types and functions exposed for tests and benchmarks
-//
-
-#ifdef  __cplusplus
-// restrict not standard in C++
-#define GGML_RESTRICT
-#else
-#define GGML_RESTRICT restrict
-#endif
-typedef void (*dequantize_row_q_t)(const void * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
-typedef void (*quantize_row_q_t)(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
-typedef void (*vec_dot_q_t)(const int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT x, const void * GGML_RESTRICT y);
-
-typedef struct {
-    dequantize_row_q_t dequantize_row_q;
-    quantize_row_q_t   quantize_row_q;
-    quantize_row_q_t   quantize_row_q_reference;
-    quantize_row_q_t   quantize_row_q_dot;
-    vec_dot_q_t        vec_dot_q;
-} quantize_fns_t;
-
-quantize_fns_t ggml_internal_get_quantize_fn(size_t i);
-
-#ifdef  __cplusplus
-}
-#endif
diff --git a/ggml-sys/src/lib.rs b/ggml-sys/src/lib.rs
deleted file mode 100644
index 76aa0673..00000000
--- a/ggml-sys/src/lib.rs
+++ /dev/null
@@ -1,1564 +0,0 @@
-/* automatically generated by rust-bindgen 0.64.0 */
-
-#![allow(non_upper_case_globals)]
-#![allow(non_camel_case_types)]
-#![allow(non_snake_case)]
-#![allow(unused)]
-
-pub const GGML_MAX_DIMS: u32 = 4;
-pub const GGML_MAX_NODES: u32 = 4096;
-pub const GGML_MAX_PARAMS: u32 = 16;
-pub const GGML_MAX_CONTEXTS: u32 = 64;
-pub const GGML_MAX_OPT: u32 = 4;
-pub const GGML_DEFAULT_N_THREADS: u32 = 4;
-pub type ggml_fp16_t = u16;
-extern "C" {
-    pub fn ggml_fp16_to_fp32(x: ggml_fp16_t) -> f32;
-}
-extern "C" {
-    pub fn ggml_fp32_to_fp16(x: f32) -> ggml_fp16_t;
-}
-#[repr(C)]
-#[derive(Debug, Copy, Clone)]
-pub struct ggml_context {
-    _unused: [u8; 0],
-}
-pub const ggml_type_GGML_TYPE_F32: ggml_type = 0;
-pub const ggml_type_GGML_TYPE_F16: ggml_type = 1;
-pub const ggml_type_GGML_TYPE_Q4_0: ggml_type = 2;
-pub const ggml_type_GGML_TYPE_Q4_1: ggml_type = 3;
-pub const ggml_type_GGML_TYPE_Q4_2: ggml_type = 4;
-pub const ggml_type_GGML_TYPE_Q4_3: ggml_type = 5;
-pub const ggml_type_GGML_TYPE_Q8_0: ggml_type = 6;
-pub const ggml_type_GGML_TYPE_I8: ggml_type = 7;
-pub const ggml_type_GGML_TYPE_I16: ggml_type = 8;
-pub const ggml_type_GGML_TYPE_I32: ggml_type = 9;
-pub const ggml_type_GGML_TYPE_COUNT: ggml_type = 10;
-pub type ggml_type = ::std::os::raw::c_uint;
-pub const ggml_op_GGML_OP_NONE: ggml_op = 0;
-pub const ggml_op_GGML_OP_DUP: ggml_op = 1;
-pub const ggml_op_GGML_OP_ADD: ggml_op = 2;
-pub const ggml_op_GGML_OP_SUB: ggml_op = 3;
-pub const ggml_op_GGML_OP_MUL: ggml_op = 4;
-pub const ggml_op_GGML_OP_DIV: ggml_op = 5;
-pub const ggml_op_GGML_OP_SQR: ggml_op = 6;
-pub const ggml_op_GGML_OP_SQRT: ggml_op = 7;
-pub const ggml_op_GGML_OP_SUM: ggml_op = 8;
-pub const ggml_op_GGML_OP_MEAN: ggml_op = 9;
-pub const ggml_op_GGML_OP_REPEAT: ggml_op = 10;
-pub const ggml_op_GGML_OP_ABS: ggml_op = 11;
-pub const ggml_op_GGML_OP_SGN: ggml_op = 12;
-pub const ggml_op_GGML_OP_NEG: ggml_op = 13;
-pub const ggml_op_GGML_OP_STEP: ggml_op = 14;
-pub const ggml_op_GGML_OP_RELU: ggml_op = 15;
-pub const ggml_op_GGML_OP_GELU: ggml_op = 16;
-pub const ggml_op_GGML_OP_SILU: ggml_op = 17;
-pub const ggml_op_GGML_OP_NORM: ggml_op = 18;
-pub const ggml_op_GGML_OP_RMS_NORM: ggml_op = 19;
-pub const ggml_op_GGML_OP_MUL_MAT: ggml_op = 20;
-pub const ggml_op_GGML_OP_SCALE: ggml_op = 21;
-pub const ggml_op_GGML_OP_CPY: ggml_op = 22;
-pub const ggml_op_GGML_OP_CONT: ggml_op = 23;
-pub const ggml_op_GGML_OP_RESHAPE: ggml_op = 24;
-pub const ggml_op_GGML_OP_VIEW: ggml_op = 25;
-pub const ggml_op_GGML_OP_PERMUTE: ggml_op = 26;
-pub const ggml_op_GGML_OP_TRANSPOSE: ggml_op = 27;
-pub const ggml_op_GGML_OP_GET_ROWS: ggml_op = 28;
-pub const ggml_op_GGML_OP_DIAG_MASK_INF: ggml_op = 29;
-pub const ggml_op_GGML_OP_SOFT_MAX: ggml_op = 30;
-pub const ggml_op_GGML_OP_ROPE: ggml_op = 31;
-pub const ggml_op_GGML_OP_ALIBI: ggml_op = 32;
-pub const ggml_op_GGML_OP_CONV_1D_1S: ggml_op = 33;
-pub const ggml_op_GGML_OP_CONV_1D_2S: ggml_op = 34;
-pub const ggml_op_GGML_OP_FLASH_ATTN: ggml_op = 35;
-pub const ggml_op_GGML_OP_FLASH_FF: ggml_op = 36;
-pub const ggml_op_GGML_OP_MAP_UNARY: ggml_op = 37;
-pub const ggml_op_GGML_OP_MAP_BINARY: ggml_op = 38;
-pub const ggml_op_GGML_OP_COUNT: ggml_op = 39;
-pub type ggml_op = ::std::os::raw::c_uint;
-#[repr(C)]
-#[derive(Debug, Copy, Clone)]
-pub struct ggml_object {
-    pub offs: usize,
-    pub size: usize,
-    pub next: *mut ggml_object,
-    pub padding: [::std::os::raw::c_char; 8usize],
-}
-#[test]
-fn bindgen_test_layout_ggml_object() {
-    const UNINIT: ::std::mem::MaybeUninit<ggml_object> = ::std::mem::MaybeUninit::uninit();
-    let ptr = UNINIT.as_ptr();
-    assert_eq!(
-        ::std::mem::size_of::<ggml_object>(),
-        32usize,
-        concat!("Size of: ", stringify!(ggml_object))
-    );
-    assert_eq!(
-        ::std::mem::align_of::<ggml_object>(),
-        8usize,
-        concat!("Alignment of ", stringify!(ggml_object))
-    );
-    assert_eq!(
-        unsafe { ::std::ptr::addr_of!((*ptr).offs) as usize - ptr as usize },
-        0usize,
-        concat!(
-            "Offset of field: ",
-            stringify!(ggml_object),
-            "::",
-            stringify!(offs)
-        )
-    );
-    assert_eq!(
-        unsafe { ::std::ptr::addr_of!((*ptr).size) as usize - ptr as usize },
-        8usize,
-        concat!(
-            "Offset of field: ",
-            stringify!(ggml_object),
-            "::",
-            stringify!(size)
-        )
-    );
-    assert_eq!(
-        unsafe { ::std::ptr::addr_of!((*ptr).next) as usize - ptr as usize },
-        16usize,
-        concat!(
-            "Offset of field: ",
-            stringify!(ggml_object),
-            "::",
-            stringify!(next)
-        )
-    );
-    assert_eq!(
-        unsafe { ::std::ptr::addr_of!((*ptr).padding) as usize - ptr as usize },
-        24usize,
-        concat!(
-            "Offset of field: ",
-            stringify!(ggml_object),
-            "::",
-            stringify!(padding)
-        )
-    );
-}
-pub const GGML_OBJECT_SIZE: usize = 32;
-#[repr(C)]
-#[derive(Debug, Copy, Clone)]
-pub struct ggml_tensor {
-    pub type_: ggml_type,
-    pub n_dims: ::std::os::raw::c_int,
-    pub ne: [i64; 4usize],
-    pub nb: [usize; 4usize],
-    pub op: ggml_op,
-    pub is_param: bool,
-    pub grad: *mut ggml_tensor,
-    pub src0: *mut ggml_tensor,
-    pub src1: *mut ggml_tensor,
-    pub opt: [*mut ggml_tensor; 4usize],
-    pub n_tasks: ::std::os::raw::c_int,
-    pub perf_runs: ::std::os::raw::c_int,
-    pub perf_cycles: i64,
-    pub perf_time_us: i64,
-    pub data: *mut ::std::os::raw::c_void,
-    pub padding: [::std::os::raw::c_char; 8usize],
-}
-#[test]
-fn bindgen_test_layout_ggml_tensor() {
-    const UNINIT: ::std::mem::MaybeUninit<ggml_tensor> = ::std::mem::MaybeUninit::uninit();
-    let ptr = UNINIT.as_ptr();
-    assert_eq!(
-        ::std::mem::size_of::<ggml_tensor>(),
-        176usize,
-        concat!("Size of: ", stringify!(ggml_tensor))
-    );
-    assert_eq!(
-        ::std::mem::align_of::<ggml_tensor>(),
-        8usize,
-        concat!("Alignment of ", stringify!(ggml_tensor))
-    );
-    assert_eq!(
-        unsafe { ::std::ptr::addr_of!((*ptr).type_) as usize - ptr as usize },
-        0usize,
-        concat!(
-            "Offset of field: ",
-            stringify!(ggml_tensor),
-            "::",
-            stringify!(type_)
-        )
-    );
-    assert_eq!(
-        unsafe { ::std::ptr::addr_of!((*ptr).n_dims) as usize - ptr as usize },
-        4usize,
-        concat!(
-            "Offset of field: ",
-            stringify!(ggml_tensor),
-            "::",
-            stringify!(n_dims)
-        )
-    );
-    assert_eq!(
-        unsafe { ::std::ptr::addr_of!((*ptr).ne) as usize - ptr as usize },
-        8usize,
-        concat!(
-            "Offset of field: ",
-            stringify!(ggml_tensor),
-            "::",
-            stringify!(ne)
-        )
-    );
-    assert_eq!(
-        unsafe { ::std::ptr::addr_of!((*ptr).nb) as usize - ptr as usize },
-        40usize,
-        concat!(
-            "Offset of field: ",
-            stringify!(ggml_tensor),
-            "::",
-            stringify!(nb)
-        )
-    );
-    assert_eq!(
-        unsafe { ::std::ptr::addr_of!((*ptr).op) as usize - ptr as usize },
-        72usize,
-        concat!(
-            "Offset of field: ",
-            stringify!(ggml_tensor),
-            "::",
-            stringify!(op)
-        )
-    );
-    assert_eq!(
-        unsafe { ::std::ptr::addr_of!((*ptr).is_param) as usize - ptr as usize },
-        76usize,
-        concat!(
-            "Offset of field: ",
-            stringify!(ggml_tensor),
-            "::",
-            stringify!(is_param)
-        )
-    );
-    assert_eq!(
-        unsafe { ::std::ptr::addr_of!((*ptr).grad) as usize - ptr as usize },
-        80usize,
-        concat!(
-            "Offset of field: ",
-            stringify!(ggml_tensor),
-            "::",
-            stringify!(grad)
-        )
-    );
-    assert_eq!(
-        unsafe { ::std::ptr::addr_of!((*ptr).src0) as usize - ptr as usize },
-        88usize,
-        concat!(
-            "Offset of field: ",
-            stringify!(ggml_tensor),
-            "::",
-            stringify!(src0)
-        )
-    );
-    assert_eq!(
-        unsafe { ::std::ptr::addr_of!((*ptr).src1) as usize - ptr as usize },
-        96usize,
-        concat!(
-            "Offset of field: ",
-            stringify!(ggml_tensor),
-            "::",
-            stringify!(src1)
-        )
-    );
-    assert_eq!(
-        unsafe { ::std::ptr::addr_of!((*ptr).opt) as usize - ptr as usize },
-        104usize,
-        concat!(
-            "Offset of field: ",
-            stringify!(ggml_tensor),
-            "::",
-            stringify!(opt)
-        )
-    );
-    assert_eq!(
-        unsafe { ::std::ptr::addr_of!((*ptr).n_tasks) as usize - ptr as usize },
-        136usize,
-        concat!(
-            "Offset of field: ",
-            stringify!(ggml_tensor),
-            "::",
-            stringify!(n_tasks)
-        )
-    );
-    assert_eq!(
-        unsafe { ::std::ptr::addr_of!((*ptr).perf_runs) as usize - ptr as usize },
-        140usize,
-        concat!(
-            "Offset of field: ",
-            stringify!(ggml_tensor),
-            "::",
-            stringify!(perf_runs)
-        )
-    );
-    assert_eq!(
-        unsafe { ::std::ptr::addr_of!((*ptr).perf_cycles) as usize - ptr as usize },
-        144usize,
-        concat!(
-            "Offset of field: ",
-            stringify!(ggml_tensor),
-            "::",
-            stringify!(perf_cycles)
-        )
-    );
-    assert_eq!(
-        unsafe { ::std::ptr::addr_of!((*ptr).perf_time_us) as usize - ptr as usize },
-        152usize,
-        concat!(
-            "Offset of field: ",
-            stringify!(ggml_tensor),
-            "::",
-            stringify!(perf_time_us)
-        )
-    );
-    assert_eq!(
-        unsafe { ::std::ptr::addr_of!((*ptr).data) as usize - ptr as usize },
-        160usize,
-        concat!(
-            "Offset of field: ",
-            stringify!(ggml_tensor),
-            "::",
-            stringify!(data)
-        )
-    );
-    assert_eq!(
-        unsafe { ::std::ptr::addr_of!((*ptr).padding) as usize - ptr as usize },
-        168usize,
-        concat!(
-            "Offset of field: ",
-            stringify!(ggml_tensor),
-            "::",
-            stringify!(padding)
-        )
-    );
-}
-#[repr(C)]
-#[derive(Debug, Copy, Clone)]
-pub struct ggml_cgraph {
-    pub n_nodes: ::std::os::raw::c_int,
-    pub n_leafs: ::std::os::raw::c_int,
-    pub n_threads: ::std::os::raw::c_int,
-    pub work_size: usize,
-    pub work: *mut ggml_tensor,
-    pub nodes: [*mut ggml_tensor; 4096usize],
-    pub grads: [*mut ggml_tensor; 4096usize],
-    pub leafs: [*mut ggml_tensor; 4096usize],
-    pub perf_runs: ::std::os::raw::c_int,
-    pub perf_cycles: i64,
-    pub perf_time_us: i64,
-}
-#[test]
-fn bindgen_test_layout_ggml_cgraph() {
-    const UNINIT: ::std::mem::MaybeUninit<ggml_cgraph> = ::std::mem::MaybeUninit::uninit();
-    let ptr = UNINIT.as_ptr();
-    assert_eq!(
-        ::std::mem::size_of::<ggml_cgraph>(),
-        98360usize,
-        concat!("Size of: ", stringify!(ggml_cgraph))
-    );
-    assert_eq!(
-        ::std::mem::align_of::<ggml_cgraph>(),
-        8usize,
-        concat!("Alignment of ", stringify!(ggml_cgraph))
-    );
-    assert_eq!(
-        unsafe { ::std::ptr::addr_of!((*ptr).n_nodes) as usize - ptr as usize },
-        0usize,
-        concat!(
-            "Offset of field: ",
-            stringify!(ggml_cgraph),
-            "::",
-            stringify!(n_nodes)
-        )
-    );
-    assert_eq!(
-        unsafe { ::std::ptr::addr_of!((*ptr).n_leafs) as usize - ptr as usize },
-        4usize,
-        concat!(
-            "Offset of field: ",
-            stringify!(ggml_cgraph),
-            "::",
-            stringify!(n_leafs)
-        )
-    );
-    assert_eq!(
-        unsafe { ::std::ptr::addr_of!((*ptr).n_threads) as usize - ptr as usize },
-        8usize,
-        concat!(
-            "Offset of field: ",
-            stringify!(ggml_cgraph),
-            "::",
-            stringify!(n_threads)
-        )
-    );
-    assert_eq!(
-        unsafe { ::std::ptr::addr_of!((*ptr).work_size) as usize - ptr as usize },
-        16usize,
-        concat!(
-            "Offset of field: ",
-            stringify!(ggml_cgraph),
-            "::",
-            stringify!(work_size)
-        )
-    );
-    assert_eq!(
-        unsafe { ::std::ptr::addr_of!((*ptr).work) as usize - ptr as usize },
-        24usize,
-        concat!(
-            "Offset of field: ",
-            stringify!(ggml_cgraph),
-            "::",
-            stringify!(work)
-        )
-    );
-    assert_eq!(
-        unsafe { ::std::ptr::addr_of!((*ptr).nodes) as usize - ptr as usize },
-        32usize,
-        concat!(
-            "Offset of field: ",
-            stringify!(ggml_cgraph),
-            "::",
-            stringify!(nodes)
-        )
-    );
-    assert_eq!(
-        unsafe { ::std::ptr::addr_of!((*ptr).grads) as usize - ptr as usize },
-        32800usize,
-        concat!(
-            "Offset of field: ",
-            stringify!(ggml_cgraph),
-            "::",
-            stringify!(grads)
-        )
-    );
-    assert_eq!(
-        unsafe { ::std::ptr::addr_of!((*ptr).leafs) as usize - ptr as usize },
-        65568usize,
-        concat!(
-            "Offset of field: ",
-            stringify!(ggml_cgraph),
-            "::",
-            stringify!(leafs)
-        )
-    );
-    assert_eq!(
-        unsafe { ::std::ptr::addr_of!((*ptr).perf_runs) as usize - ptr as usize },
-        98336usize,
-        concat!(
-            "Offset of field: ",
-            stringify!(ggml_cgraph),
-            "::",
-            stringify!(perf_runs)
-        )
-    );
-    assert_eq!(
-        unsafe { ::std::ptr::addr_of!((*ptr).perf_cycles) as usize - ptr as usize },
-        98344usize,
-        concat!(
-            "Offset of field: ",
-            stringify!(ggml_cgraph),
-            "::",
-            stringify!(perf_cycles)
-        )
-    );
-    assert_eq!(
-        unsafe { ::std::ptr::addr_of!((*ptr).perf_time_us) as usize - ptr as usize },
-        98352usize,
-        concat!(
-            "Offset of field: ",
-            stringify!(ggml_cgraph),
-            "::",
-            stringify!(perf_time_us)
-        )
-    );
-}
-#[repr(C)]
-#[derive(Debug, Copy, Clone)]
-pub struct ggml_scratch {
-    pub offs: usize,
-    pub size: usize,
-    pub data: *mut ::std::os::raw::c_void,
-}
-#[test]
-fn bindgen_test_layout_ggml_scratch() {
-    const UNINIT: ::std::mem::MaybeUninit<ggml_scratch> = ::std::mem::MaybeUninit::uninit();
-    let ptr = UNINIT.as_ptr();
-    assert_eq!(
-        ::std::mem::size_of::<ggml_scratch>(),
-        24usize,
-        concat!("Size of: ", stringify!(ggml_scratch))
-    );
-    assert_eq!(
-        ::std::mem::align_of::<ggml_scratch>(),
-        8usize,
-        concat!("Alignment of ", stringify!(ggml_scratch))
-    );
-    assert_eq!(
-        unsafe { ::std::ptr::addr_of!((*ptr).offs) as usize - ptr as usize },
-        0usize,
-        concat!(
-            "Offset of field: ",
-            stringify!(ggml_scratch),
-            "::",
-            stringify!(offs)
-        )
-    );
-    assert_eq!(
-        unsafe { ::std::ptr::addr_of!((*ptr).size) as usize - ptr as usize },
-        8usize,
-        concat!(
-            "Offset of field: ",
-            stringify!(ggml_scratch),
-            "::",
-            stringify!(size)
-        )
-    );
-    assert_eq!(
-        unsafe { ::std::ptr::addr_of!((*ptr).data) as usize - ptr as usize },
-        16usize,
-        concat!(
-            "Offset of field: ",
-            stringify!(ggml_scratch),
-            "::",
-            stringify!(data)
-        )
-    );
-}
-#[repr(C)]
-#[derive(Debug, Copy, Clone)]
-pub struct ggml_init_params {
-    pub mem_size: usize,
-    pub mem_buffer: *mut ::std::os::raw::c_void,
-    pub no_alloc: bool,
-}
-#[test]
-fn bindgen_test_layout_ggml_init_params() {
-    const UNINIT: ::std::mem::MaybeUninit<ggml_init_params> = ::std::mem::MaybeUninit::uninit();
-    let ptr = UNINIT.as_ptr();
-    assert_eq!(
-        ::std::mem::size_of::<ggml_init_params>(),
-        24usize,
-        concat!("Size of: ", stringify!(ggml_init_params))
-    );
-    assert_eq!(
-        ::std::mem::align_of::<ggml_init_params>(),
-        8usize,
-        concat!("Alignment of ", stringify!(ggml_init_params))
-    );
-    assert_eq!(
-        unsafe { ::std::ptr::addr_of!((*ptr).mem_size) as usize - ptr as usize },
-        0usize,
-        concat!(
-            "Offset of field: ",
-            stringify!(ggml_init_params),
-            "::",
-            stringify!(mem_size)
-        )
-    );
-    assert_eq!(
-        unsafe { ::std::ptr::addr_of!((*ptr).mem_buffer) as usize - ptr as usize },
-        8usize,
-        concat!(
-            "Offset of field: ",
-            stringify!(ggml_init_params),
-            "::",
-            stringify!(mem_buffer)
-        )
-    );
-    assert_eq!(
-        unsafe { ::std::ptr::addr_of!((*ptr).no_alloc) as usize - ptr as usize },
-        16usize,
-        concat!(
-            "Offset of field: ",
-            stringify!(ggml_init_params),
-            "::",
-            stringify!(no_alloc)
-        )
-    );
-}
-extern "C" {
-    pub fn ggml_time_init();
-}
-extern "C" {
-    pub fn ggml_time_ms() -> i64;
-}
-extern "C" {
-    pub fn ggml_time_us() -> i64;
-}
-extern "C" {
-    pub fn ggml_cycles() -> i64;
-}
-extern "C" {
-    pub fn ggml_cycles_per_ms() -> i64;
-}
-extern "C" {
-    pub fn ggml_print_object(obj: *const ggml_object);
-}
-extern "C" {
-    pub fn ggml_print_objects(ctx: *const ggml_context);
-}
-extern "C" {
-    pub fn ggml_nelements(tensor: *const ggml_tensor) -> i64;
-}
-extern "C" {
-    pub fn ggml_nbytes(tensor: *const ggml_tensor) -> usize;
-}
-extern "C" {
-    pub fn ggml_blck_size(type_: ggml_type) -> ::std::os::raw::c_int;
-}
-extern "C" {
-    pub fn ggml_type_size(type_: ggml_type) -> usize;
-}
-extern "C" {
-    pub fn ggml_type_sizef(type_: ggml_type) -> f32;
-}
-extern "C" {
-    pub fn ggml_type_name(type_: ggml_type) -> *const ::std::os::raw::c_char;
-}
-extern "C" {
-    pub fn ggml_element_size(tensor: *const ggml_tensor) -> usize;
-}
-extern "C" {
-    pub fn ggml_is_quantized(type_: ggml_type) -> bool;
-}
-extern "C" {
-    pub fn ggml_init(params: ggml_init_params) -> *mut ggml_context;
-}
-extern "C" {
-    pub fn ggml_free(ctx: *mut ggml_context);
-}
-extern "C" {
-    pub fn ggml_used_mem(ctx: *const ggml_context) -> usize;
-}
-extern "C" {
-    pub fn ggml_set_scratch(ctx: *mut ggml_context, scratch: ggml_scratch) -> usize;
-}
-extern "C" {
-    pub fn ggml_new_tensor(
-        ctx: *mut ggml_context,
-        type_: ggml_type,
-        n_dims: ::std::os::raw::c_int,
-        ne: *const i64,
-    ) -> *mut ggml_tensor;
-}
-extern "C" {
-    pub fn ggml_new_tensor_1d(
-        ctx: *mut ggml_context,
-        type_: ggml_type,
-        ne0: i64,
-    ) -> *mut ggml_tensor;
-}
-extern "C" {
-    pub fn ggml_new_tensor_2d(
-        ctx: *mut ggml_context,
-        type_: ggml_type,
-        ne0: i64,
-        ne1: i64,
-    ) -> *mut ggml_tensor;
-}
-extern "C" {
-    pub fn ggml_new_tensor_3d(
-        ctx: *mut ggml_context,
-        type_: ggml_type,
-        ne0: i64,
-        ne1: i64,
-        ne2: i64,
-    ) -> *mut ggml_tensor;
-}
-extern "C" {
-    pub fn ggml_new_tensor_4d(
-        ctx: *mut ggml_context,
-        type_: ggml_type,
-        ne0: i64,
-        ne1: i64,
-        ne2: i64,
-        ne3: i64,
-    ) -> *mut ggml_tensor;
-}
-extern "C" {
-    pub fn ggml_new_i32(ctx: *mut ggml_context, value: i32) -> *mut ggml_tensor;
-}
-extern "C" {
-    pub fn ggml_new_f32(ctx: *mut ggml_context, value: f32) -> *mut ggml_tensor;
-}
-extern "C" {
-    pub fn ggml_dup_tensor(ctx: *mut ggml_context, src: *const ggml_tensor) -> *mut ggml_tensor;
-}
-extern "C" {
-    pub fn ggml_view_tensor(ctx: *mut ggml_context, src: *const ggml_tensor) -> *mut ggml_tensor;
-}
-extern "C" {
-    pub fn ggml_set_zero(tensor: *mut ggml_tensor) -> *mut ggml_tensor;
-}
-extern "C" {
-    pub fn ggml_set_i32(tensor: *mut ggml_tensor, value: i32) -> *mut ggml_tensor;
-}
-extern "C" {
-    pub fn ggml_set_f32(tensor: *mut ggml_tensor, value: f32) -> *mut ggml_tensor;
-}
-extern "C" {
-    pub fn ggml_get_i32_1d(tensor: *const ggml_tensor, i: ::std::os::raw::c_int) -> i32;
-}
-extern "C" {
-    pub fn ggml_set_i32_1d(tensor: *const ggml_tensor, i: ::std::os::raw::c_int, value: i32);
-}
-extern "C" {
-    pub fn ggml_get_f32_1d(tensor: *const ggml_tensor, i: ::std::os::raw::c_int) -> f32;
-}
-extern "C" {
-    pub fn ggml_set_f32_1d(tensor: *const ggml_tensor, i: ::std::os::raw::c_int, value: f32);
-}
-extern "C" {
-    pub fn ggml_get_data(tensor: *const ggml_tensor) -> *mut ::std::os::raw::c_void;
-}
-extern "C" {
-    pub fn ggml_get_data_f32(tensor: *const ggml_tensor) -> *mut f32;
-}
-extern "C" {
-    pub fn ggml_dup(ctx: *mut ggml_context, a: *mut ggml_tensor) -> *mut ggml_tensor;
-}
-extern "C" {
-    pub fn ggml_add(
-        ctx: *mut ggml_context,
-        a: *mut ggml_tensor,
-        b: *mut ggml_tensor,
-    ) -> *mut ggml_tensor;
-}
-extern "C" {
-    pub fn ggml_add_inplace(
-        ctx: *mut ggml_context,
-        a: *mut ggml_tensor,
-        b: *mut ggml_tensor,
-    ) -> *mut ggml_tensor;
-}
-extern "C" {
-    pub fn ggml_sub(
-        ctx: *mut ggml_context,
-        a: *mut ggml_tensor,
-        b: *mut ggml_tensor,
-    ) -> *mut ggml_tensor;
-}
-extern "C" {
-    pub fn ggml_mul(
-        ctx: *mut ggml_context,
-        a: *mut ggml_tensor,
-        b: *mut ggml_tensor,
-    ) -> *mut ggml_tensor;
-}
-extern "C" {
-    pub fn ggml_div(
-        ctx: *mut ggml_context,
-        a: *mut ggml_tensor,
-        b: *mut ggml_tensor,
-    ) -> *mut ggml_tensor;
-}
-extern "C" {
-    pub fn ggml_sqr(ctx: *mut ggml_context, a: *mut ggml_tensor) -> *mut ggml_tensor;
-}
-extern "C" {
-    pub fn ggml_sqrt(ctx: *mut ggml_context, a: *mut ggml_tensor) -> *mut ggml_tensor;
-}
-extern "C" {
-    pub fn ggml_sum(ctx: *mut ggml_context, a: *mut ggml_tensor) -> *mut ggml_tensor;
-}
-extern "C" {
-    pub fn ggml_mean(ctx: *mut ggml_context, a: *mut ggml_tensor) -> *mut ggml_tensor;
-}
-extern "C" {
-    pub fn ggml_repeat(
-        ctx: *mut ggml_context,
-        a: *mut ggml_tensor,
-        b: *mut ggml_tensor,
-    ) -> *mut ggml_tensor;
-}
-extern "C" {
-    pub fn ggml_abs(ctx: *mut ggml_context, a: *mut ggml_tensor) -> *mut ggml_tensor;
-}
-extern "C" {
-    pub fn ggml_sgn(ctx: *mut ggml_context, a: *mut ggml_tensor) -> *mut ggml_tensor;
-}
-extern "C" {
-    pub fn ggml_neg(ctx: *mut ggml_context, a: *mut ggml_tensor) -> *mut ggml_tensor;
-}
-extern "C" {
-    pub fn ggml_step(ctx: *mut ggml_context, a: *mut ggml_tensor) -> *mut ggml_tensor;
-}
-extern "C" {
-    pub fn ggml_relu(ctx: *mut ggml_context, a: *mut ggml_tensor) -> *mut ggml_tensor;
-}
-extern "C" {
-    pub fn ggml_gelu(ctx: *mut ggml_context, a: *mut ggml_tensor) -> *mut ggml_tensor;
-}
-extern "C" {
-    pub fn ggml_silu(ctx: *mut ggml_context, a: *mut ggml_tensor) -> *mut ggml_tensor;
-}
-extern "C" {
-    pub fn ggml_norm(ctx: *mut ggml_context, a: *mut ggml_tensor) -> *mut ggml_tensor;
-}
-extern "C" {
-    pub fn ggml_rms_norm(ctx: *mut ggml_context, a: *mut ggml_tensor) -> *mut ggml_tensor;
-}
-extern "C" {
-    pub fn ggml_mul_mat(
-        ctx: *mut ggml_context,
-        a: *mut ggml_tensor,
-        b: *mut ggml_tensor,
-    ) -> *mut ggml_tensor;
-}
-extern "C" {
-    pub fn ggml_scale(
-        ctx: *mut ggml_context,
-        a: *mut ggml_tensor,
-        b: *mut ggml_tensor,
-    ) -> *mut ggml_tensor;
-}
-extern "C" {
-    pub fn ggml_cpy(
-        ctx: *mut ggml_context,
-        a: *mut ggml_tensor,
-        b: *mut ggml_tensor,
-    ) -> *mut ggml_tensor;
-}
-extern "C" {
-    pub fn ggml_cont(ctx: *mut ggml_context, a: *mut ggml_tensor) -> *mut ggml_tensor;
-}
-extern "C" {
-    pub fn ggml_reshape(
-        ctx: *mut ggml_context,
-        a: *mut ggml_tensor,
-        b: *mut ggml_tensor,
-    ) -> *mut ggml_tensor;
-}
-extern "C" {
-    pub fn ggml_reshape_2d(
-        ctx: *mut ggml_context,
-        a: *mut ggml_tensor,
-        ne0: i64,
-        ne1: i64,
-    ) -> *mut ggml_tensor;
-}
-extern "C" {
-    pub fn ggml_reshape_3d(
-        ctx: *mut ggml_context,
-        a: *mut ggml_tensor,
-        ne0: i64,
-        ne1: i64,
-        ne2: i64,
-    ) -> *mut ggml_tensor;
-}
-extern "C" {
-    pub fn ggml_view_1d(
-        ctx: *mut ggml_context,
-        a: *mut ggml_tensor,
-        ne0: i64,
-        offset: usize,
-    ) -> *mut ggml_tensor;
-}
-extern "C" {
-    pub fn ggml_view_2d(
-        ctx: *mut ggml_context,
-        a: *mut ggml_tensor,
-        ne0: i64,
-        ne1: i64,
-        nb1: usize,
-        offset: usize,
-    ) -> *mut ggml_tensor;
-}
-extern "C" {
-    pub fn ggml_view_3d(
-        ctx: *mut ggml_context,
-        a: *mut ggml_tensor,
-        ne0: i64,
-        ne1: i64,
-        ne2: i64,
-        nb1: usize,
-        nb2: usize,
-        offset: usize,
-    ) -> *mut ggml_tensor;
-}
-extern "C" {
-    pub fn ggml_permute(
-        ctx: *mut ggml_context,
-        a: *mut ggml_tensor,
-        axis0: ::std::os::raw::c_int,
-        axis1: ::std::os::raw::c_int,
-        axis2: ::std::os::raw::c_int,
-        axis3: ::std::os::raw::c_int,
-    ) -> *mut ggml_tensor;
-}
-extern "C" {
-    pub fn ggml_transpose(ctx: *mut ggml_context, a: *mut ggml_tensor) -> *mut ggml_tensor;
-}
-extern "C" {
-    pub fn ggml_get_rows(
-        ctx: *mut ggml_context,
-        a: *mut ggml_tensor,
-        b: *mut ggml_tensor,
-    ) -> *mut ggml_tensor;
-}
-extern "C" {
-    pub fn ggml_diag_mask_inf(
-        ctx: *mut ggml_context,
-        a: *mut ggml_tensor,
-        n_past: ::std::os::raw::c_int,
-    ) -> *mut ggml_tensor;
-}
-extern "C" {
-    pub fn ggml_soft_max(ctx: *mut ggml_context, a: *mut ggml_tensor) -> *mut ggml_tensor;
-}
-extern "C" {
-    pub fn ggml_rope(
-        ctx: *mut ggml_context,
-        a: *mut ggml_tensor,
-        n_past: ::std::os::raw::c_int,
-        n_dims: ::std::os::raw::c_int,
-        mode: ::std::os::raw::c_int,
-    ) -> *mut ggml_tensor;
-}
-extern "C" {
-    pub fn ggml_alibi(
-        ctx: *mut ggml_context,
-        a: *mut ggml_tensor,
-        n_past: ::std::os::raw::c_int,
-        n_head: ::std::os::raw::c_int,
-    ) -> *mut ggml_tensor;
-}
-extern "C" {
-    pub fn ggml_conv_1d_1s(
-        ctx: *mut ggml_context,
-        a: *mut ggml_tensor,
-        b: *mut ggml_tensor,
-    ) -> *mut ggml_tensor;
-}
-extern "C" {
-    pub fn ggml_conv_1d_2s(
-        ctx: *mut ggml_context,
-        a: *mut ggml_tensor,
-        b: *mut ggml_tensor,
-    ) -> *mut ggml_tensor;
-}
-extern "C" {
-    pub fn ggml_flash_attn(
-        ctx: *mut ggml_context,
-        q: *mut ggml_tensor,
-        k: *mut ggml_tensor,
-        v: *mut ggml_tensor,
-        masked: bool,
-    ) -> *mut ggml_tensor;
-}
-extern "C" {
-    pub fn ggml_flash_ff(
-        ctx: *mut ggml_context,
-        a: *mut ggml_tensor,
-        b0: *mut ggml_tensor,
-        b1: *mut ggml_tensor,
-        c0: *mut ggml_tensor,
-        c1: *mut ggml_tensor,
-    ) -> *mut ggml_tensor;
-}
-pub type ggml_unary_op_f32_t = ::std::option::Option<
-    unsafe extern "C" fn(arg1: ::std::os::raw::c_int, arg2: *mut f32, arg3: *const f32),
->;
-pub type ggml_binary_op_f32_t = ::std::option::Option<
-    unsafe extern "C" fn(
-        arg1: ::std::os::raw::c_int,
-        arg2: *mut f32,
-        arg3: *const f32,
-        arg4: *const f32,
-    ),
->;
-extern "C" {
-    pub fn ggml_map_unary_f32(
-        ctx: *mut ggml_context,
-        a: *mut ggml_tensor,
-        fun: ggml_unary_op_f32_t,
-    ) -> *mut ggml_tensor;
-}
-extern "C" {
-    pub fn ggml_map_binary_f32(
-        ctx: *mut ggml_context,
-        a: *mut ggml_tensor,
-        b: *mut ggml_tensor,
-        fun: ggml_binary_op_f32_t,
-    ) -> *mut ggml_tensor;
-}
-extern "C" {
-    pub fn ggml_set_param(ctx: *mut ggml_context, tensor: *mut ggml_tensor);
-}
-extern "C" {
-    pub fn ggml_build_forward_expand(cgraph: *mut ggml_cgraph, tensor: *mut ggml_tensor);
-}
-extern "C" {
-    pub fn ggml_build_forward(tensor: *mut ggml_tensor) -> ggml_cgraph;
-}
-extern "C" {
-    pub fn ggml_build_backward(
-        ctx: *mut ggml_context,
-        gf: *mut ggml_cgraph,
-        keep: bool,
-    ) -> ggml_cgraph;
-}
-extern "C" {
-    pub fn ggml_graph_compute(ctx: *mut ggml_context, cgraph: *mut ggml_cgraph);
-}
-extern "C" {
-    pub fn ggml_graph_reset(cgraph: *mut ggml_cgraph);
-}
-extern "C" {
-    pub fn ggml_graph_print(cgraph: *const ggml_cgraph);
-}
-extern "C" {
-    pub fn ggml_graph_dump_dot(
-        gb: *const ggml_cgraph,
-        gf: *const ggml_cgraph,
-        filename: *const ::std::os::raw::c_char,
-    );
-}
-pub const ggml_opt_type_GGML_OPT_ADAM: ggml_opt_type = 0;
-pub const ggml_opt_type_GGML_OPT_LBFGS: ggml_opt_type = 1;
-pub type ggml_opt_type = ::std::os::raw::c_uint;
-pub const ggml_linesearch_GGML_LINESEARCH_DEFAULT: ggml_linesearch = 1;
-pub const ggml_linesearch_GGML_LINESEARCH_BACKTRACKING_ARMIJO: ggml_linesearch = 0;
-pub const ggml_linesearch_GGML_LINESEARCH_BACKTRACKING_WOLFE: ggml_linesearch = 1;
-pub const ggml_linesearch_GGML_LINESEARCH_BACKTRACKING_STRONG_WOLFE: ggml_linesearch = 2;
-pub type ggml_linesearch = ::std::os::raw::c_uint;
-pub const ggml_opt_result_GGML_OPT_OK: ggml_opt_result = 0;
-pub const ggml_opt_result_GGML_OPT_DID_NOT_CONVERGE: ggml_opt_result = 1;
-pub const ggml_opt_result_GGML_OPT_NO_CONTEXT: ggml_opt_result = 2;
-pub const ggml_opt_result_GGML_OPT_INVALID_WOLFE: ggml_opt_result = 3;
-pub const ggml_opt_result_GGML_OPT_FAIL: ggml_opt_result = 4;
-pub const ggml_opt_result_GGML_LINESEARCH_FAIL: ggml_opt_result = -128;
-pub const ggml_opt_result_GGML_LINESEARCH_MINIMUM_STEP: ggml_opt_result = -127;
-pub const ggml_opt_result_GGML_LINESEARCH_MAXIMUM_STEP: ggml_opt_result = -126;
-pub const ggml_opt_result_GGML_LINESEARCH_MAXIMUM_ITERATIONS: ggml_opt_result = -125;
-pub const ggml_opt_result_GGML_LINESEARCH_INVALID_PARAMETERS: ggml_opt_result = -124;
-pub type ggml_opt_result = ::std::os::raw::c_int;
-#[repr(C)]
-#[derive(Debug, Copy, Clone)]
-pub struct ggml_opt_params {
-    pub type_: ggml_opt_type,
-    pub n_threads: ::std::os::raw::c_int,
-    pub past: ::std::os::raw::c_int,
-    pub delta: f32,
-    pub max_no_improvement: ::std::os::raw::c_int,
-    pub print_forward_graph: bool,
-    pub print_backward_graph: bool,
-    pub adam: ggml_opt_params__bindgen_ty_1,
-    pub lbfgs: ggml_opt_params__bindgen_ty_2,
-}
-#[repr(C)]
-#[derive(Debug, Copy, Clone)]
-pub struct ggml_opt_params__bindgen_ty_1 {
-    pub n_iter: ::std::os::raw::c_int,
-    pub alpha: f32,
-    pub beta1: f32,
-    pub beta2: f32,
-    pub eps: f32,
-    pub eps_f: f32,
-    pub eps_g: f32,
-}
-#[test]
-fn bindgen_test_layout_ggml_opt_params__bindgen_ty_1() {
-    const UNINIT: ::std::mem::MaybeUninit<ggml_opt_params__bindgen_ty_1> =
-        ::std::mem::MaybeUninit::uninit();
-    let ptr = UNINIT.as_ptr();
-    assert_eq!(
-        ::std::mem::size_of::<ggml_opt_params__bindgen_ty_1>(),
-        28usize,
-        concat!("Size of: ", stringify!(ggml_opt_params__bindgen_ty_1))
-    );
-    assert_eq!(
-        ::std::mem::align_of::<ggml_opt_params__bindgen_ty_1>(),
-        4usize,
-        concat!("Alignment of ", stringify!(ggml_opt_params__bindgen_ty_1))
-    );
-    assert_eq!(
-        unsafe { ::std::ptr::addr_of!((*ptr).n_iter) as usize - ptr as usize },
-        0usize,
-        concat!(
-            "Offset of field: ",
-            stringify!(ggml_opt_params__bindgen_ty_1),
-            "::",
-            stringify!(n_iter)
-        )
-    );
-    assert_eq!(
-        unsafe { ::std::ptr::addr_of!((*ptr).alpha) as usize - ptr as usize },
-        4usize,
-        concat!(
-            "Offset of field: ",
-            stringify!(ggml_opt_params__bindgen_ty_1),
-            "::",
-            stringify!(alpha)
-        )
-    );
-    assert_eq!(
-        unsafe { ::std::ptr::addr_of!((*ptr).beta1) as usize - ptr as usize },
-        8usize,
-        concat!(
-            "Offset of field: ",
-            stringify!(ggml_opt_params__bindgen_ty_1),
-            "::",
-            stringify!(beta1)
-        )
-    );
-    assert_eq!(
-        unsafe { ::std::ptr::addr_of!((*ptr).beta2) as usize - ptr as usize },
-        12usize,
-        concat!(
-            "Offset of field: ",
-            stringify!(ggml_opt_params__bindgen_ty_1),
-            "::",
-            stringify!(beta2)
-        )
-    );
-    assert_eq!(
-        unsafe { ::std::ptr::addr_of!((*ptr).eps) as usize - ptr as usize },
-        16usize,
-        concat!(
-            "Offset of field: ",
-            stringify!(ggml_opt_params__bindgen_ty_1),
-            "::",
-            stringify!(eps)
-        )
-    );
-    assert_eq!(
-        unsafe { ::std::ptr::addr_of!((*ptr).eps_f) as usize - ptr as usize },
-        20usize,
-        concat!(
-            "Offset of field: ",
-            stringify!(ggml_opt_params__bindgen_ty_1),
-            "::",
-            stringify!(eps_f)
-        )
-    );
-    assert_eq!(
-        unsafe { ::std::ptr::addr_of!((*ptr).eps_g) as usize - ptr as usize },
-        24usize,
-        concat!(
-            "Offset of field: ",
-            stringify!(ggml_opt_params__bindgen_ty_1),
-            "::",
-            stringify!(eps_g)
-        )
-    );
-}
-#[repr(C)]
-#[derive(Debug, Copy, Clone)]
-pub struct ggml_opt_params__bindgen_ty_2 {
-    pub m: ::std::os::raw::c_int,
-    pub n_iter: ::std::os::raw::c_int,
-    pub max_linesearch: ::std::os::raw::c_int,
-    pub eps: f32,
-    pub ftol: f32,
-    pub wolfe: f32,
-    pub min_step: f32,
-    pub max_step: f32,
-    pub linesearch: ggml_linesearch,
-}
-#[test]
-fn bindgen_test_layout_ggml_opt_params__bindgen_ty_2() {
-    const UNINIT: ::std::mem::MaybeUninit<ggml_opt_params__bindgen_ty_2> =
-        ::std::mem::MaybeUninit::uninit();
-    let ptr = UNINIT.as_ptr();
-    assert_eq!(
-        ::std::mem::size_of::<ggml_opt_params__bindgen_ty_2>(),
-        36usize,
-        concat!("Size of: ", stringify!(ggml_opt_params__bindgen_ty_2))
-    );
-    assert_eq!(
-        ::std::mem::align_of::<ggml_opt_params__bindgen_ty_2>(),
-        4usize,
-        concat!("Alignment of ", stringify!(ggml_opt_params__bindgen_ty_2))
-    );
-    assert_eq!(
-        unsafe { ::std::ptr::addr_of!((*ptr).m) as usize - ptr as usize },
-        0usize,
-        concat!(
-            "Offset of field: ",
-            stringify!(ggml_opt_params__bindgen_ty_2),
-            "::",
-            stringify!(m)
-        )
-    );
-    assert_eq!(
-        unsafe { ::std::ptr::addr_of!((*ptr).n_iter) as usize - ptr as usize },
-        4usize,
-        concat!(
-            "Offset of field: ",
-            stringify!(ggml_opt_params__bindgen_ty_2),
-            "::",
-            stringify!(n_iter)
-        )
-    );
-    assert_eq!(
-        unsafe { ::std::ptr::addr_of!((*ptr).max_linesearch) as usize - ptr as usize },
-        8usize,
-        concat!(
-            "Offset of field: ",
-            stringify!(ggml_opt_params__bindgen_ty_2),
-            "::",
-            stringify!(max_linesearch)
-        )
-    );
-    assert_eq!(
-        unsafe { ::std::ptr::addr_of!((*ptr).eps) as usize - ptr as usize },
-        12usize,
-        concat!(
-            "Offset of field: ",
-            stringify!(ggml_opt_params__bindgen_ty_2),
-            "::",
-            stringify!(eps)
-        )
-    );
-    assert_eq!(
-        unsafe { ::std::ptr::addr_of!((*ptr).ftol) as usize - ptr as usize },
-        16usize,
-        concat!(
-            "Offset of field: ",
-            stringify!(ggml_opt_params__bindgen_ty_2),
-            "::",
-            stringify!(ftol)
-        )
-    );
-    assert_eq!(
-        unsafe { ::std::ptr::addr_of!((*ptr).wolfe) as usize - ptr as usize },
-        20usize,
-        concat!(
-            "Offset of field: ",
-            stringify!(ggml_opt_params__bindgen_ty_2),
-            "::",
-            stringify!(wolfe)
-        )
-    );
-    assert_eq!(
-        unsafe { ::std::ptr::addr_of!((*ptr).min_step) as usize - ptr as usize },
-        24usize,
-        concat!(
-            "Offset of field: ",
-            stringify!(ggml_opt_params__bindgen_ty_2),
-            "::",
-            stringify!(min_step)
-        )
-    );
-    assert_eq!(
-        unsafe { ::std::ptr::addr_of!((*ptr).max_step) as usize - ptr as usize },
-        28usize,
-        concat!(
-            "Offset of field: ",
-            stringify!(ggml_opt_params__bindgen_ty_2),
-            "::",
-            stringify!(max_step)
-        )
-    );
-    assert_eq!(
-        unsafe { ::std::ptr::addr_of!((*ptr).linesearch) as usize - ptr as usize },
-        32usize,
-        concat!(
-            "Offset of field: ",
-            stringify!(ggml_opt_params__bindgen_ty_2),
-            "::",
-            stringify!(linesearch)
-        )
-    );
-}
-#[test]
-fn bindgen_test_layout_ggml_opt_params() {
-    const UNINIT: ::std::mem::MaybeUninit<ggml_opt_params> = ::std::mem::MaybeUninit::uninit();
-    let ptr = UNINIT.as_ptr();
-    assert_eq!(
-        ::std::mem::size_of::<ggml_opt_params>(),
-        88usize,
-        concat!("Size of: ", stringify!(ggml_opt_params))
-    );
-    assert_eq!(
-        ::std::mem::align_of::<ggml_opt_params>(),
-        4usize,
-        concat!("Alignment of ", stringify!(ggml_opt_params))
-    );
-    assert_eq!(
-        unsafe { ::std::ptr::addr_of!((*ptr).type_) as usize - ptr as usize },
-        0usize,
-        concat!(
-            "Offset of field: ",
-            stringify!(ggml_opt_params),
-            "::",
-            stringify!(type_)
-        )
-    );
-    assert_eq!(
-        unsafe { ::std::ptr::addr_of!((*ptr).n_threads) as usize - ptr as usize },
-        4usize,
-        concat!(
-            "Offset of field: ",
-            stringify!(ggml_opt_params),
-            "::",
-            stringify!(n_threads)
-        )
-    );
-    assert_eq!(
-        unsafe { ::std::ptr::addr_of!((*ptr).past) as usize - ptr as usize },
-        8usize,
-        concat!(
-            "Offset of field: ",
-            stringify!(ggml_opt_params),
-            "::",
-            stringify!(past)
-        )
-    );
-    assert_eq!(
-        unsafe { ::std::ptr::addr_of!((*ptr).delta) as usize - ptr as usize },
-        12usize,
-        concat!(
-            "Offset of field: ",
-            stringify!(ggml_opt_params),
-            "::",
-            stringify!(delta)
-        )
-    );
-    assert_eq!(
-        unsafe { ::std::ptr::addr_of!((*ptr).max_no_improvement) as usize - ptr as usize },
-        16usize,
-        concat!(
-            "Offset of field: ",
-            stringify!(ggml_opt_params),
-            "::",
-            stringify!(max_no_improvement)
-        )
-    );
-    assert_eq!(
-        unsafe { ::std::ptr::addr_of!((*ptr).print_forward_graph) as usize - ptr as usize },
-        20usize,
-        concat!(
-            "Offset of field: ",
-            stringify!(ggml_opt_params),
-            "::",
-            stringify!(print_forward_graph)
-        )
-    );
-    assert_eq!(
-        unsafe { ::std::ptr::addr_of!((*ptr).print_backward_graph) as usize - ptr as usize },
-        21usize,
-        concat!(
-            "Offset of field: ",
-            stringify!(ggml_opt_params),
-            "::",
-            stringify!(print_backward_graph)
-        )
-    );
-    assert_eq!(
-        unsafe { ::std::ptr::addr_of!((*ptr).adam) as usize - ptr as usize },
-        24usize,
-        concat!(
-            "Offset of field: ",
-            stringify!(ggml_opt_params),
-            "::",
-            stringify!(adam)
-        )
-    );
-    assert_eq!(
-        unsafe { ::std::ptr::addr_of!((*ptr).lbfgs) as usize - ptr as usize },
-        52usize,
-        concat!(
-            "Offset of field: ",
-            stringify!(ggml_opt_params),
-            "::",
-            stringify!(lbfgs)
-        )
-    );
-}
-extern "C" {
-    pub fn ggml_opt_default_params(type_: ggml_opt_type) -> ggml_opt_params;
-}
-extern "C" {
-    pub fn ggml_opt(
-        ctx: *mut ggml_context,
-        params: ggml_opt_params,
-        f: *mut ggml_tensor,
-    ) -> ggml_opt_result;
-}
-extern "C" {
-    pub fn ggml_quantize_q4_0(
-        src: *const f32,
-        dst: *mut ::std::os::raw::c_void,
-        n: ::std::os::raw::c_int,
-        k: ::std::os::raw::c_int,
-        hist: *mut i64,
-    ) -> usize;
-}
-extern "C" {
-    pub fn ggml_quantize_q4_1(
-        src: *const f32,
-        dst: *mut ::std::os::raw::c_void,
-        n: ::std::os::raw::c_int,
-        k: ::std::os::raw::c_int,
-        hist: *mut i64,
-    ) -> usize;
-}
-extern "C" {
-    pub fn ggml_quantize_q4_2(
-        src: *const f32,
-        dst: *mut ::std::os::raw::c_void,
-        n: ::std::os::raw::c_int,
-        k: ::std::os::raw::c_int,
-        hist: *mut i64,
-    ) -> usize;
-}
-extern "C" {
-    pub fn ggml_quantize_q4_3(
-        src: *const f32,
-        dst: *mut ::std::os::raw::c_void,
-        n: ::std::os::raw::c_int,
-        k: ::std::os::raw::c_int,
-        hist: *mut i64,
-    ) -> usize;
-}
-extern "C" {
-    pub fn ggml_quantize_chunk(
-        type_: ggml_type,
-        src: *const f32,
-        dst: *mut ::std::os::raw::c_void,
-        start: ::std::os::raw::c_int,
-        n: ::std::os::raw::c_int,
-        hist: *mut i64,
-    ) -> usize;
-}
-extern "C" {
-    pub fn ggml_cpu_has_avx() -> ::std::os::raw::c_int;
-}
-extern "C" {
-    pub fn ggml_cpu_has_avx2() -> ::std::os::raw::c_int;
-}
-extern "C" {
-    pub fn ggml_cpu_has_avx512() -> ::std::os::raw::c_int;
-}
-extern "C" {
-    pub fn ggml_cpu_has_avx512_vbmi() -> ::std::os::raw::c_int;
-}
-extern "C" {
-    pub fn ggml_cpu_has_avx512_vnni() -> ::std::os::raw::c_int;
-}
-extern "C" {
-    pub fn ggml_cpu_has_fma() -> ::std::os::raw::c_int;
-}
-extern "C" {
-    pub fn ggml_cpu_has_neon() -> ::std::os::raw::c_int;
-}
-extern "C" {
-    pub fn ggml_cpu_has_arm_fma() -> ::std::os::raw::c_int;
-}
-extern "C" {
-    pub fn ggml_cpu_has_f16c() -> ::std::os::raw::c_int;
-}
-extern "C" {
-    pub fn ggml_cpu_has_fp16_va() -> ::std::os::raw::c_int;
-}
-extern "C" {
-    pub fn ggml_cpu_has_wasm_simd() -> ::std::os::raw::c_int;
-}
-extern "C" {
-    pub fn ggml_cpu_has_blas() -> ::std::os::raw::c_int;
-}
-extern "C" {
-    pub fn ggml_cpu_has_cublas() -> ::std::os::raw::c_int;
-}
-extern "C" {
-    pub fn ggml_cpu_has_sse3() -> ::std::os::raw::c_int;
-}
-extern "C" {
-    pub fn ggml_cpu_has_vsx() -> ::std::os::raw::c_int;
-}
-pub type dequantize_row_q_t = ::std::option::Option<
-    unsafe extern "C" fn(x: *const ::std::os::raw::c_void, y: *mut f32, k: ::std::os::raw::c_int),
->;
-pub type quantize_row_q_t = ::std::option::Option<
-    unsafe extern "C" fn(x: *const f32, y: *mut ::std::os::raw::c_void, k: ::std::os::raw::c_int),
->;
-pub type vec_dot_q_t = ::std::option::Option<
-    unsafe extern "C" fn(
-        n: ::std::os::raw::c_int,
-        s: *mut f32,
-        x: *const ::std::os::raw::c_void,
-        y: *const ::std::os::raw::c_void,
-    ),
->;
-#[repr(C)]
-#[derive(Debug, Copy, Clone)]
-pub struct quantize_fns_t {
-    pub dequantize_row_q: dequantize_row_q_t,
-    pub quantize_row_q: quantize_row_q_t,
-    pub quantize_row_q_reference: quantize_row_q_t,
-    pub quantize_row_q_dot: quantize_row_q_t,
-    pub vec_dot_q: vec_dot_q_t,
-}
-#[test]
-fn bindgen_test_layout_quantize_fns_t() {
-    const UNINIT: ::std::mem::MaybeUninit<quantize_fns_t> = ::std::mem::MaybeUninit::uninit();
-    let ptr = UNINIT.as_ptr();
-    assert_eq!(
-        ::std::mem::size_of::<quantize_fns_t>(),
-        40usize,
-        concat!("Size of: ", stringify!(quantize_fns_t))
-    );
-    assert_eq!(
-        ::std::mem::align_of::<quantize_fns_t>(),
-        8usize,
-        concat!("Alignment of ", stringify!(quantize_fns_t))
-    );
-    assert_eq!(
-        unsafe { ::std::ptr::addr_of!((*ptr).dequantize_row_q) as usize - ptr as usize },
-        0usize,
-        concat!(
-            "Offset of field: ",
-            stringify!(quantize_fns_t),
-            "::",
-            stringify!(dequantize_row_q)
-        )
-    );
-    assert_eq!(
-        unsafe { ::std::ptr::addr_of!((*ptr).quantize_row_q) as usize - ptr as usize },
-        8usize,
-        concat!(
-            "Offset of field: ",
-            stringify!(quantize_fns_t),
-            "::",
-            stringify!(quantize_row_q)
-        )
-    );
-    assert_eq!(
-        unsafe { ::std::ptr::addr_of!((*ptr).quantize_row_q_reference) as usize - ptr as usize },
-        16usize,
-        concat!(
-            "Offset of field: ",
-            stringify!(quantize_fns_t),
-            "::",
-            stringify!(quantize_row_q_reference)
-        )
-    );
-    assert_eq!(
-        unsafe { ::std::ptr::addr_of!((*ptr).quantize_row_q_dot) as usize - ptr as usize },
-        24usize,
-        concat!(
-            "Offset of field: ",
-            stringify!(quantize_fns_t),
-            "::",
-            stringify!(quantize_row_q_dot)
-        )
-    );
-    assert_eq!(
-        unsafe { ::std::ptr::addr_of!((*ptr).vec_dot_q) as usize - ptr as usize },
-        32usize,
-        concat!(
-            "Offset of field: ",
-            stringify!(quantize_fns_t),
-            "::",
-            stringify!(vec_dot_q)
-        )
-    );
-}
-extern "C" {
-    pub fn ggml_internal_get_quantize_fn(i: usize) -> quantize_fns_t;
-}
diff --git a/ggml/Cargo.toml b/ggml/Cargo.toml
deleted file mode 100644
index 93c3c1c4..00000000
--- a/ggml/Cargo.toml
+++ /dev/null
@@ -1,14 +0,0 @@
-[package]
-name = "ggml"
-version = { workspace = true }
-edition = "2021"
-
-[dependencies]
-ggml-sys = { path = "../ggml-sys" }
-
-log = { workspace = true }
-
-thiserror = "1.0"
-
-[dev-dependencies]
-llm-base = { path = "../llm-base" }
diff --git a/ggml/src/lib.rs b/ggml/src/lib.rs
deleted file mode 100644
index d84abc86..00000000
--- a/ggml/src/lib.rs
+++ /dev/null
@@ -1,758 +0,0 @@
-#![deny(missing_docs)]
-
-//! `ggml` is a semi-idiomatic wrapper for the `ggml` C library.
-//!
-//! It exposes a subset of operations (currently used to implement the [llama-rs](https://crates.io/crates/llama-rs) library).
-//! Note that it does not expose a fully-idiomatic safe Rust interface; operations that could be potentially unsafe are marked as such.
-//!
-//! `ggml` operates on a computational graph; no values will be computed until [Context::graph_compute] is executed.
-//! All [Tensor]s are nodes in this computational graph, and values cannot be retrieved until computation is completed.
-
-use std::{
-    os::raw::{c_int, c_void},
-    ptr::NonNull,
-    sync::{Arc, Weak},
-};
-
-pub use ggml_sys;
-
-/// Magic constant for `ggml` files (versioned, ggmf).
-pub const FILE_MAGIC_GGMF: u32 = 0x67676d66;
-/// Magic constant for `ggml` files (versioned, ggjt).
-pub const FILE_MAGIC_GGJT: u32 = 0x67676a74;
-/// Magic constant for `ggml` files (unversioned).
-pub const FILE_MAGIC_UNVERSIONED: u32 = 0x67676d6c;
-
-/// The currently-supported format version for `ggml` files.
-pub const FORMAT_VERSION: u32 = 1;
-
-/// The size of a `ggml` object.
-pub const OBJECT_SIZE: usize = ggml_sys::GGML_OBJECT_SIZE;
-
-#[derive(Debug, Copy, Clone, PartialEq, Eq, Default)]
-/// The type of a value in `ggml`.
-pub enum Type {
-    /// Quantized 4-bit (type 0).
-    #[default]
-    Q4_0,
-    /// Quantized 4-bit (type 1); used by GPTQ.
-    Q4_1,
-    /// Quantized 4-bit (type 2).
-    Q4_2,
-    /// Quantized 4-bit (type 3).
-    Q4_3,
-    /// Quantized 8-bit (type 0).
-    Q8_0,
-    /// Integer 32-bit.
-    I32,
-    /// Float 16-bit.
-    F16,
-    /// Float 32-bit.
-    F32,
-}
-impl From<Type> for ggml_sys::ggml_type {
-    fn from(t: Type) -> Self {
-        match t {
-            Type::Q4_0 => ggml_sys::ggml_type_GGML_TYPE_Q4_0,
-            Type::Q4_1 => ggml_sys::ggml_type_GGML_TYPE_Q4_1,
-            Type::Q4_2 => ggml_sys::ggml_type_GGML_TYPE_Q4_2,
-            Type::Q4_3 => ggml_sys::ggml_type_GGML_TYPE_Q4_3,
-            Type::Q8_0 => ggml_sys::ggml_type_GGML_TYPE_Q8_0,
-            Type::I32 => ggml_sys::ggml_type_GGML_TYPE_I32,
-            Type::F16 => ggml_sys::ggml_type_GGML_TYPE_F16,
-            Type::F32 => ggml_sys::ggml_type_GGML_TYPE_F32,
-        }
-    }
-}
-impl TryFrom<ggml_sys::ggml_type> for Type {
-    type Error = ();
-    fn try_from(t: ggml_sys::ggml_type) -> Result<Self, Self::Error> {
-        match t {
-            ggml_sys::ggml_type_GGML_TYPE_Q4_0 => Ok(Type::Q4_0),
-            ggml_sys::ggml_type_GGML_TYPE_Q4_1 => Ok(Type::Q4_1),
-            ggml_sys::ggml_type_GGML_TYPE_Q4_2 => Ok(Type::Q4_2),
-            ggml_sys::ggml_type_GGML_TYPE_Q4_3 => Ok(Type::Q4_3),
-            ggml_sys::ggml_type_GGML_TYPE_Q8_0 => Ok(Type::Q8_0),
-            ggml_sys::ggml_type_GGML_TYPE_I32 => Ok(Type::I32),
-            ggml_sys::ggml_type_GGML_TYPE_F16 => Ok(Type::F16),
-            ggml_sys::ggml_type_GGML_TYPE_F32 => Ok(Type::F32),
-            _ => Err(()),
-        }
-    }
-}
-impl std::fmt::Display for Type {
-    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        match self {
-            Type::Q4_0 => write!(f, "q4_0"),
-            Type::Q4_1 => write!(f, "q4_1"),
-            Type::Q4_2 => write!(f, "q4_2"),
-            Type::Q4_3 => write!(f, "q4_3"),
-            Type::Q8_0 => write!(f, "q8_0"),
-            Type::I32 => write!(f, "i32"),
-            Type::F16 => write!(f, "f16"),
-            Type::F32 => write!(f, "f32"),
-        }
-    }
-}
-
-/// Acts as a RAII-guard over a `ggml_sys::ggml_context`, allocating via
-/// `ggml_init` and dropping via `ggml_free`.
-pub struct Context {
-    /// An `Arc` is used to model the relation between the context and the
-    /// allocated tensors. Tensors are owned by the object, so a [`Tensor`]
-    /// contains a `Weak` reference underneath and doesn't let you do anything
-    /// with it if the underlying context has been deallocated.
-    ptr: Arc<NonNull<ggml_sys::ggml_context>>,
-}
-impl Context {
-    /// Creates a new [Context] with the specified `mem_size` as a working area.
-    pub fn init(mem_size: usize, alloc: bool) -> Self {
-        let raw = unsafe {
-            ggml_sys::ggml_init(ggml_sys::ggml_init_params {
-                mem_size,
-                // Null here means we want ggml to own this memory. We don't
-                // support passing an owned buffer from the Rust side.
-                mem_buffer: std::ptr::null_mut(),
-                no_alloc: !alloc,
-            })
-        };
-        Self {
-            ptr: Arc::new(NonNull::new(raw).expect("Should not be null")),
-        }
-    }
-
-    /// Wraps a raw tensor with a weak pointer to the context.
-    fn new_tensor_raw(&self, raw: *mut ggml_sys::ggml_tensor) -> Tensor {
-        Tensor {
-            ptr: NonNull::new(raw).expect("Should not be null"),
-            ctx: Arc::downgrade(&self.ptr),
-        }
-    }
-
-    /// Creates a new 1D tensor.
-    pub fn new_tensor_1d(&self, typ: Type, ne0: usize) -> Tensor {
-        let raw = unsafe {
-            ggml_sys::ggml_new_tensor_1d(self.ptr.as_ptr(), typ.into(), usize_to_i64(ne0))
-        };
-        self.new_tensor_raw(raw)
-    }
-
-    /// Creates a new 2D tensor.
-    pub fn new_tensor_2d(&self, typ: Type, ne0: usize, ne1: usize) -> Tensor {
-        let raw = unsafe {
-            ggml_sys::ggml_new_tensor_2d(
-                self.ptr.as_ptr(),
-                typ.into(),
-                usize_to_i64(ne0),
-                usize_to_i64(ne1),
-            )
-        };
-        self.new_tensor_raw(raw)
-    }
-
-    /// Creates a new 3D tensor.
-    pub fn new_tensor_3d(&self, typ: Type, ne0: usize, ne1: usize, ne2: usize) -> Tensor {
-        let raw = unsafe {
-            ggml_sys::ggml_new_tensor_3d(
-                self.ptr.as_ptr(),
-                typ.into(),
-                usize_to_i64(ne0),
-                usize_to_i64(ne1),
-                usize_to_i64(ne2),
-            )
-        };
-        self.new_tensor_raw(raw)
-    }
-
-    /// Creates a new 1D tensor with the specified value.
-    pub fn new_f32(&self, x: f32) -> Tensor {
-        let raw = unsafe { ggml_sys::ggml_new_f32(self.ptr.as_ptr(), x) };
-        self.new_tensor_raw(raw)
-    }
-
-    /// Unknown, aside from the obvious. It's transposing something!
-    pub fn op_transpose(&self, a: &Tensor) -> Tensor {
-        let tensor = unsafe { ggml_sys::ggml_transpose(self.ptr.as_ptr(), a.ptr.as_ptr()) };
-        self.new_tensor_raw(tensor)
-    }
-
-    /// Unknown.
-    pub fn op_get_rows(&self, a: &Tensor, b: &Tensor) -> Tensor {
-        let tensor =
-            unsafe { ggml_sys::ggml_get_rows(self.ptr.as_ptr(), a.ptr.as_ptr(), b.ptr.as_ptr()) };
-        self.new_tensor_raw(tensor)
-    }
-
-    /// Creates a new tensor with the values of `a`, but normalized.
-    pub fn op_norm(&self, a: &Tensor) -> Tensor {
-        let tensor = unsafe { ggml_sys::ggml_norm(self.ptr.as_ptr(), a.ptr.as_ptr()) };
-        self.new_tensor_raw(tensor)
-    }
-
-    /// Creates a new tensor with the values of `a`, but normalized using RMSNorm.
-    pub fn op_rms_norm(&self, a: &Tensor) -> Tensor {
-        let tensor = unsafe { ggml_sys::ggml_rms_norm(self.ptr.as_ptr(), a.ptr.as_ptr()) };
-        self.new_tensor_raw(tensor)
-    }
-
-    /// Creates a new tensor with the multiplication of `a` and `b`.
-    pub fn op_mul(&self, a: &Tensor, b: &Tensor) -> Tensor {
-        let tensor =
-            unsafe { ggml_sys::ggml_mul(self.ptr.as_ptr(), a.ptr.as_ptr(), b.ptr.as_ptr()) };
-        self.new_tensor_raw(tensor)
-    }
-
-    /// Unknown.
-    pub fn op_repeat(&self, a: &Tensor, b: &Tensor) -> Tensor {
-        let tensor =
-            unsafe { ggml_sys::ggml_repeat(self.ptr.as_ptr(), a.ptr.as_ptr(), b.ptr.as_ptr()) };
-        self.new_tensor_raw(tensor)
-    }
-
-    /// Creates a new tensor with the multiplication of `a` and `b` as if they were matrices.
-    ///
-    /// `a`: m rows, n columns
-    ///
-    /// `b`: p rows, n columns (i.e. we transpose it internally)
-    ///
-    /// Result is m columns, p rows
-    pub fn op_mul_mat(&self, a: &Tensor, b: &Tensor) -> Tensor {
-        let tensor =
-            unsafe { ggml_sys::ggml_mul_mat(self.ptr.as_ptr(), a.ptr.as_ptr(), b.ptr.as_ptr()) };
-        self.new_tensor_raw(tensor)
-    }
-
-    /// Creates a new tensor with the addition of `a` and `b`.
-    pub fn op_add(&self, a: &Tensor, b: &Tensor) -> Tensor {
-        let tensor =
-            unsafe { ggml_sys::ggml_add(self.ptr.as_ptr(), a.ptr.as_ptr(), b.ptr.as_ptr()) };
-        self.new_tensor_raw(tensor)
-    }
-
-    /// Creates a new tensor with the [SiLU](https://pytorch.org/docs/stable/generated/torch.nn.SiLU.html) activation function applied to `a`.
-    pub fn op_silu(&self, a: &Tensor) -> Tensor {
-        let tensor = unsafe { ggml_sys::ggml_silu(self.ptr.as_ptr(), a.ptr.as_ptr()) };
-        self.new_tensor_raw(tensor)
-    }
-
-    /// In-place, scales `a` by the 1D tensor `b`.
-    pub fn op_scale(&self, a: &Tensor, b: &Tensor) -> Tensor {
-        let tensor =
-            unsafe { ggml_sys::ggml_scale(self.ptr.as_ptr(), a.ptr.as_ptr(), b.ptr.as_ptr()) };
-        self.new_tensor_raw(tensor)
-    }
-
-    /// In-place, sets the elements above the diagonal to -INF.
-    pub fn op_diag_mask_inf(&self, a: &Tensor, n_past: usize) -> Tensor {
-        let tensor = unsafe {
-            ggml_sys::ggml_diag_mask_inf(self.ptr.as_ptr(), a.ptr.as_ptr(), usize_to_i32(n_past))
-        };
-        self.new_tensor_raw(tensor)
-    }
-
-    /// In-place, applies the [Softmax function](https://en.wikipedia.org/wiki/Softmax_function) to `a`.
-    pub fn op_soft_max(&self, a: &Tensor) -> Tensor {
-        let tensor = unsafe { ggml_sys::ggml_soft_max(self.ptr.as_ptr(), a.ptr.as_ptr()) };
-        self.new_tensor_raw(tensor)
-    }
-
-    /// Creates a new tensor with result of mapping `fun` with `a`.
-    ///
-    /// `cnt` is the number of `f32` elements to be mapped.
-    /// `src` is source for elements to be mapped.
-    /// `dst` is the destination for mapped elements.
-    ///
-    /// # Safety
-    ///
-    /// This is marked unsafe since we're passing pointers into C code, and not
-    /// only vanilla pointers but a pointer to a function. For obvious reasons, it's
-    /// important not to do anything crazy like mutate any of these values concurrently.
-    ///
-    /// Don't make assumptions about how/when the function will be called. It may be called
-    /// on a row, it may be called on a whole tensor. It may be called concurrently or not.
-    /// Once you give that function pointer to C land, all bets are off.
-    pub unsafe fn op_map_unary(
-        &self,
-        a: &Tensor,
-        fun: unsafe extern "C" fn(cnt: c_int, dst: *mut f32, src: *const f32),
-    ) -> Tensor {
-        let tensor =
-            unsafe { ggml_sys::ggml_map_unary_f32(self.ptr.as_ptr(), a.ptr.as_ptr(), Some(fun)) };
-        self.new_tensor_raw(tensor)
-    }
-
-    /// Creates a new tensor with result of mapping `fun` with `a` and `b`.
-    ///
-    /// `cnt` is the number of `f32` elements to be mapped.
-    /// `src0`, `src1` are the sources of elements to be mapped.
-    /// `dst` is the destination for mapped elements.
-    ///
-    /// # Safety
-    ///
-    /// This is marked unsafe since we're passing pointers into C code, and not
-    /// only vanilla pointers but a pointer to a function. For obvious reasons, it's
-    /// important not to do anything crazy like mutate any of these values concurrently.
-    ///
-    /// Don't make assumptions about how/when the function will be called. It may be called
-    /// on a row, it may be called on a whole tensor. It may be called concurrently or not.
-    /// Once you give that function pointer to C land, all bets are off.
-    pub unsafe fn op_map_binary(
-        &self,
-        a: &Tensor,
-        b: &Tensor,
-        fun: unsafe extern "C" fn(cnt: c_int, dst: *mut f32, src0: *const f32, src1: *const f32),
-    ) -> Tensor {
-        let tensor = unsafe {
-            ggml_sys::ggml_map_binary_f32(
-                self.ptr.as_ptr(),
-                a.ptr.as_ptr(),
-                b.ptr.as_ptr(),
-                Some(fun),
-            )
-        };
-        self.new_tensor_raw(tensor)
-    }
-
-    /// Creates a 1D view over `a`.
-    pub fn op_view_1d(&self, a: &Tensor, ne0: usize, offset: usize) -> Tensor {
-        let tensor = unsafe {
-            ggml_sys::ggml_view_1d(self.ptr.as_ptr(), a.ptr.as_ptr(), usize_to_i64(ne0), offset)
-        };
-        self.new_tensor_raw(tensor)
-    }
-
-    /// Creates a 2D view over `a`.
-    pub fn op_view_2d(&self, a: &Tensor, ne: (usize, usize), nb1: usize, offset: usize) -> Tensor {
-        let (ne0, ne1) = ne;
-        let tensor = unsafe {
-            ggml_sys::ggml_view_2d(
-                self.ptr.as_ptr(),
-                a.ptr.as_ptr(),
-                usize_to_i64(ne0),
-                usize_to_i64(ne1),
-                nb1,
-                offset,
-            )
-        };
-        self.new_tensor_raw(tensor)
-    }
-
-    /// Creates a 3d view over `a`.
-    pub fn op_view_3d(
-        &self,
-        a: &Tensor,
-        ne: (usize, usize, usize),
-        nb: (usize, usize),
-        offset: usize,
-    ) -> Tensor {
-        let (ne0, ne1, ne2) = ne;
-        let (nb1, nb2) = nb;
-        let tensor = unsafe {
-            ggml_sys::ggml_view_3d(
-                self.ptr.as_ptr(),
-                a.ptr.as_ptr(),
-                usize_to_i64(ne0),
-                usize_to_i64(ne1),
-                usize_to_i64(ne2),
-                nb1,
-                nb2,
-                offset,
-            )
-        };
-        self.new_tensor_raw(tensor)
-    }
-
-    /// Copies `a` to `b` and returns `b`.
-    pub fn op_cpy(&self, a: &Tensor, b: &Tensor) -> Tensor {
-        let tensor =
-            unsafe { ggml_sys::ggml_cpy(self.ptr.as_ptr(), a.ptr.as_ptr(), b.ptr.as_ptr()) };
-        self.new_tensor_raw(tensor)
-    }
-
-    /// Creates a new tensor with the axes of `a` permuted as described by the parameters.
-    pub fn op_permute(
-        &self,
-        a: &Tensor,
-        axis0: usize,
-        axis1: usize,
-        axis2: usize,
-        axis3: usize,
-    ) -> Tensor {
-        let tensor = unsafe {
-            ggml_sys::ggml_permute(
-                self.ptr.as_ptr(),
-                a.ptr.as_ptr(),
-                usize_to_i32(axis0),
-                usize_to_i32(axis1),
-                usize_to_i32(axis2),
-                usize_to_i32(axis3),
-            )
-        };
-        self.new_tensor_raw(tensor)
-    }
-
-    /// In-place; reshapes `a` in accordance with the dimensions of `b`
-    pub fn op_reshape(&self, a: &Tensor, b: &Tensor) -> Tensor {
-        let tensor =
-            unsafe { ggml_sys::ggml_reshape(self.ptr.as_ptr(), a.ptr.as_ptr(), b.ptr.as_ptr()) };
-        self.new_tensor_raw(tensor)
-    }
-
-    /// In-place; reshapes `a` in accordance with the specified dimensions.
-    pub fn op_reshape_2d(&self, a: &Tensor, ne0: usize, ne1: usize) -> Tensor {
-        let tensor = unsafe {
-            ggml_sys::ggml_reshape_2d(
-                self.ptr.as_ptr(),
-                a.ptr.as_ptr(),
-                usize_to_i64(ne0),
-                usize_to_i64(ne1),
-            )
-        };
-        self.new_tensor_raw(tensor)
-    }
-
-    /// In-place; reshapes `a` in accordance with the specified dimensions.
-    pub fn op_reshape_3d(&self, a: &Tensor, ne0: usize, ne1: usize, ne2: usize) -> Tensor {
-        let tensor = unsafe {
-            ggml_sys::ggml_reshape_3d(
-                self.ptr.as_ptr(),
-                a.ptr.as_ptr(),
-                usize_to_i64(ne0),
-                usize_to_i64(ne1),
-                usize_to_i64(ne2),
-            )
-        };
-        self.new_tensor_raw(tensor)
-    }
-
-    /// In-place; applies ROtary Positional Encoding.
-    pub fn op_rope(&self, a: &Tensor, npast: usize, ndims: usize, mode: i32) -> Tensor {
-        let tensor = unsafe {
-            ggml_sys::ggml_rope(
-                self.ptr.as_ptr(),
-                a.ptr.as_ptr(),
-                usize_to_i32(npast),
-                usize_to_i32(ndims),
-                mode,
-            )
-        };
-        self.new_tensor_raw(tensor)
-    }
-
-    /// Computes the specified graph. Must be run in order to evaluate the graph.
-    pub fn graph_compute(&self, graph: &mut ComputationGraph) {
-        unsafe {
-            ggml_sys::ggml_graph_compute(self.ptr.as_ptr(), &mut graph.inner);
-        }
-    }
-
-    /// Retrieves the memory used by this [Context].
-    pub fn used_mem(&self) -> usize {
-        unsafe { ggml_sys::ggml_used_mem(self.ptr.as_ptr()) }
-    }
-
-    /// Sets the scratch buffer to be used by this [Context].
-    ///
-    /// If `scratch_buffer` is `None`, the scratch buffer will be disabled.
-    pub fn use_scratch<'a>(&'a self, scratch_buffer: Option<&'a mut Buffer>) {
-        let (size, data) = if let Some(buffer) = scratch_buffer {
-            (buffer.data.len(), buffer.data.as_ptr() as *mut c_void)
-        } else {
-            (0, std::ptr::null_mut())
-        };
-        // SAFETY: this just passes (most likely uninitialized) memory buffer to the ggml C API
-        unsafe {
-            ggml_sys::ggml_set_scratch(
-                self.ptr.as_ptr(),
-                ggml_sys::ggml_scratch {
-                    offs: 0,
-                    size,
-                    data,
-                },
-            );
-        }
-    }
-
-    /// TODO: something something
-    pub fn op_alibi(&self, a: &Tensor, n_past: usize, n_head: usize) -> Tensor {
-        let tensor = unsafe {
-            ggml_sys::ggml_alibi(
-                self.ptr.as_ptr(),
-                a.ptr.as_ptr(),
-                usize_to_i32(n_past),
-                usize_to_i32(n_head),
-            )
-        };
-
-        self.new_tensor_raw(tensor)
-    }
-
-    /// Gaussian Error Linear Units
-    pub fn op_gelu(&self, a: &Tensor) -> Tensor {
-        let tensor = unsafe { ggml_sys::ggml_gelu(self.ptr.as_ptr(), a.ptr.as_ptr()) };
-        self.new_tensor_raw(tensor)
-    }
-}
-
-impl Drop for Context {
-    fn drop(&mut self) {
-        // SAFETY: The only non-weak copy of ptr is no longer accessible after
-        // this drop call.
-        unsafe {
-            ggml_sys::ggml_free(self.ptr.as_ptr());
-        }
-    }
-}
-
-/// A buffer of memory that can be used as a scratch buffer for a [Context].
-///
-/// See [Context::use_scratch].
-pub struct Buffer {
-    data: Box<[u8]>,
-}
-
-impl Buffer {
-    /// Creates a new buffer of the specified size.
-    pub fn new(size: usize) -> Self {
-        let mut data: Vec<u8> = Vec::with_capacity(size);
-
-        // SAFETY: The contents are intentionally uninitialized, as they will be passed to
-        // the ggml C API which will fill them with data.
-        #[allow(clippy::uninit_vec)]
-        unsafe {
-            data.set_len(size);
-        }
-
-        Buffer {
-            data: data.into_boxed_slice(),
-        }
-    }
-}
-
-/// Tensors are owned by the context. A tensor is alive as long as the
-/// underlying context it was created with is alive.
-pub struct Tensor {
-    ptr: NonNull<ggml_sys::ggml_tensor>,
-    ctx: Weak<NonNull<ggml_sys::ggml_context>>,
-}
-
-impl Tensor {
-    /// Size of the `ggml_tensor` struct in bytes.
-    ///
-    /// Exposed for purposes of determining context size.
-    pub const C_TYPE_SIZE: usize = std::mem::size_of::<ggml_sys::ggml_tensor>();
-
-    /// Creates a shared copy of this tensor pointer.
-    pub fn share(&self) -> Self {
-        Tensor {
-            ptr: self.ptr,
-            ctx: Weak::clone(&self.ctx),
-        }
-    }
-
-    fn with_alive_ctx<U>(&self, mut f: impl FnMut() -> U) -> U {
-        if let Some(_ctx) = self.ctx.upgrade() {
-            f()
-        } else {
-            panic!("Using a tensor after the context was dropped")
-        }
-    }
-
-    fn with_alive_ctx_mut<U>(&self, mut f: impl FnMut() -> U) -> U {
-        if let Some(_ctx) = self.ctx.upgrade() {
-            f()
-        } else {
-            panic!("Using a tensor after the context was dropped")
-        }
-    }
-
-    /// Number of bytes used by this tensor.
-    pub fn nbytes(&self) -> usize {
-        self.with_alive_ctx(|| {
-            // SAFETY: The with_alive_call guarantees the context is alive
-            unsafe { ggml_sys::ggml_nbytes(self.ptr.as_ptr()) }
-        })
-    }
-
-    /// Provides raw mutable access to the data contained within the tensor.
-    ///
-    /// # Safety
-    ///
-    /// Only `std::slice::from_raw_parts_mut(tensor.data(), tensor.nbytes())` is safe to mutate.
-    pub unsafe fn data(&mut self) -> *mut c_void {
-        self.with_alive_ctx(|| {
-            // SAFETY: The with_alive_call guarantees the context is alive
-            unsafe { *self.ptr.as_ptr() }.data
-        })
-    }
-
-    /// Set the tensor's data pointer (useful for mmap-ed data)
-    ///
-    /// # Safety
-    ///
-    /// The memory region from `data_ptr` to `data_ptr.offset(tensor.nbytes())` will be read from.
-    pub unsafe fn set_data(&mut self, data_ptr: *mut c_void) {
-        let tensor = self.ptr.as_mut();
-        self.with_alive_ctx_mut(|| {
-            // SAFETY: The with_alive_call guarantees the context is alive
-            tensor.data = data_ptr;
-        })
-    }
-
-    /// Number of elements in this tensor.
-    pub fn nelements(&self) -> usize {
-        self.with_alive_ctx(|| {
-            // SAFETY: The with_alive_call guarantees the context is alive
-            i64_to_usize(unsafe { ggml_sys::ggml_nelements(self.ptr.as_ptr()) })
-        })
-    }
-
-    /// Number of elements in each dimension.
-    pub fn get_ne(&self) -> [i64; 4] {
-        self.with_alive_ctx(|| unsafe { *self.ptr.as_ptr() }.ne)
-    }
-
-    /// Stride of each dimension.
-    pub fn get_nb(&self) -> [usize; 4] {
-        self.with_alive_ctx(|| unsafe { *self.ptr.as_ptr() }.nb)
-    }
-
-    /// The data type.
-    pub fn get_type(&self) -> Type {
-        self.with_alive_ctx(|| unsafe { *self.ptr.as_ptr() }.type_.try_into().unwrap())
-    }
-
-    /// The size of the element type in bytes.
-    pub fn element_size(&self) -> usize {
-        self.with_alive_ctx(|| unsafe { ggml_sys::ggml_element_size(self.ptr.as_ptr()) })
-    }
-
-    /// Writes `src` to this tensor.
-    ///
-    /// # Safety
-    ///
-    /// This tensor must not be written to or read by from any other code.
-    pub unsafe fn write_data(&mut self, src: &[u8]) {
-        std::ptr::copy_nonoverlapping(src.as_ptr(), self.data() as *mut u8, src.len())
-    }
-
-    /// Zeroes out this tensor.
-    pub fn zero_data(&mut self) {
-        unsafe { std::ptr::write_bytes(self.data() as *mut u8, 0, self.nbytes()) }
-    }
-
-    /// Reads this tensor into `dst`, starting from `offset`.
-    ///
-    /// # Safety
-    ///
-    /// This tensor must not be written to or read by from any other code.
-    pub unsafe fn read_data(&self, offset: usize, dst: &mut [u8]) {
-        let data = unsafe { ggml_sys::ggml_get_data(self.ptr.as_ptr()).add(offset) };
-        std::ptr::copy_nonoverlapping(data, dst as *mut _ as _, dst.len())
-    }
-}
-
-/// A `ggml` computation graph. Keeps track of all state during computation.
-pub struct ComputationGraph {
-    inner: ggml_sys::ggml_cgraph,
-}
-
-impl ComputationGraph {
-    /// Create a new [ComputationGraph] with the specified `n_threads`.
-    pub fn new(n_threads: usize) -> Self {
-        Self {
-            inner: ggml_sys::ggml_cgraph {
-                n_threads: usize_to_i32(n_threads),
-                // SAFETY: This should be safe to zero. The original C++ impl
-                // just leaves it uninitialized
-                ..unsafe { std::mem::zeroed::<ggml_sys::ggml_cgraph>() }
-            },
-        }
-    }
-
-    /// Build this computational graph in the forward direction in preparation for computation.
-    pub fn build_forward_expand(&mut self, tensor: &Tensor) {
-        unsafe { ggml_sys::ggml_build_forward_expand(&mut self.inner, tensor.ptr.as_ptr()) }
-    }
-}
-
-/// The size of `t` as bytes.
-pub fn type_size(t: Type) -> usize {
-    unsafe { ggml_sys::ggml_type_size(t.into()) }
-}
-
-/// [type_size]/[blck_size] as float.
-pub fn type_sizef(x: Type) -> f64 {
-    (unsafe { ggml_sys::ggml_type_sizef(x.into()) }) as f64
-}
-
-/// The size of a block for `t`. Only relevant for quantized types.
-pub fn blck_size(t: Type) -> usize {
-    i32_to_usize(unsafe { ggml_sys::ggml_blck_size(t.into()) })
-}
-
-fn usize_to_i32(val: usize) -> i32 {
-    i32::try_from(val).unwrap()
-}
-
-fn usize_to_i64(val: usize) -> i64 {
-    i64::try_from(val).unwrap()
-}
-
-fn i32_to_usize(val: i32) -> usize {
-    usize::try_from(val).unwrap()
-}
-
-fn i64_to_usize(val: i64) -> usize {
-    usize::try_from(val).unwrap()
-}
-
-/// Contains the result of a quantization operation.
-pub struct QuantizationResult {
-    /// The quantized output.
-    pub output: Vec<u8>,
-    /// The quantization history.
-    pub history: Vec<i64>,
-}
-
-/// Quantizes `src` into `dst` using `q4_0` quantization.
-///
-/// You must ensure that `src.len() == n_elements`, and `n_elements_0`
-/// is the first dimension of `src`.
-pub fn quantize_q4_0(src: &[f32], n_elements: usize, n_elements_0: usize) -> QuantizationResult {
-    quantize_impl(src, n_elements, n_elements_0, ggml_sys::ggml_quantize_q4_0)
-}
-
-/// Quantizes `src` into `dst` using `q4_1` quantization.
-///
-/// You must ensure that `src.len() == n_elements`, and `n_elements_0`
-/// is the first dimension of `src`.
-pub fn quantize_q4_1(src: &[f32], n_elements: usize, n_elements_0: usize) -> QuantizationResult {
-    quantize_impl(src, n_elements, n_elements_0, ggml_sys::ggml_quantize_q4_1)
-}
-
-fn quantize_impl(
-    src: &[f32],
-    n_elements: usize,
-    n_elements_0: usize,
-    quantizer: unsafe extern "C" fn(*const f32, *mut c_void, c_int, c_int, *mut i64) -> usize,
-) -> QuantizationResult {
-    assert_eq!(src.len(), n_elements);
-    assert_eq!(n_elements % n_elements_0, 0);
-
-    // A conservative multiplier of 4 is used here.
-    let mut output = vec![0u8; n_elements * 4];
-    let mut history = vec![0i64; 16];
-    let output_size = unsafe {
-        quantizer(
-            src.as_ptr(),
-            output.as_mut_ptr() as *mut c_void,
-            n_elements.try_into().unwrap(),
-            n_elements_0.try_into().unwrap(),
-            history.as_mut_ptr(),
-        )
-    };
-
-    output.resize(output_size, 0u8);
-    QuantizationResult { output, history }
-}
diff --git a/llama/Cargo.toml b/llama/Cargo.toml
index 13632965..03b717c5 100644
--- a/llama/Cargo.toml
+++ b/llama/Cargo.toml
@@ -23,11 +23,11 @@ rust_tokenizers = { version = "3.1.2", optional = true }
 
 # Used for the `quantize` feature
 half = { version = "2.2.1", optional = true }
-ggml-format = { path = "../ggml-format", optional = true }
+ggml-rs = { path = "../ggml-rs", optional = true }
 
 [features]
 convert = ["dep:serde_json", "dep:protobuf", "dep:rust_tokenizers"]
-quantize = ["dep:half", "dep:ggml-format"]
+quantize = ["dep:half", "dep:ggml-rs"]
 
 [dev-dependencies]
 rand = { workspace = true }
diff --git a/llama/src/lib.rs b/llama/src/lib.rs
index 91c69f89..50cc93e3 100644
--- a/llama/src/lib.rs
+++ b/llama/src/lib.rs
@@ -2,7 +2,7 @@ use std::{error::Error, path::Path};
 
 use llm_base::{
     util, EvaluateOutputRequest, FileType, InferenceParameters, InferenceSession,
-    InferenceSessionParameters, LoadError, LoadProgress, Mmap, KnownModel, TensorLoader,
+    InferenceSessionParameters, KnownModel, LoadError, LoadProgress, Mmap, TensorLoader,
 };
 #[cfg(feature = "convert")]
 pub mod convert;
@@ -12,7 +12,7 @@ pub mod quantize;
 
 mod old_loader;
 
-pub use llm_base::{ggml, util::TokenUtf8Buffer, TokenBias, TokenId, Vocabulary};
+pub use llm_base::{ggml_rs, util::TokenUtf8Buffer, TokenBias, TokenId, Vocabulary};
 
 /// The weights for the LLaMA model. All the mutable state is split into a
 /// separate struct `InferenceSession`.
@@ -25,10 +25,10 @@ pub struct Llama {
 
     vocabulary: Vocabulary,
 
-    tok_embeddings: ggml::Tensor,
+    tok_embeddings: ggml_rs::Tensor,
 
-    norm: ggml::Tensor,
-    output: ggml::Tensor,
+    norm: ggml_rs::Tensor,
+    output: ggml_rs::Tensor,
 
     layers: Vec<Layer>,
 
@@ -36,7 +36,7 @@ pub struct Llama {
     _mmap: Option<Mmap>,
 
     // Must be kept alive for the model
-    _context: ggml::Context,
+    _context: ggml_rs::context::Context,
 }
 unsafe impl Send for Llama {}
 unsafe impl Sync for Llama {}
@@ -183,18 +183,18 @@ impl KnownModel for Llama {
             // add 10% to account for ggml object overhead
             buf_size = (1.1f64 * session.mem_per_token as f64 * n as f64) as usize;
         };
-        let ctx0 = ggml::Context::init(buf_size, true);
+        let ctx0 = ggml_rs::context::Context::init(buf_size, true);
 
-        let mut gf = ggml::ComputationGraph::new(n_threads);
+        let mut gf = ggml_rs::ComputationGraph::new(n_threads);
 
-        let mut embd = ctx0.new_tensor_1d(ggml::Type::I32, n);
+        let mut embd = ctx0.new_tensor_1d(ggml_rs::Type::I32, n);
         unsafe { embd.write_data(bytemuck::cast_slice(input_tokens)) };
 
         let mut input_layer = ctx0.op_get_rows(&self.tok_embeddings, &embd);
 
         for il in 0..n_layer {
             let input_self_attention = input_layer.share();
-            let mut current: ggml::Tensor;
+            let mut current: ggml_rs::Tensor;
 
             ctx0.use_scratch(Some(&mut session.scratch[0]));
 
@@ -312,7 +312,7 @@ impl KnownModel for Llama {
                 // cur = KQV_merged.contiguous().view(n_embd, N)
                 current = ctx0.op_cpy(
                     &k_q_v_merged,
-                    &ctx0.new_tensor_2d(ggml::Type::F32, n_embd, n),
+                    &ctx0.new_tensor_2d(ggml_rs::Type::F32, n_embd, n),
                 );
 
                 // projection (no bias)
@@ -437,7 +437,7 @@ impl Llama {
     /// This does *not* construct a valid model. All of the tensors are entirely
     /// empty. However, it can be used to determine if some code will compile.
     fn new_empty() -> Self {
-        let context = ggml::Context::init(1 * 1024 * 1024, true);
+        let context = ggml_rs::context::Context::init(1024 * 1024, true);
         let tok_embeddings = context.new_f32(0.0);
         let norm = context.new_f32(0.0);
         let output = context.new_f32(0.0);
@@ -496,20 +496,20 @@ impl llm_base::Hyperparameters for Hyperparameters {
 }
 
 struct Layer {
-    attention_norm: ggml::Tensor,
+    attention_norm: ggml_rs::Tensor,
 
-    wq: ggml::Tensor,
-    wk: ggml::Tensor,
-    wv: ggml::Tensor,
-    wo: ggml::Tensor,
+    wq: ggml_rs::Tensor,
+    wk: ggml_rs::Tensor,
+    wv: ggml_rs::Tensor,
+    wo: ggml_rs::Tensor,
 
     // normalization
-    ffn_norm: ggml::Tensor,
+    ffn_norm: ggml_rs::Tensor,
 
     // ff
-    w1: ggml::Tensor,
-    w2: ggml::Tensor,
-    w3: ggml::Tensor,
+    w1: ggml_rs::Tensor,
+    w2: ggml_rs::Tensor,
+    w3: ggml_rs::Tensor,
 }
 
 #[cfg(test)]
diff --git a/llama/src/old_loader.rs b/llama/src/old_loader.rs
index 56f3fcba..1e1bc3e2 100644
--- a/llama/src/old_loader.rs
+++ b/llama/src/old_loader.rs
@@ -35,9 +35,9 @@ pub(crate) fn load(
     // Verify magic
     let magic = util::read_u32(&mut reader)?;
     let model_type: ContainerType = match magic {
-        ggml::FILE_MAGIC_GGMF => ContainerType::Ggmf,
-        ggml::FILE_MAGIC_GGJT => ContainerType::Ggjt,
-        ggml::FILE_MAGIC_UNVERSIONED => ContainerType::Ggml,
+        ggml_rs::FILE_MAGIC_GGMF => ContainerType::Ggmf,
+        ggml_rs::FILE_MAGIC_GGJT => ContainerType::Ggjt,
+        ggml_rs::FILE_MAGIC_UNVERSIONED => ContainerType::Ggml,
         _ => {
             return Err(LoadError::InvalidMagic {
                 path: main_path.to_owned(),
@@ -50,7 +50,7 @@ pub(crate) fn load(
     match model_type {
         ContainerType::Ggmf | ContainerType::Ggjt => {
             let _version: u32 = match util::read_u32(&mut reader)? {
-                ggml::FORMAT_VERSION => ggml::FORMAT_VERSION,
+                ggml_rs::FORMAT_VERSION => ggml_rs::FORMAT_VERSION,
                 version => {
                     return Err(LoadError::InvalidFormatVersion {
                         container_type: model_type,
@@ -115,10 +115,10 @@ pub(crate) fn load(
     // floats or quantized in order to save memory and also to speed up the
     // computation
     let wtype = match hparams.file_type {
-        FileType::F32 => ggml::Type::F32,
-        FileType::MostlyF16 => ggml::Type::F16,
-        FileType::MostlyQ4_0 => ggml::Type::Q4_0,
-        FileType::MostlyQ4_1 => ggml::Type::Q4_1,
+        FileType::F32 => ggml_rs::Type::F32,
+        FileType::MostlyF16 => ggml_rs::Type::F16,
+        FileType::MostlyQ4_0 => ggml_rs::Type::Q4_0,
+        FileType::MostlyQ4_1 => ggml_rs::Type::Q4_1,
         _ => unimplemented!(),
     };
 
@@ -135,22 +135,22 @@ pub(crate) fn load(
         if alloc {
             let mut model_size: usize = 0;
 
-            ctx_size += mulf!(n_embd, n_vocab, ggml::type_sizef(wtype)); // tok_embeddings
-            ctx_size += mulf!(n_embd, ggml::type_sizef(ggml::Type::F32)); // norm
-            ctx_size += mulf!(n_embd, n_vocab, ggml::type_sizef(wtype)); // output
+            ctx_size += mulf!(n_embd, n_vocab, ggml_rs::type_sizef(wtype)); // tok_embeddings
+            ctx_size += mulf!(n_embd, ggml_rs::type_sizef(ggml_rs::Type::F32)); // norm
+            ctx_size += mulf!(n_embd, n_vocab, ggml_rs::type_sizef(wtype)); // output
 
-            model_size += mulf!(n_layer, n_embd, ggml::type_sizef(ggml::Type::F32)); // attention_norm
+            model_size += mulf!(n_layer, n_embd, ggml_rs::type_sizef(ggml_rs::Type::F32)); // attention_norm
 
-            model_size += mulf!(n_layer, n_embd, n_embd, ggml::type_sizef(wtype)); // wq
-            model_size += mulf!(n_layer, n_embd, n_embd, ggml::type_sizef(wtype)); // wk
-            model_size += mulf!(n_layer, n_embd, n_embd, ggml::type_sizef(wtype)); // wv
-            model_size += mulf!(n_layer, n_embd, n_embd, ggml::type_sizef(wtype)); // wo
+            model_size += mulf!(n_layer, n_embd, n_embd, ggml_rs::type_sizef(wtype)); // wq
+            model_size += mulf!(n_layer, n_embd, n_embd, ggml_rs::type_sizef(wtype)); // wk
+            model_size += mulf!(n_layer, n_embd, n_embd, ggml_rs::type_sizef(wtype)); // wv
+            model_size += mulf!(n_layer, n_embd, n_embd, ggml_rs::type_sizef(wtype)); // wo
 
-            model_size += mulf!(n_layer, n_embd, ggml::type_sizef(ggml::Type::F32)); // ffn_norm
+            model_size += mulf!(n_layer, n_embd, ggml_rs::type_sizef(ggml_rs::Type::F32)); // ffn_norm
 
-            model_size += mulf!(n_layer, n_ff, n_embd, ggml::type_sizef(wtype)); // w1
-            model_size += mulf!(n_layer, n_ff, n_embd, ggml::type_sizef(wtype)); // w2
-            model_size += mulf!(n_layer, n_ff, n_embd, ggml::type_sizef(wtype)); // w3
+            model_size += mulf!(n_layer, n_ff, n_embd, ggml_rs::type_sizef(wtype)); // w1
+            model_size += mulf!(n_layer, n_ff, n_embd, ggml_rs::type_sizef(wtype)); // w2
+            model_size += mulf!(n_layer, n_ff, n_embd, ggml_rs::type_sizef(wtype)); // w3
 
             ctx_size += model_size;
         }
@@ -161,7 +161,7 @@ pub(crate) fn load(
     };
 
     // Initialize the context
-    let context = ggml::Context::init(ctx_size, alloc);
+    let context = ggml_rs::context::Context::init(ctx_size, alloc);
 
     let (mmap, mmap_ptr) = if prefer_mmap && model_type.support_mmap() {
         let mmap = util::mmap_populate(&file)?;
@@ -217,7 +217,7 @@ fn load_weights_ggmf_or_unversioned(
     file_offset: u64,
     main_path: &Path,
     mut load_progress_callback: impl FnMut(LoadProgress),
-    tensors: &mut HashMap<String, ggml::Tensor>,
+    tensors: &mut HashMap<String, ggml_rs::Tensor>,
 ) -> Result<(), LoadError> {
     use std::{fs::File, io::BufReader};
 
@@ -269,7 +269,7 @@ fn load_weights_ggmf_or_unversioned(
             )?;
 
             if n_dims == 1 || n_parts == 1 {
-                if (nelements * bpe) / ggml::blck_size(tensor.get_type()) != tensor.nbytes() {
+                if (nelements * bpe) / ggml_rs::blck_size(tensor.get_type()) != tensor.nbytes() {
                     return Err(LoadError::TensorWrongSize {
                         tensor_name,
                         path: part_path,
@@ -289,7 +289,7 @@ fn load_weights_ggmf_or_unversioned(
 
                 total_size += tensor.nbytes();
             } else {
-                if (nelements * bpe) / ggml::blck_size(tensor.get_type())
+                if (nelements * bpe) / ggml_rs::blck_size(tensor.get_type())
                     != tensor.nbytes() / n_parts
                 {
                     return Err(LoadError::TensorWrongSize {
@@ -301,16 +301,16 @@ fn load_weights_ggmf_or_unversioned(
                 if split_type == 0 {
                     let np0 = ne[0];
                     let row_size = (usize::try_from(tensor.get_ne()[0])?
-                        / ggml::blck_size(tensor.get_type()))
-                        * ggml::type_size(tensor.get_type());
+                        / ggml_rs::blck_size(tensor.get_type()))
+                        * ggml_rs::type_size(tensor.get_type());
 
                     assert_eq!(row_size, tensor.get_nb()[1]);
 
                     for i1 in 0..ne[1] {
                         let offset_row = i1 as usize * row_size;
                         let offset = offset_row
-                            + ((part_id * np0 as usize) / ggml::blck_size(tensor.get_type()))
-                                * ggml::type_size(tensor.get_type());
+                            + ((part_id * np0 as usize) / ggml_rs::blck_size(tensor.get_type()))
+                                * ggml_rs::type_size(tensor.get_type());
                         // SAFETY: yolo, same as original code
                         unsafe {
                             let ptr = tensor.data().add(offset);
@@ -322,8 +322,8 @@ fn load_weights_ggmf_or_unversioned(
                 } else {
                     let np1 = ne[1];
                     let row_size = (usize::try_from(tensor.get_ne()[0])?
-                        / ggml::blck_size(tensor.get_type()))
-                        * ggml::type_size(tensor.get_type());
+                        / ggml_rs::blck_size(tensor.get_type()))
+                        * ggml_rs::type_size(tensor.get_type());
 
                     for i1 in 0..ne[1] {
                         let offset_row = (i1 as usize + part_id * np1 as usize) * row_size;
@@ -360,7 +360,7 @@ struct TensorHeaderGgmf<'a> {
     nelements: usize,
     ne: [i64; 2],
     tensor_name: String,
-    tensor: &'a mut ggml::Tensor,
+    tensor: &'a mut ggml_rs::Tensor,
     split_type: i32,
     bpe: usize,
 }
@@ -368,7 +368,7 @@ fn load_tensor_header_ggmf<'a>(
     n_dims: usize,
     reader: &mut impl BufRead,
     length: i32,
-    tensors: &'a mut HashMap<String, ggml::Tensor>,
+    tensors: &'a mut HashMap<String, ggml_rs::Tensor>,
     path: &Path,
     n_parts: usize,
     ftype: u32,
@@ -456,14 +456,14 @@ fn load_tensor_header_ggmf<'a>(
 }
 
 fn tensor_type_size(ftype: u32, ne: [i64; 2]) -> Option<usize> {
-    let ftype = ggml::Type::try_from(ftype).ok()?;
+    let ftype = ggml_rs::Type::try_from(ftype).ok()?;
     match ftype {
-        ggml::Type::Q4_0 | ggml::Type::Q4_1 => {
+        ggml_rs::Type::Q4_0 | ggml_rs::Type::Q4_1 => {
             assert_eq!(ne[0] % 64, 0);
         }
         _ => {}
     }
-    Some(ggml::type_size(ftype))
+    Some(ggml_rs::type_size(ftype))
 }
 
 fn load_weights_ggjt(
@@ -471,7 +471,7 @@ fn load_weights_ggjt(
     mmap_base: Option<*const u8>,
     path: &Path,
     mut load_progress_callback: impl FnMut(LoadProgress),
-    tensors: &mut HashMap<String, ggml::Tensor>,
+    tensors: &mut HashMap<String, ggml_rs::Tensor>,
 ) -> Result<(), LoadError>
 // where R: std::io::Read
 {
@@ -561,7 +561,7 @@ fn load_weights_ggjt(
 fn load_tensor_ggjt_mmap(
     reader: &mut (impl BufRead + Seek),
     mmap_base: *const u8,
-    tensor: &mut ggml::Tensor,
+    tensor: &mut ggml_rs::Tensor,
 ) -> Result<(), LoadError> {
     let offset_curr = reader.stream_position()?;
     let offset_aligned: u64 = (offset_curr + 31) & !31;
@@ -575,7 +575,7 @@ fn load_tensor_ggjt_mmap(
 
 fn load_tensor_ggjt_copy<'a>(
     reader: &mut (impl BufRead + Seek),
-    tensor: &'a mut ggml::Tensor,
+    tensor: &'a mut ggml_rs::Tensor,
 ) -> Result<(), LoadError> {
     let offset_curr = reader.stream_position()?;
     let offset_aligned: u64 = (offset_curr + 31) & !31;
diff --git a/llama/src/quantize.rs b/llama/src/quantize.rs
index 54643a1c..8a50bdb6 100644
--- a/llama/src/quantize.rs
+++ b/llama/src/quantize.rs
@@ -1,9 +1,12 @@
 //! Implements quantization of weights.
 
 use crate::{Hyperparameters, LoadError, LoadProgress};
-use ggml_format::{SaveError, SaveHandler, TensorData, TensorInfo};
+use ggml_rs::{
+    loader::TensorInfo,
+    saver::{SaveError, SaveHandler, TensorData},
+};
 use half::f16;
-use llm_base::{ggml, util, Loader};
+use llm_base::{ggml_rs, util, Loader};
 use std::{
     collections::HashMap,
     fs::File,
@@ -26,7 +29,7 @@ pub enum QuantizeProgress<'a> {
         /// Size of the tensor.
         dims: [usize; 2],
         /// Type of the tensor.
-        element_type: ggml::Type,
+        element_type: ggml_rs::Type,
         /// Number of elements in the tensor.
         n_elements: usize,
     },
@@ -101,13 +104,13 @@ pub enum QuantizeError {
     #[error("invalid quantization target {element_type:?}")]
     InvalidQuantizationTarget {
         /// The quantization target.
-        element_type: ggml::Type,
+        element_type: ggml_rs::Type,
     },
     /// The quantization process encountered an unsupported element type.
     #[error("unsupported element type {element_type:?}")]
     UnsupportedElementType {
         /// The element type.
-        element_type: ggml::Type,
+        element_type: ggml_rs::Type,
     },
 }
 impl QuantizeError {
@@ -127,11 +130,11 @@ impl QuantizeError {
 pub fn quantize(
     path_in: impl AsRef<Path>,
     path_out: impl AsRef<Path>,
-    desired_type: ggml::Type,
+    desired_type: ggml_rs::Type,
     progress_callback: impl Fn(QuantizeProgress),
 ) -> Result<(), QuantizeError> {
     // Sanity check
-    if !matches!(desired_type, ggml::Type::Q4_0 | ggml::Type::Q4_1) {
+    if !matches!(desired_type, ggml_rs::Type::Q4_0 | ggml_rs::Type::Q4_1) {
         return Err(QuantizeError::InvalidQuantizationTarget {
             element_type: desired_type,
         });
@@ -154,7 +157,7 @@ pub fn quantize(
             }
         }
     });
-    ggml_format::load_model(&mut reader, &mut loader)
+    ggml_rs::loader::load_model(&mut reader, &mut loader)
         .map_err(|err| LoadError::from_format_error(err, path_in.to_owned()))?;
 
     // Save the quantized model, quantizing as we go
@@ -181,7 +184,7 @@ pub fn quantize(
         &mut file_in,
         |p| progress_callback(p),
     );
-    ggml_format::save_model(
+    ggml_rs::saver::save_model(
         &mut writer,
         &mut saver,
         &vocabulary,
@@ -206,7 +209,7 @@ pub fn quantize(
 
 struct QuantizeSaver<'a, F: Fn(QuantizeProgress)> {
     // Input
-    quantization_type: ggml::Type,
+    quantization_type: ggml_rs::Type,
     hyperparameters: &'a Hyperparameters,
     tensors: &'a HashMap<String, TensorInfo>,
     source_file: &'a mut File,
@@ -219,7 +222,7 @@ struct QuantizeSaver<'a, F: Fn(QuantizeProgress)> {
 }
 impl<'a, F: Fn(QuantizeProgress)> QuantizeSaver<'a, F> {
     fn new(
-        quantization_type: ggml::Type,
+        quantization_type: ggml_rs::Type,
         hyperparameters: &'a Hyperparameters,
         tensors: &'a HashMap<String, TensorInfo>,
         source_file: &'a mut File,
@@ -267,7 +270,7 @@ impl<F: Fn(QuantizeProgress)> SaveHandler<QuantizeError> for QuantizeSaver<'_, F
         let quantize = tensor_name.contains("weight") && tensor.n_dims == 2;
         let raw_data = tensor.read_data(&mut BufReader::new(&mut self.source_file))?;
 
-        if quantize && !matches!(tensor.element_type, ggml::Type::F32 | ggml::Type::F16) {
+        if quantize && !matches!(tensor.element_type, ggml_rs::Type::F32 | ggml_rs::Type::F16) {
             return Err(QuantizeError::UnsupportedElementType {
                 element_type: tensor.element_type,
             });
@@ -279,11 +282,11 @@ impl<F: Fn(QuantizeProgress)> SaveHandler<QuantizeError> for QuantizeSaver<'_, F
             (self.progress_callback)(QuantizeProgress::TensorQuantizing { name: tensor_name });
 
             let data_f32: Vec<f32> = match tensor.element_type {
-                ggml::Type::F32 => raw_data
+                ggml_rs::Type::F32 => raw_data
                     .chunks_exact(4)
                     .map(|chunk| f32::from_le_bytes(chunk.try_into().unwrap()))
                     .collect(),
-                ggml::Type::F16 => raw_data
+                ggml_rs::Type::F16 => raw_data
                     .chunks_exact(2)
                     .map(|chunk| {
                         f16::from_bits(u16::from_le_bytes(chunk.try_into().unwrap())).to_f32()
@@ -293,11 +296,11 @@ impl<F: Fn(QuantizeProgress)> SaveHandler<QuantizeError> for QuantizeSaver<'_, F
             };
 
             let result = match self.quantization_type {
-                ggml::Type::Q4_0 => {
-                    ggml::quantize_q4_0(&data_f32, tensor.n_elements, tensor.dims[0])
+                ggml_rs::Type::Q4_0 => {
+                    ggml_rs::quantize_q4_0(&data_f32, tensor.n_elements, tensor.dims[0])
                 }
-                ggml::Type::Q4_1 => {
-                    ggml::quantize_q4_1(&data_f32, tensor.n_elements, tensor.dims[0])
+                ggml_rs::Type::Q4_1 => {
+                    ggml_rs::quantize_q4_1(&data_f32, tensor.n_elements, tensor.dims[0])
                 }
                 _ => unreachable!(),
             };
diff --git a/llm-base/Cargo.toml b/llm-base/Cargo.toml
index 7968f7dd..55dec81e 100644
--- a/llm-base/Cargo.toml
+++ b/llm-base/Cargo.toml
@@ -7,8 +7,7 @@ rust-version = "1.65"
 # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
 
 [dependencies]
-ggml = { path = "../ggml" }
-ggml-format = { path = "../ggml-format" }
+ggml-rs = { path = "../ggml-rs" }
 
 bytemuck = { workspace = true }
 rand = { workspace = true }
diff --git a/llm-base/src/inference_session.rs b/llm-base/src/inference_session.rs
index e161c8e8..e3c60d29 100644
--- a/llm-base/src/inference_session.rs
+++ b/llm-base/src/inference_session.rs
@@ -5,7 +5,7 @@ use rand::{distributions::WeightedIndex, prelude::Distribution};
 use thiserror::Error;
 
 use crate::{
-    mulf, Model, EvaluateOutputRequest, InferenceError, InferenceParameters, TokenId,
+    mulf, EvaluateOutputRequest, InferenceError, InferenceParameters, Model, TokenId,
     TokenUtf8Buffer, EOT_TOKEN_ID,
 };
 
@@ -27,7 +27,7 @@ const SCRATCH_SIZE: usize = 512 * 1024 * 1024;
 /// to use it from multiple threads.
 pub struct InferenceSession {
     // Must be kept alive for the model
-    pub(crate) _session_ctx: ggml::Context,
+    pub(crate) _session_ctx: ggml_rs::context::Context,
 
     // Original size of the memory used to create this context.
     pub(crate) memory_size: usize,
@@ -36,10 +36,10 @@ pub struct InferenceSession {
     pub(crate) params: InferenceSessionParameters,
 
     /// Memory K
-    pub memory_k: ggml::Tensor,
+    pub memory_k: ggml_rs::Tensor,
 
     /// Memory M
-    pub memory_v: ggml::Tensor,
+    pub memory_v: ggml_rs::Tensor,
 
     /// How many tokens have been fed into the model's working memory so far.
     pub n_past: usize,
@@ -58,7 +58,7 @@ pub struct InferenceSession {
     ///
     /// The number of scratch buffers was copied from `llama.cpp`.
     /// There is no specific reason for this number, but one is insufficient.
-    pub scratch: [ggml::Buffer; 2],
+    pub scratch: [ggml_rs::Buffer; 2],
 }
 unsafe impl Send for InferenceSession {}
 impl InferenceSession {
@@ -365,19 +365,19 @@ impl InferenceSession {
                 n_ctx,
                 n_layer,
                 n_embd,
-                ggml::type_sizef(params.memory_k_type.into())
+                ggml_rs::type_sizef(params.memory_k_type.into())
             ); // memory_k
             ctx_size += mulf!(
                 n_ctx,
                 n_layer,
                 n_embd,
-                ggml::type_sizef(params.memory_v_type.into())
+                ggml_rs::type_sizef(params.memory_v_type.into())
             ); // memory_v
             ctx_size += (5 + 10 * n_layer) * 256; // object overhead
             ctx_size
         };
 
-        let session_ctx = ggml::Context::init(ctx_size, true);
+        let session_ctx = ggml_rs::context::Context::init(ctx_size, true);
 
         // Initialize key + value memory tensors
         let n_mem = n_layer * n_ctx;
@@ -409,7 +409,7 @@ impl InferenceSession {
 }
 impl Clone for InferenceSession {
     fn clone(&self) -> Self {
-        let context = ggml::Context::init(self.memory_size, true);
+        let context = ggml_rs::context::Context::init(self.memory_size, true);
         let memory_k = context.new_tensor_1d(self.memory_k.get_type(), self.memory_k.nelements());
         let memory_v = context.new_tensor_1d(self.memory_v.get_type(), self.memory_v.nelements());
 
@@ -569,18 +569,18 @@ pub enum ModelKVMemoryType {
     /// 32-bit float.
     Float32,
 }
-impl From<ModelKVMemoryType> for ggml::Type {
+impl From<ModelKVMemoryType> for ggml_rs::Type {
     fn from(value: ModelKVMemoryType) -> Self {
         match value {
-            ModelKVMemoryType::Float16 => ggml::Type::F16,
-            ModelKVMemoryType::Float32 => ggml::Type::F32,
+            ModelKVMemoryType::Float16 => ggml_rs::Type::F16,
+            ModelKVMemoryType::Float32 => ggml_rs::Type::F32,
         }
     }
 }
 
-fn scratch_buffers() -> [ggml::Buffer; 2] {
+fn scratch_buffers() -> [ggml_rs::Buffer; 2] {
     [
-        ggml::Buffer::new(SCRATCH_SIZE),
-        ggml::Buffer::new(SCRATCH_SIZE),
+        ggml_rs::Buffer::new(SCRATCH_SIZE),
+        ggml_rs::Buffer::new(SCRATCH_SIZE),
     ]
 }
diff --git a/llm-base/src/lib.rs b/llm-base/src/lib.rs
index 846a7ae7..1b1bf6d8 100644
--- a/llm-base/src/lib.rs
+++ b/llm-base/src/lib.rs
@@ -14,15 +14,15 @@ mod inference_session;
 mod loader;
 mod vocabulary;
 
-pub use ggml;
-pub use ggml::Type as ElementType;
+pub use ggml_rs;
+pub use ggml_rs::Type as ElementType;
 pub use inference_session::{
     InferenceSession, InferenceSessionParameters, InferenceSnapshot, ModelKVMemoryType,
     SnapshotError,
 };
 pub use loader::{load, ContainerType, FileType, LoadError, LoadProgress, Loader, TensorLoader};
 pub use memmap2::Mmap;
-pub use model::{Model, Hyperparameters, KnownModel};
+pub use model::{Hyperparameters, KnownModel, Model};
 pub use util::TokenUtf8Buffer;
 pub use vocabulary::{TokenBias, TokenId, Vocabulary};
 
diff --git a/llm-base/src/loader.rs b/llm-base/src/loader.rs
index e5412305..402fe9cc 100644
--- a/llm-base/src/loader.rs
+++ b/llm-base/src/loader.rs
@@ -10,8 +10,11 @@ use crate::{
     util::{self, FindAllModelFilesError},
     Hyperparameters, KnownModel, TokenId, Vocabulary,
 };
-pub use ggml_format::ContainerType;
-use ggml_format::{LoadError as FormatLoadError, PartialHyperparameters, TensorInfo};
+pub use ggml_rs::ContainerType;
+use ggml_rs::{
+    context::Context,
+    loader::{LoadError as FormatLoadError, PartialHyperparameters, TensorInfo},
+};
 use memmap2::Mmap;
 use thiserror::Error;
 
@@ -276,9 +279,9 @@ impl LoadError {
 /// Used by models to fetch tensors from a loader.
 pub trait TensorLoader<E: std::error::Error> {
     /// Loads a tensor from the loader.
-    fn load(&mut self, name: &str, ne: &[usize]) -> Result<ggml::Tensor, E>;
+    fn load(&mut self, name: &str, ne: &[usize]) -> Result<ggml_rs::Tensor, E>;
     /// Finish loading the model, and extract all of the state from the loader.
-    fn finish(self) -> (ggml::Context, HashMap<String, ggml::Tensor>, Option<Mmap>);
+    fn finish(self) -> (Context, HashMap<String, ggml_rs::Tensor>, Option<Mmap>);
 }
 
 /// Load an arbitrary GGML model.
@@ -311,7 +314,7 @@ pub fn load<M: KnownModel>(
 
     let mut loader = Loader::new(load_progress_callback);
 
-    ggml_format::load_model(&mut reader, &mut loader)
+    ggml_rs::loader::load_model(&mut reader, &mut loader)
         .map_err(|err| LoadError::from_format_error(err, path.clone()))?;
 
     let Loader {
@@ -328,13 +331,13 @@ pub fn load<M: KnownModel>(
     let ctx_size = tensors
         .values()
         .map(|ti| {
-            ggml::Tensor::C_TYPE_SIZE
-                + ggml::OBJECT_SIZE
+            ggml_rs::Tensor::C_TYPE_SIZE
+                + ggml_rs::OBJECT_SIZE
                 + if use_mmap { 0 } else { ti.calc_size() }
         })
         .sum::<usize>();
     (load_progress_callback)(LoadProgress::ContextSize { bytes: ctx_size });
-    let context = ggml::Context::init(ctx_size, !use_mmap);
+    let context = Context::init(ctx_size, !use_mmap);
 
     let mmap = if use_mmap {
         let file = File::open(&path)?;
@@ -347,13 +350,13 @@ pub fn load<M: KnownModel>(
         path: PathBuf,
         file: File,
         tensors: HashMap<String, TensorInfo>,
-        context: ggml::Context,
+        context: Context,
         mmap: Option<Mmap>,
         load_progress_callback: &'a mut dyn FnMut(LoadProgress),
-        loaded_tensors: HashMap<String, ggml::Tensor>,
+        loaded_tensors: HashMap<String, ggml_rs::Tensor>,
     }
     impl TensorLoader<LoadError> for MmapCompatibleLoader<'_> {
-        fn load(&mut self, name: &str, ne: &[usize]) -> Result<ggml::Tensor, LoadError> {
+        fn load(&mut self, name: &str, ne: &[usize]) -> Result<ggml_rs::Tensor, LoadError> {
             let info = self
                 .tensors
                 .get(name)
@@ -412,7 +415,7 @@ pub fn load<M: KnownModel>(
             Ok(tensor)
         }
 
-        fn finish(self) -> (ggml::Context, HashMap<String, ggml::Tensor>, Option<Mmap>) {
+        fn finish(self) -> (Context, HashMap<String, ggml_rs::Tensor>, Option<Mmap>) {
             (self.context, self.loaded_tensors, self.mmap)
         }
     }
@@ -467,7 +470,7 @@ impl<Hp: Hyperparameters, F: FnMut(LoadProgress)> Loader<Hp, F> {
         }
     }
 }
-impl<Hp: Hyperparameters, F: FnMut(LoadProgress)> ggml_format::LoadHandler<LoadError>
+impl<Hp: Hyperparameters, F: FnMut(LoadProgress)> ggml_rs::loader::LoadHandler<LoadError>
     for Loader<Hp, F>
 {
     fn container_type(&mut self, container_type: ContainerType) -> Result<(), LoadError> {
diff --git a/llm-base/src/snapshot.rs b/llm-base/src/snapshot.rs
index 660e244b..af74ada2 100644
--- a/llm-base/src/snapshot.rs
+++ b/llm-base/src/snapshot.rs
@@ -5,7 +5,7 @@ use std::{
     path::Path,
 };
 
-use crate::{Model, InferenceSession, InferenceSessionParameters};
+use crate::{InferenceSession, InferenceSessionParameters, Model};
 
 use zstd::{
     stream::{read::Decoder, write::Encoder},
diff --git a/llm-base/src/util.rs b/llm-base/src/util.rs
index 757cc5fb..5d851e2f 100644
--- a/llm-base/src/util.rs
+++ b/llm-base/src/util.rs
@@ -1,4 +1,4 @@
-pub use ggml_format::util::*;
+pub use ggml_rs::util::*;
 use std::path::{Path, PathBuf};
 
 /// NOTE: The original code relies in promotion rules and automatic cast between
diff --git a/llm-cli/src/cli_args.rs b/llm-cli/src/cli_args.rs
index 6f8c6791..87e8986f 100644
--- a/llm-cli/src/cli_args.rs
+++ b/llm-cli/src/cli_args.rs
@@ -6,7 +6,7 @@ use std::{
 use clap::{Parser, ValueEnum};
 use color_eyre::eyre::{Result, WrapErr};
 use llm::{
-    ElementType, Model, InferenceParameters, InferenceSessionParameters, LoadProgress,
+    ElementType, InferenceParameters, InferenceSessionParameters, LoadProgress, Model,
     ModelKVMemoryType, TokenBias, EOT_TOKEN_ID,
 };
 use rand::SeedableRng;
diff --git a/llm/src/lib.rs b/llm/src/lib.rs
index ebac4ed5..9e5a01bd 100644
--- a/llm/src/lib.rs
+++ b/llm/src/lib.rs
@@ -1,7 +1,7 @@
 pub use llm_base::{
-    load, snapshot, ElementType, Model, FileType, InferenceError, InferenceParameters,
-    InferenceSession, InferenceSessionParameters, InferenceSnapshot, LoadError, LoadProgress,
-    KnownModel, ModelKVMemoryType, SnapshotError, TokenBias, TokenId, TokenUtf8Buffer, Vocabulary,
+    load, snapshot, ElementType, FileType, InferenceError, InferenceParameters, InferenceSession,
+    InferenceSessionParameters, InferenceSnapshot, KnownModel, LoadError, LoadProgress, Model,
+    ModelKVMemoryType, SnapshotError, TokenBias, TokenId, TokenUtf8Buffer, Vocabulary,
     EOT_TOKEN_ID,
 };
 

From 8594ac8767c7321c3b6548fd24593623fdebc228 Mon Sep 17 00:00:00 2001
From: Dan Forbes <dan@danforbes.dev>
Date: Fri, 28 Apr 2023 10:39:53 -0700
Subject: [PATCH 22/35] Use latest upstream ggml with alibi

---
 ggml-rs/ggml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ggml-rs/ggml b/ggml-rs/ggml
index 8cc06712..870ba203 160000
--- a/ggml-rs/ggml
+++ b/ggml-rs/ggml
@@ -1 +1 @@
-Subproject commit 8cc067122059864eb0fa97bf50d5dd71c5050b4b
+Subproject commit 870ba203dc228e14dcbd449cecbe2db954cb2741

From a542c98f351a631e03780153f35977cb41db8673 Mon Sep 17 00:00:00 2001
From: Dan Forbes <dan@danforbes.dev>
Date: Fri, 28 Apr 2023 11:22:00 -0700
Subject: [PATCH 23/35] Improve examples

---
 bloom/examples/bloom_inference.rs | 37 +++++++++++++++----------
 llama/examples/llama_inference.rs | 37 +++++++++++++++----------
 llm-base/src/lib.rs               |  5 +++-
 llm-base/src/loader.rs            | 46 +++++++++++++++++++++++++++++++
 4 files changed, 96 insertions(+), 29 deletions(-)

diff --git a/bloom/examples/bloom_inference.rs b/bloom/examples/bloom_inference.rs
index 9972cacf..8ca2435b 100644
--- a/bloom/examples/bloom_inference.rs
+++ b/bloom/examples/bloom_inference.rs
@@ -1,24 +1,31 @@
 use std::{convert::Infallible, env::args, io::Write};
 
-use llm_base::{snapshot, LoadError};
+use llm_base::{load_progress_callback, model::KnownModel};
 
 extern crate bloom;
 
-fn main() -> Result<(), LoadError> {
+fn main() {
     let args: Vec<String> = args().collect();
-    let bloom = bloom::Bloom::load(&args[1], true, 32, |_| {})?;
-    let (mut session, _) = snapshot::read_or_create_session(
-        &bloom,
-        Default::default(),
-        Default::default(),
-        Default::default(),
-    );
+    let loc = &args[1];
+    let prompt = match &args.len() {
+        3 => &args[2],
+        _ => "Rust is a cool programming language because ",
+    };
+
+    println!(" >>> Loading model from {loc}...");
+    let now = std::time::Instant::now();
+
+    let bloom = bloom::Bloom::load(loc, true, 512, load_progress_callback)
+        .unwrap_or_else(|e| panic!("Error loading model from {loc}: {e}"));
+
+    println!(" >>> Model loaded in {} ms.", now.elapsed().as_millis());
 
-    let _ = session.inference_with_prompt::<Infallible>(
+    let mut session = bloom.start_session(Default::default());
+    let res = session.inference_with_prompt::<Infallible>(
         &bloom,
         &Default::default(),
-        "The best kind of wine is ",
-        Some(32),
+        prompt,
+        None,
         &mut rand::thread_rng(),
         |t| {
             print!("{t}");
@@ -28,6 +35,8 @@ fn main() -> Result<(), LoadError> {
         },
     );
 
-    println!();
-    Ok(())
+    match res {
+        Ok(result) => println!("\n\nInference stats:\n{result}"),
+        Err(err) => println!("\n{err}"),
+    }
 }
diff --git a/llama/examples/llama_inference.rs b/llama/examples/llama_inference.rs
index b644f1f2..9b0ceb2b 100644
--- a/llama/examples/llama_inference.rs
+++ b/llama/examples/llama_inference.rs
@@ -1,24 +1,31 @@
 use std::{convert::Infallible, env::args, io::Write};
 
-use llm_base::{snapshot, LoadError};
+use llm_base::{load_progress_callback, model::KnownModel};
 
 extern crate llama;
 
-fn main() -> Result<(), LoadError> {
+fn main() {
     let args: Vec<String> = args().collect();
-    let llama = llama::Llama::load(&args[1], true, 32, |_| {})?;
-    let (mut session, _) = snapshot::read_or_create_session(
-        &llama,
-        Default::default(),
-        Default::default(),
-        Default::default(),
-    );
+    let loc = &args[1];
+    let prompt = match &args.len() {
+        3 => &args[2],
+        _ => "Rust is a cool programming language because ",
+    };
+
+    println!(" >>> Loading model from {loc}...");
+    let now = std::time::Instant::now();
+
+    let llama = llama::Llama::load(loc, true, 512, load_progress_callback)
+        .unwrap_or_else(|e| panic!("Error loading model from {loc}: {e}"));
+
+    println!(" >>> Model loaded in {} ms.", now.elapsed().as_millis());
 
-    let _ = session.inference_with_prompt::<Infallible>(
+    let mut session = llama.start_session(Default::default());
+    let res = session.inference_with_prompt::<Infallible>(
         &llama,
         &Default::default(),
-        "The best kind of wine is ",
-        Some(32),
+        prompt,
+        None,
         &mut rand::thread_rng(),
         |t| {
             print!("{t}");
@@ -28,6 +35,8 @@ fn main() -> Result<(), LoadError> {
         },
     );
 
-    println!();
-    Ok(())
+    match res {
+        Ok(result) => println!("\n\nInference stats:\n{result}"),
+        Err(err) => println!("\n{err}"),
+    }
 }
diff --git a/llm-base/src/lib.rs b/llm-base/src/lib.rs
index 1b1bf6d8..8edc939a 100644
--- a/llm-base/src/lib.rs
+++ b/llm-base/src/lib.rs
@@ -20,7 +20,10 @@ pub use inference_session::{
     InferenceSession, InferenceSessionParameters, InferenceSnapshot, ModelKVMemoryType,
     SnapshotError,
 };
-pub use loader::{load, ContainerType, FileType, LoadError, LoadProgress, Loader, TensorLoader};
+pub use loader::{
+    load, load_progress_callback, ContainerType, FileType, LoadError, LoadProgress, Loader,
+    TensorLoader,
+};
 pub use memmap2::Mmap;
 pub use model::{Hyperparameters, KnownModel, Model};
 pub use util::TokenUtf8Buffer;
diff --git a/llm-base/src/loader.rs b/llm-base/src/loader.rs
index 402fe9cc..9fd22d99 100644
--- a/llm-base/src/loader.rs
+++ b/llm-base/src/loader.rs
@@ -508,3 +508,49 @@ impl<Hp: Hyperparameters, F: FnMut(LoadProgress)> ggml_rs::loader::LoadHandler<L
         Ok(())
     }
 }
+
+/// Default load progress callbacks (prints progress to stdout)
+pub fn load_progress_callback(progress: LoadProgress) {
+    match progress {
+        LoadProgress::HyperparametersLoaded => println!("Loaded hyperparameters"),
+        LoadProgress::ContextSize { bytes } => println!(
+            "ggml ctx size = {:.2} MB\n",
+            bytes as f64 / (1024.0 * 1024.0)
+        ),
+        LoadProgress::PartLoading {
+            file,
+            current_part,
+            total_parts,
+        } => {
+            let current_part = current_part + 1;
+            println!(
+                "Loading model part {}/{} from '{}'\n",
+                current_part,
+                total_parts,
+                file.to_string_lossy()
+            )
+        }
+        LoadProgress::PartTensorLoaded {
+            current_tensor,
+            tensor_count,
+            ..
+        } => {
+            let current_tensor = current_tensor + 1;
+            if current_tensor % 8 == 0 {
+                println!("Loaded tensor {current_tensor}/{tensor_count}");
+            }
+        }
+        LoadProgress::PartLoaded {
+            file,
+            byte_size,
+            tensor_count,
+        } => {
+            println!("Loading of '{}' complete", file.to_string_lossy());
+            println!(
+                "Model size = {:.2} MB / num tensors = {}",
+                byte_size as f64 / 1024.0 / 1024.0,
+                tensor_count
+            );
+        }
+    };
+}

From 16fca15566be6af5763e24ea61ac1c69ec1c6b62 Mon Sep 17 00:00:00 2001
From: Dan Forbes <dan@danforbes.dev>
Date: Fri, 28 Apr 2023 11:22:29 -0700
Subject: [PATCH 24/35] Latest upstream ggml

---
 ggml-rs/ggml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ggml-rs/ggml b/ggml-rs/ggml
index 870ba203..583c5a3a 160000
--- a/ggml-rs/ggml
+++ b/ggml-rs/ggml
@@ -1 +1 @@
-Subproject commit 870ba203dc228e14dcbd449cecbe2db954cb2741
+Subproject commit 583c5a3ad6bdb041bff5ad161a49ff4d8fa52f10

From 974d2f7c5e1c8da038018ef3f0ca58fda839a92a Mon Sep 17 00:00:00 2001
From: Dan Forbes <dan@danforbes.dev>
Date: Fri, 28 Apr 2023 11:54:48 -0700
Subject: [PATCH 25/35] Cleanup README

---
 README.md | 38 ++++++++++++++++----------------------
 1 file changed, 16 insertions(+), 22 deletions(-)

diff --git a/README.md b/README.md
index 059091fd..129cb07c 100644
--- a/README.md
+++ b/README.md
@@ -1,30 +1,21 @@
 # LLaMA-rs
 
-<!-- markdownlint-disable-file MD026 -->
+This project is a Rust port of
+[llama.cpp](https://github.com/ggerganov/llama.cpp) 🦙🦀🚀
 
-> Do the LLaMA thing, but now in Rust 🦀🚀🦙
-
-![A llama riding a crab, AI-generated](./doc/resources/logo2.png)
-
-> _Image by [@darthdeus](https://github.com/darthdeus/), using Stable Diffusion_
-
-[![ko-fi](https://ko-fi.com/img/githubbutton_sm.svg)](https://ko-fi.com/F1F8DNO5D)
+Just like its C++ counterpart, it is powered by the
+[`ggml`](https://github.com/ggerganov/ggml) tensor library, which allows running
+inference for Facebook's [LLaMA](https://github.com/facebookresearch/llama)
+model on a CPU with good performance using full precision, f16 or 4-bit
+quantized versions of the model.
 
 [![Latest version](https://img.shields.io/crates/v/llama-rs.svg)](https://crates.io/crates/llama_rs)
 ![MIT/Apache2](https://shields.io/badge/license-MIT%2FApache--2.0-blue)
 [![Discord](https://img.shields.io/discord/1085885067601137734)](https://discord.gg/YB9WaXYAWU)
 
-![Gif showcasing language generation using llama-rs](./doc/resources/llama_gif.gif)
-
-**LLaMA-rs** is a Rust port of the
-[llama.cpp](https://github.com/ggerganov/llama.cpp) project. This allows running
-inference for Facebook's [LLaMA](https://github.com/facebookresearch/llama)
-model on a CPU with good performance using full precision, f16 or 4-bit
-quantized versions of the model.
+![A llama riding a crab, AI-generated](./doc/resources/logo2.png)
 
-Just like its C++ counterpart, it is powered by the
-[`ggml`](https://github.com/ggerganov/ggml) tensor library, achieving the same
-performance as the original code.
+> _Image by [@darthdeus](https://github.com/darthdeus/), using Stable Diffusion_
 
 ## Getting started
 
@@ -52,6 +43,8 @@ to install `bloom-cli` and `llama-cli` to your Cargo `bin` directory, which
 
 The CLI applications can then be run through `bloom-cli` and `llama-cli`, respectively.
 
+![Gif showcasing language generation using llama-rs](./doc/resources/llama_gif.gif)
+
 ### Building from repository
 
 Clone the repository, and then build it through
@@ -79,7 +72,8 @@ are required.
 #### From Hugging Face
 
 Compatible weights - not necessarily the original LLaMA weights - can be found
-on [Hugging Face by searching for GGML](https://huggingface.co/models?search=ggml). At present, LLaMA-architecture models are supported.
+on [Hugging Face by searching for GGML](https://huggingface.co/models?search=ggml).
+At present, LLaMA-architecture models are supported.
 
 #### LLaMA original weights
 
@@ -143,9 +137,9 @@ Some additional things to try:
 
   ![Gif showcasing alpaca repl mode](./doc/resources/alpaca_repl_screencap.gif)
 
-- Sessions can be loaded (`--load-session`) or saved (`--save-session`) to file. To automatically load
-  and save the same session, use `--persist-session`. This can be used to cache prompts to reduce load
-  time, too:
+- Sessions can be loaded (`--load-session`) or saved (`--save-session`) to file.
+  To automatically load and save the same session, use `--persist-session`.
+  This can be used to cache prompts to reduce load time, too:
 
   ![Gif showcasing prompt caching](./doc/resources/prompt_caching_screencap.gif)
 

From 1abaa41defac3c4ce268804675668f2a15652a28 Mon Sep 17 00:00:00 2001
From: Dan Forbes <dan@danforbes.dev>
Date: Fri, 28 Apr 2023 18:16:26 -0700
Subject: [PATCH 26/35] Rebase fix

---
 README.md               | 3 ++-
 llama/src/old_loader.rs | 2 +-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index 129cb07c..f19147c4 100644
--- a/README.md
+++ b/README.md
@@ -47,9 +47,10 @@ The CLI applications can then be run through `bloom-cli` and `llama-cli`, respec
 
 ### Building from repository
 
-Clone the repository, and then build it through
+Clone the repository and then build it with
 
 ```shell
+git clone --recurse-submodules git@github.com:rustformers/llama-rs.git
 cargo build --release
 ```
 
diff --git a/llama/src/old_loader.rs b/llama/src/old_loader.rs
index 1e1bc3e2..271279fa 100644
--- a/llama/src/old_loader.rs
+++ b/llama/src/old_loader.rs
@@ -13,7 +13,7 @@ use std::{
 
 use crate::Hyperparameters;
 use crate::{Llama, LoadError, LoadProgress, TokenId, Vocabulary};
-use llm_base::{ggml, mulf, util, ContainerType, FileType};
+use llm_base::{ggml_rs, mulf, util, ContainerType, FileType};
 
 pub(crate) fn load(
     path: impl AsRef<Path>,

From f994fa8d035b766349e35ceb894ccc13cddc5fab Mon Sep 17 00:00:00 2001
From: Dan Forbes <dan@danforbes.dev>
Date: Wed, 26 Apr 2023 11:13:56 -0700
Subject: [PATCH 27/35] GPT2/Cerebras loading and inference

---
 .vscode/launch.json             |  18 ++
 Cargo.lock                      |  10 +
 Cargo.toml                      |   1 +
 gpt2/Cargo.toml                 |  13 +
 gpt2/examples/gpt2_inference.rs |  33 +++
 gpt2/src/lib.rs                 | 445 ++++++++++++++++++++++++++++++++
 llm-base/src/loader.rs          |  13 +-
 7 files changed, 527 insertions(+), 6 deletions(-)
 create mode 100644 gpt2/Cargo.toml
 create mode 100644 gpt2/examples/gpt2_inference.rs
 create mode 100644 gpt2/src/lib.rs

diff --git a/.vscode/launch.json b/.vscode/launch.json
index d13a1828..2c455637 100644
--- a/.vscode/launch.json
+++ b/.vscode/launch.json
@@ -4,6 +4,24 @@
   // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
   "version": "0.2.0",
   "configurations": [
+    {
+      "type": "lldb",
+      "request": "launch",
+      "name": "Debug example 'gpt2_inference'",
+      "cargo": {
+        "args": [
+          "build",
+          "--example=gpt2_inference",
+          "--package=gpt2"
+        ],
+        "filter": {
+          "name": "gpt2_inference",
+          "kind": "example"
+        }
+      },
+      "args": ["${env:HOME}/.ggml-models/cerebras-gpt-13b.bin"],
+      "cwd": "${workspaceFolder}"
+    },
     {
       "type": "lldb",
       "request": "launch",
diff --git a/Cargo.lock b/Cargo.lock
index f3aa78f9..97065c15 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -463,6 +463,16 @@ version = "0.3.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "d2fabcfbdc87f4758337ca535fb41a6d701b65693ce38287d856d1674551ec9b"
 
+[[package]]
+name = "gpt2"
+version = "0.1.0"
+dependencies = [
+ "bytemuck",
+ "ggml",
+ "llm-base",
+ "rand",
+]
+
 [[package]]
 name = "half"
 version = "2.2.1"
diff --git a/Cargo.toml b/Cargo.toml
index fc60bb3c..a8edba78 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -3,6 +3,7 @@ members = [
     # Crates
     "ggml-rs",
     "llm-base",
+    "gpt2",
     "llama",
     "bloom",
     "llm",
diff --git a/gpt2/Cargo.toml b/gpt2/Cargo.toml
new file mode 100644
index 00000000..021dc446
--- /dev/null
+++ b/gpt2/Cargo.toml
@@ -0,0 +1,13 @@
+[package]
+name = "gpt2"
+version = { workspace = true }
+edition = "2021"
+
+[dependencies]
+ggml = { path = "../ggml" }
+llm-base = { path = "../llm-base" }
+
+bytemuck = { workspace = true }
+
+[dev-dependencies]
+rand = { workspace = true }
diff --git a/gpt2/examples/gpt2_inference.rs b/gpt2/examples/gpt2_inference.rs
new file mode 100644
index 00000000..bf2b5feb
--- /dev/null
+++ b/gpt2/examples/gpt2_inference.rs
@@ -0,0 +1,33 @@
+use std::{convert::Infallible, env::args, io::Write};
+
+use llm_base::{snapshot, LoadError};
+
+extern crate gpt2;
+
+fn main() -> Result<(), LoadError> {
+    let args: Vec<String> = args().collect();
+    let bloom = gpt2::Gpt2::load(&args[1], true, 32, |_| {})?;
+    let (mut session, _) = snapshot::read_or_create_session(
+        &bloom,
+        Default::default(),
+        Default::default(),
+        Default::default(),
+    );
+
+    let _ = session.inference_with_prompt::<Infallible>(
+        &bloom,
+        &Default::default(),
+        "The best kind of wine is ",
+        Some(32),
+        &mut rand::thread_rng(),
+        |t| {
+            print!("{t}");
+            std::io::stdout().flush().unwrap();
+
+            Ok(())
+        },
+    );
+
+    println!();
+    Ok(())
+}
diff --git a/gpt2/src/lib.rs b/gpt2/src/lib.rs
new file mode 100644
index 00000000..8fe7332d
--- /dev/null
+++ b/gpt2/src/lib.rs
@@ -0,0 +1,445 @@
+use std::path::Path;
+
+use ggml::Tensor;
+use llm_base::{
+    util, EvaluateOutputRequest, FileType, InferenceParameters, InferenceSession,
+    InferenceSessionParameters, KnownModel, LoadError, LoadProgress, TokenId, Vocabulary,
+};
+
+pub struct Gpt2 {
+    hyperparameters: Hyperparameters,
+    n_context_tokens: usize,
+    vocabulary: Vocabulary,
+    ln_f_g: Tensor,
+    ln_f_b: Tensor,
+    wte: Tensor,
+    wpe: Tensor,
+    lm_head: Tensor,
+    layers: Vec<Layer>,
+    _context: ggml::Context,
+}
+
+impl KnownModel for Gpt2 {
+    type Hyperparameters = Hyperparameters;
+
+    fn new<E: std::error::Error>(
+        hyperparameters: Self::Hyperparameters,
+        n_context_tokens: usize,
+        vocabulary: Vocabulary,
+        tensor_loader: impl llm_base::TensorLoader<E>,
+    ) -> Result<Self, E> {
+        let n_embd = hyperparameters.n_embd;
+        let n_layer = hyperparameters.n_layer;
+        let n_vocab = hyperparameters.n_vocab;
+        let n_ctx = hyperparameters.n_ctx;
+
+        let mut tl = tensor_loader;
+        // prepare memory for weights
+        let ln_f_g = tl.load("model/ln_f/g", &[n_embd])?;
+        let ln_f_b = tl.load("model/ln_f/b", &[n_embd])?;
+        let wte = tl.load("model/wte", &[n_embd, n_vocab])?;
+        let wpe = tl.load("model/wpe", &[n_embd, n_ctx])?;
+        let lm_head = tl.load("model/lm_head", &[n_embd, n_vocab])?;
+
+        let mut layers = Vec::new();
+        for i in 0..n_layer {
+            let layer = Layer {
+                ln_1_g: tl.load(&format!("model/h{i}/ln_1/g"), &[n_embd])?,
+                ln_1_b: tl.load(&format!("model/h{i}/ln_1/b"), &[n_embd])?,
+                ln_2_g: tl.load(&format!("model/h{i}/ln_2/g"), &[n_embd])?,
+                ln_2_b: tl.load(&format!("model/h{i}/ln_2/b"), &[n_embd])?,
+                c_attn_attn_w: tl
+                    .load(&format!("model/h{i}/attn/c_attn/w"), &[n_embd, n_embd * 3])?,
+                c_attn_attn_b: tl.load(&format!("model/h{i}/attn/c_attn/b"), &[n_embd * 3])?,
+                c_attn_proj_w: tl.load(&format!("model/h{i}/attn/c_proj/w"), &[n_embd, n_embd])?,
+                c_attn_proj_b: tl.load(&format!("model/h{i}/attn/c_proj/b"), &[n_embd])?,
+                c_mlp_fc_w: tl.load(&format!("model/h{i}/mlp/c_fc/w"), &[n_embd, n_embd * 4])?,
+                c_mlp_fc_b: tl.load(&format!("model/h{i}/mlp/c_fc/b"), &[n_embd * 4])?,
+                c_mlp_proj_w: tl
+                    .load(&format!("model/h{i}/mlp/c_proj/w"), &[n_embd * 4, n_embd])?,
+                c_mlp_proj_b: tl.load(&format!("model/h{i}/mlp/c_proj/b"), &[n_embd])?,
+            };
+
+            layers.push(layer);
+        }
+
+        let (_context, _, _mmap) = tl.finish();
+
+        Ok(Gpt2 {
+            hyperparameters,
+            n_context_tokens,
+            vocabulary,
+            layers,
+            ln_f_g,
+            ln_f_b,
+            wte,
+            wpe,
+            lm_head,
+            _context,
+        })
+    }
+
+    fn start_session(&self, params: InferenceSessionParameters) -> InferenceSession {
+        InferenceSession::new(
+            params,
+            self.hyperparameters.n_ctx,
+            self.hyperparameters.n_layer,
+            self.hyperparameters.n_embd,
+            self.hyperparameters.n_vocab,
+        )
+    }
+
+    fn evaluate(
+        &self,
+        session: &mut InferenceSession,
+        params: &InferenceParameters,
+        input_tokens: &[TokenId],
+        output_request: &mut EvaluateOutputRequest,
+    ) {
+        let n = input_tokens.len();
+        let n_threads = params.n_threads;
+
+        let Hyperparameters {
+            n_embd,
+            n_head,
+            n_vocab,
+            n_layer,
+            ..
+        } = self.hyperparameters;
+        let n_ctx = self.n_context_tokens;
+
+        // For the first run, we need to guess a maximum buffer size so we can measure
+        // the actual memory consumption of the temporary ggml context.
+        //
+        // These numbers are from `llama.cpp`, and could potentially be more efficient.
+        let mut buf_size = {
+            let buf_size_mb = if n_layer >= 80 {
+                1536
+            } else if n_layer >= 60 {
+                1280
+            } else {
+                1024
+            };
+            buf_size_mb * 1024 * 1024
+        };
+        if session.mem_per_token > 0 && session.mem_per_token * n > buf_size {
+            // add 10% to account for ggml object overhead
+            buf_size = (1.1f64 * session.mem_per_token as f64 * n as f64) as usize;
+        };
+        let ctx0 = ggml::Context::init(buf_size, true);
+
+        let mut gf = ggml::ComputationGraph::new(n_threads);
+
+        let mut embd = ctx0.new_tensor_1d(ggml::Type::I32, n);
+        unsafe { embd.write_data(bytemuck::cast_slice(input_tokens)) };
+
+        let n_past = session.n_past;
+
+        let mut position_buf = vec![];
+        for position_idx in 0..n {
+            position_buf.push(n_past + position_idx);
+        }
+
+        let mut position = ctx0.new_tensor_1d(ggml::Type::I32, n);
+        unsafe { position.write_data(bytemuck::cast_slice(&position_buf)) };
+
+        let mut input_layer = ctx0.op_add(
+            &ctx0.op_get_rows(&self.wte, &embd),
+            &ctx0.op_get_rows(&self.wpe, &position),
+        );
+
+        let memory_k = &session.memory_k;
+        let memory_k_size = memory_k.element_size();
+
+        let memory_v = &session.memory_v;
+        let memory_v_size = memory_v.element_size();
+
+        for il in 0..n_layer {
+            // norm
+            let mut current = ctx0.op_norm(&input_layer);
+            current = ctx0.op_add(
+                &ctx0.op_mul(&ctx0.op_repeat(&self.layers[il].ln_1_g, &current), &current),
+                &ctx0.op_repeat(&self.layers[il].ln_1_b, &current),
+            );
+
+            // attn
+            current = ctx0.op_mul_mat(&self.layers[il].c_attn_attn_w, &current);
+            current = ctx0.op_add(
+                &ctx0.op_repeat(&self.layers[il].c_attn_attn_b, &current),
+                &current,
+            );
+
+            // self-attn
+            let nb = current.get_nb()[1];
+            let f32_size = std::mem::size_of::<f32>();
+            let qcur = ctx0.op_view_2d(&current, (n_embd, n), nb, 0);
+            let kcur = ctx0.op_view_2d(&current, (n_embd, n), nb, f32_size * n_embd);
+            let vcur = ctx0.op_view_2d(&current, (n_embd, n), nb, f32_size * n_embd * 2);
+
+            if n >= 1 {
+                let k = ctx0.op_view_1d(
+                    memory_k,
+                    n * n_embd,
+                    (memory_k_size * n_embd) * (il * n_ctx + n_past),
+                );
+                let v = ctx0.op_view_1d(
+                    memory_v,
+                    n * n_embd,
+                    (memory_v_size * n_embd) * (il * n_ctx + n_past),
+                );
+
+                gf.build_forward_expand(&ctx0.op_cpy(&kcur, &k));
+                gf.build_forward_expand(&ctx0.op_cpy(&vcur, &v));
+            }
+
+            let q = ctx0.op_permute(
+                &ctx0.op_cpy(
+                    &qcur,
+                    &ctx0.new_tensor_3d(ggml::Type::F32, n_embd / n_head, n_head, n),
+                ),
+                0,
+                2,
+                1,
+                3,
+            );
+
+            let k = ctx0.op_permute(
+                &ctx0.op_reshape_3d(
+                    &ctx0.op_view_1d(
+                        &session.memory_k,
+                        (n_past + n) * n_embd,
+                        il * n_ctx * memory_k_size * n_embd,
+                    ),
+                    n_embd / n_head,
+                    n_head,
+                    n_past + n,
+                ),
+                0,
+                2,
+                1,
+                3,
+            );
+
+            let kq = ctx0.op_mul_mat(&k, &q);
+            let kq_scaled = ctx0.op_scale(
+                &kq,
+                &ctx0.new_f32(1f32 / f32::sqrt(n_embd as f32 / n_head as f32)),
+            );
+
+            let kq_masked = ctx0.op_diag_mask_inf(&kq_scaled, n_past);
+            let kq_softmax = ctx0.op_soft_max(&kq_masked);
+
+            let v_trans = ctx0.op_cpy(
+                &ctx0.op_permute(
+                    &ctx0.op_reshape_3d(
+                        &ctx0.op_view_1d(
+                            memory_v,
+                            (n_past + n) * n_embd,
+                            il * n_ctx * memory_v_size * n_embd,
+                        ),
+                        n_embd / n_head,
+                        n_head,
+                        n_past + n,
+                    ),
+                    1,
+                    2,
+                    0,
+                    3,
+                ),
+                &ctx0.new_tensor_3d(memory_v.get_type(), n_past + n, n_embd / n_head, n_head),
+            );
+
+            let kqv = ctx0.op_mul_mat(&v_trans, &kq_softmax);
+            let kqv_merged = ctx0.op_permute(&kqv, 0, 2, 1, 3);
+
+            current = ctx0.op_cpy(&kqv_merged, &ctx0.new_tensor_2d(ggml::Type::F32, n_embd, n));
+
+            // projection
+            current = ctx0.op_mul_mat(&self.layers[il].c_attn_proj_w, &current);
+            current = ctx0.op_add(
+                &ctx0.op_repeat(&self.layers[il].c_attn_proj_b, &current),
+                &current,
+            );
+
+            // add input
+            current = ctx0.op_add(&current, &input_layer);
+
+            // feed-forward
+            let ff_in = current.share();
+
+            // feed-forward normalization
+            current = ctx0.op_norm(&ff_in);
+            current = ctx0.op_add(
+                &ctx0.op_mul(&ctx0.op_repeat(&self.layers[il].ln_2_g, &current), &current),
+                &ctx0.op_repeat(&self.layers[il].ln_2_b, &current),
+            );
+
+            // feed-forward fully connected
+            current = ctx0.op_mul_mat(&self.layers[il].c_mlp_fc_w, &current);
+            current = ctx0.op_add(
+                &ctx0.op_repeat(&self.layers[il].c_mlp_fc_b, &current),
+                &current,
+            );
+
+            // feed-forward activation
+            current = ctx0.op_gelu(&current);
+
+            // feed-forward projection
+            current = ctx0.op_mul_mat(&self.layers[il].c_mlp_proj_w, &current);
+            current = ctx0.op_add(
+                &ctx0.op_repeat(&self.layers[il].c_mlp_proj_b, &current),
+                &current,
+            );
+
+            // input for next layer
+            input_layer = ctx0.op_add(&current, &ff_in);
+        }
+
+        // normalization
+        input_layer = ctx0.op_norm(&input_layer);
+        input_layer = ctx0.op_add(
+            &ctx0.op_mul(&ctx0.op_repeat(&self.ln_f_g, &input_layer), &input_layer),
+            &ctx0.op_repeat(&self.ln_f_b, &input_layer),
+        );
+
+        input_layer = ctx0.op_mul_mat(&self.lm_head, &input_layer);
+
+        // run the computation
+        gf.build_forward_expand(&input_layer);
+        ctx0.graph_compute(&mut gf);
+
+        // return result for just the last token
+        // SAFETY: yolo
+        assert_eq!(session.last_logits.len(), n_vocab);
+        unsafe {
+            input_layer.read_data(
+                n_vocab * (n - 1) * std::mem::size_of::<f32>(),
+                bytemuck::cast_slice_mut(&mut session.last_logits),
+            )
+        };
+
+        // Extract logits
+        if let Some(all_logits) = &mut output_request.all_logits {
+            all_logits.resize(n_vocab * n, 0.0);
+            // SAFETY: Tensor data can be read (properly aligned, initialized,
+            // data will not be mutated or otherwise aliased during the copy),
+            // and we're not reading past the end of the tensor data.
+            assert_eq!(input_layer.nelements(), n_vocab * n);
+            unsafe {
+                input_layer.read_data(0, bytemuck::cast_slice_mut(all_logits));
+            }
+        }
+
+        // Extract embeddings
+        if let Some(embeddings) = &mut output_request.embeddings {
+            embeddings.resize(n_embd * n, 0.0);
+            // SAFETY: Same rationale as for the "Extract logits" section applies.
+            assert_eq!(embd.nelements(), n_embd * n);
+            unsafe {
+                embd.read_data(0, bytemuck::cast_slice_mut(embeddings));
+            }
+        }
+
+        // Adjust the required memory per token if we didn't know that already
+        if session.mem_per_token == 0 {
+            session.mem_per_token = ctx0.used_mem() / n;
+        }
+
+        // Adjust n_past to new length.
+        session.n_past += input_tokens.len();
+    }
+
+    fn vocabulary(&self) -> &Vocabulary {
+        &self.vocabulary
+    }
+
+    fn n_ctx(&self) -> usize {
+        self.hyperparameters.n_ctx
+    }
+}
+
+impl Gpt2 {
+    /// Load the model from `path` with `n_context_tokens` context tokens.
+    ///
+    /// The status of the loading process will be reported through `load_progress_callback`.
+    pub fn load(
+        path: impl AsRef<Path>,
+        prefer_mmap: bool,
+        n_context_tokens: usize,
+        load_progress_callback: impl FnMut(LoadProgress),
+    ) -> Result<Gpt2, LoadError> {
+        llm_base::load(path, prefer_mmap, n_context_tokens, load_progress_callback)
+    }
+}
+
+/// The hyperparameters of the model.
+#[derive(Debug, Default, PartialEq, Eq, Clone, Copy)]
+pub struct Hyperparameters {
+    /// n_vocab
+    n_vocab: usize,
+    /// n_ctx
+    n_ctx: usize,
+    /// n_embd
+    n_embd: usize,
+    /// n_head
+    n_head: usize,
+    /// n_layer
+    n_layer: usize,
+    /// file type
+    file_type: FileType,
+}
+impl llm_base::Hyperparameters for Hyperparameters {
+    fn read(reader: &mut dyn std::io::BufRead) -> Result<Self, LoadError> {
+        let hyperparameters = Hyperparameters {
+            n_vocab: util::read_i32(reader)?.try_into()?,
+            n_ctx: util::read_i32(reader)?.try_into()?,
+            n_embd: util::read_i32(reader)?.try_into()?,
+            n_head: util::read_i32(reader)?.try_into()?,
+            n_layer: util::read_i32(reader)?.try_into()?,
+            file_type: {
+                let ftype = util::read_i32(reader)?;
+                FileType::try_from(ftype).map_err(|_| LoadError::UnsupportedFileType(ftype))?
+            },
+        };
+
+        let n_vocab = util::read_i32(reader)? as usize;
+        if hyperparameters.n_vocab != n_vocab {
+            return Err(LoadError::InvariantBroken {
+                path: None,
+                invariant: format!(
+                    "GPT2 model expected n_vocab {} found {}",
+                    hyperparameters.n_vocab, n_vocab
+                ),
+            });
+        }
+
+        Ok(hyperparameters)
+    }
+
+    fn n_vocabulary(&self) -> usize {
+        self.n_vocab
+    }
+}
+
+struct Layer {
+    // normalization
+    ln_1_g: Tensor,
+    ln_1_b: Tensor,
+
+    ln_2_g: Tensor,
+    ln_2_b: Tensor,
+
+    // attention
+    c_attn_attn_w: Tensor,
+    c_attn_attn_b: Tensor,
+
+    c_attn_proj_w: Tensor,
+    c_attn_proj_b: Tensor,
+
+    // mlp
+    c_mlp_fc_w: Tensor,
+    c_mlp_fc_b: Tensor,
+
+    c_mlp_proj_w: Tensor,
+    c_mlp_proj_b: Tensor,
+}
diff --git a/llm-base/src/loader.rs b/llm-base/src/loader.rs
index 9fd22d99..f67c700c 100644
--- a/llm-base/src/loader.rs
+++ b/llm-base/src/loader.rs
@@ -213,7 +213,7 @@ pub enum LoadError {
     #[error("invariant broken: {invariant} in {path:?}")]
     InvariantBroken {
         /// The path that failed.
-        path: PathBuf,
+        path: Option<PathBuf>,
         /// The invariant that was broken.
         invariant: String,
     },
@@ -269,9 +269,10 @@ impl LoadError {
                     ftype,
                 }
             }
-            FormatLoadError::InvariantBroken(invariant) => {
-                LoadError::InvariantBroken { path, invariant }
-            }
+            FormatLoadError::InvariantBroken(invariant) => LoadError::InvariantBroken {
+                path: Some(path),
+                invariant,
+            },
         }
     }
 }
@@ -368,7 +369,7 @@ pub fn load<M: KnownModel>(
             let dims = ne.len();
             if dims != info.n_dims {
                 return Err(LoadError::InvariantBroken {
-                    path: self.path.clone(),
+                    path: Some(self.path.clone()),
                     invariant: format!(
                         "the tensor {name} should have {} dimensions, not {dims}",
                         info.n_dims
@@ -383,7 +384,7 @@ pub fn load<M: KnownModel>(
                 3 => ctx.new_tensor_3d(info.element_type, ne[0], ne[1], ne[2]),
                 _ => {
                     return Err(LoadError::InvariantBroken {
-                        path: self.path.clone(),
+                        path: Some(self.path.clone()),
                         invariant: format!(
                             "the tensor {name} had an unsupported dimension count: {ne:?}"
                         ),

From ff99a80ccae929e4c8a4435b952b1cb8a922cbe6 Mon Sep 17 00:00:00 2001
From: Dan Forbes <dan@danforbes.dev>
Date: Sun, 30 Apr 2023 07:48:40 -0700
Subject: [PATCH 28/35] Rebase & remove BLOOM

---
 .vscode/launch.json               |  18 --
 Cargo.lock                        |  14 +-
 Cargo.toml                        |   1 -
 README.md                         |  34 +-
 bloom/Cargo.toml                  |  15 -
 bloom/examples/bloom_inference.rs |  42 ---
 bloom/src/lib.rs                  | 516 ------------------------------
 gpt2/Cargo.toml                   |   2 +-
 gpt2/examples/gpt2_inference.rs   |  39 ++-
 gpt2/src/lib.rs                   |  19 +-
 llm-cli/src/cli_args.rs           |   6 +-
 llm/Cargo.toml                    |   6 +-
 llm/src/lib.rs                    |   4 +-
 13 files changed, 62 insertions(+), 654 deletions(-)
 delete mode 100644 bloom/Cargo.toml
 delete mode 100644 bloom/examples/bloom_inference.rs
 delete mode 100644 bloom/src/lib.rs

diff --git a/.vscode/launch.json b/.vscode/launch.json
index 2c455637..f746153f 100644
--- a/.vscode/launch.json
+++ b/.vscode/launch.json
@@ -39,24 +39,6 @@
       },
       "args": ["${env:HOME}/.ggml-models/gpt4all-7b.bin"],
       "cwd": "${workspaceFolder}"
-    },
-    {
-      "type": "lldb",
-      "request": "launch",
-      "name": "Debug example 'bloom_inference'",
-      "cargo": {
-        "args": [
-          "build",
-          "--example=bloom_inference",
-          "--package=bloom"
-        ],
-        "filter": {
-          "name": "bloom_inference",
-          "kind": "example"
-        }
-      },
-      "args": ["${env:HOME}/.ggml-models/bloom-7b.bin"],
-      "cwd": "${workspaceFolder}"
     }
   ]
 }
\ No newline at end of file
diff --git a/Cargo.lock b/Cargo.lock
index 97065c15..dea7d177 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -130,16 +130,6 @@ version = "1.3.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a"
 
-[[package]]
-name = "bloom"
-version = "0.1.0"
-dependencies = [
- "bytemuck",
- "ggml-rs",
- "llm-base",
- "rand",
-]
-
 [[package]]
 name = "bytemuck"
 version = "1.13.1"
@@ -468,7 +458,7 @@ name = "gpt2"
 version = "0.1.0"
 dependencies = [
  "bytemuck",
- "ggml",
+ "ggml-rs",
  "llm-base",
  "rand",
 ]
@@ -628,7 +618,7 @@ dependencies = [
 name = "llm"
 version = "0.1.0"
 dependencies = [
- "bloom",
+ "gpt2",
  "llama",
  "llm-base",
 ]
diff --git a/Cargo.toml b/Cargo.toml
index a8edba78..67a108a5 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -5,7 +5,6 @@ members = [
     "llm-base",
     "gpt2",
     "llama",
-    "bloom",
     "llm",
     "llm-cli",
 ]
diff --git a/README.md b/README.md
index f19147c4..67672172 100644
--- a/README.md
+++ b/README.md
@@ -21,11 +21,11 @@ quantized versions of the model.
 
 Make sure you have a Rust 1.65.0 or above and C toolchain[^1] set up.
 
-`llm-base`, `bloom`, and `llama` are Rust libraries, while `bloom-cli` and
-`llama-cli` are a CLI applications that wrap `bloom` and `llama`, respectively,
-and offer basic inference capabilities.
+`llm-base`, `gpt2`, and `llama` are Rust libraries, while `llm-cli` is a CLI
+applications that wraps `gpt2` and `llama` and offer basic inference
+capabilities.
 
-The following instructions explain how to build the CLI applications.
+The following instructions explain how to build CLI applications.
 
 **NOTE**: For best results, make sure to build and run in release mode.
 Debug builds are going to be very slow.
@@ -35,13 +35,13 @@ Debug builds are going to be very slow.
 Run
 
 ```shell
-cargo install --git https://github.com/rustformers/llama-rs bloom-cli llama-cli
+cargo install --git https://github.com/rustformers/llama-rs llm-cli
 ```
 
-to install `bloom-cli` and `llama-cli` to your Cargo `bin` directory, which
-`rustup` is likely to have added to your `PATH`.
+to install `llm-cli` to your Cargo `bin` directory, which `rustup` is likely to
+have added to your `PATH`.
 
-The CLI applications can then be run through `bloom-cli` and `llama-cli`, respectively.
+The CLI application can then be run through `llm-cli`.
 
 ![Gif showcasing language generation using llama-rs](./doc/resources/llama_gif.gif)
 
@@ -54,13 +54,12 @@ git clone --recurse-submodules git@github.com:rustformers/llama-rs.git
 cargo build --release
 ```
 
-The resulting binaries will be at `target/release/bloom-cli[.exe]` and
-`target/release/llama-cli[.exe]`, respectively.
+The resulting binary will be at `target/release/llm-cli[.exe]`.
 
-They can also be run directly through Cargo, using
+It can also be run directly through Cargo, using
 
 ```shell
-cargo run --release --bin {bloom,llama}-cli -- <ARGS>
+cargo run --release --bin llm-cli -- <ARGS>
 ```
 
 This is useful for development.
@@ -104,13 +103,12 @@ cargo run -p llama-cli quantize /path/to/your/models/7B/ggml-model-f16.bin /path
 > The [llama.cpp repository](https://github.com/ggerganov/llama.cpp) has
 > additional information on how to obtain and run specific models.
 
-### BLOOM
+### GPT2
 
-The open-source [BLOOM](https://bigscience.huggingface.co/blog/bloom) model is
-also supported.
-[More information](https://huggingface.co/docs/transformers/model_doc/bloom)
-about BLOOM is available on HuggingFace, as are some
-[quantized models](https://huggingface.co/models?search=bloom%20ggml).
+OpenAI's [GPT-2](https://jalammar.github.io/illustrated-gpt2/) architecture is
+also supported. The open-source family of
+[Cerebras](https://www.cerebras.net/blog/cerebras-gpt-a-family-of-open-compute-efficient-large-language-models/)
+models is built on this architecture.
 
 _Support for other open source models is currently planned. For models where
 weights can be legally distributed, this section will be updated with scripts to
diff --git a/bloom/Cargo.toml b/bloom/Cargo.toml
deleted file mode 100644
index 884ab244..00000000
--- a/bloom/Cargo.toml
+++ /dev/null
@@ -1,15 +0,0 @@
-[package]
-name = "bloom"
-version = { workspace = true }
-edition = "2021"
-
-# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
-
-[dependencies]
-ggml-rs = { path = "../ggml-rs" }
-llm-base = { path = "../llm-base" }
-
-bytemuck = { workspace = true }
-
-[dev-dependencies]
-rand = { workspace = true }
diff --git a/bloom/examples/bloom_inference.rs b/bloom/examples/bloom_inference.rs
deleted file mode 100644
index 8ca2435b..00000000
--- a/bloom/examples/bloom_inference.rs
+++ /dev/null
@@ -1,42 +0,0 @@
-use std::{convert::Infallible, env::args, io::Write};
-
-use llm_base::{load_progress_callback, model::KnownModel};
-
-extern crate bloom;
-
-fn main() {
-    let args: Vec<String> = args().collect();
-    let loc = &args[1];
-    let prompt = match &args.len() {
-        3 => &args[2],
-        _ => "Rust is a cool programming language because ",
-    };
-
-    println!(" >>> Loading model from {loc}...");
-    let now = std::time::Instant::now();
-
-    let bloom = bloom::Bloom::load(loc, true, 512, load_progress_callback)
-        .unwrap_or_else(|e| panic!("Error loading model from {loc}: {e}"));
-
-    println!(" >>> Model loaded in {} ms.", now.elapsed().as_millis());
-
-    let mut session = bloom.start_session(Default::default());
-    let res = session.inference_with_prompt::<Infallible>(
-        &bloom,
-        &Default::default(),
-        prompt,
-        None,
-        &mut rand::thread_rng(),
-        |t| {
-            print!("{t}");
-            std::io::stdout().flush().unwrap();
-
-            Ok(())
-        },
-    );
-
-    match res {
-        Ok(result) => println!("\n\nInference stats:\n{result}"),
-        Err(err) => println!("\n{err}"),
-    }
-}
diff --git a/bloom/src/lib.rs b/bloom/src/lib.rs
deleted file mode 100644
index a0b58a1f..00000000
--- a/bloom/src/lib.rs
+++ /dev/null
@@ -1,516 +0,0 @@
-use std::path::Path;
-
-// use ggml_loader::{LoadError, LoadProgress};
-use llm_base::{
-    util, EvaluateOutputRequest, FileType, InferenceParameters, InferenceSession,
-    InferenceSessionParameters, KnownModel, LoadError, LoadProgress, Mmap, TokenId, Vocabulary,
-};
-
-/// The weights for the BLOOM model. All the mutable state is split into a
-/// separate struct `InferenceSession`.
-pub struct Bloom {
-    hyperparameters: Hyperparameters,
-    n_context_tokens: usize,
-
-    vocabulary: Vocabulary,
-    tok_embeddings: ggml_rs::Tensor,
-    norm: ggml_rs::Tensor,
-    norm_b: ggml_rs::Tensor,
-    output_norm: ggml_rs::Tensor,
-    output_norm_b: ggml_rs::Tensor,
-    output: ggml_rs::Tensor,
-    layers: Vec<Layer>,
-
-    // Must be kept alive for the model
-    _context: ggml_rs::context::Context,
-    _mmap: Option<Mmap>,
-}
-
-impl Bloom {
-    /// Load the model from `path` with `n_context_tokens` context tokens.
-    ///
-    /// The status of the loading process will be reported through `load_progress_callback`.
-    pub fn load(
-        path: impl AsRef<Path>,
-        prefer_mmap: bool,
-        n_context_tokens: usize,
-        load_progress_callback: impl FnMut(LoadProgress),
-    ) -> Result<Bloom, LoadError> {
-        llm_base::load(path, prefer_mmap, n_context_tokens, load_progress_callback)
-    }
-}
-
-impl KnownModel for Bloom {
-    type Hyperparameters = Hyperparameters;
-
-    fn new<E: std::error::Error>(
-        hyperparameters: Self::Hyperparameters,
-        n_context_tokens: usize,
-        vocabulary: Vocabulary,
-        tensor_loader: impl llm_base::TensorLoader<E>,
-    ) -> Result<Self, E> {
-        let n_embd = hyperparameters.n_embd;
-        let n_layer = hyperparameters.n_layer;
-        let n_vocab = hyperparameters.n_vocab;
-        let n_mult = hyperparameters.n_mult;
-        let n_ff = ((4 * n_embd + n_mult - 1) / n_mult) * n_mult;
-
-        let mut tl = tensor_loader;
-
-        let tok_embeddings = tl.load("tok_embeddings.weight", &[n_embd, n_vocab])?;
-
-        let norm = tl.load("norm.weight", &[n_embd])?;
-        let norm_b = tl.load("norm.bias", &[n_embd])?;
-
-        let output_norm = tl.load("output_norm.weight", &[n_embd])?;
-        let output_norm_b = tl.load("output_norm.bias", &[n_embd])?;
-
-        let output = tl.load("output.weight", &[n_embd, n_vocab])?;
-
-        let mut layers = Vec::new();
-        for i in 0..n_layer {
-            let layer = Layer {
-                attention_norm: tl.load(&format!("layers.{i}.attention_norm.weight"), &[n_embd])?,
-                attention_norm_b: tl.load(&format!("layers.{i}.attention_norm.bias"), &[n_embd])?,
-
-                query_key_value: tl.load(
-                    &format!("layers.{i}.attention.query_key_value.weight"),
-                    &[n_embd, 3 * n_embd],
-                )?,
-                query_key_value_b: tl.load(
-                    &format!("layers.{i}.attention.query_key_value.bias"),
-                    &[3 * n_embd],
-                )?,
-
-                wo: tl.load(
-                    &format!("layers.{i}.attention.wo.weight"),
-                    &[n_embd, n_embd],
-                )?,
-                wo_b: tl.load(&format!("layers.{i}.attention.wo.bias"), &[n_embd])?,
-
-                ffn_norm: tl.load(&format!("layers.{i}.ffn_norm.weight"), &[n_embd])?,
-                ffn_norm_b: tl.load(&format!("layers.{i}.ffn_norm.bias"), &[n_embd])?,
-
-                w1: tl.load(
-                    &format!("layers.{i}.feed_forward.w1.weight"),
-                    &[n_embd, n_ff],
-                )?,
-                w1_b: tl.load(&format!("layers.{i}.feed_forward.w1.bias"), &[n_ff])?,
-                w2: tl.load(
-                    &format!("layers.{i}.feed_forward.w2.weight"),
-                    &[n_ff, n_embd],
-                )?,
-                w2_b: tl.load(&format!("layers.{i}.feed_forward.w2.bias"), &[n_embd])?,
-            };
-
-            layers.push(layer);
-        }
-
-        let (_context, _, _mmap) = tl.finish();
-
-        Ok(Bloom {
-            hyperparameters,
-            n_context_tokens,
-            vocabulary,
-            tok_embeddings,
-            norm,
-            norm_b,
-            output_norm,
-            output_norm_b,
-            output,
-            layers,
-            _context,
-            _mmap,
-        })
-    }
-
-    fn start_session(&self, params: InferenceSessionParameters) -> InferenceSession {
-        InferenceSession::new(
-            params,
-            self.n_context_tokens,
-            self.hyperparameters.n_layer,
-            self.hyperparameters.n_embd,
-            self.hyperparameters.n_vocab,
-        )
-    }
-
-    fn evaluate(
-        &self,
-        session: &mut InferenceSession,
-        params: &InferenceParameters,
-        input_tokens: &[TokenId],
-        output_request: &mut EvaluateOutputRequest,
-    ) {
-        let n = input_tokens.len();
-        let n_past = session.n_past;
-        let n_threads = params.n_threads;
-
-        let Hyperparameters {
-            n_vocab,
-            n_embd,
-            n_mult: _,
-            n_head,
-            n_layer,
-            file_type: _,
-        } = self.hyperparameters;
-        let n_ctx = self.n_context_tokens;
-
-        // For the first run, we need to guess a maximum buffer size so we can measure
-        // the actual memory consumption of the temporary ggml context.
-        let mut buf_size = 1024 * 1024 * 1024;
-        if session.mem_per_token > 0 && session.mem_per_token * n > buf_size {
-            // add 10% to account for ggml object overhead
-            buf_size = (1.1f64 * session.mem_per_token as f64 * n as f64) as usize;
-        };
-        let ctx0 = ggml_rs::context::Context::init(buf_size, true);
-
-        // TODO: REMAKE THIS AFTER CHECKING GGML GRAPH
-        let mut gf = ggml_rs::ComputationGraph::new(n_threads);
-
-        let mut embd = ctx0.new_tensor_1d(ggml_rs::Type::I32, n);
-        unsafe { embd.write_data(bytemuck::cast_slice(input_tokens)) };
-
-        let mut input_layer = ctx0.op_get_rows(&self.tok_embeddings, &embd);
-
-        // word embeddings norm,
-        {
-            input_layer = ctx0.op_norm(&input_layer);
-            input_layer = ctx0.op_mul(&ctx0.op_repeat(&self.norm, &input_layer), &input_layer);
-            input_layer = ctx0.op_add(&ctx0.op_repeat(&self.norm_b, &input_layer), &input_layer);
-        }
-
-        for il in 0..n_layer {
-            let input_self_attention = input_layer.share();
-            let mut current: ggml_rs::Tensor;
-
-            // norm
-            {
-                current = ctx0.op_norm(&input_layer);
-
-                // cur = attention_norm * cur
-                current = ctx0.op_mul(
-                    &ctx0.op_repeat(&self.layers[il].attention_norm, &current),
-                    &current,
-                );
-                current = ctx0.op_add(
-                    &ctx0.op_repeat(&self.layers[il].attention_norm_b, &current),
-                    &current,
-                );
-            }
-
-            //attention
-            {
-                current = ctx0.op_mul_mat(&self.layers[il].query_key_value, &current);
-                current = ctx0.op_add(
-                    &ctx0.op_repeat(&self.layers[il].query_key_value_b, &current),
-                    &current,
-                );
-            }
-
-            // self-attention
-            {
-                let nb = current.get_nb()[1];
-                let q_current = ctx0.op_view_2d(
-                    &current,
-                    (n_embd, n),
-                    nb,
-                    //0 * std::mem::size_of::<f32>() * n_embd as usize,
-                    0,
-                );
-                let k_current = ctx0.op_view_2d(
-                    &current,
-                    (n_embd, n),
-                    nb,
-                    std::mem::size_of::<f32>() * n_embd,
-                );
-                let v_current = ctx0.op_view_2d(
-                    &current,
-                    (n_embd, n),
-                    nb,
-                    2 * std::mem::size_of::<f32>() * n_embd,
-                );
-
-                // store key and value to memory
-                if n >= 1 {
-                    let k = ctx0.op_view_1d(
-                        &session.memory_k,
-                        n * n_embd,
-                        (session.memory_k.element_size() * n_embd) * (il * n_ctx + n_past),
-                    );
-
-                    let v = ctx0.op_view_1d(
-                        &session.memory_v,
-                        n * n_embd,
-                        (session.memory_v.element_size() * n_embd) * (il * n_ctx + n_past),
-                    );
-
-                    gf.build_forward_expand(&ctx0.op_cpy(&k_current, &k));
-                    gf.build_forward_expand(&ctx0.op_cpy(&v_current, &v));
-                }
-
-                // Q = Qcur.contiguous().view(n_embd/n_head, n_head, N).permute(0, 2, 1, 3)
-                let q = ctx0.op_permute(
-                    &ctx0.op_cpy(
-                        &q_current,
-                        &ctx0.new_tensor_3d(ggml_rs::Type::F32, n_embd / n_head, n_head, n),
-                    ),
-                    0,
-                    2,
-                    1,
-                    3,
-                );
-
-                // K = Kmem.view(n_embd/n_head, n_head, n_past + N).permute(0, 2, 1, 3)
-                let k = ctx0.op_permute(
-                    &ctx0.op_reshape_3d(
-                        &ctx0.op_view_1d(
-                            &session.memory_k,
-                            (n_past + n) * n_embd,
-                            il * n_ctx * session.memory_k.element_size() * n_embd,
-                        ),
-                        n_embd / n_head,
-                        n_head,
-                        n_past + n,
-                    ),
-                    0,
-                    2,
-                    1,
-                    3,
-                );
-
-                // K * Q
-                let k_q = ctx0.op_mul_mat(&k, &q);
-
-                // KQ_scaled = KQ / sqrt(n_embd/n_head)
-                let k_q_scaled = ctx0.op_scale(
-                    &k_q,
-                    &ctx0.new_f32(1.0 / f32::sqrt(n_embd as f32 / n_head as f32)),
-                );
-
-                //alibi
-                // KQ_scaled_alibi = KQ_scaled + alibi_bias
-                let k_q_scaled_alibi = ctx0.op_alibi(&k_q_scaled, n_past, n_head);
-
-                // KQ_masked = mask_past(KQ_scaled)
-                let k_q_masked = ctx0.op_diag_mask_inf(&k_q_scaled_alibi, n_past);
-
-                // KQ = soft_max(KQ_masked)
-                let k_q_soft_max = ctx0.op_soft_max(&k_q_masked);
-
-                let memv_elsize = session.memory_v.element_size();
-
-                // let v_trans = ctx0.op_permute(
-                //     &ctx0.op_reshape_3d(
-                //         &ctx0.op_view_1d(
-                //             &session.memory_v,
-                //             (n_past + n) * n_embd,
-                //             il * n_ctx * memv_elsize * n_embd,
-                //         ),
-                //         n_embd / n_head,
-                //         n_head,
-                //         n_past + n,
-                //     ),
-                //     1,
-                //     2,
-                //     0,
-                //     3,
-                // );
-
-                // // GGML_ASSERT: ggml/ggml.c:4899: !ggml_is_transposed(a)
-                // let k_q_v = ctx0.op_mul_mat(&v_trans, &k_q_soft_max);
-
-                // split cached V into n_head heads
-                let v = ctx0.op_view_3d(
-                    &session.memory_v,
-                    (n_past + n, n_embd / n_head, n_head),
-                    (n_ctx * memv_elsize, n_ctx * memv_elsize * n_embd / n_head),
-                    il * n_ctx * memv_elsize * n_embd,
-                );
-
-                // KQV = transpose(V) * KQ_soft_max
-                let k_q_v = ctx0.op_mul_mat(&v, &k_q_soft_max);
-
-                // KQV_merged = KQV.permute(0, 2, 1, 3)
-                let k_q_v_merged = ctx0.op_permute(&k_q_v, 0, 2, 1, 3);
-
-                // cur = KQV_merged.contiguous().view(n_embd, N)
-                current = ctx0.op_cpy(
-                    &k_q_v_merged,
-                    &ctx0.new_tensor_2d(ggml_rs::Type::F32, n_embd, n),
-                );
-
-                // projection
-                current = ctx0.op_mul_mat(&self.layers[il].wo, &current);
-                current = ctx0.op_add(&ctx0.op_repeat(&self.layers[il].wo_b, &current), &current);
-            }
-
-            let input_feed_forward = ctx0.op_add(&current, &input_self_attention);
-
-            // feed-forward network
-            {
-                // norm
-                {
-                    current = ctx0.op_norm(&input_feed_forward);
-
-                    // cur = ffn_norm*cur + ffn_norm_b
-                    current = ctx0.op_mul(
-                        &ctx0.op_repeat(&self.layers[il].ffn_norm, &current),
-                        &current,
-                    );
-
-                    current = ctx0.op_add(
-                        &ctx0.op_repeat(&self.layers[il].ffn_norm_b, &current),
-                        &current,
-                    );
-                }
-
-                current = ctx0.op_mul_mat(&self.layers[il].w1, &current);
-
-                current = ctx0.op_add(&ctx0.op_repeat(&self.layers[il].w1_b, &current), &current);
-
-                // SILU activation
-
-                current = ctx0.op_gelu(&current);
-
-                current = ctx0.op_mul_mat(&self.layers[il].w2, &current);
-
-                current = ctx0.op_add(&ctx0.op_repeat(&self.layers[il].w2_b, &current), &current);
-            }
-
-            current = ctx0.op_add(&current, &input_feed_forward);
-
-            // input for next layer
-            input_layer = current;
-        }
-
-        // Used at the end to optionally extract the embeddings.
-        let embeddings_tensor;
-
-        // norm
-        {
-            input_layer = ctx0.op_norm(&input_layer);
-
-            // inpL = norm*inpL
-            input_layer = ctx0.op_mul(
-                &ctx0.op_repeat(&self.output_norm, &input_layer),
-                &input_layer,
-            );
-
-            input_layer = ctx0.op_add(
-                &ctx0.op_repeat(&self.output_norm_b, &input_layer),
-                &input_layer,
-            );
-
-            embeddings_tensor = input_layer.share(); //TODO: CHECK if this is still necessary, (not in BLOOM C implementation)
-        }
-
-        // lm_head
-        {
-            input_layer = ctx0.op_mul_mat(&self.output, &input_layer);
-        }
-
-        // logits -> probs
-        // inpL = ctx0.op_soft_max(&inpL);
-
-        // run the computation
-        gf.build_forward_expand(&input_layer);
-        ctx0.graph_compute(&mut gf);
-
-        // return result for just the last token
-        // SAFETY: yolo
-        assert_eq!(session.last_logits.len(), { n_vocab });
-        unsafe {
-            input_layer.read_data(
-                n_vocab * (n - 1) * std::mem::size_of::<f32>(),
-                bytemuck::cast_slice_mut(&mut session.last_logits),
-            )
-        };
-
-        // Extract logits
-        if let Some(all_logits) = &mut output_request.all_logits {
-            all_logits.resize(n_vocab * n, 0.0);
-            // SAFETY: Tensor data can be read (properly aligned, initialized,
-            // data will not be mutated or otherwise aliased during the copy),
-            // and we're not reading past the end of the tensor data.
-            assert_eq!(input_layer.nelements(), n_vocab * n);
-            unsafe {
-                input_layer.read_data(0, bytemuck::cast_slice_mut(all_logits));
-            }
-        }
-
-        // Extract embeddings
-        if let Some(embeddings) = &mut output_request.embeddings {
-            embeddings.resize(n_embd * n, 0.0);
-            // SAFETY: Same rationale as for the "Extract logits" section applies.
-            assert_eq!(embeddings_tensor.nelements(), n_embd * n);
-            unsafe {
-                embeddings_tensor.read_data(0, bytemuck::cast_slice_mut(embeddings));
-            }
-        }
-
-        // Adjust the required memory per token if we didn't know that already
-        if session.mem_per_token == 0 {
-            session.mem_per_token = ctx0.used_mem() / n;
-        }
-
-        // Adjust n_past to new length.
-        session.n_past += input_tokens.len();
-    }
-
-    /// Returns the vocabulary used by this model.
-    fn vocabulary(&self) -> &Vocabulary {
-        &self.vocabulary
-    }
-
-    fn n_ctx(&self) -> usize {
-        self.n_context_tokens
-    }
-}
-
-// NOTE: Field order matters! Data is laid out in the file exactly
-// in this order.
-#[derive(Debug, Default, PartialEq, Eq, Clone, Copy)]
-pub struct Hyperparameters {
-    pub n_vocab: usize,
-    pub n_embd: usize,
-    pub n_mult: usize,
-    pub n_head: usize,
-    pub n_layer: usize,
-    pub file_type: FileType,
-}
-impl llm_base::Hyperparameters for Hyperparameters {
-    fn read(reader: &mut dyn std::io::BufRead) -> Result<Self, llm_base::LoadError> {
-        Ok(Hyperparameters {
-            n_vocab: util::read_i32(reader)?.try_into()?,
-            n_embd: util::read_i32(reader)?.try_into()?,
-            n_mult: util::read_i32(reader)?.try_into()?,
-            n_head: util::read_i32(reader)?.try_into()?,
-            n_layer: util::read_i32(reader)?.try_into()?,
-            file_type: {
-                let ftype = util::read_i32(reader)?;
-                FileType::try_from(ftype).map_err(|_| LoadError::UnsupportedFileType(ftype))?
-            },
-        })
-    }
-
-    fn n_vocabulary(&self) -> usize {
-        self.n_vocab
-    }
-}
-
-struct Layer {
-    pub attention_norm: ggml_rs::Tensor,
-    pub attention_norm_b: ggml_rs::Tensor,
-    pub wo: ggml_rs::Tensor,
-    pub wo_b: ggml_rs::Tensor,
-    pub query_key_value: ggml_rs::Tensor,
-    pub query_key_value_b: ggml_rs::Tensor,
-    // normalization
-    pub ffn_norm: ggml_rs::Tensor,
-    pub ffn_norm_b: ggml_rs::Tensor,
-    // ff
-    pub w1: ggml_rs::Tensor,
-    pub w1_b: ggml_rs::Tensor,
-    pub w2: ggml_rs::Tensor,
-    pub w2_b: ggml_rs::Tensor,
-}
diff --git a/gpt2/Cargo.toml b/gpt2/Cargo.toml
index 021dc446..8efe5b13 100644
--- a/gpt2/Cargo.toml
+++ b/gpt2/Cargo.toml
@@ -4,7 +4,7 @@ version = { workspace = true }
 edition = "2021"
 
 [dependencies]
-ggml = { path = "../ggml" }
+ggml-rs = { path = "../ggml-rs" }
 llm-base = { path = "../llm-base" }
 
 bytemuck = { workspace = true }
diff --git a/gpt2/examples/gpt2_inference.rs b/gpt2/examples/gpt2_inference.rs
index bf2b5feb..8db4e97b 100644
--- a/gpt2/examples/gpt2_inference.rs
+++ b/gpt2/examples/gpt2_inference.rs
@@ -1,24 +1,31 @@
 use std::{convert::Infallible, env::args, io::Write};
 
-use llm_base::{snapshot, LoadError};
+use llm_base::{load_progress_callback, model::KnownModel};
 
 extern crate gpt2;
 
-fn main() -> Result<(), LoadError> {
+fn main() {
     let args: Vec<String> = args().collect();
-    let bloom = gpt2::Gpt2::load(&args[1], true, 32, |_| {})?;
-    let (mut session, _) = snapshot::read_or_create_session(
-        &bloom,
-        Default::default(),
-        Default::default(),
-        Default::default(),
-    );
+    let loc = &args[1];
+    let prompt = match &args.len() {
+        3 => &args[2],
+        _ => "Rust is a cool programming language because ",
+    };
+
+    println!(" >>> Loading model from {loc}...");
+    let now = std::time::Instant::now();
+
+    let gpt2 = gpt2::Gpt2::load(loc, true, 512, load_progress_callback)
+        .unwrap_or_else(|e| panic!("Error loading model from {loc}: {e}"));
+
+    println!(" >>> Model loaded in {} ms.", now.elapsed().as_millis());
 
-    let _ = session.inference_with_prompt::<Infallible>(
-        &bloom,
+    let mut session = gpt2.start_session(Default::default());
+    let res = session.inference_with_prompt::<Infallible>(
+        &gpt2,
         &Default::default(),
-        "The best kind of wine is ",
-        Some(32),
+        prompt,
+        None,
         &mut rand::thread_rng(),
         |t| {
             print!("{t}");
@@ -28,6 +35,8 @@ fn main() -> Result<(), LoadError> {
         },
     );
 
-    println!();
-    Ok(())
+    match res {
+        Ok(result) => println!("\n\nInference stats:\n{result}"),
+        Err(err) => println!("\n{err}"),
+    }
 }
diff --git a/gpt2/src/lib.rs b/gpt2/src/lib.rs
index 8fe7332d..6efce87c 100644
--- a/gpt2/src/lib.rs
+++ b/gpt2/src/lib.rs
@@ -1,6 +1,6 @@
 use std::path::Path;
 
-use ggml::Tensor;
+use ggml_rs::Tensor;
 use llm_base::{
     util, EvaluateOutputRequest, FileType, InferenceParameters, InferenceSession,
     InferenceSessionParameters, KnownModel, LoadError, LoadProgress, TokenId, Vocabulary,
@@ -16,7 +16,7 @@ pub struct Gpt2 {
     wpe: Tensor,
     lm_head: Tensor,
     layers: Vec<Layer>,
-    _context: ggml::Context,
+    _context: ggml_rs::context::Context,
 }
 
 impl KnownModel for Gpt2 {
@@ -126,11 +126,11 @@ impl KnownModel for Gpt2 {
             // add 10% to account for ggml object overhead
             buf_size = (1.1f64 * session.mem_per_token as f64 * n as f64) as usize;
         };
-        let ctx0 = ggml::Context::init(buf_size, true);
+        let ctx0 = ggml_rs::context::Context::init(buf_size, true);
 
-        let mut gf = ggml::ComputationGraph::new(n_threads);
+        let mut gf = ggml_rs::ComputationGraph::new(n_threads);
 
-        let mut embd = ctx0.new_tensor_1d(ggml::Type::I32, n);
+        let mut embd = ctx0.new_tensor_1d(ggml_rs::Type::I32, n);
         unsafe { embd.write_data(bytemuck::cast_slice(input_tokens)) };
 
         let n_past = session.n_past;
@@ -140,7 +140,7 @@ impl KnownModel for Gpt2 {
             position_buf.push(n_past + position_idx);
         }
 
-        let mut position = ctx0.new_tensor_1d(ggml::Type::I32, n);
+        let mut position = ctx0.new_tensor_1d(ggml_rs::Type::I32, n);
         unsafe { position.write_data(bytemuck::cast_slice(&position_buf)) };
 
         let mut input_layer = ctx0.op_add(
@@ -195,7 +195,7 @@ impl KnownModel for Gpt2 {
             let q = ctx0.op_permute(
                 &ctx0.op_cpy(
                     &qcur,
-                    &ctx0.new_tensor_3d(ggml::Type::F32, n_embd / n_head, n_head, n),
+                    &ctx0.new_tensor_3d(ggml_rs::Type::F32, n_embd / n_head, n_head, n),
                 ),
                 0,
                 2,
@@ -252,7 +252,10 @@ impl KnownModel for Gpt2 {
             let kqv = ctx0.op_mul_mat(&v_trans, &kq_softmax);
             let kqv_merged = ctx0.op_permute(&kqv, 0, 2, 1, 3);
 
-            current = ctx0.op_cpy(&kqv_merged, &ctx0.new_tensor_2d(ggml::Type::F32, n_embd, n));
+            current = ctx0.op_cpy(
+                &kqv_merged,
+                &ctx0.new_tensor_2d(ggml_rs::Type::F32, n_embd, n),
+            );
 
             // projection
             current = ctx0.op_mul_mat(&self.layers[il].c_attn_proj_w, &current);
diff --git a/llm-cli/src/cli_args.rs b/llm-cli/src/cli_args.rs
index 87e8986f..84eedc30 100644
--- a/llm-cli/src/cli_args.rs
+++ b/llm-cli/src/cli_args.rs
@@ -280,8 +280,8 @@ pub enum ModelArchitecture {
     /// Meta's LLaMA model and derivatives (Vicuna, etc).
     #[default]
     Llama,
-    /// The BigScience Large Open-science Open-access Multilingual Language Model (BLOOM).
-    Bloom,
+    /// OpenAI's GPT2 architecture and derivatives (Cerebras, etc).
+    Gpt2,
 }
 impl ModelLoad {
     pub fn load(&self) -> Result<Box<dyn Model>> {
@@ -363,7 +363,7 @@ impl ModelLoad {
                 n_context_tokens,
                 load_progress_callback,
             )?),
-            ModelArchitecture::Bloom => Box::new(llm::load::<llm::Bloom>(
+            ModelArchitecture::Gpt2 => Box::new(llm::load::<llm::Gpt2>(
                 path,
                 prefer_mmap,
                 n_context_tokens,
diff --git a/llm/Cargo.toml b/llm/Cargo.toml
index 080de0ed..063df3a7 100644
--- a/llm/Cargo.toml
+++ b/llm/Cargo.toml
@@ -6,9 +6,9 @@ edition = "2021"
 [dependencies]
 llm-base = { path = "../llm-base" }
 llama = { path = "../llama", features = ["convert", "quantize"], optional = true }
-bloom = { path = "../bloom", optional = true }
+gpt2 = { path = "../gpt2", optional = true }
 
 [features]
-default = ["llama", "bloom"]
+default = ["llama", "gpt2"]
 llama = ["dep:llama"]
-bloom = ["dep:bloom"]
\ No newline at end of file
+gpt2 = ["dep:gpt2"]
\ No newline at end of file
diff --git a/llm/src/lib.rs b/llm/src/lib.rs
index 9e5a01bd..53431dda 100644
--- a/llm/src/lib.rs
+++ b/llm/src/lib.rs
@@ -5,7 +5,7 @@ pub use llm_base::{
     EOT_TOKEN_ID,
 };
 
-#[cfg(feature = "bloom")]
-pub use bloom::{self, Bloom};
+#[cfg(feature = "gpt2")]
+pub use gpt2::{self, Gpt2};
 #[cfg(feature = "llama")]
 pub use llama::{self, Llama};

From 454f3a9315a148943ac2940bdb6fd008b1d19961 Mon Sep 17 00:00:00 2001
From: Dan Forbes <dan@danforbes.dev>
Date: Sun, 30 Apr 2023 07:58:58 -0700
Subject: [PATCH 29/35] GitHub Action should support Git submodules

---
 .github/workflows/rust.yml | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml
index 1697ea4e..bd2da4b9 100644
--- a/.github/workflows/rust.yml
+++ b/.github/workflows/rust.yml
@@ -19,6 +19,8 @@ jobs:
     runs-on: ${{ matrix.os }}
     steps:
     - uses: actions/checkout@v3
+      with:
+        submodules: recursive
     - uses: dtolnay/rust-toolchain@stable
     - name: Check
       run: cargo check --verbose

From e69d4872ae2dcdc86e043c3dc19e33623d435922 Mon Sep 17 00:00:00 2001
From: Dan Forbes <dan@danforbes.dev>
Date: Sun, 30 Apr 2023 08:01:14 -0700
Subject: [PATCH 30/35] Fix binary file name in README

---
 README.md | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/README.md b/README.md
index 67672172..6916fd51 100644
--- a/README.md
+++ b/README.md
@@ -35,13 +35,13 @@ Debug builds are going to be very slow.
 Run
 
 ```shell
-cargo install --git https://github.com/rustformers/llama-rs llm-cli
+cargo install --git https://github.com/rustformers/llama-rs llm
 ```
 
-to install `llm-cli` to your Cargo `bin` directory, which `rustup` is likely to
+to install `llm` to your Cargo `bin` directory, which `rustup` is likely to
 have added to your `PATH`.
 
-The CLI application can then be run through `llm-cli`.
+The CLI application can then be run through `llm`.
 
 ![Gif showcasing language generation using llama-rs](./doc/resources/llama_gif.gif)
 
@@ -54,12 +54,12 @@ git clone --recurse-submodules git@github.com:rustformers/llama-rs.git
 cargo build --release
 ```
 
-The resulting binary will be at `target/release/llm-cli[.exe]`.
+The resulting binary will be at `target/release/llm[.exe]`.
 
 It can also be run directly through Cargo, using
 
 ```shell
-cargo run --release --bin llm-cli -- <ARGS>
+cargo run --release --bin llm -- <ARGS>
 ```
 
 This is useful for development.

From 608090b329e9bdde71b2c8efc96d5cbe3185e4d4 Mon Sep 17 00:00:00 2001
From: Dan Forbes <dan@danforbes.dev>
Date: Sun, 30 Apr 2023 08:14:11 -0700
Subject: [PATCH 31/35] ggml-rs -> ggml

---
 .gitmodules                       |  4 +-
 Cargo.lock                        |  8 ++--
 Cargo.toml                        |  2 +-
 {ggml-rs => ggml}/Cargo.toml      |  2 +-
 {ggml-rs => ggml}/build.rs        |  0
 {ggml-rs => ggml}/ggml            |  0
 {ggml-rs => ggml}/src/context.rs  |  0
 {ggml-rs => ggml}/src/lib.rs      |  4 +-
 {ggml-rs => ggml}/src/loader.rs   |  0
 {ggml-rs => ggml}/src/saver.rs    |  0
 {ggml-rs => ggml}/src/tensor.rs   |  0
 {ggml-rs => ggml}/src/tests.rs    |  0
 {ggml-rs => ggml}/src/util.rs     |  0
 gpt2/Cargo.toml                   |  2 +-
 gpt2/src/lib.rs                   | 19 ++++----
 llama/Cargo.toml                  |  4 +-
 llama/src/lib.rs                  | 40 ++++++++--------
 llama/src/old_loader.rs           | 78 +++++++++++++++----------------
 llama/src/quantize.rs             | 36 +++++++-------
 llm-base/Cargo.toml               |  2 +-
 llm-base/src/inference_session.rs | 28 +++++------
 llm-base/src/lib.rs               |  4 +-
 llm-base/src/loader.rs            | 22 ++++-----
 llm-base/src/util.rs              |  2 +-
 24 files changed, 127 insertions(+), 130 deletions(-)
 rename {ggml-rs => ggml}/Cargo.toml (91%)
 rename {ggml-rs => ggml}/build.rs (100%)
 rename {ggml-rs => ggml}/ggml (100%)
 rename {ggml-rs => ggml}/src/context.rs (100%)
 rename {ggml-rs => ggml}/src/lib.rs (97%)
 rename {ggml-rs => ggml}/src/loader.rs (100%)
 rename {ggml-rs => ggml}/src/saver.rs (100%)
 rename {ggml-rs => ggml}/src/tensor.rs (100%)
 rename {ggml-rs => ggml}/src/tests.rs (100%)
 rename {ggml-rs => ggml}/src/util.rs (100%)

diff --git a/.gitmodules b/.gitmodules
index 5f5d5012..4a7cb543 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -1,3 +1,3 @@
-[submodule "ggml-rs/ggml"]
-	path = ggml-rs/ggml
+[submodule "ggml/ggml"]
+	path = ggml/ggml
 	url = git@github.com:ggerganov/ggml.git
diff --git a/Cargo.lock b/Cargo.lock
index dea7d177..a9feb475 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -432,7 +432,7 @@ dependencies = [
 ]
 
 [[package]]
-name = "ggml-rs"
+name = "ggml"
 version = "0.1.0"
 dependencies = [
  "bindgen",
@@ -458,7 +458,7 @@ name = "gpt2"
 version = "0.1.0"
 dependencies = [
  "bytemuck",
- "ggml-rs",
+ "ggml",
  "llm-base",
  "rand",
 ]
@@ -601,7 +601,7 @@ name = "llama"
 version = "0.1.0"
 dependencies = [
  "bytemuck",
- "ggml-rs",
+ "ggml",
  "half",
  "llm-base",
  "partial_sort",
@@ -629,7 +629,7 @@ version = "0.1.0"
 dependencies = [
  "bincode",
  "bytemuck",
- "ggml-rs",
+ "ggml",
  "log",
  "memmap2",
  "partial_sort",
diff --git a/Cargo.toml b/Cargo.toml
index 67a108a5..ac67ad16 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -1,7 +1,7 @@
 [workspace]
 members = [
     # Crates
-    "ggml-rs",
+    "ggml",
     "llm-base",
     "gpt2",
     "llama",
diff --git a/ggml-rs/Cargo.toml b/ggml/Cargo.toml
similarity index 91%
rename from ggml-rs/Cargo.toml
rename to ggml/Cargo.toml
index e2e59a65..43d64758 100644
--- a/ggml-rs/Cargo.toml
+++ b/ggml/Cargo.toml
@@ -1,5 +1,5 @@
 [package]
-name = "ggml-rs"
+name = "ggml"
 version = { workspace = true }
 edition = "2021"
 
diff --git a/ggml-rs/build.rs b/ggml/build.rs
similarity index 100%
rename from ggml-rs/build.rs
rename to ggml/build.rs
diff --git a/ggml-rs/ggml b/ggml/ggml
similarity index 100%
rename from ggml-rs/ggml
rename to ggml/ggml
diff --git a/ggml-rs/src/context.rs b/ggml/src/context.rs
similarity index 100%
rename from ggml-rs/src/context.rs
rename to ggml/src/context.rs
diff --git a/ggml-rs/src/lib.rs b/ggml/src/lib.rs
similarity index 97%
rename from ggml-rs/src/lib.rs
rename to ggml/src/lib.rs
index 5950de13..5ba74df9 100644
--- a/ggml-rs/src/lib.rs
+++ b/ggml/src/lib.rs
@@ -3,12 +3,12 @@
 #![allow(non_snake_case)]
 #![allow(unused)]
 
-//! `ggml-rs` is a semi-idiomatic wrapper for the `ggml` C library.
+//! `ggml` is a semi-idiomatic wrapper for the `ggml` C library.
 //!
 //! It exposes a subset of operations (currently used to implement the [llama-rs](https://crates.io/crates/llama-rs) library).
 //! Note that it does not expose a fully-idiomatic safe Rust interface; operations that could be potentially unsafe are marked as such.
 //!
-//! `ggml-rs` operates on a computational graph; no values will be computed until [Context::graph_compute] is executed.
+//! `ggml` operates on a computational graph; no values will be computed until [Context::graph_compute] is executed.
 //! All [Tensor]s are nodes in this computational graph, and values cannot be retrieved until computation is completed.
 
 use std::{
diff --git a/ggml-rs/src/loader.rs b/ggml/src/loader.rs
similarity index 100%
rename from ggml-rs/src/loader.rs
rename to ggml/src/loader.rs
diff --git a/ggml-rs/src/saver.rs b/ggml/src/saver.rs
similarity index 100%
rename from ggml-rs/src/saver.rs
rename to ggml/src/saver.rs
diff --git a/ggml-rs/src/tensor.rs b/ggml/src/tensor.rs
similarity index 100%
rename from ggml-rs/src/tensor.rs
rename to ggml/src/tensor.rs
diff --git a/ggml-rs/src/tests.rs b/ggml/src/tests.rs
similarity index 100%
rename from ggml-rs/src/tests.rs
rename to ggml/src/tests.rs
diff --git a/ggml-rs/src/util.rs b/ggml/src/util.rs
similarity index 100%
rename from ggml-rs/src/util.rs
rename to ggml/src/util.rs
diff --git a/gpt2/Cargo.toml b/gpt2/Cargo.toml
index 8efe5b13..021dc446 100644
--- a/gpt2/Cargo.toml
+++ b/gpt2/Cargo.toml
@@ -4,7 +4,7 @@ version = { workspace = true }
 edition = "2021"
 
 [dependencies]
-ggml-rs = { path = "../ggml-rs" }
+ggml = { path = "../ggml" }
 llm-base = { path = "../llm-base" }
 
 bytemuck = { workspace = true }
diff --git a/gpt2/src/lib.rs b/gpt2/src/lib.rs
index 6efce87c..b0f20c95 100644
--- a/gpt2/src/lib.rs
+++ b/gpt2/src/lib.rs
@@ -1,6 +1,6 @@
 use std::path::Path;
 
-use ggml_rs::Tensor;
+use ggml::Tensor;
 use llm_base::{
     util, EvaluateOutputRequest, FileType, InferenceParameters, InferenceSession,
     InferenceSessionParameters, KnownModel, LoadError, LoadProgress, TokenId, Vocabulary,
@@ -16,7 +16,7 @@ pub struct Gpt2 {
     wpe: Tensor,
     lm_head: Tensor,
     layers: Vec<Layer>,
-    _context: ggml_rs::context::Context,
+    _context: ggml::context::Context,
 }
 
 impl KnownModel for Gpt2 {
@@ -126,11 +126,11 @@ impl KnownModel for Gpt2 {
             // add 10% to account for ggml object overhead
             buf_size = (1.1f64 * session.mem_per_token as f64 * n as f64) as usize;
         };
-        let ctx0 = ggml_rs::context::Context::init(buf_size, true);
+        let ctx0 = ggml::context::Context::init(buf_size, true);
 
-        let mut gf = ggml_rs::ComputationGraph::new(n_threads);
+        let mut gf = ggml::ComputationGraph::new(n_threads);
 
-        let mut embd = ctx0.new_tensor_1d(ggml_rs::Type::I32, n);
+        let mut embd = ctx0.new_tensor_1d(ggml::Type::I32, n);
         unsafe { embd.write_data(bytemuck::cast_slice(input_tokens)) };
 
         let n_past = session.n_past;
@@ -140,7 +140,7 @@ impl KnownModel for Gpt2 {
             position_buf.push(n_past + position_idx);
         }
 
-        let mut position = ctx0.new_tensor_1d(ggml_rs::Type::I32, n);
+        let mut position = ctx0.new_tensor_1d(ggml::Type::I32, n);
         unsafe { position.write_data(bytemuck::cast_slice(&position_buf)) };
 
         let mut input_layer = ctx0.op_add(
@@ -195,7 +195,7 @@ impl KnownModel for Gpt2 {
             let q = ctx0.op_permute(
                 &ctx0.op_cpy(
                     &qcur,
-                    &ctx0.new_tensor_3d(ggml_rs::Type::F32, n_embd / n_head, n_head, n),
+                    &ctx0.new_tensor_3d(ggml::Type::F32, n_embd / n_head, n_head, n),
                 ),
                 0,
                 2,
@@ -252,10 +252,7 @@ impl KnownModel for Gpt2 {
             let kqv = ctx0.op_mul_mat(&v_trans, &kq_softmax);
             let kqv_merged = ctx0.op_permute(&kqv, 0, 2, 1, 3);
 
-            current = ctx0.op_cpy(
-                &kqv_merged,
-                &ctx0.new_tensor_2d(ggml_rs::Type::F32, n_embd, n),
-            );
+            current = ctx0.op_cpy(&kqv_merged, &ctx0.new_tensor_2d(ggml::Type::F32, n_embd, n));
 
             // projection
             current = ctx0.op_mul_mat(&self.layers[il].c_attn_proj_w, &current);
diff --git a/llama/Cargo.toml b/llama/Cargo.toml
index 03b717c5..3a916b3a 100644
--- a/llama/Cargo.toml
+++ b/llama/Cargo.toml
@@ -23,11 +23,11 @@ rust_tokenizers = { version = "3.1.2", optional = true }
 
 # Used for the `quantize` feature
 half = { version = "2.2.1", optional = true }
-ggml-rs = { path = "../ggml-rs", optional = true }
+ggml = { path = "../ggml", optional = true }
 
 [features]
 convert = ["dep:serde_json", "dep:protobuf", "dep:rust_tokenizers"]
-quantize = ["dep:half", "dep:ggml-rs"]
+quantize = ["dep:half", "dep:ggml"]
 
 [dev-dependencies]
 rand = { workspace = true }
diff --git a/llama/src/lib.rs b/llama/src/lib.rs
index 50cc93e3..133cdaff 100644
--- a/llama/src/lib.rs
+++ b/llama/src/lib.rs
@@ -12,7 +12,7 @@ pub mod quantize;
 
 mod old_loader;
 
-pub use llm_base::{ggml_rs, util::TokenUtf8Buffer, TokenBias, TokenId, Vocabulary};
+pub use llm_base::{ggml, util::TokenUtf8Buffer, TokenBias, TokenId, Vocabulary};
 
 /// The weights for the LLaMA model. All the mutable state is split into a
 /// separate struct `InferenceSession`.
@@ -25,10 +25,10 @@ pub struct Llama {
 
     vocabulary: Vocabulary,
 
-    tok_embeddings: ggml_rs::Tensor,
+    tok_embeddings: ggml::Tensor,
 
-    norm: ggml_rs::Tensor,
-    output: ggml_rs::Tensor,
+    norm: ggml::Tensor,
+    output: ggml::Tensor,
 
     layers: Vec<Layer>,
 
@@ -36,7 +36,7 @@ pub struct Llama {
     _mmap: Option<Mmap>,
 
     // Must be kept alive for the model
-    _context: ggml_rs::context::Context,
+    _context: ggml::context::Context,
 }
 unsafe impl Send for Llama {}
 unsafe impl Sync for Llama {}
@@ -183,18 +183,18 @@ impl KnownModel for Llama {
             // add 10% to account for ggml object overhead
             buf_size = (1.1f64 * session.mem_per_token as f64 * n as f64) as usize;
         };
-        let ctx0 = ggml_rs::context::Context::init(buf_size, true);
+        let ctx0 = ggml::context::Context::init(buf_size, true);
 
-        let mut gf = ggml_rs::ComputationGraph::new(n_threads);
+        let mut gf = ggml::ComputationGraph::new(n_threads);
 
-        let mut embd = ctx0.new_tensor_1d(ggml_rs::Type::I32, n);
+        let mut embd = ctx0.new_tensor_1d(ggml::Type::I32, n);
         unsafe { embd.write_data(bytemuck::cast_slice(input_tokens)) };
 
         let mut input_layer = ctx0.op_get_rows(&self.tok_embeddings, &embd);
 
         for il in 0..n_layer {
             let input_self_attention = input_layer.share();
-            let mut current: ggml_rs::Tensor;
+            let mut current: ggml::Tensor;
 
             ctx0.use_scratch(Some(&mut session.scratch[0]));
 
@@ -312,7 +312,7 @@ impl KnownModel for Llama {
                 // cur = KQV_merged.contiguous().view(n_embd, N)
                 current = ctx0.op_cpy(
                     &k_q_v_merged,
-                    &ctx0.new_tensor_2d(ggml_rs::Type::F32, n_embd, n),
+                    &ctx0.new_tensor_2d(ggml::Type::F32, n_embd, n),
                 );
 
                 // projection (no bias)
@@ -437,7 +437,7 @@ impl Llama {
     /// This does *not* construct a valid model. All of the tensors are entirely
     /// empty. However, it can be used to determine if some code will compile.
     fn new_empty() -> Self {
-        let context = ggml_rs::context::Context::init(1024 * 1024, true);
+        let context = ggml::context::Context::init(1024 * 1024, true);
         let tok_embeddings = context.new_f32(0.0);
         let norm = context.new_f32(0.0);
         let output = context.new_f32(0.0);
@@ -496,20 +496,20 @@ impl llm_base::Hyperparameters for Hyperparameters {
 }
 
 struct Layer {
-    attention_norm: ggml_rs::Tensor,
+    attention_norm: ggml::Tensor,
 
-    wq: ggml_rs::Tensor,
-    wk: ggml_rs::Tensor,
-    wv: ggml_rs::Tensor,
-    wo: ggml_rs::Tensor,
+    wq: ggml::Tensor,
+    wk: ggml::Tensor,
+    wv: ggml::Tensor,
+    wo: ggml::Tensor,
 
     // normalization
-    ffn_norm: ggml_rs::Tensor,
+    ffn_norm: ggml::Tensor,
 
     // ff
-    w1: ggml_rs::Tensor,
-    w2: ggml_rs::Tensor,
-    w3: ggml_rs::Tensor,
+    w1: ggml::Tensor,
+    w2: ggml::Tensor,
+    w3: ggml::Tensor,
 }
 
 #[cfg(test)]
diff --git a/llama/src/old_loader.rs b/llama/src/old_loader.rs
index 271279fa..fd38bcd0 100644
--- a/llama/src/old_loader.rs
+++ b/llama/src/old_loader.rs
@@ -13,7 +13,7 @@ use std::{
 
 use crate::Hyperparameters;
 use crate::{Llama, LoadError, LoadProgress, TokenId, Vocabulary};
-use llm_base::{ggml_rs, mulf, util, ContainerType, FileType};
+use llm_base::{ggml, mulf, util, ContainerType, FileType};
 
 pub(crate) fn load(
     path: impl AsRef<Path>,
@@ -35,9 +35,9 @@ pub(crate) fn load(
     // Verify magic
     let magic = util::read_u32(&mut reader)?;
     let model_type: ContainerType = match magic {
-        ggml_rs::FILE_MAGIC_GGMF => ContainerType::Ggmf,
-        ggml_rs::FILE_MAGIC_GGJT => ContainerType::Ggjt,
-        ggml_rs::FILE_MAGIC_UNVERSIONED => ContainerType::Ggml,
+        ggml::FILE_MAGIC_GGMF => ContainerType::Ggmf,
+        ggml::FILE_MAGIC_GGJT => ContainerType::Ggjt,
+        ggml::FILE_MAGIC_UNVERSIONED => ContainerType::Ggml,
         _ => {
             return Err(LoadError::InvalidMagic {
                 path: main_path.to_owned(),
@@ -50,7 +50,7 @@ pub(crate) fn load(
     match model_type {
         ContainerType::Ggmf | ContainerType::Ggjt => {
             let _version: u32 = match util::read_u32(&mut reader)? {
-                ggml_rs::FORMAT_VERSION => ggml_rs::FORMAT_VERSION,
+                ggml::FORMAT_VERSION => ggml::FORMAT_VERSION,
                 version => {
                     return Err(LoadError::InvalidFormatVersion {
                         container_type: model_type,
@@ -115,10 +115,10 @@ pub(crate) fn load(
     // floats or quantized in order to save memory and also to speed up the
     // computation
     let wtype = match hparams.file_type {
-        FileType::F32 => ggml_rs::Type::F32,
-        FileType::MostlyF16 => ggml_rs::Type::F16,
-        FileType::MostlyQ4_0 => ggml_rs::Type::Q4_0,
-        FileType::MostlyQ4_1 => ggml_rs::Type::Q4_1,
+        FileType::F32 => ggml::Type::F32,
+        FileType::MostlyF16 => ggml::Type::F16,
+        FileType::MostlyQ4_0 => ggml::Type::Q4_0,
+        FileType::MostlyQ4_1 => ggml::Type::Q4_1,
         _ => unimplemented!(),
     };
 
@@ -135,22 +135,22 @@ pub(crate) fn load(
         if alloc {
             let mut model_size: usize = 0;
 
-            ctx_size += mulf!(n_embd, n_vocab, ggml_rs::type_sizef(wtype)); // tok_embeddings
-            ctx_size += mulf!(n_embd, ggml_rs::type_sizef(ggml_rs::Type::F32)); // norm
-            ctx_size += mulf!(n_embd, n_vocab, ggml_rs::type_sizef(wtype)); // output
+            ctx_size += mulf!(n_embd, n_vocab, ggml::type_sizef(wtype)); // tok_embeddings
+            ctx_size += mulf!(n_embd, ggml::type_sizef(ggml::Type::F32)); // norm
+            ctx_size += mulf!(n_embd, n_vocab, ggml::type_sizef(wtype)); // output
 
-            model_size += mulf!(n_layer, n_embd, ggml_rs::type_sizef(ggml_rs::Type::F32)); // attention_norm
+            model_size += mulf!(n_layer, n_embd, ggml::type_sizef(ggml::Type::F32)); // attention_norm
 
-            model_size += mulf!(n_layer, n_embd, n_embd, ggml_rs::type_sizef(wtype)); // wq
-            model_size += mulf!(n_layer, n_embd, n_embd, ggml_rs::type_sizef(wtype)); // wk
-            model_size += mulf!(n_layer, n_embd, n_embd, ggml_rs::type_sizef(wtype)); // wv
-            model_size += mulf!(n_layer, n_embd, n_embd, ggml_rs::type_sizef(wtype)); // wo
+            model_size += mulf!(n_layer, n_embd, n_embd, ggml::type_sizef(wtype)); // wq
+            model_size += mulf!(n_layer, n_embd, n_embd, ggml::type_sizef(wtype)); // wk
+            model_size += mulf!(n_layer, n_embd, n_embd, ggml::type_sizef(wtype)); // wv
+            model_size += mulf!(n_layer, n_embd, n_embd, ggml::type_sizef(wtype)); // wo
 
-            model_size += mulf!(n_layer, n_embd, ggml_rs::type_sizef(ggml_rs::Type::F32)); // ffn_norm
+            model_size += mulf!(n_layer, n_embd, ggml::type_sizef(ggml::Type::F32)); // ffn_norm
 
-            model_size += mulf!(n_layer, n_ff, n_embd, ggml_rs::type_sizef(wtype)); // w1
-            model_size += mulf!(n_layer, n_ff, n_embd, ggml_rs::type_sizef(wtype)); // w2
-            model_size += mulf!(n_layer, n_ff, n_embd, ggml_rs::type_sizef(wtype)); // w3
+            model_size += mulf!(n_layer, n_ff, n_embd, ggml::type_sizef(wtype)); // w1
+            model_size += mulf!(n_layer, n_ff, n_embd, ggml::type_sizef(wtype)); // w2
+            model_size += mulf!(n_layer, n_ff, n_embd, ggml::type_sizef(wtype)); // w3
 
             ctx_size += model_size;
         }
@@ -161,7 +161,7 @@ pub(crate) fn load(
     };
 
     // Initialize the context
-    let context = ggml_rs::context::Context::init(ctx_size, alloc);
+    let context = ggml::context::Context::init(ctx_size, alloc);
 
     let (mmap, mmap_ptr) = if prefer_mmap && model_type.support_mmap() {
         let mmap = util::mmap_populate(&file)?;
@@ -217,7 +217,7 @@ fn load_weights_ggmf_or_unversioned(
     file_offset: u64,
     main_path: &Path,
     mut load_progress_callback: impl FnMut(LoadProgress),
-    tensors: &mut HashMap<String, ggml_rs::Tensor>,
+    tensors: &mut HashMap<String, ggml::Tensor>,
 ) -> Result<(), LoadError> {
     use std::{fs::File, io::BufReader};
 
@@ -269,7 +269,7 @@ fn load_weights_ggmf_or_unversioned(
             )?;
 
             if n_dims == 1 || n_parts == 1 {
-                if (nelements * bpe) / ggml_rs::blck_size(tensor.get_type()) != tensor.nbytes() {
+                if (nelements * bpe) / ggml::blck_size(tensor.get_type()) != tensor.nbytes() {
                     return Err(LoadError::TensorWrongSize {
                         tensor_name,
                         path: part_path,
@@ -289,7 +289,7 @@ fn load_weights_ggmf_or_unversioned(
 
                 total_size += tensor.nbytes();
             } else {
-                if (nelements * bpe) / ggml_rs::blck_size(tensor.get_type())
+                if (nelements * bpe) / ggml::blck_size(tensor.get_type())
                     != tensor.nbytes() / n_parts
                 {
                     return Err(LoadError::TensorWrongSize {
@@ -301,16 +301,16 @@ fn load_weights_ggmf_or_unversioned(
                 if split_type == 0 {
                     let np0 = ne[0];
                     let row_size = (usize::try_from(tensor.get_ne()[0])?
-                        / ggml_rs::blck_size(tensor.get_type()))
-                        * ggml_rs::type_size(tensor.get_type());
+                        / ggml::blck_size(tensor.get_type()))
+                        * ggml::type_size(tensor.get_type());
 
                     assert_eq!(row_size, tensor.get_nb()[1]);
 
                     for i1 in 0..ne[1] {
                         let offset_row = i1 as usize * row_size;
                         let offset = offset_row
-                            + ((part_id * np0 as usize) / ggml_rs::blck_size(tensor.get_type()))
-                                * ggml_rs::type_size(tensor.get_type());
+                            + ((part_id * np0 as usize) / ggml::blck_size(tensor.get_type()))
+                                * ggml::type_size(tensor.get_type());
                         // SAFETY: yolo, same as original code
                         unsafe {
                             let ptr = tensor.data().add(offset);
@@ -322,8 +322,8 @@ fn load_weights_ggmf_or_unversioned(
                 } else {
                     let np1 = ne[1];
                     let row_size = (usize::try_from(tensor.get_ne()[0])?
-                        / ggml_rs::blck_size(tensor.get_type()))
-                        * ggml_rs::type_size(tensor.get_type());
+                        / ggml::blck_size(tensor.get_type()))
+                        * ggml::type_size(tensor.get_type());
 
                     for i1 in 0..ne[1] {
                         let offset_row = (i1 as usize + part_id * np1 as usize) * row_size;
@@ -360,7 +360,7 @@ struct TensorHeaderGgmf<'a> {
     nelements: usize,
     ne: [i64; 2],
     tensor_name: String,
-    tensor: &'a mut ggml_rs::Tensor,
+    tensor: &'a mut ggml::Tensor,
     split_type: i32,
     bpe: usize,
 }
@@ -368,7 +368,7 @@ fn load_tensor_header_ggmf<'a>(
     n_dims: usize,
     reader: &mut impl BufRead,
     length: i32,
-    tensors: &'a mut HashMap<String, ggml_rs::Tensor>,
+    tensors: &'a mut HashMap<String, ggml::Tensor>,
     path: &Path,
     n_parts: usize,
     ftype: u32,
@@ -456,14 +456,14 @@ fn load_tensor_header_ggmf<'a>(
 }
 
 fn tensor_type_size(ftype: u32, ne: [i64; 2]) -> Option<usize> {
-    let ftype = ggml_rs::Type::try_from(ftype).ok()?;
+    let ftype = ggml::Type::try_from(ftype).ok()?;
     match ftype {
-        ggml_rs::Type::Q4_0 | ggml_rs::Type::Q4_1 => {
+        ggml::Type::Q4_0 | ggml::Type::Q4_1 => {
             assert_eq!(ne[0] % 64, 0);
         }
         _ => {}
     }
-    Some(ggml_rs::type_size(ftype))
+    Some(ggml::type_size(ftype))
 }
 
 fn load_weights_ggjt(
@@ -471,7 +471,7 @@ fn load_weights_ggjt(
     mmap_base: Option<*const u8>,
     path: &Path,
     mut load_progress_callback: impl FnMut(LoadProgress),
-    tensors: &mut HashMap<String, ggml_rs::Tensor>,
+    tensors: &mut HashMap<String, ggml::Tensor>,
 ) -> Result<(), LoadError>
 // where R: std::io::Read
 {
@@ -561,7 +561,7 @@ fn load_weights_ggjt(
 fn load_tensor_ggjt_mmap(
     reader: &mut (impl BufRead + Seek),
     mmap_base: *const u8,
-    tensor: &mut ggml_rs::Tensor,
+    tensor: &mut ggml::Tensor,
 ) -> Result<(), LoadError> {
     let offset_curr = reader.stream_position()?;
     let offset_aligned: u64 = (offset_curr + 31) & !31;
@@ -575,7 +575,7 @@ fn load_tensor_ggjt_mmap(
 
 fn load_tensor_ggjt_copy<'a>(
     reader: &mut (impl BufRead + Seek),
-    tensor: &'a mut ggml_rs::Tensor,
+    tensor: &'a mut ggml::Tensor,
 ) -> Result<(), LoadError> {
     let offset_curr = reader.stream_position()?;
     let offset_aligned: u64 = (offset_curr + 31) & !31;
diff --git a/llama/src/quantize.rs b/llama/src/quantize.rs
index 8a50bdb6..055def3f 100644
--- a/llama/src/quantize.rs
+++ b/llama/src/quantize.rs
@@ -1,12 +1,12 @@
 //! Implements quantization of weights.
 
 use crate::{Hyperparameters, LoadError, LoadProgress};
-use ggml_rs::{
+use ggml::{
     loader::TensorInfo,
     saver::{SaveError, SaveHandler, TensorData},
 };
 use half::f16;
-use llm_base::{ggml_rs, util, Loader};
+use llm_base::{ggml, util, Loader};
 use std::{
     collections::HashMap,
     fs::File,
@@ -29,7 +29,7 @@ pub enum QuantizeProgress<'a> {
         /// Size of the tensor.
         dims: [usize; 2],
         /// Type of the tensor.
-        element_type: ggml_rs::Type,
+        element_type: ggml::Type,
         /// Number of elements in the tensor.
         n_elements: usize,
     },
@@ -104,13 +104,13 @@ pub enum QuantizeError {
     #[error("invalid quantization target {element_type:?}")]
     InvalidQuantizationTarget {
         /// The quantization target.
-        element_type: ggml_rs::Type,
+        element_type: ggml::Type,
     },
     /// The quantization process encountered an unsupported element type.
     #[error("unsupported element type {element_type:?}")]
     UnsupportedElementType {
         /// The element type.
-        element_type: ggml_rs::Type,
+        element_type: ggml::Type,
     },
 }
 impl QuantizeError {
@@ -130,11 +130,11 @@ impl QuantizeError {
 pub fn quantize(
     path_in: impl AsRef<Path>,
     path_out: impl AsRef<Path>,
-    desired_type: ggml_rs::Type,
+    desired_type: ggml::Type,
     progress_callback: impl Fn(QuantizeProgress),
 ) -> Result<(), QuantizeError> {
     // Sanity check
-    if !matches!(desired_type, ggml_rs::Type::Q4_0 | ggml_rs::Type::Q4_1) {
+    if !matches!(desired_type, ggml::Type::Q4_0 | ggml::Type::Q4_1) {
         return Err(QuantizeError::InvalidQuantizationTarget {
             element_type: desired_type,
         });
@@ -157,7 +157,7 @@ pub fn quantize(
             }
         }
     });
-    ggml_rs::loader::load_model(&mut reader, &mut loader)
+    ggml::loader::load_model(&mut reader, &mut loader)
         .map_err(|err| LoadError::from_format_error(err, path_in.to_owned()))?;
 
     // Save the quantized model, quantizing as we go
@@ -184,7 +184,7 @@ pub fn quantize(
         &mut file_in,
         |p| progress_callback(p),
     );
-    ggml_rs::saver::save_model(
+    ggml::saver::save_model(
         &mut writer,
         &mut saver,
         &vocabulary,
@@ -209,7 +209,7 @@ pub fn quantize(
 
 struct QuantizeSaver<'a, F: Fn(QuantizeProgress)> {
     // Input
-    quantization_type: ggml_rs::Type,
+    quantization_type: ggml::Type,
     hyperparameters: &'a Hyperparameters,
     tensors: &'a HashMap<String, TensorInfo>,
     source_file: &'a mut File,
@@ -222,7 +222,7 @@ struct QuantizeSaver<'a, F: Fn(QuantizeProgress)> {
 }
 impl<'a, F: Fn(QuantizeProgress)> QuantizeSaver<'a, F> {
     fn new(
-        quantization_type: ggml_rs::Type,
+        quantization_type: ggml::Type,
         hyperparameters: &'a Hyperparameters,
         tensors: &'a HashMap<String, TensorInfo>,
         source_file: &'a mut File,
@@ -270,7 +270,7 @@ impl<F: Fn(QuantizeProgress)> SaveHandler<QuantizeError> for QuantizeSaver<'_, F
         let quantize = tensor_name.contains("weight") && tensor.n_dims == 2;
         let raw_data = tensor.read_data(&mut BufReader::new(&mut self.source_file))?;
 
-        if quantize && !matches!(tensor.element_type, ggml_rs::Type::F32 | ggml_rs::Type::F16) {
+        if quantize && !matches!(tensor.element_type, ggml::Type::F32 | ggml::Type::F16) {
             return Err(QuantizeError::UnsupportedElementType {
                 element_type: tensor.element_type,
             });
@@ -282,11 +282,11 @@ impl<F: Fn(QuantizeProgress)> SaveHandler<QuantizeError> for QuantizeSaver<'_, F
             (self.progress_callback)(QuantizeProgress::TensorQuantizing { name: tensor_name });
 
             let data_f32: Vec<f32> = match tensor.element_type {
-                ggml_rs::Type::F32 => raw_data
+                ggml::Type::F32 => raw_data
                     .chunks_exact(4)
                     .map(|chunk| f32::from_le_bytes(chunk.try_into().unwrap()))
                     .collect(),
-                ggml_rs::Type::F16 => raw_data
+                ggml::Type::F16 => raw_data
                     .chunks_exact(2)
                     .map(|chunk| {
                         f16::from_bits(u16::from_le_bytes(chunk.try_into().unwrap())).to_f32()
@@ -296,11 +296,11 @@ impl<F: Fn(QuantizeProgress)> SaveHandler<QuantizeError> for QuantizeSaver<'_, F
             };
 
             let result = match self.quantization_type {
-                ggml_rs::Type::Q4_0 => {
-                    ggml_rs::quantize_q4_0(&data_f32, tensor.n_elements, tensor.dims[0])
+                ggml::Type::Q4_0 => {
+                    ggml::quantize_q4_0(&data_f32, tensor.n_elements, tensor.dims[0])
                 }
-                ggml_rs::Type::Q4_1 => {
-                    ggml_rs::quantize_q4_1(&data_f32, tensor.n_elements, tensor.dims[0])
+                ggml::Type::Q4_1 => {
+                    ggml::quantize_q4_1(&data_f32, tensor.n_elements, tensor.dims[0])
                 }
                 _ => unreachable!(),
             };
diff --git a/llm-base/Cargo.toml b/llm-base/Cargo.toml
index 55dec81e..9e758251 100644
--- a/llm-base/Cargo.toml
+++ b/llm-base/Cargo.toml
@@ -7,7 +7,7 @@ rust-version = "1.65"
 # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
 
 [dependencies]
-ggml-rs = { path = "../ggml-rs" }
+ggml = { path = "../ggml" }
 
 bytemuck = { workspace = true }
 rand = { workspace = true }
diff --git a/llm-base/src/inference_session.rs b/llm-base/src/inference_session.rs
index e3c60d29..96e14338 100644
--- a/llm-base/src/inference_session.rs
+++ b/llm-base/src/inference_session.rs
@@ -27,7 +27,7 @@ const SCRATCH_SIZE: usize = 512 * 1024 * 1024;
 /// to use it from multiple threads.
 pub struct InferenceSession {
     // Must be kept alive for the model
-    pub(crate) _session_ctx: ggml_rs::context::Context,
+    pub(crate) _session_ctx: ggml::context::Context,
 
     // Original size of the memory used to create this context.
     pub(crate) memory_size: usize,
@@ -36,10 +36,10 @@ pub struct InferenceSession {
     pub(crate) params: InferenceSessionParameters,
 
     /// Memory K
-    pub memory_k: ggml_rs::Tensor,
+    pub memory_k: ggml::Tensor,
 
     /// Memory M
-    pub memory_v: ggml_rs::Tensor,
+    pub memory_v: ggml::Tensor,
 
     /// How many tokens have been fed into the model's working memory so far.
     pub n_past: usize,
@@ -58,7 +58,7 @@ pub struct InferenceSession {
     ///
     /// The number of scratch buffers was copied from `llama.cpp`.
     /// There is no specific reason for this number, but one is insufficient.
-    pub scratch: [ggml_rs::Buffer; 2],
+    pub scratch: [ggml::Buffer; 2],
 }
 unsafe impl Send for InferenceSession {}
 impl InferenceSession {
@@ -365,19 +365,19 @@ impl InferenceSession {
                 n_ctx,
                 n_layer,
                 n_embd,
-                ggml_rs::type_sizef(params.memory_k_type.into())
+                ggml::type_sizef(params.memory_k_type.into())
             ); // memory_k
             ctx_size += mulf!(
                 n_ctx,
                 n_layer,
                 n_embd,
-                ggml_rs::type_sizef(params.memory_v_type.into())
+                ggml::type_sizef(params.memory_v_type.into())
             ); // memory_v
             ctx_size += (5 + 10 * n_layer) * 256; // object overhead
             ctx_size
         };
 
-        let session_ctx = ggml_rs::context::Context::init(ctx_size, true);
+        let session_ctx = ggml::context::Context::init(ctx_size, true);
 
         // Initialize key + value memory tensors
         let n_mem = n_layer * n_ctx;
@@ -409,7 +409,7 @@ impl InferenceSession {
 }
 impl Clone for InferenceSession {
     fn clone(&self) -> Self {
-        let context = ggml_rs::context::Context::init(self.memory_size, true);
+        let context = ggml::context::Context::init(self.memory_size, true);
         let memory_k = context.new_tensor_1d(self.memory_k.get_type(), self.memory_k.nelements());
         let memory_v = context.new_tensor_1d(self.memory_v.get_type(), self.memory_v.nelements());
 
@@ -569,18 +569,18 @@ pub enum ModelKVMemoryType {
     /// 32-bit float.
     Float32,
 }
-impl From<ModelKVMemoryType> for ggml_rs::Type {
+impl From<ModelKVMemoryType> for ggml::Type {
     fn from(value: ModelKVMemoryType) -> Self {
         match value {
-            ModelKVMemoryType::Float16 => ggml_rs::Type::F16,
-            ModelKVMemoryType::Float32 => ggml_rs::Type::F32,
+            ModelKVMemoryType::Float16 => ggml::Type::F16,
+            ModelKVMemoryType::Float32 => ggml::Type::F32,
         }
     }
 }
 
-fn scratch_buffers() -> [ggml_rs::Buffer; 2] {
+fn scratch_buffers() -> [ggml::Buffer; 2] {
     [
-        ggml_rs::Buffer::new(SCRATCH_SIZE),
-        ggml_rs::Buffer::new(SCRATCH_SIZE),
+        ggml::Buffer::new(SCRATCH_SIZE),
+        ggml::Buffer::new(SCRATCH_SIZE),
     ]
 }
diff --git a/llm-base/src/lib.rs b/llm-base/src/lib.rs
index 8edc939a..871bc8db 100644
--- a/llm-base/src/lib.rs
+++ b/llm-base/src/lib.rs
@@ -14,8 +14,8 @@ mod inference_session;
 mod loader;
 mod vocabulary;
 
-pub use ggml_rs;
-pub use ggml_rs::Type as ElementType;
+pub use ggml;
+pub use ggml::Type as ElementType;
 pub use inference_session::{
     InferenceSession, InferenceSessionParameters, InferenceSnapshot, ModelKVMemoryType,
     SnapshotError,
diff --git a/llm-base/src/loader.rs b/llm-base/src/loader.rs
index f67c700c..f3ebc082 100644
--- a/llm-base/src/loader.rs
+++ b/llm-base/src/loader.rs
@@ -10,8 +10,8 @@ use crate::{
     util::{self, FindAllModelFilesError},
     Hyperparameters, KnownModel, TokenId, Vocabulary,
 };
-pub use ggml_rs::ContainerType;
-use ggml_rs::{
+pub use ggml::ContainerType;
+use ggml::{
     context::Context,
     loader::{LoadError as FormatLoadError, PartialHyperparameters, TensorInfo},
 };
@@ -280,9 +280,9 @@ impl LoadError {
 /// Used by models to fetch tensors from a loader.
 pub trait TensorLoader<E: std::error::Error> {
     /// Loads a tensor from the loader.
-    fn load(&mut self, name: &str, ne: &[usize]) -> Result<ggml_rs::Tensor, E>;
+    fn load(&mut self, name: &str, ne: &[usize]) -> Result<ggml::Tensor, E>;
     /// Finish loading the model, and extract all of the state from the loader.
-    fn finish(self) -> (Context, HashMap<String, ggml_rs::Tensor>, Option<Mmap>);
+    fn finish(self) -> (Context, HashMap<String, ggml::Tensor>, Option<Mmap>);
 }
 
 /// Load an arbitrary GGML model.
@@ -315,7 +315,7 @@ pub fn load<M: KnownModel>(
 
     let mut loader = Loader::new(load_progress_callback);
 
-    ggml_rs::loader::load_model(&mut reader, &mut loader)
+    ggml::loader::load_model(&mut reader, &mut loader)
         .map_err(|err| LoadError::from_format_error(err, path.clone()))?;
 
     let Loader {
@@ -332,8 +332,8 @@ pub fn load<M: KnownModel>(
     let ctx_size = tensors
         .values()
         .map(|ti| {
-            ggml_rs::Tensor::C_TYPE_SIZE
-                + ggml_rs::OBJECT_SIZE
+            ggml::Tensor::C_TYPE_SIZE
+                + ggml::OBJECT_SIZE
                 + if use_mmap { 0 } else { ti.calc_size() }
         })
         .sum::<usize>();
@@ -354,10 +354,10 @@ pub fn load<M: KnownModel>(
         context: Context,
         mmap: Option<Mmap>,
         load_progress_callback: &'a mut dyn FnMut(LoadProgress),
-        loaded_tensors: HashMap<String, ggml_rs::Tensor>,
+        loaded_tensors: HashMap<String, ggml::Tensor>,
     }
     impl TensorLoader<LoadError> for MmapCompatibleLoader<'_> {
-        fn load(&mut self, name: &str, ne: &[usize]) -> Result<ggml_rs::Tensor, LoadError> {
+        fn load(&mut self, name: &str, ne: &[usize]) -> Result<ggml::Tensor, LoadError> {
             let info = self
                 .tensors
                 .get(name)
@@ -416,7 +416,7 @@ pub fn load<M: KnownModel>(
             Ok(tensor)
         }
 
-        fn finish(self) -> (Context, HashMap<String, ggml_rs::Tensor>, Option<Mmap>) {
+        fn finish(self) -> (Context, HashMap<String, ggml::Tensor>, Option<Mmap>) {
             (self.context, self.loaded_tensors, self.mmap)
         }
     }
@@ -471,7 +471,7 @@ impl<Hp: Hyperparameters, F: FnMut(LoadProgress)> Loader<Hp, F> {
         }
     }
 }
-impl<Hp: Hyperparameters, F: FnMut(LoadProgress)> ggml_rs::loader::LoadHandler<LoadError>
+impl<Hp: Hyperparameters, F: FnMut(LoadProgress)> ggml::loader::LoadHandler<LoadError>
     for Loader<Hp, F>
 {
     fn container_type(&mut self, container_type: ContainerType) -> Result<(), LoadError> {
diff --git a/llm-base/src/util.rs b/llm-base/src/util.rs
index 5d851e2f..90dda24a 100644
--- a/llm-base/src/util.rs
+++ b/llm-base/src/util.rs
@@ -1,4 +1,4 @@
-pub use ggml_rs::util::*;
+pub use ggml::util::*;
 use std::path::{Path, PathBuf};
 
 /// NOTE: The original code relies in promotion rules and automatic cast between

From 78db42cdf62f363f415bd9fc74c9c101e05ef2b0 Mon Sep 17 00:00:00 2001
From: Dan Forbes <dan@danforbes.dev>
Date: Sun, 30 Apr 2023 08:36:31 -0700
Subject: [PATCH 32/35] Add back BLOOM

Co-authored-by: @hhamud <53880692+hhamud@users.noreply.github.com>
---
 .vscode/launch.json               |  18 ++
 Cargo.lock                        |  10 +
 Cargo.toml                        |   3 +-
 README.md                         |  14 +-
 bloom/Cargo.toml                  |  15 +
 bloom/examples/bloom_inference.rs |  42 +++
 bloom/src/lib.rs                  | 516 ++++++++++++++++++++++++++++++
 7 files changed, 614 insertions(+), 4 deletions(-)
 create mode 100644 bloom/Cargo.toml
 create mode 100644 bloom/examples/bloom_inference.rs
 create mode 100644 bloom/src/lib.rs

diff --git a/.vscode/launch.json b/.vscode/launch.json
index f746153f..64c08765 100644
--- a/.vscode/launch.json
+++ b/.vscode/launch.json
@@ -4,6 +4,24 @@
   // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
   "version": "0.2.0",
   "configurations": [
+    {
+      "type": "lldb",
+      "request": "launch",
+      "name": "Debug example 'bloom_inference'",
+      "cargo": {
+        "args": [
+          "build",
+          "--example=bloom_inference",
+          "--package=bloom"
+        ],
+        "filter": {
+          "name": "bloom_inference",
+          "kind": "example"
+        }
+      },
+      "args": ["${env:HOME}/.ggml-models/bloom-7b.bin"],
+      "cwd": "${workspaceFolder}"
+    },
     {
       "type": "lldb",
       "request": "launch",
diff --git a/Cargo.lock b/Cargo.lock
index a9feb475..7bd50f09 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -130,6 +130,16 @@ version = "1.3.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a"
 
+[[package]]
+name = "bloom"
+version = "0.1.0"
+dependencies = [
+ "bytemuck",
+ "ggml",
+ "llm-base",
+ "rand",
+]
+
 [[package]]
 name = "bytemuck"
 version = "1.13.1"
diff --git a/Cargo.toml b/Cargo.toml
index ac67ad16..bf235bbc 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -1,11 +1,12 @@
 [workspace]
 members = [
     # Crates
+    "bloom",
     "ggml",
-    "llm-base",
     "gpt2",
     "llama",
     "llm",
+    "llm-base",
     "llm-cli",
 ]
 resolver = "2"
diff --git a/README.md b/README.md
index 6916fd51..031a55dc 100644
--- a/README.md
+++ b/README.md
@@ -21,9 +21,9 @@ quantized versions of the model.
 
 Make sure you have a Rust 1.65.0 or above and C toolchain[^1] set up.
 
-`llm-base`, `gpt2`, and `llama` are Rust libraries, while `llm-cli` is a CLI
-applications that wraps `gpt2` and `llama` and offer basic inference
-capabilities.
+`llm-base`, and the model crates (e.g. `bloom`, `gpt2` `llama`) are Rust
+libraries, while `llm-cli` is a CLI applications that wraps the models and offer
+basic inference capabilities.
 
 The following instructions explain how to build CLI applications.
 
@@ -103,6 +103,14 @@ cargo run -p llama-cli quantize /path/to/your/models/7B/ggml-model-f16.bin /path
 > The [llama.cpp repository](https://github.com/ggerganov/llama.cpp) has
 > additional information on how to obtain and run specific models.
 
+### BLOOM
+
+The open-source [BLOOM](https://bigscience.huggingface.co/blog/bloom) model is
+also supported.
+[More information](https://huggingface.co/docs/transformers/model_doc/bloom)
+about BLOOM is available on HuggingFace, as are some
+[quantized models](https://huggingface.co/models?search=bloom%20ggml).
+
 ### GPT2
 
 OpenAI's [GPT-2](https://jalammar.github.io/illustrated-gpt2/) architecture is
diff --git a/bloom/Cargo.toml b/bloom/Cargo.toml
new file mode 100644
index 00000000..2dd9b0a9
--- /dev/null
+++ b/bloom/Cargo.toml
@@ -0,0 +1,15 @@
+[package]
+name = "bloom"
+version = { workspace = true }
+edition = "2021"
+
+# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
+
+[dependencies]
+ggml = { path = "../ggml" }
+llm-base = { path = "../llm-base" }
+
+bytemuck = { workspace = true }
+
+[dev-dependencies]
+rand = { workspace = true }
diff --git a/bloom/examples/bloom_inference.rs b/bloom/examples/bloom_inference.rs
new file mode 100644
index 00000000..8ca2435b
--- /dev/null
+++ b/bloom/examples/bloom_inference.rs
@@ -0,0 +1,42 @@
+use std::{convert::Infallible, env::args, io::Write};
+
+use llm_base::{load_progress_callback, model::KnownModel};
+
+extern crate bloom;
+
+fn main() {
+    let args: Vec<String> = args().collect();
+    let loc = &args[1];
+    let prompt = match &args.len() {
+        3 => &args[2],
+        _ => "Rust is a cool programming language because ",
+    };
+
+    println!(" >>> Loading model from {loc}...");
+    let now = std::time::Instant::now();
+
+    let bloom = bloom::Bloom::load(loc, true, 512, load_progress_callback)
+        .unwrap_or_else(|e| panic!("Error loading model from {loc}: {e}"));
+
+    println!(" >>> Model loaded in {} ms.", now.elapsed().as_millis());
+
+    let mut session = bloom.start_session(Default::default());
+    let res = session.inference_with_prompt::<Infallible>(
+        &bloom,
+        &Default::default(),
+        prompt,
+        None,
+        &mut rand::thread_rng(),
+        |t| {
+            print!("{t}");
+            std::io::stdout().flush().unwrap();
+
+            Ok(())
+        },
+    );
+
+    match res {
+        Ok(result) => println!("\n\nInference stats:\n{result}"),
+        Err(err) => println!("\n{err}"),
+    }
+}
diff --git a/bloom/src/lib.rs b/bloom/src/lib.rs
new file mode 100644
index 00000000..562c7a04
--- /dev/null
+++ b/bloom/src/lib.rs
@@ -0,0 +1,516 @@
+use std::path::Path;
+
+// use ggml_loader::{LoadError, LoadProgress};
+use llm_base::{
+    util, EvaluateOutputRequest, FileType, InferenceParameters, InferenceSession,
+    InferenceSessionParameters, LoadError, LoadProgress, Mmap, KnownModel, TokenId, Vocabulary,
+};
+
+/// The weights for the BLOOM model. All the mutable state is split into a
+/// separate struct `InferenceSession`.
+pub struct Bloom {
+    hyperparameters: Hyperparameters,
+    n_context_tokens: usize,
+
+    vocabulary: Vocabulary,
+    tok_embeddings: ggml::Tensor,
+    norm: ggml::Tensor,
+    norm_b: ggml::Tensor,
+    output_norm: ggml::Tensor,
+    output_norm_b: ggml::Tensor,
+    output: ggml::Tensor,
+    layers: Vec<Layer>,
+
+    // Must be kept alive for the model
+    _context: ggml::context::Context,
+    _mmap: Option<Mmap>,
+}
+
+impl Bloom {
+    /// Load the model from `path` with `n_context_tokens` context tokens.
+    ///
+    /// The status of the loading process will be reported through `load_progress_callback`.
+    pub fn load(
+        path: impl AsRef<Path>,
+        prefer_mmap: bool,
+        n_context_tokens: usize,
+        load_progress_callback: impl FnMut(LoadProgress),
+    ) -> Result<Bloom, LoadError> {
+        llm_base::load(path, prefer_mmap, n_context_tokens, load_progress_callback)
+    }
+}
+
+impl KnownModel for Bloom {
+    type Hyperparameters = Hyperparameters;
+
+    fn new<E: std::error::Error>(
+        hyperparameters: Self::Hyperparameters,
+        n_context_tokens: usize,
+        vocabulary: Vocabulary,
+        tensor_loader: impl llm_base::TensorLoader<E>,
+    ) -> Result<Self, E> {
+        let n_embd = hyperparameters.n_embd;
+        let n_layer = hyperparameters.n_layer;
+        let n_vocab = hyperparameters.n_vocab;
+        let n_mult = hyperparameters.n_mult;
+        let n_ff = ((4 * n_embd + n_mult - 1) / n_mult) * n_mult;
+
+        let mut tl = tensor_loader;
+
+        let tok_embeddings = tl.load("tok_embeddings.weight", &[n_embd, n_vocab])?;
+
+        let norm = tl.load("norm.weight", &[n_embd])?;
+        let norm_b = tl.load("norm.bias", &[n_embd])?;
+
+        let output_norm = tl.load("output_norm.weight", &[n_embd])?;
+        let output_norm_b = tl.load("output_norm.bias", &[n_embd])?;
+
+        let output = tl.load("output.weight", &[n_embd, n_vocab])?;
+
+        let mut layers = Vec::new();
+        for i in 0..n_layer {
+            let layer = Layer {
+                attention_norm: tl.load(&format!("layers.{i}.attention_norm.weight"), &[n_embd])?,
+                attention_norm_b: tl.load(&format!("layers.{i}.attention_norm.bias"), &[n_embd])?,
+
+                query_key_value: tl.load(
+                    &format!("layers.{i}.attention.query_key_value.weight"),
+                    &[n_embd, 3 * n_embd],
+                )?,
+                query_key_value_b: tl.load(
+                    &format!("layers.{i}.attention.query_key_value.bias"),
+                    &[3 * n_embd],
+                )?,
+
+                wo: tl.load(
+                    &format!("layers.{i}.attention.wo.weight"),
+                    &[n_embd, n_embd],
+                )?,
+                wo_b: tl.load(&format!("layers.{i}.attention.wo.bias"), &[n_embd])?,
+
+                ffn_norm: tl.load(&format!("layers.{i}.ffn_norm.weight"), &[n_embd])?,
+                ffn_norm_b: tl.load(&format!("layers.{i}.ffn_norm.bias"), &[n_embd])?,
+
+                w1: tl.load(
+                    &format!("layers.{i}.feed_forward.w1.weight"),
+                    &[n_embd, n_ff],
+                )?,
+                w1_b: tl.load(&format!("layers.{i}.feed_forward.w1.bias"), &[n_ff])?,
+                w2: tl.load(
+                    &format!("layers.{i}.feed_forward.w2.weight"),
+                    &[n_ff, n_embd],
+                )?,
+                w2_b: tl.load(&format!("layers.{i}.feed_forward.w2.bias"), &[n_embd])?,
+            };
+
+            layers.push(layer);
+        }
+
+        let (_context, _, _mmap) = tl.finish();
+
+        Ok(Bloom {
+            hyperparameters,
+            n_context_tokens,
+            vocabulary,
+            tok_embeddings,
+            norm,
+            norm_b,
+            output_norm,
+            output_norm_b,
+            output,
+            layers,
+            _context,
+            _mmap,
+        })
+    }
+
+    fn start_session(&self, params: InferenceSessionParameters) -> InferenceSession {
+        InferenceSession::new(
+            params,
+            self.n_context_tokens,
+            self.hyperparameters.n_layer,
+            self.hyperparameters.n_embd,
+            self.hyperparameters.n_vocab,
+        )
+    }
+
+    fn evaluate(
+        &self,
+        session: &mut InferenceSession,
+        params: &InferenceParameters,
+        input_tokens: &[TokenId],
+        output_request: &mut EvaluateOutputRequest,
+    ) {
+        let n = input_tokens.len();
+        let n_past = session.n_past;
+        let n_threads = params.n_threads;
+
+        let Hyperparameters {
+            n_vocab,
+            n_embd,
+            n_mult: _,
+            n_head,
+            n_layer,
+            file_type: _,
+        } = self.hyperparameters;
+        let n_ctx = self.n_context_tokens;
+
+        // For the first run, we need to guess a maximum buffer size so we can measure
+        // the actual memory consumption of the temporary ggml context.
+        let mut buf_size = 1024 * 1024 * 1024;
+        if session.mem_per_token > 0 && session.mem_per_token * n > buf_size {
+            // add 10% to account for ggml object overhead
+            buf_size = (1.1f64 * session.mem_per_token as f64 * n as f64) as usize;
+        };
+        let ctx0 = ggml::context::Context::init(buf_size, true);
+
+        // TODO: REMAKE THIS AFTER CHECKING GGML GRAPH
+        let mut gf = ggml::ComputationGraph::new(n_threads);
+
+        let mut embd = ctx0.new_tensor_1d(ggml::Type::I32, n);
+        unsafe { embd.write_data(bytemuck::cast_slice(input_tokens)) };
+
+        let mut input_layer = ctx0.op_get_rows(&self.tok_embeddings, &embd);
+
+        // word embeddings norm,
+        {
+            input_layer = ctx0.op_norm(&input_layer);
+            input_layer = ctx0.op_mul(&ctx0.op_repeat(&self.norm, &input_layer), &input_layer);
+            input_layer = ctx0.op_add(&ctx0.op_repeat(&self.norm_b, &input_layer), &input_layer);
+        }
+
+        for il in 0..n_layer {
+            let input_self_attention = input_layer.share();
+            let mut current: ggml::Tensor;
+
+            // norm
+            {
+                current = ctx0.op_norm(&input_layer);
+
+                // cur = attention_norm * cur
+                current = ctx0.op_mul(
+                    &ctx0.op_repeat(&self.layers[il].attention_norm, &current),
+                    &current,
+                );
+                current = ctx0.op_add(
+                    &ctx0.op_repeat(&self.layers[il].attention_norm_b, &current),
+                    &current,
+                );
+            }
+
+            //attention
+            {
+                current = ctx0.op_mul_mat(&self.layers[il].query_key_value, &current);
+                current = ctx0.op_add(
+                    &ctx0.op_repeat(&self.layers[il].query_key_value_b, &current),
+                    &current,
+                );
+            }
+
+            // self-attention
+            {
+                let nb = current.get_nb()[1];
+                let q_current = ctx0.op_view_2d(
+                    &current,
+                    (n_embd, n),
+                    nb,
+                    //0 * std::mem::size_of::<f32>() * n_embd as usize,
+                    0,
+                );
+                let k_current = ctx0.op_view_2d(
+                    &current,
+                    (n_embd, n),
+                    nb,
+                    std::mem::size_of::<f32>() * n_embd,
+                );
+                let v_current = ctx0.op_view_2d(
+                    &current,
+                    (n_embd, n),
+                    nb,
+                    2 * std::mem::size_of::<f32>() * n_embd,
+                );
+
+                // store key and value to memory
+                if n >= 1 {
+                    let k = ctx0.op_view_1d(
+                        &session.memory_k,
+                        n * n_embd,
+                        (session.memory_k.element_size() * n_embd) * (il * n_ctx + n_past),
+                    );
+
+                    let v = ctx0.op_view_1d(
+                        &session.memory_v,
+                        n * n_embd,
+                        (session.memory_v.element_size() * n_embd) * (il * n_ctx + n_past),
+                    );
+
+                    gf.build_forward_expand(&ctx0.op_cpy(&k_current, &k));
+                    gf.build_forward_expand(&ctx0.op_cpy(&v_current, &v));
+                }
+
+                // Q = Qcur.contiguous().view(n_embd/n_head, n_head, N).permute(0, 2, 1, 3)
+                let q = ctx0.op_permute(
+                    &ctx0.op_cpy(
+                        &q_current,
+                        &ctx0.new_tensor_3d(ggml::Type::F32, n_embd / n_head, n_head, n),
+                    ),
+                    0,
+                    2,
+                    1,
+                    3,
+                );
+
+                // K = Kmem.view(n_embd/n_head, n_head, n_past + N).permute(0, 2, 1, 3)
+                let k = ctx0.op_permute(
+                    &ctx0.op_reshape_3d(
+                        &ctx0.op_view_1d(
+                            &session.memory_k,
+                            (n_past + n) * n_embd,
+                            il * n_ctx * session.memory_k.element_size() * n_embd,
+                        ),
+                        n_embd / n_head,
+                        n_head,
+                        n_past + n,
+                    ),
+                    0,
+                    2,
+                    1,
+                    3,
+                );
+
+                // K * Q
+                let k_q = ctx0.op_mul_mat(&k, &q);
+
+                // KQ_scaled = KQ / sqrt(n_embd/n_head)
+                let k_q_scaled = ctx0.op_scale(
+                    &k_q,
+                    &ctx0.new_f32(1.0 / f32::sqrt(n_embd as f32 / n_head as f32)),
+                );
+
+                //alibi
+                // KQ_scaled_alibi = KQ_scaled + alibi_bias
+                let k_q_scaled_alibi = ctx0.op_alibi(&k_q_scaled, n_past, n_head);
+
+                // KQ_masked = mask_past(KQ_scaled)
+                let k_q_masked = ctx0.op_diag_mask_inf(&k_q_scaled_alibi, n_past);
+
+                // KQ = soft_max(KQ_masked)
+                let k_q_soft_max = ctx0.op_soft_max(&k_q_masked);
+
+                let memv_elsize = session.memory_v.element_size();
+
+                // let v_trans = ctx0.op_permute(
+                //     &ctx0.op_reshape_3d(
+                //         &ctx0.op_view_1d(
+                //             &session.memory_v,
+                //             (n_past + n) * n_embd,
+                //             il * n_ctx * memv_elsize * n_embd,
+                //         ),
+                //         n_embd / n_head,
+                //         n_head,
+                //         n_past + n,
+                //     ),
+                //     1,
+                //     2,
+                //     0,
+                //     3,
+                // );
+
+                // // GGML_ASSERT: ggml/ggml.c:4899: !ggml_is_transposed(a)
+                // let k_q_v = ctx0.op_mul_mat(&v_trans, &k_q_soft_max);
+
+                // split cached V into n_head heads
+                let v = ctx0.op_view_3d(
+                    &session.memory_v,
+                    (n_past + n, n_embd / n_head, n_head),
+                    (n_ctx * memv_elsize, n_ctx * memv_elsize * n_embd / n_head),
+                    il * n_ctx * memv_elsize * n_embd,
+                );
+
+                // KQV = transpose(V) * KQ_soft_max
+                let k_q_v = ctx0.op_mul_mat(&v, &k_q_soft_max);
+
+                // KQV_merged = KQV.permute(0, 2, 1, 3)
+                let k_q_v_merged = ctx0.op_permute(&k_q_v, 0, 2, 1, 3);
+
+                // cur = KQV_merged.contiguous().view(n_embd, N)
+                current = ctx0.op_cpy(
+                    &k_q_v_merged,
+                    &ctx0.new_tensor_2d(ggml::Type::F32, n_embd, n),
+                );
+
+                // projection
+                current = ctx0.op_mul_mat(&self.layers[il].wo, &current);
+                current = ctx0.op_add(&ctx0.op_repeat(&self.layers[il].wo_b, &current), &current);
+            }
+
+            let input_feed_forward = ctx0.op_add(&current, &input_self_attention);
+
+            // feed-forward network
+            {
+                // norm
+                {
+                    current = ctx0.op_norm(&input_feed_forward);
+
+                    // cur = ffn_norm*cur + ffn_norm_b
+                    current = ctx0.op_mul(
+                        &ctx0.op_repeat(&self.layers[il].ffn_norm, &current),
+                        &current,
+                    );
+
+                    current = ctx0.op_add(
+                        &ctx0.op_repeat(&self.layers[il].ffn_norm_b, &current),
+                        &current,
+                    );
+                }
+
+                current = ctx0.op_mul_mat(&self.layers[il].w1, &current);
+
+                current = ctx0.op_add(&ctx0.op_repeat(&self.layers[il].w1_b, &current), &current);
+
+                // SILU activation
+
+                current = ctx0.op_gelu(&current);
+
+                current = ctx0.op_mul_mat(&self.layers[il].w2, &current);
+
+                current = ctx0.op_add(&ctx0.op_repeat(&self.layers[il].w2_b, &current), &current);
+            }
+
+            current = ctx0.op_add(&current, &input_feed_forward);
+
+            // input for next layer
+            input_layer = current;
+        }
+
+        // Used at the end to optionally extract the embeddings.
+        let embeddings_tensor;
+
+        // norm
+        {
+            input_layer = ctx0.op_norm(&input_layer);
+
+            // inpL = norm*inpL
+            input_layer = ctx0.op_mul(
+                &ctx0.op_repeat(&self.output_norm, &input_layer),
+                &input_layer,
+            );
+
+            input_layer = ctx0.op_add(
+                &ctx0.op_repeat(&self.output_norm_b, &input_layer),
+                &input_layer,
+            );
+
+            embeddings_tensor = input_layer.share(); //TODO: CHECK if this is still necessary, (not in BLOOM C implementation)
+        }
+
+        // lm_head
+        {
+            input_layer = ctx0.op_mul_mat(&self.output, &input_layer);
+        }
+
+        // logits -> probs
+        // inpL = ctx0.op_soft_max(&inpL);
+
+        // run the computation
+        gf.build_forward_expand(&input_layer);
+        ctx0.graph_compute(&mut gf);
+
+        // return result for just the last token
+        // SAFETY: yolo
+        assert_eq!(session.last_logits.len(), { n_vocab });
+        unsafe {
+            input_layer.read_data(
+                n_vocab * (n - 1) * std::mem::size_of::<f32>(),
+                bytemuck::cast_slice_mut(&mut session.last_logits),
+            )
+        };
+
+        // Extract logits
+        if let Some(all_logits) = &mut output_request.all_logits {
+            all_logits.resize(n_vocab * n, 0.0);
+            // SAFETY: Tensor data can be read (properly aligned, initialized,
+            // data will not be mutated or otherwise aliased during the copy),
+            // and we're not reading past the end of the tensor data.
+            assert_eq!(input_layer.nelements(), n_vocab * n);
+            unsafe {
+                input_layer.read_data(0, bytemuck::cast_slice_mut(all_logits));
+            }
+        }
+
+        // Extract embeddings
+        if let Some(embeddings) = &mut output_request.embeddings {
+            embeddings.resize(n_embd * n, 0.0);
+            // SAFETY: Same rationale as for the "Extract logits" section applies.
+            assert_eq!(embeddings_tensor.nelements(), n_embd * n);
+            unsafe {
+                embeddings_tensor.read_data(0, bytemuck::cast_slice_mut(embeddings));
+            }
+        }
+
+        // Adjust the required memory per token if we didn't know that already
+        if session.mem_per_token == 0 {
+            session.mem_per_token = ctx0.used_mem() / n;
+        }
+
+        // Adjust n_past to new length.
+        session.n_past += input_tokens.len();
+    }
+
+    /// Returns the vocabulary used by this model.
+    fn vocabulary(&self) -> &Vocabulary {
+        &self.vocabulary
+    }
+
+    fn n_ctx(&self) -> usize {
+        self.n_context_tokens
+    }
+}
+
+// NOTE: Field order matters! Data is laid out in the file exactly
+// in this order.
+#[derive(Debug, Default, PartialEq, Eq, Clone, Copy)]
+pub struct Hyperparameters {
+    pub n_vocab: usize,
+    pub n_embd: usize,
+    pub n_mult: usize,
+    pub n_head: usize,
+    pub n_layer: usize,
+    pub file_type: FileType,
+}
+impl llm_base::Hyperparameters for Hyperparameters {
+    fn read(reader: &mut dyn std::io::BufRead) -> Result<Self, llm_base::LoadError> {
+        Ok(Hyperparameters {
+            n_vocab: util::read_i32(reader)?.try_into()?,
+            n_embd: util::read_i32(reader)?.try_into()?,
+            n_mult: util::read_i32(reader)?.try_into()?,
+            n_head: util::read_i32(reader)?.try_into()?,
+            n_layer: util::read_i32(reader)?.try_into()?,
+            file_type: {
+                let ftype = util::read_i32(reader)?;
+                FileType::try_from(ftype).map_err(|_| LoadError::UnsupportedFileType(ftype))?
+            },
+        })
+    }
+
+    fn n_vocabulary(&self) -> usize {
+        self.n_vocab
+    }
+}
+
+struct Layer {
+    pub attention_norm: ggml::Tensor,
+    pub attention_norm_b: ggml::Tensor,
+    pub wo: ggml::Tensor,
+    pub wo_b: ggml::Tensor,
+    pub query_key_value: ggml::Tensor,
+    pub query_key_value_b: ggml::Tensor,
+    // normalization
+    pub ffn_norm: ggml::Tensor,
+    pub ffn_norm_b: ggml::Tensor,
+    // ff
+    pub w1: ggml::Tensor,
+    pub w1_b: ggml::Tensor,
+    pub w2: ggml::Tensor,
+    pub w2_b: ggml::Tensor,
+}

From 1eb2e115eeb277aee409b4f41202e25a023e9e96 Mon Sep 17 00:00:00 2001
From: Philpax <me@philpax.me>
Date: Sun, 30 Apr 2023 22:56:03 +0200
Subject: [PATCH 33/35] feat: re-enable BLOOM for now

---
 .gitignore              | 2 ++
 Cargo.lock              | 1 +
 bloom/src/lib.rs        | 7 +++++--
 llm-cli/src/cli_args.rs | 8 ++++++++
 llm/Cargo.toml          | 5 +++--
 llm/src/lib.rs          | 2 ++
 6 files changed, 21 insertions(+), 4 deletions(-)

diff --git a/.gitignore b/.gitignore
index ea8c4bf7..5dbf6776 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1 +1,3 @@
 /target
+/models
+.DS_Store
\ No newline at end of file
diff --git a/Cargo.lock b/Cargo.lock
index 7bd50f09..33d0b49d 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -628,6 +628,7 @@ dependencies = [
 name = "llm"
 version = "0.1.0"
 dependencies = [
+ "bloom",
  "gpt2",
  "llama",
  "llm-base",
diff --git a/bloom/src/lib.rs b/bloom/src/lib.rs
index 562c7a04..a5a4ed2f 100644
--- a/bloom/src/lib.rs
+++ b/bloom/src/lib.rs
@@ -1,9 +1,12 @@
+//! An implementation of BLOOM (BigScience Large Open-science Open-access Multilingual Language Model).
+//!
+//! This implementation of BLOOM may not be fully correct. More work may be required.
+
 use std::path::Path;
 
-// use ggml_loader::{LoadError, LoadProgress};
 use llm_base::{
     util, EvaluateOutputRequest, FileType, InferenceParameters, InferenceSession,
-    InferenceSessionParameters, LoadError, LoadProgress, Mmap, KnownModel, TokenId, Vocabulary,
+    InferenceSessionParameters, KnownModel, LoadError, LoadProgress, Mmap, TokenId, Vocabulary,
 };
 
 /// The weights for the BLOOM model. All the mutable state is split into a
diff --git a/llm-cli/src/cli_args.rs b/llm-cli/src/cli_args.rs
index 84eedc30..725f4451 100644
--- a/llm-cli/src/cli_args.rs
+++ b/llm-cli/src/cli_args.rs
@@ -282,6 +282,8 @@ pub enum ModelArchitecture {
     Llama,
     /// OpenAI's GPT2 architecture and derivatives (Cerebras, etc).
     Gpt2,
+    /// The BLOOM model. This is currently disabled as it does not work.
+    Bloom,
 }
 impl ModelLoad {
     pub fn load(&self) -> Result<Box<dyn Model>> {
@@ -369,6 +371,12 @@ impl ModelLoad {
                 n_context_tokens,
                 load_progress_callback,
             )?),
+            ModelArchitecture::Bloom => Box::new(llm::load::<llm::Bloom>(
+                path,
+                prefer_mmap,
+                n_context_tokens,
+                load_progress_callback,
+            )?),
         })
     }
 }
diff --git a/llm/Cargo.toml b/llm/Cargo.toml
index 063df3a7..1ce45915 100644
--- a/llm/Cargo.toml
+++ b/llm/Cargo.toml
@@ -7,8 +7,9 @@ edition = "2021"
 llm-base = { path = "../llm-base" }
 llama = { path = "../llama", features = ["convert", "quantize"], optional = true }
 gpt2 = { path = "../gpt2", optional = true }
+bloom = { path = "../bloom", optional = true }
 
 [features]
-default = ["llama", "gpt2"]
+default = ["llama", "gpt2", "bloom"]
 llama = ["dep:llama"]
-gpt2 = ["dep:gpt2"]
\ No newline at end of file
+gpt2 = ["dep:gpt2"]
diff --git a/llm/src/lib.rs b/llm/src/lib.rs
index 53431dda..53a5a164 100644
--- a/llm/src/lib.rs
+++ b/llm/src/lib.rs
@@ -5,6 +5,8 @@ pub use llm_base::{
     EOT_TOKEN_ID,
 };
 
+#[cfg(feature = "bloom")]
+pub use bloom::{self, Bloom};
 #[cfg(feature = "gpt2")]
 pub use gpt2::{self, Gpt2};
 #[cfg(feature = "llama")]

From 181d82350ca406017510770f74eb36aa23d8bd3d Mon Sep 17 00:00:00 2001
From: Philpax <me@philpax.me>
Date: Sun, 30 Apr 2023 23:26:16 +0200
Subject: [PATCH 34/35] refactor: reintroduce ggml-sys and bindgen tool

---
 .gitmodules                              |    2 +-
 CONTRIBUTING.md                          |   13 +-
 Cargo.lock                               |   34 +-
 Cargo.toml                               |    2 +
 ggml/Cargo.toml                          |    7 +-
 ggml/ggml                                |    1 -
 ggml/src/context.rs                      |   82 +-
 ggml/src/lib.rs                          |   89 +-
 ggml/src/tensor.rs                       |   22 +-
 ggml/sys/Cargo.toml                      |    7 +
 ggml/{ => sys}/build.rs                  |   12 +-
 ggml/sys/ggml                            |    1 +
 ggml/sys/src/lib.rs                      | 1617 ++++++++++++++++++++++
 tools/generate-ggml-bindings/Cargo.toml  |    8 +
 tools/generate-ggml-bindings/src/main.rs |   28 +
 15 files changed, 1797 insertions(+), 128 deletions(-)
 delete mode 160000 ggml/ggml
 create mode 100644 ggml/sys/Cargo.toml
 rename ggml/{ => sys}/build.rs (90%)
 create mode 160000 ggml/sys/ggml
 create mode 100644 ggml/sys/src/lib.rs
 create mode 100644 tools/generate-ggml-bindings/Cargo.toml
 create mode 100644 tools/generate-ggml-bindings/src/main.rs

diff --git a/.gitmodules b/.gitmodules
index 4a7cb543..12466c24 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -1,3 +1,3 @@
 [submodule "ggml/ggml"]
-	path = ggml/ggml
+	path = ggml/sys/ggml
 	url = git@github.com:ggerganov/ggml.git
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index c6e025a5..a1bae158 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -9,12 +9,13 @@ When new GGML versions are pushed to llama.cpp (or one of the other repos
 hosting a copy of it) and we want to update our copy, the process should be as
 follows:
 
-- Update the `ggml.c` and `ggml.h` inside `ggml-sys/ggml`.
-- In that same folder, update `CREDITS.txt` to indicate the llama.cpp version 
-  these files were taken from 
+- Update the submodule to the latest version of GGML:
+  ```shell
+  $ git submodule update --remote
+  ```
 - Run the bindgen script:
-    ```shell
-    $ cargo run --bin generate-ggml-bindings ggml-sys
-    ```
+  ```shell
+  $ cargo run --bin generate-ggml-bindings ggml-sys
+  ```
 - Fix any compiler errors that pop up due to the new version of the bindings and
   test the changes.
diff --git a/Cargo.lock b/Cargo.lock
index 33d0b49d..55a64934 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -104,9 +104,9 @@ dependencies = [
 
 [[package]]
 name = "bindgen"
-version = "0.64.0"
+version = "0.65.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c4243e6031260db77ede97ad86c27e501d646a27ab57b59a574f725d98ab1fb4"
+checksum = "cfdf7b466f9a4903edc73f95d6d2bcd5baf8ae620638762244d3f60143643cc5"
 dependencies = [
  "bitflags",
  "cexpr",
@@ -115,12 +115,13 @@ dependencies = [
  "lazycell",
  "log",
  "peeking_take_while",
+ "prettyplease",
  "proc-macro2",
  "quote",
  "regex",
  "rustc-hash",
  "shlex",
- "syn 1.0.109",
+ "syn 2.0.13",
  "which",
 ]
 
@@ -430,6 +431,13 @@ dependencies = [
  "windows-sys 0.48.0",
 ]
 
+[[package]]
+name = "generate-ggml-bindings"
+version = "0.1.0"
+dependencies = [
+ "bindgen",
+]
+
 [[package]]
 name = "getrandom"
 version = "0.2.9"
@@ -445,12 +453,18 @@ dependencies = [
 name = "ggml"
 version = "0.1.0"
 dependencies = [
- "bindgen",
- "cc",
+ "ggml-sys",
  "rand",
  "thiserror",
 ]
 
+[[package]]
+name = "ggml-sys"
+version = "0.1.0"
+dependencies = [
+ "cc",
+]
+
 [[package]]
 name = "gimli"
 version = "0.27.2"
@@ -809,6 +823,16 @@ version = "0.2.17"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "5b40af805b3121feab8a3c29f04d8ad262fa8e0561883e7653e024ae4479e6de"
 
+[[package]]
+name = "prettyplease"
+version = "0.2.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1ceca8aaf45b5c46ec7ed39fff75f57290368c1846d33d24a122ca81416ab058"
+dependencies = [
+ "proc-macro2",
+ "syn 2.0.13",
+]
+
 [[package]]
 name = "proc-macro2"
 version = "1.0.56"
diff --git a/Cargo.toml b/Cargo.toml
index bf235bbc..5d7d42ee 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -3,11 +3,13 @@ members = [
     # Crates
     "bloom",
     "ggml",
+    "ggml/sys",
     "gpt2",
     "llama",
     "llm",
     "llm-base",
     "llm-cli",
+    "tools/*"
 ]
 resolver = "2"
 
diff --git a/ggml/Cargo.toml b/ggml/Cargo.toml
index 43d64758..255e3a7a 100644
--- a/ggml/Cargo.toml
+++ b/ggml/Cargo.toml
@@ -3,12 +3,9 @@ name = "ggml"
 version = { workspace = true }
 edition = "2021"
 
-[build-dependencies]
-bindgen = "0.64.0"
-cc = "^1.0"
-
 [dependencies]
 thiserror = "1.0"
+ggml-sys = { path = "sys" }
 
 [dev-dependencies]
-rand = "0.8"
+rand = "0.8"
\ No newline at end of file
diff --git a/ggml/ggml b/ggml/ggml
deleted file mode 160000
index 583c5a3a..00000000
--- a/ggml/ggml
+++ /dev/null
@@ -1 +0,0 @@
-Subproject commit 583c5a3ad6bdb041bff5ad161a49ff4d8fa52f10
diff --git a/ggml/src/context.rs b/ggml/src/context.rs
index 9aff0e38..c4ac5d68 100644
--- a/ggml/src/context.rs
+++ b/ggml/src/context.rs
@@ -1,26 +1,26 @@
 use std::{
     os::raw::{c_int, c_void},
     ptr::NonNull,
-    sync::{Arc, Weak},
+    sync::Arc,
 };
 
-use crate::{usize_to_i32, usize_to_i64, Buffer, ComputationGraph, Tensor, Type};
+use crate::{sys, usize_to_i32, usize_to_i64, Buffer, ComputationGraph, Tensor, Type};
 
-/// Acts as a RAII-guard over a `crate::ggml_context`, allocating via
+/// Acts as a RAII-guard over a `sys::ggml_context`, allocating via
 /// `ggml_init` and dropping via `ggml_free`.
 pub struct Context {
     /// An `Arc` is used to model the relation between the context and the
     /// allocated tensors. Tensors are owned by the object, so a [`Tensor`]
     /// contains a `Weak` reference underneath and doesn't let you do anything
     /// with it if the underlying context has been deallocated.
-    ptr: Arc<NonNull<crate::ggml_context>>,
+    ptr: Arc<NonNull<sys::ggml_context>>,
 }
 
 impl Context {
     /// Creates a new [Context] with the specified `mem_size` as a working area.
     pub fn init(mem_size: usize, alloc: bool) -> Self {
         let raw = unsafe {
-            crate::ggml_init(crate::ggml_init_params {
+            sys::ggml_init(sys::ggml_init_params {
                 mem_size,
                 // Null here means we want ggml to own this memory. We don't
                 // support passing an owned buffer from the Rust side.
@@ -34,7 +34,7 @@ impl Context {
     }
 
     /// Wraps a raw tensor with a weak pointer to the context.
-    fn new_tensor_raw(&self, raw: *mut crate::ggml_tensor) -> Tensor {
+    fn new_tensor_raw(&self, raw: *mut sys::ggml_tensor) -> Tensor {
         Tensor {
             ptr: NonNull::new(raw).expect("Should not be null"),
             ctx: Arc::downgrade(&self.ptr),
@@ -44,14 +44,14 @@ impl Context {
     /// Creates a new 1D tensor.
     pub fn new_tensor_1d(&self, typ: Type, ne0: usize) -> Tensor {
         let raw =
-            unsafe { crate::ggml_new_tensor_1d(self.ptr.as_ptr(), typ.into(), usize_to_i64(ne0)) };
+            unsafe { sys::ggml_new_tensor_1d(self.ptr.as_ptr(), typ.into(), usize_to_i64(ne0)) };
         self.new_tensor_raw(raw)
     }
 
     /// Creates a new 2D tensor.
     pub fn new_tensor_2d(&self, typ: Type, ne0: usize, ne1: usize) -> Tensor {
         let raw = unsafe {
-            crate::ggml_new_tensor_2d(
+            sys::ggml_new_tensor_2d(
                 self.ptr.as_ptr(),
                 typ.into(),
                 usize_to_i64(ne0),
@@ -64,7 +64,7 @@ impl Context {
     /// Creates a new 3D tensor.
     pub fn new_tensor_3d(&self, typ: Type, ne0: usize, ne1: usize, ne2: usize) -> Tensor {
         let raw = unsafe {
-            crate::ggml_new_tensor_3d(
+            sys::ggml_new_tensor_3d(
                 self.ptr.as_ptr(),
                 typ.into(),
                 usize_to_i64(ne0),
@@ -77,45 +77,44 @@ impl Context {
 
     /// Creates a new 1D tensor with the specified value.
     pub fn new_f32(&self, x: f32) -> Tensor {
-        let raw = unsafe { crate::ggml_new_f32(self.ptr.as_ptr(), x) };
+        let raw = unsafe { sys::ggml_new_f32(self.ptr.as_ptr(), x) };
         self.new_tensor_raw(raw)
     }
 
     /// Unknown, aside from the obvious. It's transposing something!
     pub fn op_transpose(&self, a: &Tensor) -> Tensor {
-        let tensor = unsafe { crate::ggml_transpose(self.ptr.as_ptr(), a.ptr.as_ptr()) };
+        let tensor = unsafe { sys::ggml_transpose(self.ptr.as_ptr(), a.ptr.as_ptr()) };
         self.new_tensor_raw(tensor)
     }
 
     /// Unknown.
     pub fn op_get_rows(&self, a: &Tensor, b: &Tensor) -> Tensor {
         let tensor =
-            unsafe { crate::ggml_get_rows(self.ptr.as_ptr(), a.ptr.as_ptr(), b.ptr.as_ptr()) };
+            unsafe { sys::ggml_get_rows(self.ptr.as_ptr(), a.ptr.as_ptr(), b.ptr.as_ptr()) };
         self.new_tensor_raw(tensor)
     }
 
     /// Creates a new tensor with the values of `a`, but normalized.
     pub fn op_norm(&self, a: &Tensor) -> Tensor {
-        let tensor = unsafe { crate::ggml_norm(self.ptr.as_ptr(), a.ptr.as_ptr()) };
+        let tensor = unsafe { sys::ggml_norm(self.ptr.as_ptr(), a.ptr.as_ptr()) };
         self.new_tensor_raw(tensor)
     }
 
     /// Creates a new tensor with the values of `a`, but normalized using RMSNorm.
     pub fn op_rms_norm(&self, a: &Tensor) -> Tensor {
-        let tensor = unsafe { crate::ggml_rms_norm(self.ptr.as_ptr(), a.ptr.as_ptr()) };
+        let tensor = unsafe { sys::ggml_rms_norm(self.ptr.as_ptr(), a.ptr.as_ptr()) };
         self.new_tensor_raw(tensor)
     }
 
     /// Creates a new tensor with the multiplication of `a` and `b`.
     pub fn op_mul(&self, a: &Tensor, b: &Tensor) -> Tensor {
-        let tensor = unsafe { crate::ggml_mul(self.ptr.as_ptr(), a.ptr.as_ptr(), b.ptr.as_ptr()) };
+        let tensor = unsafe { sys::ggml_mul(self.ptr.as_ptr(), a.ptr.as_ptr(), b.ptr.as_ptr()) };
         self.new_tensor_raw(tensor)
     }
 
     /// Unknown.
     pub fn op_repeat(&self, a: &Tensor, b: &Tensor) -> Tensor {
-        let tensor =
-            unsafe { crate::ggml_repeat(self.ptr.as_ptr(), a.ptr.as_ptr(), b.ptr.as_ptr()) };
+        let tensor = unsafe { sys::ggml_repeat(self.ptr.as_ptr(), a.ptr.as_ptr(), b.ptr.as_ptr()) };
         self.new_tensor_raw(tensor)
     }
 
@@ -128,40 +127,39 @@ impl Context {
     /// Result is m columns, p rows
     pub fn op_mul_mat(&self, a: &Tensor, b: &Tensor) -> Tensor {
         let tensor =
-            unsafe { crate::ggml_mul_mat(self.ptr.as_ptr(), a.ptr.as_ptr(), b.ptr.as_ptr()) };
+            unsafe { sys::ggml_mul_mat(self.ptr.as_ptr(), a.ptr.as_ptr(), b.ptr.as_ptr()) };
         self.new_tensor_raw(tensor)
     }
 
     /// Creates a new tensor with the addition of `a` and `b`.
     pub fn op_add(&self, a: &Tensor, b: &Tensor) -> Tensor {
-        let tensor = unsafe { crate::ggml_add(self.ptr.as_ptr(), a.ptr.as_ptr(), b.ptr.as_ptr()) };
+        let tensor = unsafe { sys::ggml_add(self.ptr.as_ptr(), a.ptr.as_ptr(), b.ptr.as_ptr()) };
         self.new_tensor_raw(tensor)
     }
 
     /// Creates a new tensor with the [SiLU](https://pytorch.org/docs/stable/generated/torch.nn.SiLU.html) activation function applied to `a`.
     pub fn op_silu(&self, a: &Tensor) -> Tensor {
-        let tensor = unsafe { crate::ggml_silu(self.ptr.as_ptr(), a.ptr.as_ptr()) };
+        let tensor = unsafe { sys::ggml_silu(self.ptr.as_ptr(), a.ptr.as_ptr()) };
         self.new_tensor_raw(tensor)
     }
 
     /// In-place, scales `a` by the 1D tensor `b`.
     pub fn op_scale(&self, a: &Tensor, b: &Tensor) -> Tensor {
-        let tensor =
-            unsafe { crate::ggml_scale(self.ptr.as_ptr(), a.ptr.as_ptr(), b.ptr.as_ptr()) };
+        let tensor = unsafe { sys::ggml_scale(self.ptr.as_ptr(), a.ptr.as_ptr(), b.ptr.as_ptr()) };
         self.new_tensor_raw(tensor)
     }
 
     /// In-place, sets the elements above the diagonal to -INF.
     pub fn op_diag_mask_inf(&self, a: &Tensor, n_past: usize) -> Tensor {
         let tensor = unsafe {
-            crate::ggml_diag_mask_inf(self.ptr.as_ptr(), a.ptr.as_ptr(), usize_to_i32(n_past))
+            sys::ggml_diag_mask_inf(self.ptr.as_ptr(), a.ptr.as_ptr(), usize_to_i32(n_past))
         };
         self.new_tensor_raw(tensor)
     }
 
     /// In-place, applies the [Softmax function](https://en.wikipedia.org/wiki/Softmax_function) to `a`.
     pub fn op_soft_max(&self, a: &Tensor) -> Tensor {
-        let tensor = unsafe { crate::ggml_soft_max(self.ptr.as_ptr(), a.ptr.as_ptr()) };
+        let tensor = unsafe { sys::ggml_soft_max(self.ptr.as_ptr(), a.ptr.as_ptr()) };
         self.new_tensor_raw(tensor)
     }
 
@@ -186,7 +184,7 @@ impl Context {
         fun: unsafe extern "C" fn(cnt: c_int, dst: *mut f32, src: *const f32),
     ) -> Tensor {
         let tensor =
-            unsafe { crate::ggml_map_unary_f32(self.ptr.as_ptr(), a.ptr.as_ptr(), Some(fun)) };
+            unsafe { sys::ggml_map_unary_f32(self.ptr.as_ptr(), a.ptr.as_ptr(), Some(fun)) };
         self.new_tensor_raw(tensor)
     }
 
@@ -212,7 +210,7 @@ impl Context {
         fun: unsafe extern "C" fn(cnt: c_int, dst: *mut f32, src0: *const f32, src1: *const f32),
     ) -> Tensor {
         let tensor = unsafe {
-            crate::ggml_map_binary_f32(self.ptr.as_ptr(), a.ptr.as_ptr(), b.ptr.as_ptr(), Some(fun))
+            sys::ggml_map_binary_f32(self.ptr.as_ptr(), a.ptr.as_ptr(), b.ptr.as_ptr(), Some(fun))
         };
         self.new_tensor_raw(tensor)
     }
@@ -220,7 +218,7 @@ impl Context {
     /// Creates a 1D view over `a`.
     pub fn op_view_1d(&self, a: &Tensor, ne0: usize, offset: usize) -> Tensor {
         let tensor = unsafe {
-            crate::ggml_view_1d(self.ptr.as_ptr(), a.ptr.as_ptr(), usize_to_i64(ne0), offset)
+            sys::ggml_view_1d(self.ptr.as_ptr(), a.ptr.as_ptr(), usize_to_i64(ne0), offset)
         };
         self.new_tensor_raw(tensor)
     }
@@ -229,7 +227,7 @@ impl Context {
     pub fn op_view_2d(&self, a: &Tensor, ne: (usize, usize), nb1: usize, offset: usize) -> Tensor {
         let (ne0, ne1) = ne;
         let tensor = unsafe {
-            crate::ggml_view_2d(
+            sys::ggml_view_2d(
                 self.ptr.as_ptr(),
                 a.ptr.as_ptr(),
                 usize_to_i64(ne0),
@@ -252,7 +250,7 @@ impl Context {
         let (ne0, ne1, ne2) = ne;
         let (nb1, nb2) = nb;
         let tensor = unsafe {
-            crate::ggml_view_3d(
+            sys::ggml_view_3d(
                 self.ptr.as_ptr(),
                 a.ptr.as_ptr(),
                 usize_to_i64(ne0),
@@ -268,7 +266,7 @@ impl Context {
 
     /// Copies `a` to `b` and returns `b`.
     pub fn op_cpy(&self, a: &Tensor, b: &Tensor) -> Tensor {
-        let tensor = unsafe { crate::ggml_cpy(self.ptr.as_ptr(), a.ptr.as_ptr(), b.ptr.as_ptr()) };
+        let tensor = unsafe { sys::ggml_cpy(self.ptr.as_ptr(), a.ptr.as_ptr(), b.ptr.as_ptr()) };
         self.new_tensor_raw(tensor)
     }
 
@@ -282,7 +280,7 @@ impl Context {
         axis3: usize,
     ) -> Tensor {
         let tensor = unsafe {
-            crate::ggml_permute(
+            sys::ggml_permute(
                 self.ptr.as_ptr(),
                 a.ptr.as_ptr(),
                 usize_to_i32(axis0),
@@ -297,14 +295,14 @@ impl Context {
     /// In-place; reshapes `a` in accordance with the dimensions of `b`
     pub fn op_reshape(&self, a: &Tensor, b: &Tensor) -> Tensor {
         let tensor =
-            unsafe { crate::ggml_reshape(self.ptr.as_ptr(), a.ptr.as_ptr(), b.ptr.as_ptr()) };
+            unsafe { sys::ggml_reshape(self.ptr.as_ptr(), a.ptr.as_ptr(), b.ptr.as_ptr()) };
         self.new_tensor_raw(tensor)
     }
 
     /// In-place; reshapes `a` in accordance with the specified dimensions.
     pub fn op_reshape_2d(&self, a: &Tensor, ne0: usize, ne1: usize) -> Tensor {
         let tensor = unsafe {
-            crate::ggml_reshape_2d(
+            sys::ggml_reshape_2d(
                 self.ptr.as_ptr(),
                 a.ptr.as_ptr(),
                 usize_to_i64(ne0),
@@ -317,7 +315,7 @@ impl Context {
     /// In-place; reshapes `a` in accordance with the specified dimensions.
     pub fn op_reshape_3d(&self, a: &Tensor, ne0: usize, ne1: usize, ne2: usize) -> Tensor {
         let tensor = unsafe {
-            crate::ggml_reshape_3d(
+            sys::ggml_reshape_3d(
                 self.ptr.as_ptr(),
                 a.ptr.as_ptr(),
                 usize_to_i64(ne0),
@@ -331,7 +329,7 @@ impl Context {
     /// In-place; applies ROtary Positional Encoding.
     pub fn op_rope(&self, a: &Tensor, npast: usize, ndims: usize, mode: i32) -> Tensor {
         let tensor = unsafe {
-            crate::ggml_rope(
+            sys::ggml_rope(
                 self.ptr.as_ptr(),
                 a.ptr.as_ptr(),
                 usize_to_i32(npast),
@@ -345,13 +343,13 @@ impl Context {
     /// Computes the specified graph. Must be run in order to evaluate the graph.
     pub fn graph_compute(&self, graph: &mut ComputationGraph) {
         unsafe {
-            crate::ggml_graph_compute(self.ptr.as_ptr(), &mut graph.inner);
+            sys::ggml_graph_compute(self.ptr.as_ptr(), &mut graph.inner);
         }
     }
 
     /// Retrieves the memory used by this [Context].
     pub fn used_mem(&self) -> usize {
-        unsafe { crate::ggml_used_mem(self.ptr.as_ptr()) }
+        unsafe { sys::ggml_used_mem(self.ptr.as_ptr()) }
     }
 
     /// Sets the scratch buffer to be used by this [Context].
@@ -365,9 +363,9 @@ impl Context {
         };
         // SAFETY: this just passes (most likely uninitialized) memory buffer to the ggml C API
         unsafe {
-            crate::ggml_set_scratch(
+            sys::ggml_set_scratch(
                 self.ptr.as_ptr(),
-                crate::ggml_scratch {
+                sys::ggml_scratch {
                     offs: 0,
                     size,
                     data,
@@ -379,7 +377,7 @@ impl Context {
     /// TODO: something something
     pub fn op_alibi(&self, a: &Tensor, n_past: usize, n_head: usize) -> Tensor {
         let tensor = unsafe {
-            crate::ggml_alibi(
+            sys::ggml_alibi(
                 self.ptr.as_ptr(),
                 a.ptr.as_ptr(),
                 usize_to_i32(n_past),
@@ -392,7 +390,7 @@ impl Context {
 
     /// Gaussian Error Linear Units
     pub fn op_gelu(&self, a: &Tensor) -> Tensor {
-        let tensor = unsafe { crate::ggml_gelu(self.ptr.as_ptr(), a.ptr.as_ptr()) };
+        let tensor = unsafe { sys::ggml_gelu(self.ptr.as_ptr(), a.ptr.as_ptr()) };
         self.new_tensor_raw(tensor)
     }
 }
@@ -402,7 +400,7 @@ impl Drop for Context {
         // SAFETY: The only non-weak copy of ptr is no longer accessible after
         // this drop call.
         unsafe {
-            crate::ggml_free(self.ptr.as_ptr());
+            sys::ggml_free(self.ptr.as_ptr());
         }
     }
 }
diff --git a/ggml/src/lib.rs b/ggml/src/lib.rs
index 5ba74df9..09ae4b95 100644
--- a/ggml/src/lib.rs
+++ b/ggml/src/lib.rs
@@ -1,8 +1,3 @@
-#![allow(non_upper_case_globals)]
-#![allow(non_camel_case_types)]
-#![allow(non_snake_case)]
-#![allow(unused)]
-
 //! `ggml` is a semi-idiomatic wrapper for the `ggml` C library.
 //!
 //! It exposes a subset of operations (currently used to implement the [llama-rs](https://crates.io/crates/llama-rs) library).
@@ -11,13 +6,7 @@
 //! `ggml` operates on a computational graph; no values will be computed until [Context::graph_compute] is executed.
 //! All [Tensor]s are nodes in this computational graph, and values cannot be retrieved until computation is completed.
 
-use std::{
-    os::raw::{c_int, c_void},
-    ptr::NonNull,
-    sync::{Arc, Weak},
-};
-
-include!(concat!(env!("OUT_DIR"), "/bindings.rs"));
+use std::os::raw::{c_int, c_void};
 
 pub use tensor::Tensor;
 
@@ -31,6 +20,8 @@ pub mod saver;
 pub mod context;
 mod tensor;
 
+pub(crate) use ggml_sys as sys;
+
 #[cfg(test)]
 mod tests;
 
@@ -69,7 +60,7 @@ pub const FILE_MAGIC_UNVERSIONED: u32 = 0x67676d6c;
 pub const FORMAT_VERSION: u32 = 1;
 
 /// The size of a `ggml` object.
-pub const OBJECT_SIZE: usize = crate::GGML_OBJECT_SIZE;
+pub const OBJECT_SIZE: usize = sys::GGML_OBJECT_SIZE;
 
 #[derive(Debug, Copy, Clone, PartialEq, Eq, Default)]
 /// The type of a value in `ggml`.
@@ -81,10 +72,14 @@ pub enum Type {
     Q4_1,
     /// Quantized 4-bit (type 2).
     Q4_2,
-    /// Quantized 4-bit (type 3).
-    Q4_3,
+    /// Quantized 5-bit (type 0).
+    Q5_0,
+    /// Quantized 5-bit (type 1).
+    Q5_1,
     /// Quantized 8-bit (type 0).
     Q8_0,
+    /// Quantized 8-bit (type 1).
+    Q8_1,
     /// Integer 32-bit.
     I32,
     /// Float 16-bit.
@@ -92,32 +87,36 @@ pub enum Type {
     /// Float 32-bit.
     F32,
 }
-impl From<Type> for crate::ggml_type {
+impl From<Type> for sys::ggml_type {
     fn from(t: Type) -> Self {
         match t {
-            Type::Q4_0 => crate::ggml_type_GGML_TYPE_Q4_0,
-            Type::Q4_1 => crate::ggml_type_GGML_TYPE_Q4_1,
-            Type::Q4_2 => crate::ggml_type_GGML_TYPE_Q4_2,
-            Type::Q4_3 => crate::ggml_type_GGML_TYPE_Q4_3,
-            Type::Q8_0 => crate::ggml_type_GGML_TYPE_Q8_0,
-            Type::I32 => crate::ggml_type_GGML_TYPE_I32,
-            Type::F16 => crate::ggml_type_GGML_TYPE_F16,
-            Type::F32 => crate::ggml_type_GGML_TYPE_F32,
+            Type::Q4_0 => sys::ggml_type_GGML_TYPE_Q4_0,
+            Type::Q4_1 => sys::ggml_type_GGML_TYPE_Q4_1,
+            Type::Q4_2 => sys::ggml_type_GGML_TYPE_Q4_2,
+            Type::Q5_0 => sys::ggml_type_GGML_TYPE_Q5_0,
+            Type::Q5_1 => sys::ggml_type_GGML_TYPE_Q5_1,
+            Type::Q8_0 => sys::ggml_type_GGML_TYPE_Q8_0,
+            Type::Q8_1 => sys::ggml_type_GGML_TYPE_Q8_1,
+            Type::I32 => sys::ggml_type_GGML_TYPE_I32,
+            Type::F16 => sys::ggml_type_GGML_TYPE_F16,
+            Type::F32 => sys::ggml_type_GGML_TYPE_F32,
         }
     }
 }
-impl TryFrom<crate::ggml_type> for Type {
+impl TryFrom<sys::ggml_type> for Type {
     type Error = ();
-    fn try_from(t: crate::ggml_type) -> Result<Self, Self::Error> {
+    fn try_from(t: sys::ggml_type) -> Result<Self, Self::Error> {
         match t {
-            crate::ggml_type_GGML_TYPE_Q4_0 => Ok(Type::Q4_0),
-            crate::ggml_type_GGML_TYPE_Q4_1 => Ok(Type::Q4_1),
-            crate::ggml_type_GGML_TYPE_Q4_2 => Ok(Type::Q4_2),
-            crate::ggml_type_GGML_TYPE_Q4_3 => Ok(Type::Q4_3),
-            crate::ggml_type_GGML_TYPE_Q8_0 => Ok(Type::Q8_0),
-            crate::ggml_type_GGML_TYPE_I32 => Ok(Type::I32),
-            crate::ggml_type_GGML_TYPE_F16 => Ok(Type::F16),
-            crate::ggml_type_GGML_TYPE_F32 => Ok(Type::F32),
+            sys::ggml_type_GGML_TYPE_Q4_0 => Ok(Type::Q4_0),
+            sys::ggml_type_GGML_TYPE_Q4_1 => Ok(Type::Q4_1),
+            sys::ggml_type_GGML_TYPE_Q4_2 => Ok(Type::Q4_2),
+            sys::ggml_type_GGML_TYPE_Q5_0 => Ok(Type::Q5_0),
+            sys::ggml_type_GGML_TYPE_Q5_1 => Ok(Type::Q5_1),
+            sys::ggml_type_GGML_TYPE_Q8_0 => Ok(Type::Q8_0),
+            sys::ggml_type_GGML_TYPE_Q8_1 => Ok(Type::Q8_1),
+            sys::ggml_type_GGML_TYPE_I32 => Ok(Type::I32),
+            sys::ggml_type_GGML_TYPE_F16 => Ok(Type::F16),
+            sys::ggml_type_GGML_TYPE_F32 => Ok(Type::F32),
             _ => Err(()),
         }
     }
@@ -128,8 +127,10 @@ impl std::fmt::Display for Type {
             Type::Q4_0 => write!(f, "q4_0"),
             Type::Q4_1 => write!(f, "q4_1"),
             Type::Q4_2 => write!(f, "q4_2"),
-            Type::Q4_3 => write!(f, "q4_3"),
+            Type::Q5_0 => write!(f, "q5_0"),
+            Type::Q5_1 => write!(f, "q5_1"),
             Type::Q8_0 => write!(f, "q8_0"),
+            Type::Q8_1 => write!(f, "q8_1"),
             Type::I32 => write!(f, "i32"),
             Type::F16 => write!(f, "f16"),
             Type::F32 => write!(f, "f32"),
@@ -164,41 +165,41 @@ impl Buffer {
 
 /// A `ggml` computation graph. Keeps track of all state during computation.
 pub struct ComputationGraph {
-    inner: crate::ggml_cgraph,
+    inner: sys::ggml_cgraph,
 }
 
 impl ComputationGraph {
     /// Create a new [ComputationGraph] with the specified `n_threads`.
     pub fn new(n_threads: usize) -> Self {
         Self {
-            inner: crate::ggml_cgraph {
+            inner: sys::ggml_cgraph {
                 n_threads: usize_to_i32(n_threads),
                 // SAFETY: This should be safe to zero. The original C++ impl
                 // just leaves it uninitialized
-                ..unsafe { std::mem::zeroed::<crate::ggml_cgraph>() }
+                ..unsafe { std::mem::zeroed::<sys::ggml_cgraph>() }
             },
         }
     }
 
     /// Build this computational graph in the forward direction in preparation for computation.
     pub fn build_forward_expand(&mut self, tensor: &Tensor) {
-        unsafe { crate::ggml_build_forward_expand(&mut self.inner, tensor.ptr.as_ptr()) }
+        unsafe { sys::ggml_build_forward_expand(&mut self.inner, tensor.ptr.as_ptr()) }
     }
 }
 
 /// The size of `t` as bytes.
 pub fn type_size(t: Type) -> usize {
-    unsafe { crate::ggml_type_size(t.into()) }
+    unsafe { sys::ggml_type_size(t.into()) }
 }
 
 /// [type_size]/[blck_size] as float.
 pub fn type_sizef(x: Type) -> f64 {
-    (unsafe { crate::ggml_type_sizef(x.into()) }) as f64
+    (unsafe { sys::ggml_type_sizef(x.into()) }) as f64
 }
 
 /// The size of a block for `t`. Only relevant for quantized types.
 pub fn blck_size(t: Type) -> usize {
-    i32_to_usize(unsafe { crate::ggml_blck_size(t.into()) })
+    i32_to_usize(unsafe { sys::ggml_blck_size(t.into()) })
 }
 
 fn usize_to_i32(val: usize) -> i32 {
@@ -230,7 +231,7 @@ pub struct QuantizationResult {
 /// You must ensure that `src.len() == n_elements`, and `n_elements_0`
 /// is the first dimension of `src`.
 pub fn quantize_q4_0(src: &[f32], n_elements: usize, n_elements_0: usize) -> QuantizationResult {
-    quantize_impl(src, n_elements, n_elements_0, crate::ggml_quantize_q4_0)
+    quantize_impl(src, n_elements, n_elements_0, sys::ggml_quantize_q4_0)
 }
 
 /// Quantizes `src` into `dst` using `q4_1` quantization.
@@ -238,7 +239,7 @@ pub fn quantize_q4_0(src: &[f32], n_elements: usize, n_elements_0: usize) -> Qua
 /// You must ensure that `src.len() == n_elements`, and `n_elements_0`
 /// is the first dimension of `src`.
 pub fn quantize_q4_1(src: &[f32], n_elements: usize, n_elements_0: usize) -> QuantizationResult {
-    quantize_impl(src, n_elements, n_elements_0, crate::ggml_quantize_q4_1)
+    quantize_impl(src, n_elements, n_elements_0, sys::ggml_quantize_q4_1)
 }
 
 fn quantize_impl(
diff --git a/ggml/src/tensor.rs b/ggml/src/tensor.rs
index 6e426940..0a5edd82 100644
--- a/ggml/src/tensor.rs
+++ b/ggml/src/tensor.rs
@@ -1,23 +1,19 @@
-use std::{
-    os::raw::{c_int, c_void},
-    ptr::NonNull,
-    sync::{Arc, Weak},
-};
+use std::{os::raw::c_void, ptr::NonNull, sync::Weak};
 
-use crate::{i64_to_usize, Type};
+use crate::{i64_to_usize, sys, Type};
 
 /// Tensors are owned by the context. A tensor is alive as long as the
 /// underlying context it was created with is alive.
 pub struct Tensor {
-    pub(crate) ptr: NonNull<crate::ggml_tensor>,
-    pub(crate) ctx: Weak<NonNull<crate::ggml_context>>,
+    pub(crate) ptr: NonNull<sys::ggml_tensor>,
+    pub(crate) ctx: Weak<NonNull<sys::ggml_context>>,
 }
 
 impl Tensor {
     /// Size of the `ggml_tensor` struct in bytes.
     ///
     /// Exposed for purposes of determining context size.
-    pub const C_TYPE_SIZE: usize = std::mem::size_of::<crate::ggml_tensor>();
+    pub const C_TYPE_SIZE: usize = std::mem::size_of::<sys::ggml_tensor>();
 
     /// Creates a shared copy of this tensor pointer.
     pub fn share(&self) -> Self {
@@ -47,7 +43,7 @@ impl Tensor {
     pub fn nbytes(&self) -> usize {
         self.with_alive_ctx(|| {
             // SAFETY: The with_alive_call guarantees the context is alive
-            unsafe { crate::ggml_nbytes(self.ptr.as_ptr()) }
+            unsafe { sys::ggml_nbytes(self.ptr.as_ptr()) }
         })
     }
 
@@ -80,7 +76,7 @@ impl Tensor {
     pub fn nelements(&self) -> usize {
         self.with_alive_ctx(|| {
             // SAFETY: The with_alive_call guarantees the context is alive
-            i64_to_usize(unsafe { crate::ggml_nelements(self.ptr.as_ptr()) })
+            i64_to_usize(unsafe { sys::ggml_nelements(self.ptr.as_ptr()) })
         })
     }
 
@@ -101,7 +97,7 @@ impl Tensor {
 
     /// The size of the element type in bytes.
     pub fn element_size(&self) -> usize {
-        self.with_alive_ctx(|| unsafe { crate::ggml_element_size(self.ptr.as_ptr()) })
+        self.with_alive_ctx(|| unsafe { sys::ggml_element_size(self.ptr.as_ptr()) })
     }
 
     /// Writes `src` to this tensor.
@@ -124,7 +120,7 @@ impl Tensor {
     ///
     /// This tensor must not be written to or read by from any other code.
     pub unsafe fn read_data(&self, offset: usize, dst: &mut [u8]) {
-        let data = unsafe { crate::ggml_get_data(self.ptr.as_ptr()).add(offset) };
+        let data = unsafe { sys::ggml_get_data(self.ptr.as_ptr()).add(offset) };
         std::ptr::copy_nonoverlapping(data, dst as *mut _ as _, dst.len())
     }
 }
diff --git a/ggml/sys/Cargo.toml b/ggml/sys/Cargo.toml
new file mode 100644
index 00000000..793d25b3
--- /dev/null
+++ b/ggml/sys/Cargo.toml
@@ -0,0 +1,7 @@
+[package]
+name = "ggml-sys"
+version = { workspace = true }
+edition = "2021"
+
+[build-dependencies]
+cc = "^1.0"
diff --git a/ggml/build.rs b/ggml/sys/build.rs
similarity index 90%
rename from ggml/build.rs
rename to ggml/sys/build.rs
index 02b38258..0b99fa0f 100644
--- a/ggml/build.rs
+++ b/ggml/sys/build.rs
@@ -1,4 +1,4 @@
-use std::{env, path::PathBuf};
+use std::env;
 
 // By default, this crate will attempt to compile ggml with the features of your host system if
 // the host and target are the same. If they are not, it will turn off auto-feature-detection,
@@ -89,16 +89,6 @@ fn main() {
     }
     build.warnings(false);
     build.compile("ggml");
-
-    let header_path = "./ggml/include/ggml/ggml.h";
-    bindgen::Builder::default()
-        .header(String::from(header_path))
-        .allowlist_file(header_path)
-        .parse_callbacks(Box::new(bindgen::CargoCallbacks))
-        .generate()
-        .expect("Unable to generate bindings.")
-        .write_to_file(PathBuf::from(env::var("OUT_DIR").unwrap()).join("bindings.rs"))
-        .expect("Unable to write generated bindings to file.");
 }
 
 fn get_supported_target_features() -> std::collections::HashSet<String> {
diff --git a/ggml/sys/ggml b/ggml/sys/ggml
new file mode 160000
index 00000000..43dfb439
--- /dev/null
+++ b/ggml/sys/ggml
@@ -0,0 +1 @@
+Subproject commit 43dfb439fbf03eaf2db34a511f9e60d8338493e7
diff --git a/ggml/sys/src/lib.rs b/ggml/sys/src/lib.rs
new file mode 100644
index 00000000..be3d5d3b
--- /dev/null
+++ b/ggml/sys/src/lib.rs
@@ -0,0 +1,1617 @@
+/* automatically generated by rust-bindgen 0.65.1 */
+
+#![allow(non_upper_case_globals)]
+#![allow(non_camel_case_types)]
+#![allow(non_snake_case)]
+#![allow(unused)]
+
+pub const GGML_FILE_MAGIC: u32 = 1734831468;
+pub const GGML_FILE_VERSION: u32 = 1;
+pub const GGML_MAX_DIMS: u32 = 4;
+pub const GGML_MAX_NODES: u32 = 4096;
+pub const GGML_MAX_PARAMS: u32 = 16;
+pub const GGML_MAX_CONTEXTS: u32 = 64;
+pub const GGML_MAX_OPT: u32 = 4;
+pub const GGML_DEFAULT_N_THREADS: u32 = 4;
+pub type ggml_fp16_t = u16;
+extern "C" {
+    pub fn ggml_fp16_to_fp32(x: ggml_fp16_t) -> f32;
+}
+extern "C" {
+    pub fn ggml_fp32_to_fp16(x: f32) -> ggml_fp16_t;
+}
+#[repr(C)]
+#[derive(Debug, Copy, Clone)]
+pub struct ggml_context {
+    _unused: [u8; 0],
+}
+pub const ggml_type_GGML_TYPE_F32: ggml_type = 0;
+pub const ggml_type_GGML_TYPE_F16: ggml_type = 1;
+pub const ggml_type_GGML_TYPE_Q4_0: ggml_type = 2;
+pub const ggml_type_GGML_TYPE_Q4_1: ggml_type = 3;
+pub const ggml_type_GGML_TYPE_Q4_2: ggml_type = 4;
+pub const ggml_type_GGML_TYPE_Q5_0: ggml_type = 6;
+pub const ggml_type_GGML_TYPE_Q5_1: ggml_type = 7;
+pub const ggml_type_GGML_TYPE_Q8_0: ggml_type = 8;
+pub const ggml_type_GGML_TYPE_Q8_1: ggml_type = 9;
+pub const ggml_type_GGML_TYPE_I8: ggml_type = 10;
+pub const ggml_type_GGML_TYPE_I16: ggml_type = 11;
+pub const ggml_type_GGML_TYPE_I32: ggml_type = 12;
+pub const ggml_type_GGML_TYPE_COUNT: ggml_type = 13;
+pub type ggml_type = ::std::os::raw::c_uint;
+pub const ggml_ftype_GGML_FTYPE_UNKNOWN: ggml_ftype = -1;
+pub const ggml_ftype_GGML_FTYPE_ALL_F32: ggml_ftype = 0;
+pub const ggml_ftype_GGML_FTYPE_MOSTLY_F16: ggml_ftype = 1;
+pub const ggml_ftype_GGML_FTYPE_MOSTLY_Q4_0: ggml_ftype = 2;
+pub const ggml_ftype_GGML_FTYPE_MOSTLY_Q4_1: ggml_ftype = 3;
+pub const ggml_ftype_GGML_FTYPE_MOSTLY_Q4_1_SOME_F16: ggml_ftype = 4;
+pub const ggml_ftype_GGML_FTYPE_MOSTLY_Q4_2: ggml_ftype = 5;
+pub const ggml_ftype_GGML_FTYPE_MOSTLY_Q8_0: ggml_ftype = 7;
+pub const ggml_ftype_GGML_FTYPE_MOSTLY_Q5_0: ggml_ftype = 8;
+pub const ggml_ftype_GGML_FTYPE_MOSTLY_Q5_1: ggml_ftype = 9;
+pub type ggml_ftype = ::std::os::raw::c_int;
+pub const ggml_op_GGML_OP_NONE: ggml_op = 0;
+pub const ggml_op_GGML_OP_DUP: ggml_op = 1;
+pub const ggml_op_GGML_OP_ADD: ggml_op = 2;
+pub const ggml_op_GGML_OP_SUB: ggml_op = 3;
+pub const ggml_op_GGML_OP_MUL: ggml_op = 4;
+pub const ggml_op_GGML_OP_DIV: ggml_op = 5;
+pub const ggml_op_GGML_OP_SQR: ggml_op = 6;
+pub const ggml_op_GGML_OP_SQRT: ggml_op = 7;
+pub const ggml_op_GGML_OP_SUM: ggml_op = 8;
+pub const ggml_op_GGML_OP_MEAN: ggml_op = 9;
+pub const ggml_op_GGML_OP_REPEAT: ggml_op = 10;
+pub const ggml_op_GGML_OP_ABS: ggml_op = 11;
+pub const ggml_op_GGML_OP_SGN: ggml_op = 12;
+pub const ggml_op_GGML_OP_NEG: ggml_op = 13;
+pub const ggml_op_GGML_OP_STEP: ggml_op = 14;
+pub const ggml_op_GGML_OP_RELU: ggml_op = 15;
+pub const ggml_op_GGML_OP_GELU: ggml_op = 16;
+pub const ggml_op_GGML_OP_SILU: ggml_op = 17;
+pub const ggml_op_GGML_OP_NORM: ggml_op = 18;
+pub const ggml_op_GGML_OP_RMS_NORM: ggml_op = 19;
+pub const ggml_op_GGML_OP_MUL_MAT: ggml_op = 20;
+pub const ggml_op_GGML_OP_SCALE: ggml_op = 21;
+pub const ggml_op_GGML_OP_CPY: ggml_op = 22;
+pub const ggml_op_GGML_OP_CONT: ggml_op = 23;
+pub const ggml_op_GGML_OP_RESHAPE: ggml_op = 24;
+pub const ggml_op_GGML_OP_VIEW: ggml_op = 25;
+pub const ggml_op_GGML_OP_PERMUTE: ggml_op = 26;
+pub const ggml_op_GGML_OP_TRANSPOSE: ggml_op = 27;
+pub const ggml_op_GGML_OP_GET_ROWS: ggml_op = 28;
+pub const ggml_op_GGML_OP_DIAG_MASK_INF: ggml_op = 29;
+pub const ggml_op_GGML_OP_SOFT_MAX: ggml_op = 30;
+pub const ggml_op_GGML_OP_ROPE: ggml_op = 31;
+pub const ggml_op_GGML_OP_ALIBI: ggml_op = 32;
+pub const ggml_op_GGML_OP_CONV_1D_1S: ggml_op = 33;
+pub const ggml_op_GGML_OP_CONV_1D_2S: ggml_op = 34;
+pub const ggml_op_GGML_OP_FLASH_ATTN: ggml_op = 35;
+pub const ggml_op_GGML_OP_FLASH_FF: ggml_op = 36;
+pub const ggml_op_GGML_OP_MAP_UNARY: ggml_op = 37;
+pub const ggml_op_GGML_OP_MAP_BINARY: ggml_op = 38;
+pub const ggml_op_GGML_OP_COUNT: ggml_op = 39;
+pub type ggml_op = ::std::os::raw::c_uint;
+#[repr(C)]
+#[derive(Debug, Copy, Clone)]
+pub struct ggml_object {
+    pub offs: usize,
+    pub size: usize,
+    pub next: *mut ggml_object,
+    pub padding: [::std::os::raw::c_char; 8usize],
+}
+#[test]
+fn bindgen_test_layout_ggml_object() {
+    const UNINIT: ::std::mem::MaybeUninit<ggml_object> = ::std::mem::MaybeUninit::uninit();
+    let ptr = UNINIT.as_ptr();
+    assert_eq!(
+        ::std::mem::size_of::<ggml_object>(),
+        32usize,
+        concat!("Size of: ", stringify!(ggml_object))
+    );
+    assert_eq!(
+        ::std::mem::align_of::<ggml_object>(),
+        8usize,
+        concat!("Alignment of ", stringify!(ggml_object))
+    );
+    assert_eq!(
+        unsafe { ::std::ptr::addr_of!((*ptr).offs) as usize - ptr as usize },
+        0usize,
+        concat!(
+            "Offset of field: ",
+            stringify!(ggml_object),
+            "::",
+            stringify!(offs)
+        )
+    );
+    assert_eq!(
+        unsafe { ::std::ptr::addr_of!((*ptr).size) as usize - ptr as usize },
+        8usize,
+        concat!(
+            "Offset of field: ",
+            stringify!(ggml_object),
+            "::",
+            stringify!(size)
+        )
+    );
+    assert_eq!(
+        unsafe { ::std::ptr::addr_of!((*ptr).next) as usize - ptr as usize },
+        16usize,
+        concat!(
+            "Offset of field: ",
+            stringify!(ggml_object),
+            "::",
+            stringify!(next)
+        )
+    );
+    assert_eq!(
+        unsafe { ::std::ptr::addr_of!((*ptr).padding) as usize - ptr as usize },
+        24usize,
+        concat!(
+            "Offset of field: ",
+            stringify!(ggml_object),
+            "::",
+            stringify!(padding)
+        )
+    );
+}
+pub const GGML_OBJECT_SIZE: usize = 32;
+#[repr(C)]
+#[derive(Debug, Copy, Clone)]
+pub struct ggml_tensor {
+    pub type_: ggml_type,
+    pub n_dims: ::std::os::raw::c_int,
+    pub ne: [i64; 4usize],
+    pub nb: [usize; 4usize],
+    pub op: ggml_op,
+    pub is_param: bool,
+    pub grad: *mut ggml_tensor,
+    pub src0: *mut ggml_tensor,
+    pub src1: *mut ggml_tensor,
+    pub opt: [*mut ggml_tensor; 4usize],
+    pub n_tasks: ::std::os::raw::c_int,
+    pub perf_runs: ::std::os::raw::c_int,
+    pub perf_cycles: i64,
+    pub perf_time_us: i64,
+    pub data: *mut ::std::os::raw::c_void,
+    pub padding: [::std::os::raw::c_char; 8usize],
+}
+#[test]
+fn bindgen_test_layout_ggml_tensor() {
+    const UNINIT: ::std::mem::MaybeUninit<ggml_tensor> = ::std::mem::MaybeUninit::uninit();
+    let ptr = UNINIT.as_ptr();
+    assert_eq!(
+        ::std::mem::size_of::<ggml_tensor>(),
+        176usize,
+        concat!("Size of: ", stringify!(ggml_tensor))
+    );
+    assert_eq!(
+        ::std::mem::align_of::<ggml_tensor>(),
+        8usize,
+        concat!("Alignment of ", stringify!(ggml_tensor))
+    );
+    assert_eq!(
+        unsafe { ::std::ptr::addr_of!((*ptr).type_) as usize - ptr as usize },
+        0usize,
+        concat!(
+            "Offset of field: ",
+            stringify!(ggml_tensor),
+            "::",
+            stringify!(type_)
+        )
+    );
+    assert_eq!(
+        unsafe { ::std::ptr::addr_of!((*ptr).n_dims) as usize - ptr as usize },
+        4usize,
+        concat!(
+            "Offset of field: ",
+            stringify!(ggml_tensor),
+            "::",
+            stringify!(n_dims)
+        )
+    );
+    assert_eq!(
+        unsafe { ::std::ptr::addr_of!((*ptr).ne) as usize - ptr as usize },
+        8usize,
+        concat!(
+            "Offset of field: ",
+            stringify!(ggml_tensor),
+            "::",
+            stringify!(ne)
+        )
+    );
+    assert_eq!(
+        unsafe { ::std::ptr::addr_of!((*ptr).nb) as usize - ptr as usize },
+        40usize,
+        concat!(
+            "Offset of field: ",
+            stringify!(ggml_tensor),
+            "::",
+            stringify!(nb)
+        )
+    );
+    assert_eq!(
+        unsafe { ::std::ptr::addr_of!((*ptr).op) as usize - ptr as usize },
+        72usize,
+        concat!(
+            "Offset of field: ",
+            stringify!(ggml_tensor),
+            "::",
+            stringify!(op)
+        )
+    );
+    assert_eq!(
+        unsafe { ::std::ptr::addr_of!((*ptr).is_param) as usize - ptr as usize },
+        76usize,
+        concat!(
+            "Offset of field: ",
+            stringify!(ggml_tensor),
+            "::",
+            stringify!(is_param)
+        )
+    );
+    assert_eq!(
+        unsafe { ::std::ptr::addr_of!((*ptr).grad) as usize - ptr as usize },
+        80usize,
+        concat!(
+            "Offset of field: ",
+            stringify!(ggml_tensor),
+            "::",
+            stringify!(grad)
+        )
+    );
+    assert_eq!(
+        unsafe { ::std::ptr::addr_of!((*ptr).src0) as usize - ptr as usize },
+        88usize,
+        concat!(
+            "Offset of field: ",
+            stringify!(ggml_tensor),
+            "::",
+            stringify!(src0)
+        )
+    );
+    assert_eq!(
+        unsafe { ::std::ptr::addr_of!((*ptr).src1) as usize - ptr as usize },
+        96usize,
+        concat!(
+            "Offset of field: ",
+            stringify!(ggml_tensor),
+            "::",
+            stringify!(src1)
+        )
+    );
+    assert_eq!(
+        unsafe { ::std::ptr::addr_of!((*ptr).opt) as usize - ptr as usize },
+        104usize,
+        concat!(
+            "Offset of field: ",
+            stringify!(ggml_tensor),
+            "::",
+            stringify!(opt)
+        )
+    );
+    assert_eq!(
+        unsafe { ::std::ptr::addr_of!((*ptr).n_tasks) as usize - ptr as usize },
+        136usize,
+        concat!(
+            "Offset of field: ",
+            stringify!(ggml_tensor),
+            "::",
+            stringify!(n_tasks)
+        )
+    );
+    assert_eq!(
+        unsafe { ::std::ptr::addr_of!((*ptr).perf_runs) as usize - ptr as usize },
+        140usize,
+        concat!(
+            "Offset of field: ",
+            stringify!(ggml_tensor),
+            "::",
+            stringify!(perf_runs)
+        )
+    );
+    assert_eq!(
+        unsafe { ::std::ptr::addr_of!((*ptr).perf_cycles) as usize - ptr as usize },
+        144usize,
+        concat!(
+            "Offset of field: ",
+            stringify!(ggml_tensor),
+            "::",
+            stringify!(perf_cycles)
+        )
+    );
+    assert_eq!(
+        unsafe { ::std::ptr::addr_of!((*ptr).perf_time_us) as usize - ptr as usize },
+        152usize,
+        concat!(
+            "Offset of field: ",
+            stringify!(ggml_tensor),
+            "::",
+            stringify!(perf_time_us)
+        )
+    );
+    assert_eq!(
+        unsafe { ::std::ptr::addr_of!((*ptr).data) as usize - ptr as usize },
+        160usize,
+        concat!(
+            "Offset of field: ",
+            stringify!(ggml_tensor),
+            "::",
+            stringify!(data)
+        )
+    );
+    assert_eq!(
+        unsafe { ::std::ptr::addr_of!((*ptr).padding) as usize - ptr as usize },
+        168usize,
+        concat!(
+            "Offset of field: ",
+            stringify!(ggml_tensor),
+            "::",
+            stringify!(padding)
+        )
+    );
+}
+#[repr(C)]
+#[derive(Debug, Copy, Clone)]
+pub struct ggml_cgraph {
+    pub n_nodes: ::std::os::raw::c_int,
+    pub n_leafs: ::std::os::raw::c_int,
+    pub n_threads: ::std::os::raw::c_int,
+    pub work_size: usize,
+    pub work: *mut ggml_tensor,
+    pub nodes: [*mut ggml_tensor; 4096usize],
+    pub grads: [*mut ggml_tensor; 4096usize],
+    pub leafs: [*mut ggml_tensor; 4096usize],
+    pub perf_runs: ::std::os::raw::c_int,
+    pub perf_cycles: i64,
+    pub perf_time_us: i64,
+}
+#[test]
+fn bindgen_test_layout_ggml_cgraph() {
+    const UNINIT: ::std::mem::MaybeUninit<ggml_cgraph> = ::std::mem::MaybeUninit::uninit();
+    let ptr = UNINIT.as_ptr();
+    assert_eq!(
+        ::std::mem::size_of::<ggml_cgraph>(),
+        98360usize,
+        concat!("Size of: ", stringify!(ggml_cgraph))
+    );
+    assert_eq!(
+        ::std::mem::align_of::<ggml_cgraph>(),
+        8usize,
+        concat!("Alignment of ", stringify!(ggml_cgraph))
+    );
+    assert_eq!(
+        unsafe { ::std::ptr::addr_of!((*ptr).n_nodes) as usize - ptr as usize },
+        0usize,
+        concat!(
+            "Offset of field: ",
+            stringify!(ggml_cgraph),
+            "::",
+            stringify!(n_nodes)
+        )
+    );
+    assert_eq!(
+        unsafe { ::std::ptr::addr_of!((*ptr).n_leafs) as usize - ptr as usize },
+        4usize,
+        concat!(
+            "Offset of field: ",
+            stringify!(ggml_cgraph),
+            "::",
+            stringify!(n_leafs)
+        )
+    );
+    assert_eq!(
+        unsafe { ::std::ptr::addr_of!((*ptr).n_threads) as usize - ptr as usize },
+        8usize,
+        concat!(
+            "Offset of field: ",
+            stringify!(ggml_cgraph),
+            "::",
+            stringify!(n_threads)
+        )
+    );
+    assert_eq!(
+        unsafe { ::std::ptr::addr_of!((*ptr).work_size) as usize - ptr as usize },
+        16usize,
+        concat!(
+            "Offset of field: ",
+            stringify!(ggml_cgraph),
+            "::",
+            stringify!(work_size)
+        )
+    );
+    assert_eq!(
+        unsafe { ::std::ptr::addr_of!((*ptr).work) as usize - ptr as usize },
+        24usize,
+        concat!(
+            "Offset of field: ",
+            stringify!(ggml_cgraph),
+            "::",
+            stringify!(work)
+        )
+    );
+    assert_eq!(
+        unsafe { ::std::ptr::addr_of!((*ptr).nodes) as usize - ptr as usize },
+        32usize,
+        concat!(
+            "Offset of field: ",
+            stringify!(ggml_cgraph),
+            "::",
+            stringify!(nodes)
+        )
+    );
+    assert_eq!(
+        unsafe { ::std::ptr::addr_of!((*ptr).grads) as usize - ptr as usize },
+        32800usize,
+        concat!(
+            "Offset of field: ",
+            stringify!(ggml_cgraph),
+            "::",
+            stringify!(grads)
+        )
+    );
+    assert_eq!(
+        unsafe { ::std::ptr::addr_of!((*ptr).leafs) as usize - ptr as usize },
+        65568usize,
+        concat!(
+            "Offset of field: ",
+            stringify!(ggml_cgraph),
+            "::",
+            stringify!(leafs)
+        )
+    );
+    assert_eq!(
+        unsafe { ::std::ptr::addr_of!((*ptr).perf_runs) as usize - ptr as usize },
+        98336usize,
+        concat!(
+            "Offset of field: ",
+            stringify!(ggml_cgraph),
+            "::",
+            stringify!(perf_runs)
+        )
+    );
+    assert_eq!(
+        unsafe { ::std::ptr::addr_of!((*ptr).perf_cycles) as usize - ptr as usize },
+        98344usize,
+        concat!(
+            "Offset of field: ",
+            stringify!(ggml_cgraph),
+            "::",
+            stringify!(perf_cycles)
+        )
+    );
+    assert_eq!(
+        unsafe { ::std::ptr::addr_of!((*ptr).perf_time_us) as usize - ptr as usize },
+        98352usize,
+        concat!(
+            "Offset of field: ",
+            stringify!(ggml_cgraph),
+            "::",
+            stringify!(perf_time_us)
+        )
+    );
+}
+#[repr(C)]
+#[derive(Debug, Copy, Clone)]
+pub struct ggml_scratch {
+    pub offs: usize,
+    pub size: usize,
+    pub data: *mut ::std::os::raw::c_void,
+}
+#[test]
+fn bindgen_test_layout_ggml_scratch() {
+    const UNINIT: ::std::mem::MaybeUninit<ggml_scratch> = ::std::mem::MaybeUninit::uninit();
+    let ptr = UNINIT.as_ptr();
+    assert_eq!(
+        ::std::mem::size_of::<ggml_scratch>(),
+        24usize,
+        concat!("Size of: ", stringify!(ggml_scratch))
+    );
+    assert_eq!(
+        ::std::mem::align_of::<ggml_scratch>(),
+        8usize,
+        concat!("Alignment of ", stringify!(ggml_scratch))
+    );
+    assert_eq!(
+        unsafe { ::std::ptr::addr_of!((*ptr).offs) as usize - ptr as usize },
+        0usize,
+        concat!(
+            "Offset of field: ",
+            stringify!(ggml_scratch),
+            "::",
+            stringify!(offs)
+        )
+    );
+    assert_eq!(
+        unsafe { ::std::ptr::addr_of!((*ptr).size) as usize - ptr as usize },
+        8usize,
+        concat!(
+            "Offset of field: ",
+            stringify!(ggml_scratch),
+            "::",
+            stringify!(size)
+        )
+    );
+    assert_eq!(
+        unsafe { ::std::ptr::addr_of!((*ptr).data) as usize - ptr as usize },
+        16usize,
+        concat!(
+            "Offset of field: ",
+            stringify!(ggml_scratch),
+            "::",
+            stringify!(data)
+        )
+    );
+}
+#[repr(C)]
+#[derive(Debug, Copy, Clone)]
+pub struct ggml_init_params {
+    pub mem_size: usize,
+    pub mem_buffer: *mut ::std::os::raw::c_void,
+    pub no_alloc: bool,
+}
+#[test]
+fn bindgen_test_layout_ggml_init_params() {
+    const UNINIT: ::std::mem::MaybeUninit<ggml_init_params> = ::std::mem::MaybeUninit::uninit();
+    let ptr = UNINIT.as_ptr();
+    assert_eq!(
+        ::std::mem::size_of::<ggml_init_params>(),
+        24usize,
+        concat!("Size of: ", stringify!(ggml_init_params))
+    );
+    assert_eq!(
+        ::std::mem::align_of::<ggml_init_params>(),
+        8usize,
+        concat!("Alignment of ", stringify!(ggml_init_params))
+    );
+    assert_eq!(
+        unsafe { ::std::ptr::addr_of!((*ptr).mem_size) as usize - ptr as usize },
+        0usize,
+        concat!(
+            "Offset of field: ",
+            stringify!(ggml_init_params),
+            "::",
+            stringify!(mem_size)
+        )
+    );
+    assert_eq!(
+        unsafe { ::std::ptr::addr_of!((*ptr).mem_buffer) as usize - ptr as usize },
+        8usize,
+        concat!(
+            "Offset of field: ",
+            stringify!(ggml_init_params),
+            "::",
+            stringify!(mem_buffer)
+        )
+    );
+    assert_eq!(
+        unsafe { ::std::ptr::addr_of!((*ptr).no_alloc) as usize - ptr as usize },
+        16usize,
+        concat!(
+            "Offset of field: ",
+            stringify!(ggml_init_params),
+            "::",
+            stringify!(no_alloc)
+        )
+    );
+}
+extern "C" {
+    pub fn ggml_time_init();
+}
+extern "C" {
+    pub fn ggml_time_ms() -> i64;
+}
+extern "C" {
+    pub fn ggml_time_us() -> i64;
+}
+extern "C" {
+    pub fn ggml_cycles() -> i64;
+}
+extern "C" {
+    pub fn ggml_cycles_per_ms() -> i64;
+}
+extern "C" {
+    pub fn ggml_print_object(obj: *const ggml_object);
+}
+extern "C" {
+    pub fn ggml_print_objects(ctx: *const ggml_context);
+}
+extern "C" {
+    pub fn ggml_nelements(tensor: *const ggml_tensor) -> i64;
+}
+extern "C" {
+    pub fn ggml_nbytes(tensor: *const ggml_tensor) -> usize;
+}
+extern "C" {
+    pub fn ggml_blck_size(type_: ggml_type) -> ::std::os::raw::c_int;
+}
+extern "C" {
+    pub fn ggml_type_size(type_: ggml_type) -> usize;
+}
+extern "C" {
+    pub fn ggml_type_sizef(type_: ggml_type) -> f32;
+}
+extern "C" {
+    pub fn ggml_type_name(type_: ggml_type) -> *const ::std::os::raw::c_char;
+}
+extern "C" {
+    pub fn ggml_element_size(tensor: *const ggml_tensor) -> usize;
+}
+extern "C" {
+    pub fn ggml_is_quantized(type_: ggml_type) -> bool;
+}
+extern "C" {
+    pub fn ggml_ftype_to_ggml_type(ftype: ggml_ftype) -> ggml_type;
+}
+extern "C" {
+    pub fn ggml_init(params: ggml_init_params) -> *mut ggml_context;
+}
+extern "C" {
+    pub fn ggml_free(ctx: *mut ggml_context);
+}
+extern "C" {
+    pub fn ggml_used_mem(ctx: *const ggml_context) -> usize;
+}
+extern "C" {
+    pub fn ggml_set_scratch(ctx: *mut ggml_context, scratch: ggml_scratch) -> usize;
+}
+extern "C" {
+    pub fn ggml_new_tensor(
+        ctx: *mut ggml_context,
+        type_: ggml_type,
+        n_dims: ::std::os::raw::c_int,
+        ne: *const i64,
+    ) -> *mut ggml_tensor;
+}
+extern "C" {
+    pub fn ggml_new_tensor_1d(
+        ctx: *mut ggml_context,
+        type_: ggml_type,
+        ne0: i64,
+    ) -> *mut ggml_tensor;
+}
+extern "C" {
+    pub fn ggml_new_tensor_2d(
+        ctx: *mut ggml_context,
+        type_: ggml_type,
+        ne0: i64,
+        ne1: i64,
+    ) -> *mut ggml_tensor;
+}
+extern "C" {
+    pub fn ggml_new_tensor_3d(
+        ctx: *mut ggml_context,
+        type_: ggml_type,
+        ne0: i64,
+        ne1: i64,
+        ne2: i64,
+    ) -> *mut ggml_tensor;
+}
+extern "C" {
+    pub fn ggml_new_tensor_4d(
+        ctx: *mut ggml_context,
+        type_: ggml_type,
+        ne0: i64,
+        ne1: i64,
+        ne2: i64,
+        ne3: i64,
+    ) -> *mut ggml_tensor;
+}
+extern "C" {
+    pub fn ggml_new_i32(ctx: *mut ggml_context, value: i32) -> *mut ggml_tensor;
+}
+extern "C" {
+    pub fn ggml_new_f32(ctx: *mut ggml_context, value: f32) -> *mut ggml_tensor;
+}
+extern "C" {
+    pub fn ggml_dup_tensor(ctx: *mut ggml_context, src: *const ggml_tensor) -> *mut ggml_tensor;
+}
+extern "C" {
+    pub fn ggml_view_tensor(ctx: *mut ggml_context, src: *const ggml_tensor) -> *mut ggml_tensor;
+}
+extern "C" {
+    pub fn ggml_set_zero(tensor: *mut ggml_tensor) -> *mut ggml_tensor;
+}
+extern "C" {
+    pub fn ggml_set_i32(tensor: *mut ggml_tensor, value: i32) -> *mut ggml_tensor;
+}
+extern "C" {
+    pub fn ggml_set_f32(tensor: *mut ggml_tensor, value: f32) -> *mut ggml_tensor;
+}
+extern "C" {
+    pub fn ggml_get_i32_1d(tensor: *const ggml_tensor, i: ::std::os::raw::c_int) -> i32;
+}
+extern "C" {
+    pub fn ggml_set_i32_1d(tensor: *const ggml_tensor, i: ::std::os::raw::c_int, value: i32);
+}
+extern "C" {
+    pub fn ggml_get_f32_1d(tensor: *const ggml_tensor, i: ::std::os::raw::c_int) -> f32;
+}
+extern "C" {
+    pub fn ggml_set_f32_1d(tensor: *const ggml_tensor, i: ::std::os::raw::c_int, value: f32);
+}
+extern "C" {
+    pub fn ggml_get_data(tensor: *const ggml_tensor) -> *mut ::std::os::raw::c_void;
+}
+extern "C" {
+    pub fn ggml_get_data_f32(tensor: *const ggml_tensor) -> *mut f32;
+}
+extern "C" {
+    pub fn ggml_dup(ctx: *mut ggml_context, a: *mut ggml_tensor) -> *mut ggml_tensor;
+}
+extern "C" {
+    pub fn ggml_add(
+        ctx: *mut ggml_context,
+        a: *mut ggml_tensor,
+        b: *mut ggml_tensor,
+    ) -> *mut ggml_tensor;
+}
+extern "C" {
+    pub fn ggml_add_inplace(
+        ctx: *mut ggml_context,
+        a: *mut ggml_tensor,
+        b: *mut ggml_tensor,
+    ) -> *mut ggml_tensor;
+}
+extern "C" {
+    pub fn ggml_sub(
+        ctx: *mut ggml_context,
+        a: *mut ggml_tensor,
+        b: *mut ggml_tensor,
+    ) -> *mut ggml_tensor;
+}
+extern "C" {
+    pub fn ggml_mul(
+        ctx: *mut ggml_context,
+        a: *mut ggml_tensor,
+        b: *mut ggml_tensor,
+    ) -> *mut ggml_tensor;
+}
+extern "C" {
+    pub fn ggml_div(
+        ctx: *mut ggml_context,
+        a: *mut ggml_tensor,
+        b: *mut ggml_tensor,
+    ) -> *mut ggml_tensor;
+}
+extern "C" {
+    pub fn ggml_sqr(ctx: *mut ggml_context, a: *mut ggml_tensor) -> *mut ggml_tensor;
+}
+extern "C" {
+    pub fn ggml_sqrt(ctx: *mut ggml_context, a: *mut ggml_tensor) -> *mut ggml_tensor;
+}
+extern "C" {
+    pub fn ggml_sum(ctx: *mut ggml_context, a: *mut ggml_tensor) -> *mut ggml_tensor;
+}
+extern "C" {
+    pub fn ggml_mean(ctx: *mut ggml_context, a: *mut ggml_tensor) -> *mut ggml_tensor;
+}
+extern "C" {
+    pub fn ggml_repeat(
+        ctx: *mut ggml_context,
+        a: *mut ggml_tensor,
+        b: *mut ggml_tensor,
+    ) -> *mut ggml_tensor;
+}
+extern "C" {
+    pub fn ggml_abs(ctx: *mut ggml_context, a: *mut ggml_tensor) -> *mut ggml_tensor;
+}
+extern "C" {
+    pub fn ggml_sgn(ctx: *mut ggml_context, a: *mut ggml_tensor) -> *mut ggml_tensor;
+}
+extern "C" {
+    pub fn ggml_neg(ctx: *mut ggml_context, a: *mut ggml_tensor) -> *mut ggml_tensor;
+}
+extern "C" {
+    pub fn ggml_step(ctx: *mut ggml_context, a: *mut ggml_tensor) -> *mut ggml_tensor;
+}
+extern "C" {
+    pub fn ggml_relu(ctx: *mut ggml_context, a: *mut ggml_tensor) -> *mut ggml_tensor;
+}
+extern "C" {
+    pub fn ggml_gelu(ctx: *mut ggml_context, a: *mut ggml_tensor) -> *mut ggml_tensor;
+}
+extern "C" {
+    pub fn ggml_silu(ctx: *mut ggml_context, a: *mut ggml_tensor) -> *mut ggml_tensor;
+}
+extern "C" {
+    pub fn ggml_norm(ctx: *mut ggml_context, a: *mut ggml_tensor) -> *mut ggml_tensor;
+}
+extern "C" {
+    pub fn ggml_rms_norm(ctx: *mut ggml_context, a: *mut ggml_tensor) -> *mut ggml_tensor;
+}
+extern "C" {
+    pub fn ggml_mul_mat(
+        ctx: *mut ggml_context,
+        a: *mut ggml_tensor,
+        b: *mut ggml_tensor,
+    ) -> *mut ggml_tensor;
+}
+extern "C" {
+    pub fn ggml_scale(
+        ctx: *mut ggml_context,
+        a: *mut ggml_tensor,
+        b: *mut ggml_tensor,
+    ) -> *mut ggml_tensor;
+}
+extern "C" {
+    pub fn ggml_cpy(
+        ctx: *mut ggml_context,
+        a: *mut ggml_tensor,
+        b: *mut ggml_tensor,
+    ) -> *mut ggml_tensor;
+}
+extern "C" {
+    pub fn ggml_cont(ctx: *mut ggml_context, a: *mut ggml_tensor) -> *mut ggml_tensor;
+}
+extern "C" {
+    pub fn ggml_reshape(
+        ctx: *mut ggml_context,
+        a: *mut ggml_tensor,
+        b: *mut ggml_tensor,
+    ) -> *mut ggml_tensor;
+}
+extern "C" {
+    pub fn ggml_reshape_2d(
+        ctx: *mut ggml_context,
+        a: *mut ggml_tensor,
+        ne0: i64,
+        ne1: i64,
+    ) -> *mut ggml_tensor;
+}
+extern "C" {
+    pub fn ggml_reshape_3d(
+        ctx: *mut ggml_context,
+        a: *mut ggml_tensor,
+        ne0: i64,
+        ne1: i64,
+        ne2: i64,
+    ) -> *mut ggml_tensor;
+}
+extern "C" {
+    pub fn ggml_view_1d(
+        ctx: *mut ggml_context,
+        a: *mut ggml_tensor,
+        ne0: i64,
+        offset: usize,
+    ) -> *mut ggml_tensor;
+}
+extern "C" {
+    pub fn ggml_view_2d(
+        ctx: *mut ggml_context,
+        a: *mut ggml_tensor,
+        ne0: i64,
+        ne1: i64,
+        nb1: usize,
+        offset: usize,
+    ) -> *mut ggml_tensor;
+}
+extern "C" {
+    pub fn ggml_view_3d(
+        ctx: *mut ggml_context,
+        a: *mut ggml_tensor,
+        ne0: i64,
+        ne1: i64,
+        ne2: i64,
+        nb1: usize,
+        nb2: usize,
+        offset: usize,
+    ) -> *mut ggml_tensor;
+}
+extern "C" {
+    pub fn ggml_permute(
+        ctx: *mut ggml_context,
+        a: *mut ggml_tensor,
+        axis0: ::std::os::raw::c_int,
+        axis1: ::std::os::raw::c_int,
+        axis2: ::std::os::raw::c_int,
+        axis3: ::std::os::raw::c_int,
+    ) -> *mut ggml_tensor;
+}
+extern "C" {
+    pub fn ggml_transpose(ctx: *mut ggml_context, a: *mut ggml_tensor) -> *mut ggml_tensor;
+}
+extern "C" {
+    pub fn ggml_get_rows(
+        ctx: *mut ggml_context,
+        a: *mut ggml_tensor,
+        b: *mut ggml_tensor,
+    ) -> *mut ggml_tensor;
+}
+extern "C" {
+    pub fn ggml_diag_mask_inf(
+        ctx: *mut ggml_context,
+        a: *mut ggml_tensor,
+        n_past: ::std::os::raw::c_int,
+    ) -> *mut ggml_tensor;
+}
+extern "C" {
+    pub fn ggml_soft_max(ctx: *mut ggml_context, a: *mut ggml_tensor) -> *mut ggml_tensor;
+}
+extern "C" {
+    pub fn ggml_rope(
+        ctx: *mut ggml_context,
+        a: *mut ggml_tensor,
+        n_past: ::std::os::raw::c_int,
+        n_dims: ::std::os::raw::c_int,
+        mode: ::std::os::raw::c_int,
+    ) -> *mut ggml_tensor;
+}
+extern "C" {
+    pub fn ggml_alibi(
+        ctx: *mut ggml_context,
+        a: *mut ggml_tensor,
+        n_past: ::std::os::raw::c_int,
+        n_head: ::std::os::raw::c_int,
+    ) -> *mut ggml_tensor;
+}
+extern "C" {
+    pub fn ggml_conv_1d_1s(
+        ctx: *mut ggml_context,
+        a: *mut ggml_tensor,
+        b: *mut ggml_tensor,
+    ) -> *mut ggml_tensor;
+}
+extern "C" {
+    pub fn ggml_conv_1d_2s(
+        ctx: *mut ggml_context,
+        a: *mut ggml_tensor,
+        b: *mut ggml_tensor,
+    ) -> *mut ggml_tensor;
+}
+extern "C" {
+    pub fn ggml_flash_attn(
+        ctx: *mut ggml_context,
+        q: *mut ggml_tensor,
+        k: *mut ggml_tensor,
+        v: *mut ggml_tensor,
+        masked: bool,
+    ) -> *mut ggml_tensor;
+}
+extern "C" {
+    pub fn ggml_flash_ff(
+        ctx: *mut ggml_context,
+        a: *mut ggml_tensor,
+        b0: *mut ggml_tensor,
+        b1: *mut ggml_tensor,
+        c0: *mut ggml_tensor,
+        c1: *mut ggml_tensor,
+    ) -> *mut ggml_tensor;
+}
+pub type ggml_unary_op_f32_t = ::std::option::Option<
+    unsafe extern "C" fn(arg1: ::std::os::raw::c_int, arg2: *mut f32, arg3: *const f32),
+>;
+pub type ggml_binary_op_f32_t = ::std::option::Option<
+    unsafe extern "C" fn(
+        arg1: ::std::os::raw::c_int,
+        arg2: *mut f32,
+        arg3: *const f32,
+        arg4: *const f32,
+    ),
+>;
+extern "C" {
+    pub fn ggml_map_unary_f32(
+        ctx: *mut ggml_context,
+        a: *mut ggml_tensor,
+        fun: ggml_unary_op_f32_t,
+    ) -> *mut ggml_tensor;
+}
+extern "C" {
+    pub fn ggml_map_binary_f32(
+        ctx: *mut ggml_context,
+        a: *mut ggml_tensor,
+        b: *mut ggml_tensor,
+        fun: ggml_binary_op_f32_t,
+    ) -> *mut ggml_tensor;
+}
+extern "C" {
+    pub fn ggml_set_param(ctx: *mut ggml_context, tensor: *mut ggml_tensor);
+}
+extern "C" {
+    pub fn ggml_build_forward_expand(cgraph: *mut ggml_cgraph, tensor: *mut ggml_tensor);
+}
+extern "C" {
+    pub fn ggml_build_forward(tensor: *mut ggml_tensor) -> ggml_cgraph;
+}
+extern "C" {
+    pub fn ggml_build_backward(
+        ctx: *mut ggml_context,
+        gf: *mut ggml_cgraph,
+        keep: bool,
+    ) -> ggml_cgraph;
+}
+extern "C" {
+    pub fn ggml_graph_compute(ctx: *mut ggml_context, cgraph: *mut ggml_cgraph);
+}
+extern "C" {
+    pub fn ggml_graph_reset(cgraph: *mut ggml_cgraph);
+}
+extern "C" {
+    pub fn ggml_graph_print(cgraph: *const ggml_cgraph);
+}
+extern "C" {
+    pub fn ggml_graph_dump_dot(
+        gb: *const ggml_cgraph,
+        gf: *const ggml_cgraph,
+        filename: *const ::std::os::raw::c_char,
+    );
+}
+pub const ggml_opt_type_GGML_OPT_ADAM: ggml_opt_type = 0;
+pub const ggml_opt_type_GGML_OPT_LBFGS: ggml_opt_type = 1;
+pub type ggml_opt_type = ::std::os::raw::c_uint;
+pub const ggml_linesearch_GGML_LINESEARCH_DEFAULT: ggml_linesearch = 1;
+pub const ggml_linesearch_GGML_LINESEARCH_BACKTRACKING_ARMIJO: ggml_linesearch = 0;
+pub const ggml_linesearch_GGML_LINESEARCH_BACKTRACKING_WOLFE: ggml_linesearch = 1;
+pub const ggml_linesearch_GGML_LINESEARCH_BACKTRACKING_STRONG_WOLFE: ggml_linesearch = 2;
+pub type ggml_linesearch = ::std::os::raw::c_uint;
+pub const ggml_opt_result_GGML_OPT_OK: ggml_opt_result = 0;
+pub const ggml_opt_result_GGML_OPT_DID_NOT_CONVERGE: ggml_opt_result = 1;
+pub const ggml_opt_result_GGML_OPT_NO_CONTEXT: ggml_opt_result = 2;
+pub const ggml_opt_result_GGML_OPT_INVALID_WOLFE: ggml_opt_result = 3;
+pub const ggml_opt_result_GGML_OPT_FAIL: ggml_opt_result = 4;
+pub const ggml_opt_result_GGML_LINESEARCH_FAIL: ggml_opt_result = -128;
+pub const ggml_opt_result_GGML_LINESEARCH_MINIMUM_STEP: ggml_opt_result = -127;
+pub const ggml_opt_result_GGML_LINESEARCH_MAXIMUM_STEP: ggml_opt_result = -126;
+pub const ggml_opt_result_GGML_LINESEARCH_MAXIMUM_ITERATIONS: ggml_opt_result = -125;
+pub const ggml_opt_result_GGML_LINESEARCH_INVALID_PARAMETERS: ggml_opt_result = -124;
+pub type ggml_opt_result = ::std::os::raw::c_int;
+#[repr(C)]
+#[derive(Debug, Copy, Clone)]
+pub struct ggml_opt_params {
+    pub type_: ggml_opt_type,
+    pub n_threads: ::std::os::raw::c_int,
+    pub past: ::std::os::raw::c_int,
+    pub delta: f32,
+    pub max_no_improvement: ::std::os::raw::c_int,
+    pub print_forward_graph: bool,
+    pub print_backward_graph: bool,
+    pub adam: ggml_opt_params__bindgen_ty_1,
+    pub lbfgs: ggml_opt_params__bindgen_ty_2,
+}
+#[repr(C)]
+#[derive(Debug, Copy, Clone)]
+pub struct ggml_opt_params__bindgen_ty_1 {
+    pub n_iter: ::std::os::raw::c_int,
+    pub alpha: f32,
+    pub beta1: f32,
+    pub beta2: f32,
+    pub eps: f32,
+    pub eps_f: f32,
+    pub eps_g: f32,
+}
+#[test]
+fn bindgen_test_layout_ggml_opt_params__bindgen_ty_1() {
+    const UNINIT: ::std::mem::MaybeUninit<ggml_opt_params__bindgen_ty_1> =
+        ::std::mem::MaybeUninit::uninit();
+    let ptr = UNINIT.as_ptr();
+    assert_eq!(
+        ::std::mem::size_of::<ggml_opt_params__bindgen_ty_1>(),
+        28usize,
+        concat!("Size of: ", stringify!(ggml_opt_params__bindgen_ty_1))
+    );
+    assert_eq!(
+        ::std::mem::align_of::<ggml_opt_params__bindgen_ty_1>(),
+        4usize,
+        concat!("Alignment of ", stringify!(ggml_opt_params__bindgen_ty_1))
+    );
+    assert_eq!(
+        unsafe { ::std::ptr::addr_of!((*ptr).n_iter) as usize - ptr as usize },
+        0usize,
+        concat!(
+            "Offset of field: ",
+            stringify!(ggml_opt_params__bindgen_ty_1),
+            "::",
+            stringify!(n_iter)
+        )
+    );
+    assert_eq!(
+        unsafe { ::std::ptr::addr_of!((*ptr).alpha) as usize - ptr as usize },
+        4usize,
+        concat!(
+            "Offset of field: ",
+            stringify!(ggml_opt_params__bindgen_ty_1),
+            "::",
+            stringify!(alpha)
+        )
+    );
+    assert_eq!(
+        unsafe { ::std::ptr::addr_of!((*ptr).beta1) as usize - ptr as usize },
+        8usize,
+        concat!(
+            "Offset of field: ",
+            stringify!(ggml_opt_params__bindgen_ty_1),
+            "::",
+            stringify!(beta1)
+        )
+    );
+    assert_eq!(
+        unsafe { ::std::ptr::addr_of!((*ptr).beta2) as usize - ptr as usize },
+        12usize,
+        concat!(
+            "Offset of field: ",
+            stringify!(ggml_opt_params__bindgen_ty_1),
+            "::",
+            stringify!(beta2)
+        )
+    );
+    assert_eq!(
+        unsafe { ::std::ptr::addr_of!((*ptr).eps) as usize - ptr as usize },
+        16usize,
+        concat!(
+            "Offset of field: ",
+            stringify!(ggml_opt_params__bindgen_ty_1),
+            "::",
+            stringify!(eps)
+        )
+    );
+    assert_eq!(
+        unsafe { ::std::ptr::addr_of!((*ptr).eps_f) as usize - ptr as usize },
+        20usize,
+        concat!(
+            "Offset of field: ",
+            stringify!(ggml_opt_params__bindgen_ty_1),
+            "::",
+            stringify!(eps_f)
+        )
+    );
+    assert_eq!(
+        unsafe { ::std::ptr::addr_of!((*ptr).eps_g) as usize - ptr as usize },
+        24usize,
+        concat!(
+            "Offset of field: ",
+            stringify!(ggml_opt_params__bindgen_ty_1),
+            "::",
+            stringify!(eps_g)
+        )
+    );
+}
+#[repr(C)]
+#[derive(Debug, Copy, Clone)]
+pub struct ggml_opt_params__bindgen_ty_2 {
+    pub m: ::std::os::raw::c_int,
+    pub n_iter: ::std::os::raw::c_int,
+    pub max_linesearch: ::std::os::raw::c_int,
+    pub eps: f32,
+    pub ftol: f32,
+    pub wolfe: f32,
+    pub min_step: f32,
+    pub max_step: f32,
+    pub linesearch: ggml_linesearch,
+}
+#[test]
+fn bindgen_test_layout_ggml_opt_params__bindgen_ty_2() {
+    const UNINIT: ::std::mem::MaybeUninit<ggml_opt_params__bindgen_ty_2> =
+        ::std::mem::MaybeUninit::uninit();
+    let ptr = UNINIT.as_ptr();
+    assert_eq!(
+        ::std::mem::size_of::<ggml_opt_params__bindgen_ty_2>(),
+        36usize,
+        concat!("Size of: ", stringify!(ggml_opt_params__bindgen_ty_2))
+    );
+    assert_eq!(
+        ::std::mem::align_of::<ggml_opt_params__bindgen_ty_2>(),
+        4usize,
+        concat!("Alignment of ", stringify!(ggml_opt_params__bindgen_ty_2))
+    );
+    assert_eq!(
+        unsafe { ::std::ptr::addr_of!((*ptr).m) as usize - ptr as usize },
+        0usize,
+        concat!(
+            "Offset of field: ",
+            stringify!(ggml_opt_params__bindgen_ty_2),
+            "::",
+            stringify!(m)
+        )
+    );
+    assert_eq!(
+        unsafe { ::std::ptr::addr_of!((*ptr).n_iter) as usize - ptr as usize },
+        4usize,
+        concat!(
+            "Offset of field: ",
+            stringify!(ggml_opt_params__bindgen_ty_2),
+            "::",
+            stringify!(n_iter)
+        )
+    );
+    assert_eq!(
+        unsafe { ::std::ptr::addr_of!((*ptr).max_linesearch) as usize - ptr as usize },
+        8usize,
+        concat!(
+            "Offset of field: ",
+            stringify!(ggml_opt_params__bindgen_ty_2),
+            "::",
+            stringify!(max_linesearch)
+        )
+    );
+    assert_eq!(
+        unsafe { ::std::ptr::addr_of!((*ptr).eps) as usize - ptr as usize },
+        12usize,
+        concat!(
+            "Offset of field: ",
+            stringify!(ggml_opt_params__bindgen_ty_2),
+            "::",
+            stringify!(eps)
+        )
+    );
+    assert_eq!(
+        unsafe { ::std::ptr::addr_of!((*ptr).ftol) as usize - ptr as usize },
+        16usize,
+        concat!(
+            "Offset of field: ",
+            stringify!(ggml_opt_params__bindgen_ty_2),
+            "::",
+            stringify!(ftol)
+        )
+    );
+    assert_eq!(
+        unsafe { ::std::ptr::addr_of!((*ptr).wolfe) as usize - ptr as usize },
+        20usize,
+        concat!(
+            "Offset of field: ",
+            stringify!(ggml_opt_params__bindgen_ty_2),
+            "::",
+            stringify!(wolfe)
+        )
+    );
+    assert_eq!(
+        unsafe { ::std::ptr::addr_of!((*ptr).min_step) as usize - ptr as usize },
+        24usize,
+        concat!(
+            "Offset of field: ",
+            stringify!(ggml_opt_params__bindgen_ty_2),
+            "::",
+            stringify!(min_step)
+        )
+    );
+    assert_eq!(
+        unsafe { ::std::ptr::addr_of!((*ptr).max_step) as usize - ptr as usize },
+        28usize,
+        concat!(
+            "Offset of field: ",
+            stringify!(ggml_opt_params__bindgen_ty_2),
+            "::",
+            stringify!(max_step)
+        )
+    );
+    assert_eq!(
+        unsafe { ::std::ptr::addr_of!((*ptr).linesearch) as usize - ptr as usize },
+        32usize,
+        concat!(
+            "Offset of field: ",
+            stringify!(ggml_opt_params__bindgen_ty_2),
+            "::",
+            stringify!(linesearch)
+        )
+    );
+}
+#[test]
+fn bindgen_test_layout_ggml_opt_params() {
+    const UNINIT: ::std::mem::MaybeUninit<ggml_opt_params> = ::std::mem::MaybeUninit::uninit();
+    let ptr = UNINIT.as_ptr();
+    assert_eq!(
+        ::std::mem::size_of::<ggml_opt_params>(),
+        88usize,
+        concat!("Size of: ", stringify!(ggml_opt_params))
+    );
+    assert_eq!(
+        ::std::mem::align_of::<ggml_opt_params>(),
+        4usize,
+        concat!("Alignment of ", stringify!(ggml_opt_params))
+    );
+    assert_eq!(
+        unsafe { ::std::ptr::addr_of!((*ptr).type_) as usize - ptr as usize },
+        0usize,
+        concat!(
+            "Offset of field: ",
+            stringify!(ggml_opt_params),
+            "::",
+            stringify!(type_)
+        )
+    );
+    assert_eq!(
+        unsafe { ::std::ptr::addr_of!((*ptr).n_threads) as usize - ptr as usize },
+        4usize,
+        concat!(
+            "Offset of field: ",
+            stringify!(ggml_opt_params),
+            "::",
+            stringify!(n_threads)
+        )
+    );
+    assert_eq!(
+        unsafe { ::std::ptr::addr_of!((*ptr).past) as usize - ptr as usize },
+        8usize,
+        concat!(
+            "Offset of field: ",
+            stringify!(ggml_opt_params),
+            "::",
+            stringify!(past)
+        )
+    );
+    assert_eq!(
+        unsafe { ::std::ptr::addr_of!((*ptr).delta) as usize - ptr as usize },
+        12usize,
+        concat!(
+            "Offset of field: ",
+            stringify!(ggml_opt_params),
+            "::",
+            stringify!(delta)
+        )
+    );
+    assert_eq!(
+        unsafe { ::std::ptr::addr_of!((*ptr).max_no_improvement) as usize - ptr as usize },
+        16usize,
+        concat!(
+            "Offset of field: ",
+            stringify!(ggml_opt_params),
+            "::",
+            stringify!(max_no_improvement)
+        )
+    );
+    assert_eq!(
+        unsafe { ::std::ptr::addr_of!((*ptr).print_forward_graph) as usize - ptr as usize },
+        20usize,
+        concat!(
+            "Offset of field: ",
+            stringify!(ggml_opt_params),
+            "::",
+            stringify!(print_forward_graph)
+        )
+    );
+    assert_eq!(
+        unsafe { ::std::ptr::addr_of!((*ptr).print_backward_graph) as usize - ptr as usize },
+        21usize,
+        concat!(
+            "Offset of field: ",
+            stringify!(ggml_opt_params),
+            "::",
+            stringify!(print_backward_graph)
+        )
+    );
+    assert_eq!(
+        unsafe { ::std::ptr::addr_of!((*ptr).adam) as usize - ptr as usize },
+        24usize,
+        concat!(
+            "Offset of field: ",
+            stringify!(ggml_opt_params),
+            "::",
+            stringify!(adam)
+        )
+    );
+    assert_eq!(
+        unsafe { ::std::ptr::addr_of!((*ptr).lbfgs) as usize - ptr as usize },
+        52usize,
+        concat!(
+            "Offset of field: ",
+            stringify!(ggml_opt_params),
+            "::",
+            stringify!(lbfgs)
+        )
+    );
+}
+extern "C" {
+    pub fn ggml_opt_default_params(type_: ggml_opt_type) -> ggml_opt_params;
+}
+extern "C" {
+    pub fn ggml_opt(
+        ctx: *mut ggml_context,
+        params: ggml_opt_params,
+        f: *mut ggml_tensor,
+    ) -> ggml_opt_result;
+}
+extern "C" {
+    pub fn ggml_quantize_q4_0(
+        src: *const f32,
+        dst: *mut ::std::os::raw::c_void,
+        n: ::std::os::raw::c_int,
+        k: ::std::os::raw::c_int,
+        hist: *mut i64,
+    ) -> usize;
+}
+extern "C" {
+    pub fn ggml_quantize_q4_1(
+        src: *const f32,
+        dst: *mut ::std::os::raw::c_void,
+        n: ::std::os::raw::c_int,
+        k: ::std::os::raw::c_int,
+        hist: *mut i64,
+    ) -> usize;
+}
+extern "C" {
+    pub fn ggml_quantize_q4_2(
+        src: *const f32,
+        dst: *mut ::std::os::raw::c_void,
+        n: ::std::os::raw::c_int,
+        k: ::std::os::raw::c_int,
+        hist: *mut i64,
+    ) -> usize;
+}
+extern "C" {
+    pub fn ggml_quantize_q5_0(
+        src: *const f32,
+        dst: *mut ::std::os::raw::c_void,
+        n: ::std::os::raw::c_int,
+        k: ::std::os::raw::c_int,
+        hist: *mut i64,
+    ) -> usize;
+}
+extern "C" {
+    pub fn ggml_quantize_q5_1(
+        src: *const f32,
+        dst: *mut ::std::os::raw::c_void,
+        n: ::std::os::raw::c_int,
+        k: ::std::os::raw::c_int,
+        hist: *mut i64,
+    ) -> usize;
+}
+extern "C" {
+    pub fn ggml_quantize_q8_0(
+        src: *const f32,
+        dst: *mut ::std::os::raw::c_void,
+        n: ::std::os::raw::c_int,
+        k: ::std::os::raw::c_int,
+        hist: *mut i64,
+    ) -> usize;
+}
+extern "C" {
+    pub fn ggml_quantize_chunk(
+        type_: ggml_type,
+        src: *const f32,
+        dst: *mut ::std::os::raw::c_void,
+        start: ::std::os::raw::c_int,
+        n: ::std::os::raw::c_int,
+        hist: *mut i64,
+    ) -> usize;
+}
+extern "C" {
+    pub fn ggml_cpu_has_avx() -> ::std::os::raw::c_int;
+}
+extern "C" {
+    pub fn ggml_cpu_has_avx2() -> ::std::os::raw::c_int;
+}
+extern "C" {
+    pub fn ggml_cpu_has_avx512() -> ::std::os::raw::c_int;
+}
+extern "C" {
+    pub fn ggml_cpu_has_avx512_vbmi() -> ::std::os::raw::c_int;
+}
+extern "C" {
+    pub fn ggml_cpu_has_avx512_vnni() -> ::std::os::raw::c_int;
+}
+extern "C" {
+    pub fn ggml_cpu_has_fma() -> ::std::os::raw::c_int;
+}
+extern "C" {
+    pub fn ggml_cpu_has_neon() -> ::std::os::raw::c_int;
+}
+extern "C" {
+    pub fn ggml_cpu_has_arm_fma() -> ::std::os::raw::c_int;
+}
+extern "C" {
+    pub fn ggml_cpu_has_f16c() -> ::std::os::raw::c_int;
+}
+extern "C" {
+    pub fn ggml_cpu_has_fp16_va() -> ::std::os::raw::c_int;
+}
+extern "C" {
+    pub fn ggml_cpu_has_wasm_simd() -> ::std::os::raw::c_int;
+}
+extern "C" {
+    pub fn ggml_cpu_has_blas() -> ::std::os::raw::c_int;
+}
+extern "C" {
+    pub fn ggml_cpu_has_cublas() -> ::std::os::raw::c_int;
+}
+extern "C" {
+    pub fn ggml_cpu_has_clblast() -> ::std::os::raw::c_int;
+}
+extern "C" {
+    pub fn ggml_cpu_has_gpublas() -> ::std::os::raw::c_int;
+}
+extern "C" {
+    pub fn ggml_cpu_has_sse3() -> ::std::os::raw::c_int;
+}
+extern "C" {
+    pub fn ggml_cpu_has_vsx() -> ::std::os::raw::c_int;
+}
+pub type dequantize_row_q_t = ::std::option::Option<
+    unsafe extern "C" fn(x: *const ::std::os::raw::c_void, y: *mut f32, k: ::std::os::raw::c_int),
+>;
+pub type quantize_row_q_t = ::std::option::Option<
+    unsafe extern "C" fn(x: *const f32, y: *mut ::std::os::raw::c_void, k: ::std::os::raw::c_int),
+>;
+pub type vec_dot_q_t = ::std::option::Option<
+    unsafe extern "C" fn(
+        n: ::std::os::raw::c_int,
+        s: *mut f32,
+        x: *const ::std::os::raw::c_void,
+        y: *const ::std::os::raw::c_void,
+    ),
+>;
+#[repr(C)]
+#[derive(Debug, Copy, Clone)]
+pub struct quantize_fns_t {
+    pub dequantize_row_q: dequantize_row_q_t,
+    pub quantize_row_q: quantize_row_q_t,
+    pub quantize_row_q_reference: quantize_row_q_t,
+    pub quantize_row_q_dot: quantize_row_q_t,
+    pub vec_dot_q: vec_dot_q_t,
+    pub vec_dot_type: ggml_type,
+}
+#[test]
+fn bindgen_test_layout_quantize_fns_t() {
+    const UNINIT: ::std::mem::MaybeUninit<quantize_fns_t> = ::std::mem::MaybeUninit::uninit();
+    let ptr = UNINIT.as_ptr();
+    assert_eq!(
+        ::std::mem::size_of::<quantize_fns_t>(),
+        48usize,
+        concat!("Size of: ", stringify!(quantize_fns_t))
+    );
+    assert_eq!(
+        ::std::mem::align_of::<quantize_fns_t>(),
+        8usize,
+        concat!("Alignment of ", stringify!(quantize_fns_t))
+    );
+    assert_eq!(
+        unsafe { ::std::ptr::addr_of!((*ptr).dequantize_row_q) as usize - ptr as usize },
+        0usize,
+        concat!(
+            "Offset of field: ",
+            stringify!(quantize_fns_t),
+            "::",
+            stringify!(dequantize_row_q)
+        )
+    );
+    assert_eq!(
+        unsafe { ::std::ptr::addr_of!((*ptr).quantize_row_q) as usize - ptr as usize },
+        8usize,
+        concat!(
+            "Offset of field: ",
+            stringify!(quantize_fns_t),
+            "::",
+            stringify!(quantize_row_q)
+        )
+    );
+    assert_eq!(
+        unsafe { ::std::ptr::addr_of!((*ptr).quantize_row_q_reference) as usize - ptr as usize },
+        16usize,
+        concat!(
+            "Offset of field: ",
+            stringify!(quantize_fns_t),
+            "::",
+            stringify!(quantize_row_q_reference)
+        )
+    );
+    assert_eq!(
+        unsafe { ::std::ptr::addr_of!((*ptr).quantize_row_q_dot) as usize - ptr as usize },
+        24usize,
+        concat!(
+            "Offset of field: ",
+            stringify!(quantize_fns_t),
+            "::",
+            stringify!(quantize_row_q_dot)
+        )
+    );
+    assert_eq!(
+        unsafe { ::std::ptr::addr_of!((*ptr).vec_dot_q) as usize - ptr as usize },
+        32usize,
+        concat!(
+            "Offset of field: ",
+            stringify!(quantize_fns_t),
+            "::",
+            stringify!(vec_dot_q)
+        )
+    );
+    assert_eq!(
+        unsafe { ::std::ptr::addr_of!((*ptr).vec_dot_type) as usize - ptr as usize },
+        40usize,
+        concat!(
+            "Offset of field: ",
+            stringify!(quantize_fns_t),
+            "::",
+            stringify!(vec_dot_type)
+        )
+    );
+}
+extern "C" {
+    pub fn ggml_internal_get_quantize_fn(i: usize) -> quantize_fns_t;
+}
diff --git a/tools/generate-ggml-bindings/Cargo.toml b/tools/generate-ggml-bindings/Cargo.toml
new file mode 100644
index 00000000..0efc3ef6
--- /dev/null
+++ b/tools/generate-ggml-bindings/Cargo.toml
@@ -0,0 +1,8 @@
+[package]
+name = "generate-ggml-bindings"
+version = "0.1.0"
+edition = "2021"
+publish = false
+
+[dependencies]
+bindgen = "0.65.1"
diff --git a/tools/generate-ggml-bindings/src/main.rs b/tools/generate-ggml-bindings/src/main.rs
new file mode 100644
index 00000000..aca90252
--- /dev/null
+++ b/tools/generate-ggml-bindings/src/main.rs
@@ -0,0 +1,28 @@
+//! Helper tool to generate the bindings for the ggml crate.
+//!
+//! Assumed to be run from the root of the workspace.
+
+use std::path::PathBuf;
+
+fn main() {
+    const HEADER_PATH: &str = "ggml/sys/ggml/include/ggml/ggml.h";
+
+    let bindings = bindgen::Builder::default()
+        .header(HEADER_PATH)
+        // Suppress some warnings
+        .raw_line("#![allow(non_upper_case_globals)]")
+        .raw_line("#![allow(non_camel_case_types)]")
+        .raw_line("#![allow(non_snake_case)]")
+        .raw_line("#![allow(unused)]")
+        // Do not generate code for ggml's includes (stdlib)
+        .allowlist_file(HEADER_PATH)
+        .generate()
+        .expect("Unable to generate bindings");
+
+    let out_path = PathBuf::from("ggml").join("sys").join("src").join("lib.rs");
+    bindings
+        .write_to_file(out_path)
+        .expect("Couldn't write bindings");
+
+    println!("Successfully updated bindings");
+}

From 9314c68639c3b202fcae74de27c0eae80e3c8fef Mon Sep 17 00:00:00 2001
From: Philpax <me@philpax.me>
Date: Sun, 30 Apr 2023 23:32:26 +0200
Subject: [PATCH 35/35] fix: check out submodules for clippy CI

---
 .github/workflows/rust.yml | 42 ++++++++++++++++++++------------------
 1 file changed, 22 insertions(+), 20 deletions(-)

diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml
index bd2da4b9..af2280f3 100644
--- a/.github/workflows/rust.yml
+++ b/.github/workflows/rust.yml
@@ -2,9 +2,9 @@ name: Rust
 
 on:
   push:
-    branches: [ "main" ]
+    branches: ["main"]
   pull_request:
-    branches: [ "main" ]
+    branches: ["main"]
 
 env:
   CARGO_TERM_COLOR: always
@@ -18,25 +18,27 @@ jobs:
         os: [windows-latest, ubuntu-latest, macos-latest]
     runs-on: ${{ matrix.os }}
     steps:
-    - uses: actions/checkout@v3
-      with:
-        submodules: recursive
-    - uses: dtolnay/rust-toolchain@stable
-    - name: Check
-      run: cargo check --verbose
-    - name: Build
-      run: cargo build --verbose
-    - name: Run tests
-      run: cargo test --verbose
+      - uses: actions/checkout@v3
+        with:
+          submodules: recursive
+      - uses: dtolnay/rust-toolchain@stable
+      - name: Check
+        run: cargo check --verbose
+      - name: Build
+        run: cargo build --verbose
+      - name: Run tests
+        run: cargo test --verbose
   fmt:
     name: Clippy and formatting
     runs-on: ubuntu-latest
     steps:
-    - uses: actions/checkout@v3
-    - uses: dtolnay/rust-toolchain@stable
-      with:
-        components: rustfmt, clippy
-    - name: Formatting
-      run: cargo fmt --all -- --check
-    - name: Clippy
-      run: cargo clippy -- -Dclippy::all # -Dclippy::pedantic
+      - uses: actions/checkout@v3
+        with:
+          submodules: recursive
+      - uses: dtolnay/rust-toolchain@stable
+        with:
+          components: rustfmt, clippy
+      - name: Formatting
+        run: cargo fmt --all -- --check
+      - name: Clippy
+        run: cargo clippy -- -Dclippy::all # -Dclippy::pedantic