diff --git a/LICENSE b/LICENSE
index 0b5e765..7757b14 100644
--- a/LICENSE
+++ b/LICENSE
@@ -186,7 +186,7 @@
       same "printed page" as the copyright notice for easier
       identification within third-party archives.
 
-   Copyright 2024 leafspark
+   Copyright (c) 2024-2025 leafspark
 
    Licensed under the Apache License, Version 2.0 (the "License");
    you may not use this file except in compliance with the License.
diff --git a/requirements.txt b/requirements.txt
index fdafd9e..aa6ad08 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,13 +1,13 @@
 PyYAML~=6.0.2
-psutil~=6.1.0
+psutil~=6.1.1
 pynvml~=12.0.0
 PySide6~=6.8.1
-safetensors~=0.4.5
+safetensors~=0.5.0
 numpy<2.0.0
 torch~=2.5.1
 sentencepiece~=0.2.0
-setuptools~=75.5.0
-huggingface-hub~=0.26.5
-transformers~=4.47.0
+setuptools~=75.6.0
+huggingface-hub~=0.27.0
+transformers~=4.47.1
 fastapi~=0.115.6
 uvicorn~=0.34.0
diff --git a/src/AutoGGUF.py b/src/AutoGGUF.py
index c9d1255..d3593c0 100644
--- a/src/AutoGGUF.py
+++ b/src/AutoGGUF.py
@@ -500,7 +500,7 @@ def __init__(self, args: List[str]) -> None:
         # Timer for updating system info
         self.timer = QTimer()
         self.timer.timeout.connect(self.update_system_info)
-        self.timer.start(200)
+        self.timer.start(500)
 
         # Backend selection
         backend_layout = QHBoxLayout()
@@ -1023,7 +1023,9 @@ def __init__(self, args: List[str]) -> None:
         hf_to_gguf_layout.addRow(OUTPUT_FILE, hf_outfile_layout)
 
         self.hf_outtype = QComboBox()
-        self.hf_outtype.addItems(["f32", "f16", "bf16", "q8_0", "auto"])
+        self.hf_outtype.addItems(
+            ["f32", "f16", "bf16", "q8_0", "tq1_0", "tq2_0", "auto"]
+        )
         hf_to_gguf_layout.addRow(OUTPUT_TYPE, self.hf_outtype)
 
         self.hf_vocab_only = QCheckBox(VOCAB_ONLY)
diff --git a/src/GPUMonitor.py b/src/GPUMonitor.py
index bec2827..edddcd8 100644
--- a/src/GPUMonitor.py
+++ b/src/GPUMonitor.py
@@ -95,7 +95,7 @@ def __init__(self, parent=None) -> None:
 
         self.timer = QTimer(self)
         self.timer.timeout.connect(self.update_gpu_info)
-        self.timer.start(200)  # Update every 0.2 seconds
+        self.timer.start(500)  # Update every 0.5 seconds
 
         self.gpu_data = []
         self.vram_data = []
@@ -192,7 +192,7 @@ def update_graph_data() -> None:
 
         timer = QTimer(dialog)
         timer.timeout.connect(update_graph_data)
-        timer.start(200)  # Update every 0.2 seconds
+        timer.start(500)  # Update every 0.5 seconds
 
         dialog.exec()
 
@@ -227,7 +227,7 @@ def update_graph_data() -> None:
 
         timer = QTimer(dialog)
         timer.timeout.connect(update_graph_data)
-        timer.start(200)  # Update every 0.2 seconds
+        timer.start(500)  # Update every 0.5 seconds
 
         tab_widget.addTab(gpu_graph, GPU_USAGE_OVER_TIME)
         tab_widget.addTab(vram_graph, VRAM_USAGE_OVER_TIME)
diff --git a/src/convert_hf_to_gguf.py b/src/convert_hf_to_gguf.py
index c1d2f97..b5b3ab6 100644
--- a/src/convert_hf_to_gguf.py
+++ b/src/convert_hf_to_gguf.py
@@ -23,6 +23,7 @@
     TypeVar,
     cast,
 )
+from itertools import chain
 
 import math
 import numpy as np
@@ -36,6 +37,9 @@
 logger = logging.getLogger("hf-to-gguf")
 
 
+###### MODEL DEFINITIONS ######
+
+
 class SentencePieceTokenTypes(IntEnum):
     NORMAL = 1
     UNKNOWN = 2
@@ -68,8 +72,8 @@ class Model:
     model_name: str | None
     metadata_override: Path | None
     dir_model_card: Path
-    is_lora: bool
 
+    # subclasses should define this!
     model_arch: gguf.MODEL_ARCH
 
     def __init__(
@@ -86,7 +90,7 @@ def __init__(
         split_max_size: int = 0,
         dry_run: bool = False,
         small_first_shard: bool = False,
-        is_lora: bool = False,
+        hparams: dict[str, Any] | None = None,
     ):
         if type(self) is Model:
             raise TypeError(
@@ -110,7 +114,9 @@ def __init__(
             self.part_names = Model.get_model_part_names(
                 self.dir_model, "pytorch_model", ".bin"
             )
-        self.hparams = Model.load_hparams(self.dir_model)
+        self.hparams = (
+            Model.load_hparams(self.dir_model) if hparams is None else hparams
+        )
         self.block_count = self.find_hparam(
             ["n_layers", "num_hidden_layers", "n_layer", "num_layers"]
         )
@@ -118,11 +124,11 @@ def __init__(
         self.tensor_names = None
         self.metadata_override = metadata_override
         self.model_name = model_name
-        self.dir_model_card = dir_model
-        self.is_lora = is_lora
+        self.dir_model_card = dir_model  # overridden in convert_lora_to_gguf.py
 
+        # Apply heuristics to figure out typical tensor encoding based on first layer tensor encoding type
         if self.ftype == gguf.LlamaFileType.GUESSED:
-
+            # NOTE: can't use field "torch_dtype" in config.json, because some finetunes lie.
             _, first_tensor = next(self.get_tensors())
             if first_tensor.dtype == torch.float16:
                 logger.info(
@@ -135,6 +141,7 @@ def __init__(
                 )
                 self.ftype = gguf.LlamaFileType.MOSTLY_BF16
 
+        # Configure GGUF Writer
         self.gguf_writer = gguf.GGUFWriter(
             path=None,
             arch=gguf.MODEL_ARCH_NAMES[self.model_arch],
@@ -148,7 +155,8 @@ def __init__(
 
     @classmethod
     def __init_subclass__(cls):
-
+        # can't use an abstract property, because overriding it without type errors
+        # would require using decorated functions instead of simply defining the property
         if "model_arch" not in cls.__dict__:
             raise TypeError(f"Missing property 'model_arch' for {cls.__name__!r}")
 
@@ -219,6 +227,7 @@ def get_tensors(self) -> Iterator[tuple[str, Tensor]]:
                             data = LazyTorchTensor.from_eager(data)
                     yield name, data
 
+        # verify tensor name presence and identify potentially missing files
         if len(tensor_names_from_parts.symmetric_difference(self.tensor_names)) > 0:
             missing = sorted(self.tensor_names.difference(tensor_names_from_parts))
             extra = sorted(tensor_names_from_parts.difference(self.tensor_names))
@@ -285,9 +294,11 @@ def set_gguf_parameters(self):
             self.gguf_writer.add_context_length(n_ctx)
             logger.info(f"gguf: context length = {n_ctx}")
 
-        n_embd = self.find_hparam(["hidden_size", "n_embd"])
-        self.gguf_writer.add_embedding_length(n_embd)
-        logger.info(f"gguf: embedding length = {n_embd}")
+        if (
+            n_embd := self.find_hparam(["hidden_size", "n_embd"], optional=True)
+        ) is not None:
+            self.gguf_writer.add_embedding_length(n_embd)
+            logger.info(f"gguf: embedding length = {n_embd}")
 
         if (
             n_ff := self.find_hparam(["intermediate_size", "n_inner"], optional=True)
@@ -295,9 +306,11 @@ def set_gguf_parameters(self):
             self.gguf_writer.add_feed_forward_length(n_ff)
             logger.info(f"gguf: feed forward length = {n_ff}")
 
-        n_head = self.find_hparam(["num_attention_heads", "n_head"])
-        self.gguf_writer.add_head_count(n_head)
-        logger.info(f"gguf: head count = {n_head}")
+        if (
+            n_head := self.find_hparam(["num_attention_heads", "n_head"], optional=True)
+        ) is not None:
+            self.gguf_writer.add_head_count(n_head)
+            logger.info(f"gguf: head count = {n_head}")
 
         if (n_head_kv := self.hparams.get("num_key_value_heads")) is not None:
             self.gguf_writer.add_head_count_kv(n_head_kv)
@@ -333,24 +346,30 @@ def set_gguf_parameters(self):
     def modify_tensors(
         self, data_torch: Tensor, name: str, bid: int | None
     ) -> Iterable[tuple[str, Tensor]]:
-        del bid
+        del bid  # unused
 
         return [(self.map_tensor_name(name), data_torch)]
 
     def tensor_force_quant(
         self, name: str, new_name: str, bid: int | None, n_dims: int
     ) -> gguf.GGMLQuantizationType | bool:
-        del name, new_name, bid, n_dims
+        del name, new_name, bid, n_dims  # unused
 
         return False
 
+    # some models need extra generated tensors (like rope_freqs)
+    def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
+        return ()
+
     def prepare_tensors(self):
         max_name_len = max(len(s) for _, s in self.tensor_map.mapping.values()) + len(
             ".weight,"
         )
 
-        for name, data_torch in self.get_tensors():
-
+        for name, data_torch in chain(
+            self.generate_extra_tensors(), self.get_tensors()
+        ):
+            # we don't need these
             if name.endswith(
                 (".attention.masked_bias", ".attention.bias", ".rotary_emb.inv_freq")
             ):
@@ -358,28 +377,37 @@ def prepare_tensors(self):
 
             old_dtype = data_torch.dtype
 
+            # convert any unsupported data types to float32
             if data_torch.dtype not in (torch.float16, torch.float32):
                 data_torch = data_torch.to(torch.float32)
 
+            # use the first number-like part of the tensor name as the block id
             bid = None
             for part in name.split("."):
                 if part.isdecimal():
                     bid = int(part)
                     break
 
-            for new_name, data in (
-                (n, d.squeeze().numpy())
-                for n, d in self.modify_tensors(data_torch, name, bid)
-            ):
-                data: np.ndarray
+            for new_name, data_torch in self.modify_tensors(data_torch, name, bid):
+                # TODO: why do we squeeze here?
+                # data = data_torch.squeeze().numpy()
+                data = data_torch.numpy()
+
+                # if data ends up empty, it means data_torch was a scalar tensor -> restore
+                if len(data.shape) == 0:
+                    data = data_torch.numpy()
+
                 n_dims = len(data.shape)
                 data_qtype: gguf.GGMLQuantizationType | bool = self.tensor_force_quant(
                     name, new_name, bid, n_dims
                 )
 
+                # Most of the codebase that takes in 1D tensors or norms only handles F32 tensors
                 if n_dims <= 1 or new_name.endswith("_norm.weight"):
                     data_qtype = gguf.GGMLQuantizationType.F32
 
+                # Conditions should closely match those in llama_model_quantize_internal in llama.cpp
+                # Some tensor types are always in float32
                 if data_qtype is False and (
                     any(
                         self.match_model_tensor_name(new_name, key, bid)
@@ -393,6 +421,8 @@ def prepare_tensors(self):
                             gguf.MODEL_TENSOR.TIME_MIX_W2,
                             gguf.MODEL_TENSOR.TIME_MIX_DECAY_W1,
                             gguf.MODEL_TENSOR.TIME_MIX_DECAY_W2,
+                            gguf.MODEL_TENSOR.POSNET_NORM1,
+                            gguf.MODEL_TENSOR.POSNET_NORM2,
                         )
                     )
                     or not new_name.endswith(".weight")
@@ -410,9 +440,10 @@ def prepare_tensors(self):
                         gguf.LlamaFileType.MOSTLY_TQ1_0,
                         gguf.LlamaFileType.MOSTLY_TQ2_0,
                     ):
-
+                        # TODO: use Q4_K and Q6_K
                         data_qtype = gguf.GGMLQuantizationType.F16
 
+                # No override (data_qtype is False), or wants to be quantized (data_qtype is True)
                 if isinstance(data_qtype, bool):
                     if self.ftype == gguf.LlamaFileType.ALL_F32:
                         data_qtype = gguf.GGMLQuantizationType.F32
@@ -442,8 +473,10 @@ def prepare_tensors(self):
                     else data.shape
                 )
 
+                # reverse shape to make it similar to the internal ggml dimension order
                 shape_str = f"{{{', '.join(str(n) for n in reversed(shape))}}}"
 
+                # n_dims is implicit in the shape
                 logger.info(
                     f"{f'%-{max_name_len}s' % f'{new_name},'} {old_dtype} --> {data_qtype.name}, shape = {shape_str}"
                 )
@@ -463,18 +496,22 @@ def prepare_metadata(self, vocab_only: bool):
             self.metadata_override, self.dir_model_card, self.model_name, total_params
         )
 
+        # Fallback to model directory name if metadata name is still missing
         if self.metadata.name is None:
             self.metadata.name = self.dir_model.name
 
+        # Generate parameter weight class (useful for leader boards) if not yet determined
         if self.metadata.size_label is None and total_params > 0:
             self.metadata.size_label = gguf.size_label(
                 total_params, shared_params, expert_params, expert_count
             )
 
+        # Extract the encoding scheme from the file type name. e.g. 'gguf.LlamaFileType.MOSTLY_Q8_0' --> 'Q8_0'
         output_type: str = self.ftype.name.partition("_")[2]
 
+        # Filename Output
         if self.fname_out.is_dir():
-
+            # Generate default filename based on model specification and available metadata
             if not vocab_only:
                 fname_default: str = gguf.naming_convention(
                     self.metadata.name,
@@ -496,9 +533,14 @@ def prepare_metadata(self, vocab_only: bool):
                     model_type="vocab",
                 )
 
+            # Use the default filename
             self.fname_out = self.fname_out / f"{fname_default}.gguf"
         else:
+            # Output path is a custom defined templated filename
+            # Note: `not is_dir()` is used because `.is_file()` will not detect
+            #       file template strings as it doesn't actually exist as a file
 
+            # Process templated file name with the output ftype, useful with the "auto" ftype
             self.fname_out = self.fname_out.parent / gguf.fill_templated_filename(
                 self.fname_out.name, output_type
             )
@@ -576,11 +618,13 @@ def does_token_look_special(self, token: str | bytes) -> bool:
         else:
             token_text = token
 
+        # Some models mark some added tokens which ought to be control tokens as not special.
+        # (e.g. command-r, command-r-plus, deepseek-coder, gemma{,-2})
         seems_special = token_text in (
-            "<pad>",
+            "<pad>",  # deepseek-coder
             "<mask>",
             "<2mass>",
-            "[@BOS@]",
+            "[@BOS@]",  # gemma{,-2}
         )
 
         seems_special = seems_special or (
@@ -588,14 +632,16 @@ def does_token_look_special(self, token: str | bytes) -> bool:
         )
         seems_special = seems_special or (
             token_text.startswith("<｜") and token_text.endswith("｜>")
-        )
+        )  # deepseek-coder
 
+        # TODO: should these be marked as UNUSED instead? (maybe not)
         seems_special = seems_special or (
             token_text.startswith("<unused") and token_text.endswith(">")
-        )
+        )  # gemma{,-2}
 
         return seems_special
 
+    # used for GPT-2 BPE and WordPiece vocabs
     def get_vocab_base(self) -> tuple[list[str], list[int], str]:
         tokens: list[str] = []
         toktypes: list[int] = []
@@ -620,12 +666,28 @@ def get_vocab_base(self) -> tuple[list[str], list[int], str]:
             else:
                 token: str = reverse_vocab[i]
                 if token in added_vocab:
+                    # The tokenizer in llama.cpp assumes the CONTROL and USER_DEFINED tokens are pre-normalized.
+                    # To avoid unexpected issues - we make sure to normalize non-normalized tokens
+                    if not tokenizer.added_tokens_decoder[i].normalized:
+                        previous_token = token
+                        token = tokenizer.decode(
+                            tokenizer.encode(token, add_special_tokens=False)
+                        )
+                        if previous_token != token:
+                            logger.info(
+                                f"{repr(previous_token)} is encoded and decoded back to {repr(token)} using AutoTokenizer"
+                            )
+
                     if tokenizer.added_tokens_decoder[
                         i
                     ].special or self.does_token_look_special(token):
                         toktypes.append(gguf.TokenType.CONTROL)
                     else:
-                        token = token.replace(b"\xe2\x96\x81".decode("utf-8"), " ")
+                        # NOTE: this was added for Gemma.
+                        # Encoding and decoding the tokens above isn't sufficient for this case.
+                        token = token.replace(
+                            b"\xe2\x96\x81".decode("utf-8"), " "
+                        )  # pre-normalize user-defined spaces
                         toktypes.append(gguf.TokenType.USER_DEFINED)
                 else:
                     toktypes.append(gguf.TokenType.NORMAL)
@@ -633,7 +695,15 @@ def get_vocab_base(self) -> tuple[list[str], list[int], str]:
 
         return tokens, toktypes, tokpre
 
+    # NOTE: this function is generated by convert_hf_to_gguf_update.py
+    #       do not modify it manually!
+    # ref:  https://github.com/ggerganov/llama.cpp/pull/6920
+    # Marker: Start get_vocab_base_pre
     def get_vocab_base_pre(self, tokenizer) -> str:
+        # encoding this string and hashing the resulting tokens would (hopefully) give us a unique identifier that
+        # is specific for the BPE pre-tokenizer used by the model
+        # we will use this unique identifier to write a "tokenizer.ggml.pre" entry in the GGUF file which we can
+        # use in llama.cpp to implement the same pre-tokenizer
 
         chktxt = "\n \n\n \n\n\n \t \t\t \t\n  \n   \n    \n     \n🚀 (normal) 😶\u200d🌫️ (multiple emojis concatenated) ✅ 🦙🦙 3 33 333 3333 33333 333333 3333333 33333333 3.3 3..3 3...3 កាន់តែពិសេសអាច😁 ?我想在apple工作1314151天～ ------======= нещо на Български ''''''```````\"\"\"\"......!!!!!!?????? I've been 'told he's there, 'RE you sure? 'M not sure I'll make it, 'D you like some tea? We'Ve a'lL"
 
@@ -645,96 +715,126 @@ def get_vocab_base_pre(self, tokenizer) -> str:
 
         res = None
 
+        # NOTE: if you get an error here, you need to update the convert_hf_to_gguf_update.py script
+        #       or pull the latest version of the model from Huggingface
+        #       don't edit the hashes manually!
         if chkhsh == "0ef9807a4087ebef797fc749390439009c3b9eda9ad1a097abbe738f486c01e5":
-
+            # ref: https://huggingface.co/meta-llama/Meta-Llama-3-8B
             res = "llama-bpe"
         if chkhsh == "049ecf7629871e3041641907f3de7c733e4dbfdc736f57d882ba0b0845599754":
-
+            # ref: https://huggingface.co/deepseek-ai/deepseek-llm-7b-base
             res = "deepseek-llm"
         if chkhsh == "347715f544604f9118bb75ed199f68779f423cabb20db6de6f31b908d04d7821":
-
+            # ref: https://huggingface.co/deepseek-ai/deepseek-coder-6.7b-base
             res = "deepseek-coder"
         if chkhsh == "8aeee3860c56296a157a1fe2fad249ec40aa59b1bb5709f4ade11c4e6fe652ed":
-
+            # ref: https://huggingface.co/tiiuae/falcon-7b
             res = "falcon"
+        if chkhsh == "9d032fcbd5501f4a38150912590928bfb36091efb5df11b8e2124b0390e3fb1e":
+            # ref: https://huggingface.co/tiiuae/Falcon3-7B-Base
+            res = "falcon3"
         if chkhsh == "0876d13b50744004aa9aeae05e7b0647eac9d801b5ba4668afc01e709c15e19f":
-
+            # ref: https://huggingface.co/BAAI/bge-small-en-v1.5
             res = "bert-bge"
+        if chkhsh == "8e62295832751ca1e8f92f2226f403dea30dc5165e448b5bfa05af5340c64ec7":
+            # ref: https://huggingface.co/BAAI/bge-large-zh-v1.5
+            res = "bert-bge-large"
         if chkhsh == "b6dc8df998e1cfbdc4eac8243701a65afe638679230920b50d6f17d81c098166":
-
+            # ref: https://huggingface.co/mosaicml/mpt-7b
             res = "mpt"
         if chkhsh == "35d91631860c815f952d711435f48d356ebac988362536bed955d43bfa436e34":
-
+            # ref: https://huggingface.co/bigcode/starcoder2-3b
             res = "starcoder"
         if chkhsh == "3ce83efda5659b07b1ad37ca97ca5797ea4285d9b9ab0dc679e4a720c9da7454":
-
+            # ref: https://huggingface.co/openai-community/gpt2
             res = "gpt-2"
         if chkhsh == "32d85c31273f8019248f2559fed492d929ea28b17e51d81d3bb36fff23ca72b3":
-
+            # ref: https://huggingface.co/stabilityai/stablelm-2-zephyr-1_6b
             res = "stablelm2"
         if chkhsh == "6221ad2852e85ce96f791f476e0b390cf9b474c9e3d1362f53a24a06dc8220ff":
-
+            # ref: https://huggingface.co/smallcloudai/Refact-1_6-base
             res = "refact"
         if chkhsh == "9c2227e4dd922002fb81bde4fc02b0483ca4f12911410dee2255e4987644e3f8":
-
+            # ref: https://huggingface.co/CohereForAI/c4ai-command-r-v01
             res = "command-r"
         if chkhsh == "e636dc30a262dcc0d8c323492e32ae2b70728f4df7dfe9737d9f920a282b8aea":
-
+            # ref: https://huggingface.co/Qwen/Qwen1.5-7B
             res = "qwen2"
         if chkhsh == "b6dc8df998e1cfbdc4eac8243701a65afe638679230920b50d6f17d81c098166":
-
+            # ref: https://huggingface.co/allenai/OLMo-1.7-7B-hf
             res = "olmo"
         if chkhsh == "a8594e3edff7c29c003940395316294b2c623e09894deebbc65f33f1515df79e":
-
+            # ref: https://huggingface.co/databricks/dbrx-base
             res = "dbrx"
+        if chkhsh == "c7699093ba4255a91e702aa38a596aa81669f3525dae06c2953267dde580f448":
+            # ref: https://huggingface.co/jinaai/jina-reranker-v1-tiny-en
+            res = "jina-v1-en"
         if chkhsh == "0876d13b50744004aa9aeae05e7b0647eac9d801b5ba4668afc01e709c15e19f":
-
+            # ref: https://huggingface.co/jinaai/jina-embeddings-v2-base-en
             res = "jina-v2-en"
         if chkhsh == "171aeeedd6fb548d418a7461d053f11b6f1f1fc9b387bd66640d28a4b9f5c643":
-
+            # ref: https://huggingface.co/jinaai/jina-embeddings-v2-base-es
             res = "jina-v2-es"
         if chkhsh == "27949a2493fc4a9f53f5b9b029c82689cfbe5d3a1929bb25e043089e28466de6":
-
+            # ref: https://huggingface.co/jinaai/jina-embeddings-v2-base-de
             res = "jina-v2-de"
         if chkhsh == "c136ed14d01c2745d4f60a9596ae66800e2b61fa45643e72436041855ad4089d":
-
+            # ref: https://huggingface.co/abacusai/Smaug-Llama-3-70B-Instruct
             res = "smaug-bpe"
         if chkhsh == "c7ea5862a53e4272c035c8238367063e2b270d51faa48c0f09e9d5b54746c360":
-
+            # ref: https://huggingface.co/LumiOpen/Poro-34B-chat
             res = "poro-chat"
         if chkhsh == "7967bfa498ade6b757b064f31e964dddbb80f8f9a4d68d4ba7998fcf281c531a":
-
+            # ref: https://huggingface.co/jinaai/jina-embeddings-v2-base-code
             res = "jina-v2-code"
         if chkhsh == "b6e8e1518dc4305be2fe39c313ed643381c4da5db34a98f6a04c093f8afbe99b":
-
+            # ref: https://huggingface.co/THUDM/glm-4-9b-chat
             res = "chatglm-bpe"
         if chkhsh == "7fc505bd3104ca1083b150b17d088b59534ede9bde81f0dd2090967d7fe52cee":
-
+            # ref: https://huggingface.co/LumiOpen/Viking-7B
             res = "viking"
         if chkhsh == "b53802fb28e26d645c3a310b34bfe07da813026ec7c7716883404d5e0f8b1901":
-
+            # ref: https://huggingface.co/core42/jais-13b
             res = "jais"
         if chkhsh == "7b3e7548e4308f52a76e8229e4e6cc831195d0d1df43aed21ac6c93da05fec5f":
-
+            # ref: https://huggingface.co/WisdomShell/CodeShell-7B
             res = "codeshell"
         if chkhsh == "63b97e4253352e6f357cc59ea5b583e3a680eaeaf2632188c2b952de2588485e":
-
+            # ref: https://huggingface.co/mistralai/Mistral-Nemo-Base-2407
             res = "tekken"
         if chkhsh == "855059429035d75a914d1eda9f10a876752e281a054a7a3d421ef0533e5b6249":
-
+            # ref: https://huggingface.co/HuggingFaceTB/SmolLM-135M
             res = "smollm"
         if chkhsh == "3c30d3ad1d6b64202cd222813e7736c2db6e1bd6d67197090fc1211fbc612ae7":
-
+            # ref: https://huggingface.co/bigscience/bloom
             res = "bloom"
         if chkhsh == "bc01ce58980e1db43859146dc51b1758b3b88729b217a74792e9f8d43e479d21":
-
+            # ref: https://huggingface.co/TurkuNLP/gpt3-finnish-small
             res = "gpt3-finnish"
         if chkhsh == "4e2b24cc4770243d65a2c9ec19770a72f08cffc161adbb73fcbb6b7dd45a0aae":
-
+            # ref: https://huggingface.co/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct
             res = "exaone"
         if chkhsh == "fcace8b9cac38ce847670c970cd5892031a753a1ef381abd1d9af00f713da085":
-
+            # ref: https://huggingface.co/microsoft/phi-2
             res = "phi-2"
+        if chkhsh == "60824e3c0d9401f89943cbb2fff727f0e2d4c545ba4df2d6e4f09a6db0f5b450":
+            # ref: https://huggingface.co/facebook/chameleon-7b
+            res = "chameleon"
+        if chkhsh == "1431a23e583c97432bc230bff598d103ddb5a1f89960c8f1d1051aaa944d0b35":
+            # ref: https://huggingface.co/sapienzanlp/Minerva-7B-base-v1.0
+            res = "minerva-7b"
+        if chkhsh == "8b5a93ed704057481f240da0be7e7dca721d7f8f4755263b6807227a2cbeae65":
+            # ref: https://huggingface.co/sentence-transformers/stsb-roberta-base
+            res = "roberta-bpe"
+        if chkhsh == "ad851be1dba641f2e3711822f816db2c265f788b37c63b4e1aeacb9ee92de8eb":
+            # ref: https://huggingface.co/ai-sage/GigaChat-20B-A3B-instruct
+            res = "gigachat"
+        if chkhsh == "d4c8f286ea6b520b3d495c4455483cfa2302c0cfcd4be05d781b6a8a0a7cdaf1":
+            # ref: https://huggingface.co/Infinigence/Megrez-3B-Instruct
+            res = "megrez"
+        if chkhsh == "877081d19cf6996e2c4ff0e1236341e9b7bde288f5311a56a937f0afbbb3aeb5":
+            # ref: https://huggingface.co/deepseek-ai/DeepSeek-V3
+            res = "deepseek-v3"
 
         if res is None:
             logger.warning("\n")
@@ -769,6 +869,10 @@ def get_vocab_base_pre(self, tokenizer) -> str:
         logger.debug(f"chkhsh: {chkhsh}")
 
         return res
+        # Marker: End get_vocab_base_pre
+
+    def _set_vocab_none(self) -> None:
+        self.gguf_writer.add_tokenizer_model("none")
 
     def _set_vocab_gpt2(self) -> None:
         tokens, toktypes, tokpre = self.get_vocab_base()
@@ -805,6 +909,7 @@ def _set_vocab_qwen(self):
             assert len(merged) == 2
             merges.append(" ".join(map(QwenModel.token_bytes_to_string, merged)))
 
+        # for this kind of tokenizer, added_vocab is not a subset of vocab, so they need to be combined
         added_vocab = tokenizer.special_tokens
         reverse_vocab = {
             id_: encoded_tok for encoded_tok, id_ in {**vocab, **added_vocab}.items()
@@ -828,7 +933,7 @@ def _set_vocab_qwen(self):
 
         special_vocab = gguf.SpecialVocab(dir_model, load_merges=False)
         special_vocab.merges = merges
-
+        # only add special tokens when they were not already loaded from config.json
         if len(special_vocab.special_token_ids) == 0:
             special_vocab._set_special_token(
                 "bos", tokenizer.special_tokens["<|endoftext|>"]
@@ -836,7 +941,7 @@ def _set_vocab_qwen(self):
             special_vocab._set_special_token(
                 "eos", tokenizer.special_tokens["<|endoftext|>"]
             )
-
+        # this one is usually not in config.json anyway
         special_vocab._set_special_token(
             "unk", tokenizer.special_tokens["<|endoftext|>"]
         )
@@ -924,7 +1029,9 @@ def _create_vocab_sentencepiece(self):
                     if token_data.get("special") or self.does_token_look_special(token):
                         toktypes[token_id] = SentencePieceTokenTypes.CONTROL
                     else:
-                        token = token.replace(b"\xe2\x96\x81".decode("utf-8"), " ")
+                        token = token.replace(
+                            b"\xe2\x96\x81".decode("utf-8"), " "
+                        )  # pre-normalize user-defined spaces
                         toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED
 
                     scores[token_id] = -1000.0
@@ -976,7 +1083,7 @@ def _set_vocab_builtin(
         default_pre = "mpt" if model_name == "gpt-neox" else "default"
 
         field = vocab_reader.get_field(gguf.Keys.Tokenizer.MODEL)
-        assert field
+        assert field  # tokenizer model
         self.gguf_writer.add_tokenizer_model(bytes(field.parts[-1]).decode("utf-8"))
 
         field = vocab_reader.get_field(gguf.Keys.Tokenizer.PRE)
@@ -985,27 +1092,27 @@ def _set_vocab_builtin(
         )
 
         field = vocab_reader.get_field(gguf.Keys.Tokenizer.LIST)
-        assert field
+        assert field  # token list
         self.gguf_writer.add_token_list(
             [bytes(field.parts[i]) for i in field.data][:vocab_size]
         )
 
         if model_name == "llama-spm":
             field = vocab_reader.get_field(gguf.Keys.Tokenizer.SCORES)
-            assert field
+            assert field  # token scores
             self.gguf_writer.add_token_scores(
                 [field.parts[i].tolist()[0] for i in field.data][:vocab_size]
             )
 
         field = vocab_reader.get_field(gguf.Keys.Tokenizer.TOKEN_TYPE)
-        assert field
+        assert field  # token types
         self.gguf_writer.add_token_types(
             [field.parts[i].tolist()[0] for i in field.data][:vocab_size]
         )
 
         if model_name != "llama-spm":
             field = vocab_reader.get_field(gguf.Keys.Tokenizer.MERGES)
-            assert field
+            assert field  # token merges
             self.gguf_writer.add_token_merges(
                 [bytes(field.parts[i]) for i in field.data]
             )
@@ -1050,7 +1157,7 @@ def set_gguf_parameters(self):
     def modify_tensors(
         self, data_torch: Tensor, name: str, bid: int | None
     ) -> Iterable[tuple[str, Tensor]]:
-        del bid
+        del bid  # unused
 
         n_head = self.hparams.get("n_head", self.hparams.get("num_attention_heads"))
         n_embed = self.hparams.get("hidden_size", self.hparams.get("n_embed"))
@@ -1058,7 +1165,9 @@ def modify_tensors(
         tensors: list[tuple[str, Tensor]] = []
 
         if re.match(r"gpt_neox\.layers\.\d+\.attention\.query_key_value\.weight", name):
-
+            # Map bloom-style qkv_linear to gpt-style qkv_linear
+            # bloom: https://github.com/huggingface/transformers/blob/main/src/transformers/models/bloom/modeling_bloom.py#L238-L252  # noqa
+            # gpt-2: https://github.com/huggingface/transformers/blob/main/src/transformers/models/gpt2/modeling_gpt2.py#L312  # noqa
             qkv_weights = data_torch.reshape((n_head, 3, n_embed // n_head, n_embed))
             data_torch = torch.cat(
                 (
@@ -1105,7 +1214,7 @@ def set_gguf_parameters(self):
     def modify_tensors(
         self, data_torch: Tensor, name: str, bid: int | None
     ) -> Iterable[tuple[str, Tensor]]:
-        del bid
+        del bid  # unused
 
         n_head = self.hparams.get("n_head", self.hparams.get("num_attention_heads"))
         n_embed = self.hparams.get("hidden_size", self.hparams.get("n_embed"))
@@ -1115,7 +1224,9 @@ def modify_tensors(
         tensors: list[tuple[str, Tensor]] = []
 
         if re.match(r"h\.\d+\.self_attention\.query_key_value\.weight", name):
-
+            # Map bloom-style qkv_linear to gpt-style qkv_linear
+            # bloom: https://github.com/huggingface/transformers/blob/main/src/transformers/models/bloom/modeling_bloom.py#L238-L252  # noqa
+            # gpt-2: https://github.com/huggingface/transformers/blob/main/src/transformers/models/gpt2/modeling_gpt2.py#L312  # noqa
             qkv_weights = data_torch.reshape((n_head, 3, n_embed // n_head, n_embed))
             data_torch = torch.cat(
                 (
@@ -1143,6 +1254,7 @@ def modify_tensors(
         if name == "word_embeddings.weight":
             assert self.tensor_names is not None
 
+            # TODO: tie them at runtime, don't duplicate in the model file
             if all(
                 s not in self.tensor_names for s in ("lm_head.weight", "output.weight")
             ):
@@ -1161,7 +1273,7 @@ def set_vocab(self):
         try:
             self._set_vocab_gpt2()
         except Exception:
-
+            # Fallback for SEA-LION model
             self._set_vocab_sentencepiece()
             self.gguf_writer.add_add_bos_token(False)
             self.gguf_writer.add_pad_token_id(3)
@@ -1190,7 +1302,7 @@ def set_gguf_parameters(self):
     def modify_tensors(
         self, data_torch: Tensor, name: str, bid: int | None
     ) -> Iterable[tuple[str, Tensor]]:
-        del bid
+        del bid  # unused
 
         if "scales" in name:
             new_name = self.map_tensor_name(
@@ -1233,7 +1345,8 @@ def set_gguf_parameters(self):
         self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"])
         self.gguf_writer.add_head_count(head_count)
         self.gguf_writer.add_head_count_kv(head_count_kv)
-
+        # note: config provides rms norm but it is actually layer norm
+        # ref:  https://huggingface.co/OrionStarAI/Orion-14B-Chat/blob/276a17221ce42beb45f66fac657a41540e71f4f5/modeling_orion.py#L570-L571
         self.gguf_writer.add_layer_norm_eps(self.hparams["rms_norm_eps"])
 
 
@@ -1362,7 +1475,8 @@ def set_vocab(self):
 
         tokenizer = AutoTokenizer.from_pretrained(dir_model)
         vocab_size = hparams.get("vocab_size", len(tokenizer.vocab))
-
+        # Since we are checking the maximum index, we need to ensure it's strictly less than vocab_size,
+        # because vocab_size is the count of items, and indexes start at 0.
         max_vocab_index = max(tokenizer.get_vocab().values())
         if max_vocab_index >= vocab_size:
             raise ValueError("Vocabulary size exceeds expected maximum size.")
@@ -1374,12 +1488,12 @@ def set_vocab(self):
 
         for token_id in range(vocab_size):
             token_text = reverse_vocab[token_id].encode("utf-8")
-
+            # replace "\x00" to string with length > 0
             if token_text == b"\x00":
-                toktype = gguf.TokenType.BYTE
+                toktype = gguf.TokenType.BYTE  # special
                 token_text = f"<{token_text}>".encode("utf-8")
             elif re.fullmatch(rb"<0x[0-9A-Fa-f]{2}>", token_text):
-                toktype = gguf.TokenType.BYTE
+                toktype = gguf.TokenType.BYTE  # special
             elif reverse_vocab[token_id] in added_vocab:
                 if tokenizer.added_tokens_decoder[token_id].special:
                     toktype = gguf.TokenType.CONTROL
@@ -1440,11 +1554,12 @@ def set_gguf_parameters(self):
     def modify_tensors(
         self, data_torch: Tensor, name: str, bid: int | None
     ) -> Iterable[tuple[str, Tensor]]:
-        del bid
+        del bid  # unused
 
         head_count = self.hparams["num_attention_heads"]
         head_count_kv = self.hparams.get("num_key_value_heads", head_count)
 
+        # HF models permute some of the tensors, so we need to undo that
         if name.endswith("q_proj.weight"):
             data_torch = self._reverse_hf_permute(data_torch, head_count, head_count)
         if name.endswith("k_proj.weight"):
@@ -1474,18 +1589,18 @@ class FalconModel(Model):
     def set_gguf_parameters(self):
         block_count = self.hparams.get("num_hidden_layers")
         if block_count is None:
-            block_count = self.hparams["n_layer"]
+            block_count = self.hparams["n_layer"]  # old name
 
         n_head = self.hparams.get("num_attention_heads")
         if n_head is None:
-            n_head = self.hparams["n_head"]
+            n_head = self.hparams["n_head"]  # old name
 
         n_head_kv = self.hparams.get("num_kv_heads")
         if n_head_kv is None:
-            n_head_kv = self.hparams.get("n_head_kv", 1)
+            n_head_kv = self.hparams.get("n_head_kv", 1)  # old name
 
-        self.gguf_writer.add_context_length(2048)
-        self.gguf_writer.add_tensor_data_layout("jploski")
+        self.gguf_writer.add_context_length(2048)  # not in config.json
+        self.gguf_writer.add_tensor_data_layout("jploski")  # qkv tensor transform
         self.gguf_writer.add_embedding_length(self.hparams["hidden_size"])
         self.gguf_writer.add_feed_forward_length(4 * self.hparams["hidden_size"])
         self.gguf_writer.add_block_count(block_count)
@@ -1497,7 +1612,17 @@ def set_gguf_parameters(self):
     def modify_tensors(
         self, data_torch: Tensor, name: str, bid: int | None
     ) -> Iterable[tuple[str, Tensor]]:
-        del bid
+        del bid  # unused
+
+        # QKV tensor transform
+        # The original query_key_value tensor contains n_head_kv "kv groups",
+        # each consisting of n_head/n_head_kv query weights followed by one key
+        # and one value weight (shared by all query heads in the kv group).
+        # This layout makes it a big pain to work with in GGML.
+        # So we rearrange them here,, so that we have n_head query weights
+        # followed by n_head_kv key weights followed by n_head_kv value weights,
+        # in contiguous fashion.
+        # ref: https://github.com/jploski/ggml/blob/falcon40b/examples/falcon/convert-hf-to-ggml.py
 
         if "query_key_value" in name:
             n_head = self.find_hparam(["num_attention_heads", "n_head"])
@@ -1541,6 +1666,7 @@ class RefactModel(Model):
     def set_vocab(self):
         super().set_vocab()
 
+        # TODO: how to determine special FIM tokens automatically?
         special_vocab = gguf.SpecialVocab(
             self.dir_model,
             load_merges=False,
@@ -1549,7 +1675,7 @@ def set_vocab(self):
         special_vocab._set_special_token("prefix", 1)
         special_vocab._set_special_token("suffix", 3)
         special_vocab._set_special_token("middle", 2)
-        special_vocab.chat_template = None
+        special_vocab.chat_template = None  # do not add it twice
         special_vocab.add_to_gguf(self.gguf_writer)
 
     def set_gguf_parameters(self):
@@ -1561,6 +1687,7 @@ def set_gguf_parameters(self):
 
         block_count = self.hparams["n_layer"]
 
+        # refact uses Alibi. So this is from config.json which might be used by training.
         self.gguf_writer.add_context_length(self.hparams["n_positions"])
         self.gguf_writer.add_embedding_length(self.hparams["n_embd"])
 
@@ -1633,7 +1760,7 @@ def set_vocab(self):
         if (self.dir_model / "tokenizer.json").is_file():
             self._set_vocab_gpt2()
         else:
-
+            # StableLM 2 1.6B used to have a vocab in a similar format to Qwen's vocab
             self._set_vocab_qwen()
 
     def set_gguf_parameters(self):
@@ -1712,7 +1839,7 @@ def _stack_qk_norm(
         layer_name: str = "q_layernorm",
     ):
         datas: list[Tensor] = []
-
+        # extract the norms in order
         for xid in range(n_head):
             ename = f"model.layers.{bid}.self_attn.{layer_name}.norms.{xid}.weight"
             datas.append(norms[ename])
@@ -1728,7 +1855,7 @@ def prepare_tensors(self):
         super().prepare_tensors()
 
         if self._q_norms is not None or self._k_norms is not None:
-
+            # flatten two `list[dict[str, Tensor]]` into a single `list[str]`
             norms = (
                 [k for d in self._q_norms for k in d.keys()]
                 if self._q_norms is not None
@@ -1755,9 +1882,10 @@ def set_vocab(self):
             try:
                 self._set_vocab_llama_hf()
             except (FileNotFoundError, TypeError):
-
+                # Llama 3
                 self._set_vocab_gpt2()
 
+        # Apply to CodeLlama only (and ignore for Llama 3 with a vocab size of 128256)
         if self.hparams.get("vocab_size", 32000) == 32016:
             special_vocab = gguf.SpecialVocab(
                 self.dir_model,
@@ -1770,6 +1898,19 @@ def set_vocab(self):
             special_vocab._set_special_token("eot", 32010)
             special_vocab.add_to_gguf(self.gguf_writer)
 
+        tokenizer_config_file = self.dir_model / "tokenizer_config.json"
+        if tokenizer_config_file.is_file():
+            with open(tokenizer_config_file, "r", encoding="utf-8") as f:
+                tokenizer_config_json = json.load(f)
+                if "add_prefix_space" in tokenizer_config_json:
+                    self.gguf_writer.add_add_space_prefix(
+                        tokenizer_config_json["add_prefix_space"]
+                    )
+
+        # Apply to granite small models only
+        if self.hparams.get("vocab_size", 32000) == 49152:
+            self.gguf_writer.add_add_bos_token(False)
+
     def set_gguf_parameters(self):
         super().set_gguf_parameters()
         hparams = self.hparams
@@ -1791,18 +1932,6 @@ def set_gguf_parameters(self):
                     self.hparams["rope_scaling"]["factor"]
                 )
 
-        tokenizer_config_file = self.dir_model / "tokenizer_config.json"
-        if tokenizer_config_file.is_file():
-            with open(tokenizer_config_file, "r", encoding="utf-8") as f:
-                tokenizer_config_json = json.load(f)
-                if "add_prefix_space" in tokenizer_config_json:
-                    self.gguf_writer.add_add_space_prefix(
-                        tokenizer_config_json["add_prefix_space"]
-                    )
-
-        if self.hparams.get("vocab_size", 32000) == 49152:
-            self.gguf_writer.add_add_bos_token(False)
-
     @staticmethod
     def permute(weights: Tensor, n_head: int, n_head_kv: int | None):
         if n_head_kv is not None and n_head != n_head_kv:
@@ -1828,6 +1957,7 @@ def modify_tensors(
         if name.endswith(("k_proj.weight", "k_proj.bias")):
             data_torch = LlamaModel.permute(data_torch, n_head, n_kv_head)
 
+        # process the experts separately
         if name.find("block_sparse_moe.experts") != -1:
             n_experts = self.hparams["num_local_experts"]
 
@@ -1841,6 +1971,7 @@ def modify_tensors(
             if len(self._experts[bid]) >= n_experts * 3:
                 tensors: list[tuple[str, Tensor]] = []
 
+                # merge the experts into a single 3d tensor
                 for wid in ["w1", "w2", "w3"]:
                     datas: list[Tensor] = []
 
@@ -1862,7 +1993,7 @@ def modify_tensors(
 
         return [(self.map_tensor_name(name), data_torch)]
 
-    def prepare_tensors(self):
+    def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
         if rope_scaling := self.find_hparam(["rope_scaling"], optional=True):
             if rope_scaling.get("rope_type", "").lower() == "llama3":
                 base = self.hparams.get("rope_theta", 10000.0)
@@ -1898,21 +2029,233 @@ def prepare_tensors(self):
                         )
                         rope_factors.append(1 / ((1 - smooth) / factor + smooth))
 
-                if not self.is_lora:
-                    self.gguf_writer.add_tensor(
-                        self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FREQS),
-                        np.array(rope_factors, dtype=np.float32),
-                    )
+                yield (
+                    self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FREQS),
+                    torch.tensor(rope_factors, dtype=torch.float32),
+                )
 
+    def prepare_tensors(self):
         super().prepare_tensors()
 
         if self._experts is not None:
-
+            # flatten `list[dict[str, Tensor]]` into `list[str]`
             experts = [k for d in self._experts for k in d.keys()]
             if len(experts) > 0:
                 raise ValueError(f"Unprocessed experts: {experts}")
 
 
+@Model.register("DeciLMForCausalLM")
+class DeciModel(Model):
+    model_arch = gguf.MODEL_ARCH.DECI
+
+    @staticmethod
+    def _ffn_mult_to_intermediate_size(ffn_mult: float, n_embd: int) -> int:
+        # DeciLM-specific code
+        intermediate_size = int(2 * ffn_mult * n_embd / 3)
+        return DeciModel._find_multiple(intermediate_size, 256)
+
+    @staticmethod
+    def _find_multiple(n: int, k: int) -> int:
+        # DeciLM-specific code
+        if n % k == 0:
+            return n
+        return n + k - (n % k)
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+        if "block_configs" in self.hparams:  # Llama-3_1-Nemotron-51B
+            _block_configs: list[dict[str, Any]] = self.hparams["block_configs"]
+            assert self.block_count == len(_block_configs)
+            self._num_kv_heads = list()
+            self._num_heads = list()
+            _ffn_multipliers = list()
+            # ***linear attention layer***
+            # if n_heads_in_group is None and replace_with_linear is True
+            # then _num_kv_heads[il] is 0 and _num_heads[il] is num_attention_heads
+            # ***attention-free layer***
+            # if n_heads_in_group is None and replace_with_linear is False
+            # then _num_kv_heads[il] is 0 and _num_heads[il] is 0
+            # ***normal attention-layer***
+            # if n_heads_in_group is not None, then
+            # _num_kv_heads[il] is num_attention_head // n_heads_in_group and
+            # _num_heads[il] is num_attention_head
+            for il in range(len(_block_configs)):
+                if _block_configs[il]["attention"]["n_heads_in_group"] is None:
+                    if _block_configs[il]["attention"]["replace_with_linear"] is True:
+                        self._num_kv_heads.append(0)
+                        self._num_heads.append(self.hparams["num_attention_heads"])
+                    else:
+                        self._num_kv_heads.append(0)
+                        self._num_heads.append(0)
+                else:
+                    self._num_kv_heads.append(
+                        self.hparams["num_attention_heads"]
+                        // _block_configs[il]["attention"]["n_heads_in_group"]
+                    )
+                    self._num_heads.append(self.hparams["num_attention_heads"])
+                _ffn_multipliers.append(_block_configs[il]["ffn"]["ffn_mult"])
+            assert self.block_count == len(self._num_kv_heads)
+            assert self.block_count == len(self._num_heads)
+            assert self.block_count == len(_ffn_multipliers)
+            assert isinstance(self._num_kv_heads, list) and isinstance(
+                self._num_kv_heads[0], int
+            )
+            assert isinstance(self._num_heads, list) and isinstance(
+                self._num_heads[0], int
+            )
+            assert isinstance(_ffn_multipliers, list) and isinstance(
+                _ffn_multipliers[0], float
+            )
+            self._ffn_dims: list[int] = [
+                DeciModel._ffn_mult_to_intermediate_size(
+                    multiplier, self.hparams["hidden_size"]
+                )
+                for multiplier in _ffn_multipliers
+            ]
+
+    def set_vocab(self):
+        # Please change tokenizer_config.json of Llama-3_1-Nemotron-51B's
+        # eos_token from '|eot_id|' to '|end_of_text|'
+        if self.hparams.get("vocab_size", 128256) == 128256:
+            tokens, toktypes, tokpre = self.get_vocab_base()
+            self.gguf_writer.add_tokenizer_model("gpt2")
+            self.gguf_writer.add_tokenizer_pre(tokpre)
+            self.gguf_writer.add_token_list(tokens)
+            self.gguf_writer.add_token_types(toktypes)
+
+            special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=True)
+            special_vocab.add_to_gguf(self.gguf_writer)
+        else:
+            # DeciLM-7B
+            self._set_vocab_llama_hf()
+
+    def set_gguf_parameters(self):
+        if "block_configs" in self.hparams:  # Llama-3_1-Nemotron-51B
+            assert self.block_count == len(self._num_kv_heads)
+            assert self.block_count == len(self._num_heads)
+            assert self.block_count == len(self._ffn_dims)
+            if (rope_theta := self.hparams.get("rope_theta")) is not None:
+                self.gguf_writer.add_rope_freq_base(rope_theta)
+            self.gguf_writer.add_head_count_kv(self._num_kv_heads)
+            self.gguf_writer.add_head_count(self._num_heads)
+            self.gguf_writer.add_feed_forward_length(self._ffn_dims)
+            self.gguf_writer.add_block_count(self.block_count)
+            self.gguf_writer.add_context_length(self.hparams["max_position_embeddings"])
+            self.gguf_writer.add_embedding_length(self.hparams["hidden_size"])
+            self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"])
+            self.gguf_writer.add_key_length(
+                self.hparams["hidden_size"] // self.hparams["num_attention_heads"]
+            )
+            self.gguf_writer.add_value_length(
+                self.hparams["hidden_size"] // self.hparams["num_attention_heads"]
+            )
+            self.gguf_writer.add_file_type(self.ftype)
+        else:  # DeciLM-7B
+            super().set_gguf_parameters()
+            if "num_key_value_heads_per_layer" in self.hparams:  # DeciLM-7B
+                self._num_kv_heads: list[int] = self.hparams[
+                    "num_key_value_heads_per_layer"
+                ]
+                assert self.block_count == len(self._num_kv_heads)
+                self.gguf_writer.add_head_count_kv(self._num_kv_heads)
+        hparams = self.hparams
+        self.gguf_writer.add_vocab_size(hparams["vocab_size"])
+
+        if "head_dim" in hparams:
+            rope_dim = hparams["head_dim"]
+        else:
+            rope_dim = hparams["hidden_size"] // hparams["num_attention_heads"]
+        self.gguf_writer.add_rope_dimension_count(rope_dim)
+
+        if (
+            self.hparams.get("rope_scaling") is not None
+            and "factor" in self.hparams["rope_scaling"]
+        ):
+            if self.hparams["rope_scaling"].get("type") == "linear":
+                self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
+                self.gguf_writer.add_rope_scaling_factor(
+                    self.hparams["rope_scaling"]["factor"]
+                )
+
+    @staticmethod
+    def permute(weights: Tensor, n_head: int, n_head_kv: int | None):
+        if n_head_kv is not None and n_head != n_head_kv:
+            n_head = n_head_kv
+        return (
+            weights.reshape(
+                n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:]
+            )
+            .swapaxes(1, 2)
+            .reshape(weights.shape)
+        )
+
+    def modify_tensors(
+        self, data_torch: Tensor, name: str, bid: int | None
+    ) -> Iterable[tuple[str, Tensor]]:
+        n_head = self.hparams["num_attention_heads"]
+        if bid is not None:
+            if "num_key_value_heads_per_layer" in self.hparams:
+                n_kv_head = self.hparams["num_key_value_heads_per_layer"][bid]
+            elif "block_configs" in self.hparams:
+                n_kv_head = self._num_kv_heads[bid]
+                n_head = self._num_heads[bid]
+            else:
+                n_kv_head = self.hparams.get("num_key_value_heads")
+        else:
+            n_kv_head = self.hparams.get("num_key_value_heads")
+
+        if name.endswith(("q_proj.weight", "q_proj.bias")):
+            data_torch = DeciModel.permute(data_torch, n_head, n_head)
+        if name.endswith(("k_proj.weight", "k_proj.bias")):
+            data_torch = DeciModel.permute(data_torch, n_head, n_kv_head)
+        return [(self.map_tensor_name(name), data_torch)]
+
+    def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
+        if rope_scaling := self.find_hparam(["rope_scaling"], optional=True):
+            if rope_scaling.get("rope_type", "").lower() == "llama3":
+                base = self.hparams.get("rope_theta", 10000.0)
+                dim = self.hparams.get(
+                    "head_dim",
+                    self.hparams["hidden_size"] // self.hparams["num_attention_heads"],
+                )
+                freqs = 1.0 / (
+                    base ** (torch.arange(0, dim, 2, dtype=torch.float32) / dim)
+                )
+
+                factor = rope_scaling.get("factor", 8.0)
+                low_freq_factor = rope_scaling.get("low_freq_factor", 1.0)
+                high_freq_factor = rope_scaling.get("high_freq_factor", 4.0)
+                old_context_len = self.hparams.get(
+                    "original_max_position_embeddings", 8192
+                )
+
+                low_freq_wavelen = old_context_len / low_freq_factor
+                high_freq_wavelen = old_context_len / high_freq_factor
+                assert low_freq_wavelen != high_freq_wavelen
+
+                rope_factors = []
+                for freq in freqs:
+                    wavelen = 2 * math.pi / freq
+                    if wavelen < high_freq_wavelen:
+                        rope_factors.append(1)
+                    elif wavelen > low_freq_wavelen:
+                        rope_factors.append(factor)
+                    else:
+                        smooth = (old_context_len / wavelen - low_freq_factor) / (
+                            high_freq_factor - low_freq_factor
+                        )
+                        rope_factors.append(1 / ((1 - smooth) / factor + smooth))
+
+                yield (
+                    self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FREQS),
+                    torch.tensor(rope_factors, dtype=torch.float32),
+                )
+
+    def prepare_tensors(self):
+        super().prepare_tensors()
+
+
 @Model.register("BitnetForCausalLM")
 class BitnetModel(Model):
     model_arch = gguf.MODEL_ARCH.BITNET
@@ -1930,7 +2273,9 @@ def weight_quant(self, weight: Tensor) -> Tensor:
         weight = weight.float()
         scale = weight.abs().mean().clamp(min=1e-5)
         iscale = 1 / scale
-
+        # TODO: multiply by the scale directly instead of inverting it twice
+        # (this is also unnecessarily doubly inverted upstream)
+        # ref: https://huggingface.co/1bitLLM/bitnet_b1_58-3B/blob/af89e318d78a70802061246bf037199d2fb97020/utils_quant.py#L10
         result = (weight * iscale).round().clamp(-1, 1) / iscale
         return result.type(dtype)
 
@@ -1951,7 +2296,7 @@ def modify_tensors(
                 gguf.MODEL_TENSOR.FFN_GATE,
             ]
         ):
-
+            # transform weight into 1/0/-1 (in fp32)
             data_torch = self.weight_quant(data_torch)
 
         yield (new_name, data_torch)
@@ -1975,7 +2320,7 @@ def set_gguf_parameters(self):
     def modify_tensors(
         self, data_torch: Tensor, name: str, bid: int | None
     ) -> Iterable[tuple[str, Tensor]]:
-
+        # process the experts separately
         if name.find(".moe.") != -1:
             n_experts = self.hparams["num_local_experts"]
 
@@ -1989,6 +2334,7 @@ def modify_tensors(
             if len(self._experts[bid]) >= n_experts * 3:
                 tensors: list[tuple[str, Tensor]] = []
 
+                # merge the experts into a single 3d tensor
                 for wid in ["linear", "linear_1", "linear_v"]:
                     datas: list[Tensor] = []
 
@@ -2044,17 +2390,26 @@ def set_gguf_parameters(self):
     def modify_tensors(
         self, data_torch: Tensor, name: str, bid: int | None
     ) -> Iterable[tuple[str, Tensor]]:
-        del bid
+        del bid  # unused
 
         n_expert = self.hparams["ffn_config"]["moe_num_experts"]
         n_ff = self.hparams["ffn_config"]["ffn_hidden_size"]
         n_embd = self.hparams["d_model"]
 
+        # Specific behavior for experts tensors: suffix .weight, view as 3D and transpose
+        # original implementation expects (n_expert, n_ff, n_embd) for all experts weights
+        # But llama.cpp moe graph works differently
+        # AND the dimensions in ggml are typically in the reverse order of the pytorch dimensions
+        # so (n_expert, n_ff, n_embd) in pytorch is {n_embd, n_ff, n_expert} in ggml_tensor
         exp_tensor_names = {
-            "ffn.experts.mlp.w1": None,
-            "ffn.experts.mlp.w2": (0, 2, 1),
+            "ffn.experts.mlp.w1": None,  # LLM_TENSOR_FFN_GATE_EXPS ggml_tensor->ne{n_embd, n_ff,   n_expert}
+            "ffn.experts.mlp.w2": (
+                0,
+                2,
+                1,
+            ),  # LLM_TENSOR_FFN_DOWN_EXPS ggml_tensor->ne{n_ff,   n_embd, n_expert}
             "ffn.experts.mlp.v1": None,
-        }
+        }  # LLM_TENSOR_FFN_UP_EXPS   ggml_tensor->ne{n_embd, n_ff,   n_expert}
         experts = False
 
         for exp_tensor_name in exp_tensor_names.keys():
@@ -2065,6 +2420,12 @@ def modify_tensors(
                     data_torch = data_torch.permute(*permute_tensor)
                 break
 
+        # map tensor names
+        # In MoE models the ffn tensors are typically most of the model weights,
+        # and need to be quantizable. Quantize expects tensor names to be suffixed by .weight.
+        # Every other model has the weight names ending in .weight,
+        # let's assume that is the convention which is not the case for dbrx:
+        # https://huggingface.co/databricks/dbrx-instruct/blob/main/model.safetensors.index.json#L15
         new_name = self.map_tensor_name(
             name if not experts else name + ".weight", try_suffixes=(".weight",)
         )
@@ -2074,7 +2435,7 @@ def modify_tensors(
     def tensor_force_quant(
         self, name: str, new_name: str, bid: int | None, n_dims: int
     ) -> gguf.GGMLQuantizationType | bool:
-        del name, new_name, bid
+        del name, new_name, bid  # unused
 
         return n_dims > 1
 
@@ -2084,48 +2445,71 @@ class MiniCPMModel(Model):
     model_arch = gguf.MODEL_ARCH.MINICPM
 
     def set_gguf_parameters(self):
-        block_count = self.hparams["num_hidden_layers"]
-        self.gguf_writer.add_context_length(self.hparams["max_position_embeddings"])
-        self.gguf_writer.add_embedding_length(self.hparams["hidden_size"])
-        self.gguf_writer.add_block_count(block_count)
-        self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"])
-        self.gguf_writer.add_rope_dimension_count(
-            self.hparams["hidden_size"] // self.hparams["num_attention_heads"]
+        super().set_gguf_parameters()
+        embedding_scale = float(self.hparams["scale_emb"])
+        self.gguf_writer.add_embedding_scale(embedding_scale)
+        logger.info(f"gguf: (minicpm) embedding_scale = {embedding_scale}")
+        residual_scale = (
+            self.hparams["scale_depth"] / self.hparams["num_hidden_layers"] ** 0.5
         )
-        self.gguf_writer.add_head_count(self.hparams["num_attention_heads"])
-        self.gguf_writer.add_head_count_kv(self.hparams["num_key_value_heads"])
-        self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"])
-        self.gguf_writer.add_file_type(self.ftype)
+        self.gguf_writer.add_residual_scale(residual_scale)
+        logger.info(f"gguf: (minicpm) residual_scale = {residual_scale}")
+        logit_scale = self.hparams["hidden_size"] / self.hparams["dim_model_base"]
+        self.gguf_writer.add_logit_scale(logit_scale)
+        logger.info(f"gguf: (minicpm) logit_scale = {logit_scale}")
+        if self.hparams.get("rope_scaling") is not None:
+            if self.hparams["rope_scaling"].get("type") == "longrope":
+                self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LONGROPE)
+                logger.info(
+                    f"gguf: (minicpm) rope_scaling_type = {gguf.RopeScalingType.LONGROPE}"
+                )
 
-    def set_vocab(self):
-        self._set_vocab_llama_hf()
+    def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
+        rope_dims = self.hparams["hidden_size"] // self.hparams["num_attention_heads"]
 
-    def _reverse_hf_permute(
-        self, weights: Tensor, n_head: int, n_kv_head: int | None = None
-    ) -> Tensor:
-        if n_kv_head is not None and n_head != n_kv_head:
-            n_head //= n_kv_head
+        rope_scaling = self.find_hparam(["rope_scaling"], True)
+        if rope_scaling is not None:
+            long_factors = rope_scaling.get("long_factor", None)
+            short_factors = rope_scaling.get("short_factor", None)
 
-        return (
-            weights.reshape(
-                n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:]
+            if long_factors is None or short_factors is None:
+                raise KeyError(
+                    "Missing the required key rope_scaling.long_factor or rope_scaling_short_factor"
+                )
+
+            if (
+                len(long_factors) != len(short_factors)
+                or len(long_factors) != rope_dims / 2
+            ):
+                raise ValueError(
+                    f"The length of rope long and short factors must be {rope_dims / 2}"
+                )
+
+            yield (
+                self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FACTORS_LONG),
+                torch.tensor(long_factors, dtype=torch.float32),
             )
-            .swapaxes(1, 2)
-            .reshape(weights.shape)
-        )
+            yield (
+                self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FACTORS_SHORT),
+                torch.tensor(short_factors, dtype=torch.float32),
+            )
+
+    def set_vocab(self):
+        self._set_vocab_sentencepiece()
 
     def modify_tensors(
         self, data_torch: Tensor, name: str, bid: int | None
     ) -> Iterable[tuple[str, Tensor]]:
-        del bid
+        del bid  # unused
 
         n_head = self.hparams["num_attention_heads"]
         n_kv_head = self.hparams.get("num_key_value_heads")
 
+        # HF models permute some of the tensors, so we need to undo that
         if name.endswith(("q_proj.weight")):
-            data_torch = self._reverse_hf_permute(data_torch, n_head, n_head)
+            data_torch = LlamaModel.permute(data_torch, n_head, n_head)
         if name.endswith(("k_proj.weight")):
-            data_torch = self._reverse_hf_permute(data_torch, n_head, n_kv_head)
+            data_torch = LlamaModel.permute(data_torch, n_head, n_kv_head)
 
         return [(self.map_tensor_name(name), data_torch)]
 
@@ -2137,8 +2521,6 @@ class MiniCPM3Model(Model):
     def set_gguf_parameters(self):
         hparams = self.hparams
 
-        rope_dims = hparams["qk_rope_head_dim"]
-
         self.gguf_writer.add_file_type(self.ftype)
         self.gguf_writer.add_context_length(hparams["max_position_embeddings"])
         self.gguf_writer.add_embedding_length(hparams["hidden_size"])
@@ -2156,37 +2538,38 @@ def set_gguf_parameters(self):
         )
         self.gguf_writer.add_rope_dimension_count(hparams["qk_rope_head_dim"])
 
+    def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
         rope_scaling = self.find_hparam(["rope_scaling"], True)
-        if rope_scaling is None:
-            return
+        if rope_scaling is not None:
+            rope_dims = self.hparams["qk_rope_head_dim"]
 
-        long_factors = rope_scaling.get("long_factor", None)
-        short_factors = rope_scaling.get("short_factor", None)
+            long_factors = rope_scaling.get("long_factor", None)
+            short_factors = rope_scaling.get("short_factor", None)
 
-        if long_factors is None or short_factors is None:
-            raise KeyError(
-                "Missing the required key rope_scaling.long_factor or rope_scaling_short_factor"
-            )
+            if long_factors is None or short_factors is None:
+                raise KeyError(
+                    "Missing the required key rope_scaling.long_factor or rope_scaling_short_factor"
+                )
 
-        if (
-            len(long_factors) != len(short_factors)
-            or len(long_factors) != rope_dims / 2
-        ):
-            raise ValueError(
-                f"The length of rope long and short factors must be {rope_dims / 2}"
-            )
+            if (
+                len(long_factors) != len(short_factors)
+                or len(long_factors) != rope_dims / 2
+            ):
+                raise ValueError(
+                    f"The length of rope long and short factors must be {rope_dims / 2}"
+                )
 
-        self.gguf_writer.add_tensor(
-            gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.ROPE_FACTORS_LONG] + ".weight",
-            np.array(long_factors, dtype=np.float32),
-        )
-        self.gguf_writer.add_tensor(
-            gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.ROPE_FACTORS_SHORT] + ".weight",
-            np.array(short_factors, dtype=np.float32),
-        )
+            yield (
+                self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FACTORS_LONG),
+                torch.tensor(long_factors, dtype=torch.float32),
+            )
+            yield (
+                self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FACTORS_SHORT),
+                torch.tensor(short_factors, dtype=torch.float32),
+            )
 
     def set_vocab(self):
-        self._set_vocab_llama_hf()
+        self._set_vocab_sentencepiece()
 
     def _reverse_hf_permute(
         self, weights: Tensor, n_head: int, n_kv_head: int | None = None
@@ -2264,6 +2647,87 @@ def set_vocab(self):
         except FileNotFoundError:
             self._set_vocab_gpt2()
 
+    def set_gguf_parameters(self):
+        super().set_gguf_parameters()
+        if (
+            self.hparams.get("rope_scaling") is not None
+            and "factor" in self.hparams["rope_scaling"]
+        ):
+            if self.hparams["rope_scaling"].get("type") == "yarn":
+                self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN)
+                self.gguf_writer.add_rope_scaling_factor(
+                    self.hparams["rope_scaling"]["factor"]
+                )
+                self.gguf_writer.add_rope_scaling_orig_ctx_len(
+                    self.hparams["rope_scaling"]["original_max_position_embeddings"]
+                )
+
+
+@Model.register("Qwen2VLForConditionalGeneration")
+class Qwen2VLModel(Model):
+    model_arch = gguf.MODEL_ARCH.QWEN2VL
+
+    def set_gguf_parameters(self):
+        super().set_gguf_parameters()
+        mrope_section = self.hparams["rope_scaling"]["mrope_section"]
+        mrope_section += [0] * max(0, 4 - len(mrope_section))
+        self.gguf_writer.add_rope_dimension_sections(mrope_section)
+
+    def set_vocab(self):
+        try:
+            self._set_vocab_sentencepiece()
+        except FileNotFoundError:
+            self._set_vocab_gpt2()
+
+    def get_tensors(self) -> Iterator[tuple[str, Tensor]]:
+        for name, data in super().get_tensors():
+            if name.startswith("visual."):
+                continue
+            yield name, data
+
+
+@Model.register("WavTokenizerDec")
+class WavTokenizerDecModel(Model):
+    model_arch = gguf.MODEL_ARCH.WAVTOKENIZER_DEC
+
+    def modify_tensors(
+        self, data_torch: Tensor, name: str, bid: int | None
+    ) -> Iterable[tuple[str, Tensor]]:
+        del bid  # unused
+
+        if (
+            name.endswith("codebook.cluster_size")
+            or name.endswith("codebook.embed_avg")
+            or name.endswith("codebook.inited")
+        ):
+            logger.debug(f"Skipping {name!r}")
+            return []
+
+        logger.info(f"{self.map_tensor_name(name)} -> {data_torch.shape}")
+
+        return [(self.map_tensor_name(name), data_torch)]
+
+    def set_vocab(self):
+        self._set_vocab_none()
+
+    def set_gguf_parameters(self):
+        super().set_gguf_parameters()
+        self.gguf_writer.add_vocab_size(self.hparams["vocab_size"])
+        self.gguf_writer.add_features_length(self.hparams["n_embd_features"])
+        self.gguf_writer.add_feed_forward_length(self.hparams["n_ff"])
+        self.gguf_writer.add_group_norm_eps(self.hparams["group_norm_epsilon"])
+        self.gguf_writer.add_group_norm_groups(self.hparams["group_norm_groups"])
+
+        self.gguf_writer.add_posnet_embedding_length(self.hparams["posnet"]["n_embd"])
+        self.gguf_writer.add_posnet_block_count(self.hparams["posnet"]["n_layer"])
+
+        self.gguf_writer.add_convnext_embedding_length(
+            self.hparams["convnext"]["n_embd"]
+        )
+        self.gguf_writer.add_convnext_block_count(self.hparams["convnext"]["n_layer"])
+
+        self.gguf_writer.add_causal_attention(False)
+
 
 @Model.register("Qwen2MoeForCausalLM")
 class Qwen2MoeModel(Model):
@@ -2295,7 +2759,7 @@ def set_gguf_parameters(self):
     def modify_tensors(
         self, data_torch: Tensor, name: str, bid: int | None
     ) -> Iterable[tuple[str, Tensor]]:
-
+        # process the experts separately
         if name.find("experts") != -1:
             n_experts = self.hparams["num_experts"]
             assert bid is not None
@@ -2308,6 +2772,7 @@ def modify_tensors(
             if len(self._experts[bid]) >= n_experts * 3:
                 tensors: list[tuple[str, Tensor]] = []
 
+                # merge the experts into a single 3d tensor
                 for w_name in ["down_proj", "gate_proj", "up_proj"]:
                     datas: list[Tensor] = []
 
@@ -2333,7 +2798,7 @@ def prepare_tensors(self):
         super().prepare_tensors()
 
         if self._experts is not None:
-
+            # flatten `list[dict[str, Tensor]]` into `list[str]`
             experts = [k for d in self._experts for k in d.keys()]
             if len(experts) > 0:
                 raise ValueError(f"Unprocessed experts: {experts}")
@@ -2355,10 +2820,11 @@ def set_gguf_parameters(self):
     def modify_tensors(
         self, data_torch: Tensor, name: str, bid: int | None
     ) -> Iterable[tuple[str, Tensor]]:
-        del bid
+        del bid  # unused
 
         tensors: list[tuple[str, Tensor]] = []
 
+        # we don't need these
         if name.endswith((".attn.bias", ".attn.masked_bias")):
             return tensors
 
@@ -2371,6 +2837,7 @@ def modify_tensors(
 
         tensors.append((new_name, data_torch))
 
+        # note: GPT2 output is tied to (same as) wte in original model
         if new_name == self.format_tensor_name(gguf.MODEL_TENSOR.TOKEN_EMBD):
             tensors.append(
                 (self.format_tensor_name(gguf.MODEL_TENSOR.OUTPUT), data_torch)
@@ -2412,6 +2879,15 @@ class Phi3MiniModel(Model):
     model_arch = gguf.MODEL_ARCH.PHI3
 
     def set_vocab(self):
+        # Phi-4 model uses GPT2Tokenizer
+        tokenizer_config_file = self.dir_model / "tokenizer_config.json"
+        if tokenizer_config_file.is_file():
+            with open(tokenizer_config_file, "r", encoding="utf-8") as f:
+                tokenizer_config_json = json.load(f)
+                tokenizer_class = tokenizer_config_json["tokenizer_class"]
+                if tokenizer_class == "GPT2Tokenizer":
+                    return self._set_vocab_gpt2()
+
         from sentencepiece import SentencePieceProcessor
 
         tokenizer_path = self.dir_model / "tokenizer.model"
@@ -2538,8 +3014,20 @@ def set_gguf_parameters(self):
         self.gguf_writer.add_rope_dimension_count(rope_dims)
         self.gguf_writer.add_rope_freq_base(self.find_hparam(["rope_theta"]))
         self.gguf_writer.add_file_type(self.ftype)
-        self.gguf_writer.add_sliding_window(self.find_hparam(["sliding_window"]))
+        sliding_window = self.hparams.get("sliding_window")
+        # use zero value of sliding_window to distinguish Phi-4 from other PHI3 models
+        if sliding_window is None:
+            sliding_window = 0
+        self.gguf_writer.add_sliding_window(sliding_window)
+
+    def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
+        n_embd = self.find_hparam(["hidden_size", "n_embd"])
+        n_head = self.find_hparam(["num_attention_heads", "n_head"])
+        max_pos_embds = self.find_hparam(["n_positions", "max_position_embeddings"])
+        orig_max_pos_embds = self.find_hparam(["original_max_position_embeddings"])
+        rope_dims = n_embd // n_head
 
+        # write rope scaling for long context (128k) model
         rope_scaling = self.find_hparam(["rope_scaling"], True)
         if rope_scaling is None:
             return
@@ -2581,15 +3069,14 @@ def set_gguf_parameters(self):
                 f"The length of rope long and short factors must be {rope_dims / 2}"
             )
 
-        if not self.is_lora:
-            self.gguf_writer.add_tensor(
-                gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.ROPE_FACTORS_LONG] + ".weight",
-                np.array(long_factors, dtype=np.float32),
-            )
-            self.gguf_writer.add_tensor(
-                gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.ROPE_FACTORS_SHORT] + ".weight",
-                np.array(short_factors, dtype=np.float32),
-            )
+        yield (
+            self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FACTORS_LONG),
+            torch.tensor(long_factors, dtype=torch.float32),
+        )
+        yield (
+            self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FACTORS_SHORT),
+            torch.tensor(short_factors, dtype=torch.float32),
+        )
 
 
 @Model.register("PlamoForCausalLM")
@@ -2603,12 +3090,14 @@ def set_gguf_parameters(self):
         hparams = self.hparams
         block_count = hparams["num_hidden_layers"]
 
-        self.gguf_writer.add_context_length(4096)
+        self.gguf_writer.add_context_length(4096)  # not in config.json
         self.gguf_writer.add_embedding_length(hparams["hidden_size"])
         self.gguf_writer.add_feed_forward_length(hparams["intermediate_size"])
         self.gguf_writer.add_block_count(block_count)
         self.gguf_writer.add_head_count(hparams["num_attention_heads"])
-        self.gguf_writer.add_head_count_kv(5)
+        self.gguf_writer.add_head_count_kv(
+            5
+        )  # hparams["num_key_value_heads"]) is wrong
         self.gguf_writer.add_layer_norm_rms_eps(hparams["rms_norm_eps"])
         self.gguf_writer.add_file_type(self.ftype)
 
@@ -2629,10 +3118,11 @@ def shuffle_attn_output_weight(self, data_torch):
     def modify_tensors(
         self, data_torch: Tensor, name: str, bid: int | None
     ) -> Iterable[tuple[str, Tensor]]:
-        del bid
+        del bid  # unused
 
         new_name = self.map_tensor_name(name)
 
+        # shuffle for broadcasting of gqa in ggml_mul_mat
         if new_name.endswith("attn_q.weight"):
             data_torch = self.shuffle_attn_q_weight(data_torch)
         elif new_name.endswith("attn_output.weight"):
@@ -2663,7 +3153,7 @@ def set_gguf_parameters(self):
     def modify_tensors(
         self, data_torch: Tensor, name: str, bid: int | None
     ) -> Iterable[tuple[str, Tensor]]:
-        del bid
+        del bid  # unused
 
         new_name = self.map_tensor_name(name)
 
@@ -2675,7 +3165,7 @@ def modify_tensors(
             if all(
                 s not in self.tensor_names for s in ("lm_head.weight", "output.weight")
             ):
-
+                # copy tok_embd.weight to output.weight
                 tensors.append(
                     (self.format_tensor_name(gguf.MODEL_TENSOR.OUTPUT), data_torch)
                 )
@@ -2688,7 +3178,10 @@ class InternLM2Model(Model):
     model_arch = gguf.MODEL_ARCH.INTERNLM2
 
     def set_vocab(self):
-
+        # (TODO): Is there a better way?
+        # Copy from _set_vocab_sentencepiece, The only difference is that we will treat the character
+        # \x00 specially and convert it into an emoji character to prevent it from being mistakenly
+        # recognized as an empty string in C++.
         from sentencepiece import SentencePieceProcessor
         from sentencepiece import sentencepiece_model_pb2 as model
 
@@ -2702,7 +3195,9 @@ def set_vocab(self):
             logger.error(f"Error: Missing {tokenizer_path}")
             sys.exit(1)
 
-        sentencepiece_model = model.ModelProto()
+        sentencepiece_model = (
+            model.ModelProto()
+        )  # pyright: ignore[reportAttributeAccessIssue]
         sentencepiece_model.ParseFromString(open(tokenizer_path, "rb").read())
         add_prefix = sentencepiece_model.normalizer_spec.add_dummy_prefix
 
@@ -2716,7 +3211,8 @@ def set_vocab(self):
             text = piece.encode("utf-8")
             score = tokenizer.GetScore(token_id)
             if text == b"\x00":
-
+                # (TODO): fixme
+                # Hack here and replace the \x00 characters.
                 logger.warning(f"InternLM2 convert token '{text}' to '🐉'!")
                 text = "🐉".encode("utf-8")
 
@@ -2729,7 +3225,7 @@ def set_vocab(self):
                 toktype = SentencePieceTokenTypes.UNUSED
             elif tokenizer.IsByte(token_id):
                 toktype = SentencePieceTokenTypes.BYTE
-
+            # take care of ununsed raw token
             if piece.startswith("[UNUSED"):
                 toktype = SentencePieceTokenTypes.UNUSED
 
@@ -2806,7 +3302,9 @@ def set_vocab(self):
         special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens))
         old_eos = special_vocab.special_token_ids["eos"]
         if chat_eos_token_id is not None:
-
+            # For the chat model, we replace the eos with '<|im_end|>'.
+            # TODO: this is a hack, should be fixed
+            #       https://github.com/ggerganov/llama.cpp/pull/6745#issuecomment-2067687048
             special_vocab.special_token_ids["eos"] = chat_eos_token_id
             logger.warning(
                 f"Replace eos:{old_eos} with a special token:{chat_eos_token_id}"
@@ -2851,6 +3349,7 @@ def modify_tensors(
             qkv = qkv.reshape((num_groups, q_per_kv + 2, head_dim, n_embd))
             q, k, v = qkv[:, :q_per_kv], qkv[:, -2], qkv[:, -1]
 
+            # The model weights of q and k equire additional reshape.
             q = LlamaModel.permute(q.reshape((-1, q.shape[-1])), num_heads, num_heads)
             k = LlamaModel.permute(
                 k.reshape((-1, k.shape[-1])), num_heads, num_kv_heads
@@ -2866,7 +3365,7 @@ def modify_tensors(
             return [(self.map_tensor_name(name), data_torch)]
 
 
-@Model.register("BertModel", "CamembertModel")
+@Model.register("BertModel", "BertForMaskedLM", "CamembertModel")
 class BertModel(Model):
     model_arch = gguf.MODEL_ARCH.BERT
 
@@ -2878,6 +3377,7 @@ def set_gguf_parameters(self):
         super().set_gguf_parameters()
         self.gguf_writer.add_causal_attention(False)
 
+        # get pooling path
         pooling_path = None
         module_path = self.dir_model / "modules.json"
         if module_path.is_file():
@@ -2888,6 +3388,7 @@ def set_gguf_parameters(self):
                     pooling_path = mod["path"]
                     break
 
+        # get pooling type
         if pooling_path is not None:
             with open(
                 self.dir_model / pooling_path / "config.json", encoding="utf-8"
@@ -2905,8 +3406,12 @@ def set_vocab(self):
         tokens, toktypes, tokpre = self.get_vocab_base()
         self.vocab_size = len(tokens)
 
-        self.gguf_writer.add_token_type_count(2)
+        # we need this to validate the size of the token_type embeddings
+        # though currently we are passing all zeros to the token_type embeddings
+        # "Sequence A" or "Sequence B"
+        self.gguf_writer.add_token_type_count(self.hparams.get("type_vocab_size", 1))
 
+        # convert to phantom space vocab
         def phantom(tok):
             if tok.startswith("[") and tok.endswith("]"):
                 return tok
@@ -2916,29 +3421,96 @@ def phantom(tok):
 
         tokens = list(map(phantom, tokens))
 
+        # add vocab to gguf
         self.gguf_writer.add_tokenizer_model("bert")
         self.gguf_writer.add_tokenizer_pre(tokpre)
         self.gguf_writer.add_token_list(tokens)
         self.gguf_writer.add_token_types(toktypes)
 
+        # handle special tokens
         special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens))
         special_vocab.add_to_gguf(self.gguf_writer)
 
     def modify_tensors(
         self, data_torch: Tensor, name: str, bid: int | None
     ) -> Iterable[tuple[str, Tensor]]:
-        del bid
+        del bid  # unused
+
+        if name.startswith("bert."):
+            name = name[5:]
+
+        if name.endswith(".gamma"):
+            name = name[:-6] + ".weight"
 
+        if name.endswith(".beta"):
+            name = name[:-5] + ".bias"
+
+        # we are only using BERT for embeddings so we don't need the pooling layer
         if name in (
             "embeddings.position_ids",
             "pooler.dense.weight",
             "pooler.dense.bias",
         ):
+            return []  # we don't need these
+
+        if name.startswith("cls.predictions"):
+            return []
+
+        if name.startswith("cls.seq_relationship"):
             return []
 
         return [(self.map_tensor_name(name), data_torch)]
 
 
+@Model.register("RobertaModel")
+class RobertaModel(BertModel):
+    model_arch = gguf.MODEL_ARCH.BERT
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+        # we need the pad_token_id to know how to chop down position_embd matrix
+        if (pad_token_id := self.hparams.get("pad_token_id")) is not None:
+            self._position_offset = 1 + pad_token_id
+            if "max_position_embeddings" in self.hparams:
+                self.hparams["max_position_embeddings"] -= self._position_offset
+        else:
+            self._position_offset = None
+
+    def set_vocab(self):
+        """Support BPE tokenizers for roberta models"""
+        bpe_tok_path = self.dir_model / "tokenizer.json"
+        if bpe_tok_path.exists():
+            self._set_vocab_gpt2()
+            self.gguf_writer.add_add_bos_token(True)
+            self.gguf_writer.add_add_eos_token(True)
+
+            # we need this to validate the size of the token_type embeddings
+            # though currently we are passing all zeros to the token_type embeddings
+            # "Sequence A" or "Sequence B"
+            self.gguf_writer.add_token_type_count(
+                self.hparams.get("type_vocab_size", 1)
+            )
+
+        else:
+            return super().set_vocab()
+
+    def modify_tensors(
+        self, data_torch: Tensor, name: str, bid: int | None
+    ) -> Iterable[tuple[str, Tensor]]:
+        # if name starts with "roberta.", remove the prefix
+        # e.g. https://huggingface.co/BAAI/bge-reranker-v2-m3/tree/main
+        if name.startswith("roberta."):
+            name = name[8:]
+
+        # position embeddings start at pad_token_id + 1, so just chop down the weight tensor
+        if name == "embeddings.position_embeddings.weight":
+            if self._position_offset is not None:
+                data_torch = data_torch[self._position_offset :, :]
+
+        return super().modify_tensors(data_torch, name, bid)
+
+
 @Model.register("NomicBertModel")
 class NomicBertModel(BertModel):
     model_arch = gguf.MODEL_ARCH.NOMIC_BERT
@@ -2946,18 +3518,20 @@ class NomicBertModel(BertModel):
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
 
+        # the HF config claims n_ctx=8192, but it uses RoPE scaling
         self.hparams["n_ctx"] = 2048
 
+        # SwigLU activation
         assert self.hparams["activation_function"] == "swiglu"
-
+        # this doesn't do anything in the HF version
         assert self.hparams["causal"] is False
-
+        # no bias tensors
         assert self.hparams["qkv_proj_bias"] is False
         assert self.hparams["mlp_fc1_bias"] is False
         assert self.hparams["mlp_fc2_bias"] is False
-
+        # norm at end of layer
         assert self.hparams["prenorm"] is False
-
+        # standard RoPE
         assert self.hparams["rotary_emb_fraction"] == 1.0
         assert self.hparams["rotary_emb_interleaved"] is False
         assert self.hparams["rotary_emb_scale_base"] is None
@@ -2967,13 +3541,14 @@ def set_gguf_parameters(self):
         self.gguf_writer.add_rope_freq_base(self.hparams["rotary_emb_base"])
 
 
-@Model.register("XLMRobertaModel")
+@Model.register("XLMRobertaModel", "XLMRobertaForSequenceClassification")
 class XLMRobertaModel(BertModel):
     model_arch = gguf.MODEL_ARCH.BERT
 
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
 
+        # we need the pad_token_id to know how to chop down position_embd matrix
         if (pad_token_id := self.hparams.get("pad_token_id")) is not None:
             self._position_offset = 1 + pad_token_id
             if "max_position_embeddings" in self.hparams:
@@ -2982,7 +3557,8 @@ def __init__(self, *args, **kwargs):
             self._position_offset = None
 
     def set_vocab(self):
-
+        # to avoid TypeError: Descriptors cannot be created directly
+        # exception when importing sentencepiece_model_pb2
         os.environ["PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION"] = "python"
         from sentencepiece import SentencePieceProcessor
         from sentencepiece import sentencepiece_model_pb2 as model
@@ -2991,9 +3567,11 @@ def set_vocab(self):
         if not tokenizer_path.is_file():
             raise FileNotFoundError(f"File not found: {tokenizer_path}")
 
-        sentencepiece_model = model.ModelProto()
+        sentencepiece_model = (
+            model.ModelProto()
+        )  # pyright: ignore[reportAttributeAccessIssue]
         sentencepiece_model.ParseFromString(open(tokenizer_path, "rb").read())
-        assert sentencepiece_model.trainer_spec.model_type == 1
+        assert sentencepiece_model.trainer_spec.model_type == 1  # UNIGRAM
 
         add_prefix = sentencepiece_model.normalizer_spec.add_dummy_prefix
         remove_whitespaces = (
@@ -3039,6 +3617,7 @@ def set_vocab(self):
                 scores.append(-1000.0)
                 toktypes.append(SentencePieceTokenTypes.UNUSED)
 
+        # realign tokens (see HF tokenizer code)
         tokens = [b"<s>", b"<pad>", b"</s>", b"<unk>"] + tokens[3:-1]
         scores = [0.0, 0.0, 0.0, 0.0] + scores[3:-1]
         toktypes = [
@@ -3054,7 +3633,7 @@ def set_vocab(self):
         self.gguf_writer.add_token_scores(scores)
         self.gguf_writer.add_token_types(toktypes)
         self.gguf_writer.add_add_space_prefix(add_prefix)
-        self.gguf_writer.add_token_type_count(1)
+        self.gguf_writer.add_token_type_count(self.hparams.get("type_vocab_size", 1))
         self.gguf_writer.add_remove_extra_whitespaces(remove_whitespaces)
         if precompiled_charsmap:
             self.gguf_writer.add_precompiled_charsmap(precompiled_charsmap)
@@ -3068,7 +3647,12 @@ def set_vocab(self):
     def modify_tensors(
         self, data_torch: Tensor, name: str, bid: int | None
     ) -> Iterable[tuple[str, Tensor]]:
+        # if name starts with "roberta.", remove the prefix
+        # e.g. https://huggingface.co/BAAI/bge-reranker-v2-m3/tree/main
+        if name.startswith("roberta."):
+            name = name[8:]
 
+        # position embeddings start at pad_token_id + 1, so just chop down the weight tensor
         if name == "embeddings.position_embeddings.weight":
             if self._position_offset is not None:
                 data_torch = data_torch[self._position_offset :, :]
@@ -3083,6 +3667,7 @@ class GemmaModel(Model):
     def set_vocab(self):
         self._set_vocab_sentencepiece()
 
+        # TODO: these special tokens should be exported only for the CodeGemma family
         special_vocab = gguf.SpecialVocab(
             self.dir_model,
             load_merges=False,
@@ -3093,7 +3678,7 @@ def set_vocab(self):
         special_vocab._set_special_token("middle", 68)
         special_vocab._set_special_token("fsep", 70)
         special_vocab._set_special_token("eot", 107)
-        special_vocab.chat_template = None
+        special_vocab.chat_template = None  # do not add it twice
         special_vocab.add_to_gguf(self.gguf_writer)
 
         self.gguf_writer.add_add_space_prefix(False)
@@ -3120,14 +3705,17 @@ def set_gguf_parameters(self):
     def modify_tensors(
         self, data_torch: Tensor, name: str, bid: int | None
     ) -> Iterable[tuple[str, Tensor]]:
-        del bid
+        del bid  # unused
 
+        # lm_head is not used in llama.cpp, while autoawq will include this tensor in model
+        # To prevent errors, skip loading lm_head.weight.
         if name == "lm_head.weight":
             logger.debug(
                 f"Skipping get tensor {name!r} in safetensors so that convert can end normally."
             )
             return []
 
+        # ref: https://github.com/huggingface/transformers/blob/fc37f38915372c15992b540dfcbbe00a916d4fc6/src/transformers/models/gemma/modeling_gemma.py#L89
         if name.endswith("norm.weight"):
             data_torch = data_torch + 1
 
@@ -3172,14 +3760,17 @@ def set_gguf_parameters(self):
     def modify_tensors(
         self, data_torch: Tensor, name: str, bid: int | None
     ) -> Iterable[tuple[str, Tensor]]:
-        del bid
+        del bid  # unused
 
+        # lm_head is not used in llama.cpp, while autoawq will include this tensor in model
+        # To prevent errors, skip loading lm_head.weight.
         if name == "lm_head.weight":
             logger.debug(
                 f"Skipping get tensor {name!r} in safetensors so that convert can end normally."
             )
             return []
 
+        # ref: https://github.com/huggingface/transformers/blob/fc37f38915372c15992b540dfcbbe00a916d4fc6/src/transformers/models/gemma/modeling_gemma.py#L89
         if name.endswith("norm.weight"):
             data_torch = data_torch + 1
 
@@ -3215,7 +3806,7 @@ def set_vocab(self):
                 token = token.encode("utf-8") if isinstance(token, str) else token
                 assert isinstance(token, bytes)
                 assert len(token) == token_len
-                token_text: str = repr(token)[2:-1]
+                token_text: str = repr(token)[2:-1]  # "b'\xff'" -> "\xff"
                 tokens.append(token_text.encode("utf-8"))
                 toktypes.append(gguf.TokenType.NORMAL)
         remainder = vocab_size - len(tokens)
@@ -3228,6 +3819,9 @@ def set_vocab(self):
         self.gguf_writer.add_token_list(tokens)
         self.gguf_writer.add_token_types(toktypes)
         special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=False)
+        special_vocab.chat_template = "rwkv-world"
+        # hack: Add '\n\n' as the EOT token to make it chat normally
+        special_vocab._set_special_token("eot", 261)
         special_vocab.add_to_gguf(self.gguf_writer)
 
     def set_gguf_parameters(self):
@@ -3244,6 +3838,7 @@ def set_gguf_parameters(self):
         time_mix_extra_dim = 64 if hidden_size == 4096 else 32
         time_decay_extra_dim = 128 if hidden_size == 4096 else 64
 
+        # RWKV isn't context limited
         self.gguf_writer.add_context_length(1048576)
         self.gguf_writer.add_embedding_length(hidden_size)
         self.gguf_writer.add_block_count(block_count)
@@ -3255,6 +3850,7 @@ def set_gguf_parameters(self):
         self.gguf_writer.add_feed_forward_length(intermediate_size)
         self.gguf_writer.add_file_type(self.ftype)
 
+        # required by llama.cpp, unused
         self.gguf_writer.add_head_count(0)
 
     def modify_tensors(
@@ -3275,6 +3871,9 @@ def modify_tensors(
         if new_name.endswith("time_mix_w2.weight"):
             data_torch = data_torch.permute(0, 2, 1)
 
+        if new_name.endswith("time_mix_decay.weight") or "lerp" in new_name:
+            data_torch = data_torch.squeeze()
+
         rescale_every_n_layers = self.hparams["rescale_every"]
         if rescale_every_n_layers > 0:
             if new_name.endswith("time_mix_output.weight") or new_name.endswith(
@@ -3291,9 +3890,10 @@ class MambaModel(Model):
 
     def set_vocab(self):
         vocab_size = self.hparams["vocab_size"]
-
+        # Round vocab size to next multiple of 8
         pad_vocab = self.hparams.get("pad_vocab_size_multiple", 8)
-
+        # pad using ceiling division
+        # ref: https://stackoverflow.com/a/17511341/22827863
         vocab_size = -(vocab_size // -pad_vocab) * pad_vocab
         self.hparams["vocab_size"] = vocab_size
 
@@ -3302,7 +3902,7 @@ def set_vocab(self):
         elif (self.dir_model / "tokenizer.model").is_file():
             self._set_vocab_sentencepiece()
         else:
-
+            # Use the GPT-NeoX tokenizer when no tokenizer files are present
             self._set_vocab_builtin("gpt-neox", vocab_size)
 
     def set_gguf_parameters(self):
@@ -3313,7 +3913,9 @@ def set_gguf_parameters(self):
             or 2 * d_model
         )
         d_state = self.find_hparam(["state_size", "d_state"], optional=True) or 16
-
+        # ceiling division
+        # ref: https://stackoverflow.com/a/17511341/22827863
+        # ref: https://github.com/state-spaces/mamba/blob/ce59daea3a090d011d6476c6e5b97f6d58ddad8b/mamba_ssm/modules/mamba_simple.py#L58
         dt_rank = self.find_hparam(["time_step_rank", "dt_rank"], optional=True) or -(
             d_model // -16
         )
@@ -3322,23 +3924,31 @@ def set_gguf_parameters(self):
             or 1e-5
         )
         use_dt_b_c_norm = False
-
+        # For falconmamba we do apply RMS norm on B / DT and C layers
         if self.find_hparam(["model_type"], optional=True) in ("falcon_mamba",):
             use_dt_b_c_norm = True
-
+        # Fail early for models which don't have a block expansion factor of 2
         assert d_inner == 2 * d_model
 
-        self.gguf_writer.add_context_length(2**20)
+        self.gguf_writer.add_context_length(
+            2**20
+        )  # arbitrary value; for those who use the default
         self.gguf_writer.add_embedding_length(d_model)
-        self.gguf_writer.add_feed_forward_length(0)
-        self.gguf_writer.add_head_count(0)
+        self.gguf_writer.add_feed_forward_length(
+            0
+        )  # unused, but seemingly required when loading
+        self.gguf_writer.add_head_count(
+            0
+        )  # unused, but seemingly required when loading
         self.gguf_writer.add_block_count(self.block_count)
         self.gguf_writer.add_ssm_conv_kernel(d_conv)
         self.gguf_writer.add_ssm_inner_size(d_inner)
         self.gguf_writer.add_ssm_state_size(d_state)
         self.gguf_writer.add_ssm_time_step_rank(dt_rank)
         self.gguf_writer.add_layer_norm_rms_eps(rms_norm_eps)
-        self.gguf_writer.add_ssm_dt_b_c_rms(use_dt_b_c_norm)
+        self.gguf_writer.add_ssm_dt_b_c_rms(
+            use_dt_b_c_norm
+        )  # For classic Mamba we don't apply rms norm on B / DT layers
         self.gguf_writer.add_file_type(self.ftype)
 
     _tok_embd = None
@@ -3346,7 +3956,7 @@ def set_gguf_parameters(self):
     def modify_tensors(
         self, data_torch: Tensor, name: str, bid: int | None
     ) -> Iterable[tuple[str, Tensor]]:
-        del bid
+        del bid  # unused
 
         output_name = self.format_tensor_name(gguf.MODEL_TENSOR.OUTPUT)
         tok_embd_name = self.format_tensor_name(gguf.MODEL_TENSOR.TOKEN_EMBD)
@@ -3357,6 +3967,7 @@ def modify_tensors(
             logger.debug("A_log --> A ==> " + new_name)
             data_torch = -torch.exp(data_torch)
 
+        # assuming token_embd.weight is seen before output.weight
         if self._tok_embd is not None and new_name == output_name:
             if torch.equal(self._tok_embd, data_torch):
                 logger.debug(
@@ -3376,6 +3987,9 @@ class CommandR2Model(Model):
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
 
+        # max_position_embeddings = 8192 in config.json but model was actually
+        # trained on 128k context length
+        # aya-23 models don't have model_max_length specified
         self.hparams["max_position_embeddings"] = self.find_hparam(
             ["model_max_length", "max_position_embeddings"]
         )
@@ -3386,6 +4000,26 @@ def set_gguf_parameters(self):
         self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE)
 
 
+@Model.register("Cohere2ForCausalLM")
+class Cohere2Model(Model):
+    model_arch = gguf.MODEL_ARCH.COHERE2
+
+    def set_gguf_parameters(self):
+        super().set_gguf_parameters()
+
+        self.gguf_writer.add_logit_scale(self.hparams["logit_scale"])
+        self.gguf_writer.add_sliding_window(self.hparams["sliding_window"])
+        self.gguf_writer.add_vocab_size(self.hparams["vocab_size"])
+
+        rotary_pct = self.hparams["rotary_pct"]
+        hidden_size = self.hparams["hidden_size"]
+        num_attention_heads = self.hparams["num_attention_heads"]
+        self.gguf_writer.add_rope_dimension_count(
+            int(rotary_pct * (hidden_size // num_attention_heads))
+        )
+        self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE)
+
+
 @Model.register("OlmoForCausalLM")
 @Model.register("OLMoForCausalLM")
 class OlmoModel(Model):
@@ -3398,10 +4032,12 @@ def set_gguf_parameters(self):
         if clip_qkv is not None:
             self.gguf_writer.add_clamp_kqv(clip_qkv)
 
+    # Same as super class, but permuting q_proj, k_proj
+    # Copied from: LlamaModel
     def modify_tensors(
         self, data_torch: Tensor, name: str, bid: int | None
     ) -> Iterable[tuple[str, Tensor]]:
-        del bid
+        del bid  # unused
 
         n_head = self.hparams["num_attention_heads"]
         n_kv_head = self.hparams.get("num_key_value_heads")
@@ -3414,6 +4050,11 @@ def modify_tensors(
         return [(self.map_tensor_name(name), data_torch)]
 
 
+@Model.register("Olmo2ForCausalLM")
+class Olmo2Model(Model):
+    model_arch = gguf.MODEL_ARCH.OLMO2
+
+
 @Model.register("OlmoeForCausalLM")
 class OlmoeModel(Model):
     model_arch = gguf.MODEL_ARCH.OLMOE
@@ -3426,10 +4067,11 @@ def set_gguf_parameters(self):
 
     _experts: list[dict[str, Tensor]] | None = None
 
+    # Copied from: Qwen2MoeModel
     def modify_tensors(
         self, data_torch: Tensor, name: str, bid: int | None
     ) -> Iterable[tuple[str, Tensor]]:
-
+        # process the experts separately
         if name.find("experts") != -1:
             n_experts = self.hparams["num_experts"]
             assert bid is not None
@@ -3442,6 +4084,7 @@ def modify_tensors(
             if len(self._experts[bid]) >= n_experts * 3:
                 tensors: list[tuple[str, Tensor]] = []
 
+                # merge the experts into a single 3d tensor
                 for w_name in ["down_proj", "gate_proj", "up_proj"]:
                     datas: list[Tensor] = []
 
@@ -3463,11 +4106,12 @@ def modify_tensors(
 
         return [(self.map_tensor_name(name), data_torch)]
 
+    # Copied from: Qwen2MoeModel
     def prepare_tensors(self):
         super().prepare_tensors()
 
         if self._experts is not None:
-
+            # flatten `list[dict[str, Tensor]]` into `list[str]`
             experts = [k for d in self._experts for k in d.keys()]
             if len(experts) > 0:
                 raise ValueError(f"Unprocessed experts: {experts}")
@@ -3513,6 +4157,16 @@ def set_vocab(self):
         self.gguf_writer.add_add_bos_token(True)
         self.gguf_writer.add_add_eos_token(True)
 
+    def modify_tensors(
+        self, data_torch: Tensor, name: str, bid: int | None
+    ) -> Iterable[tuple[str, Tensor]]:
+        # if name starts with "bert.", remove the prefix
+        # e.g. https://huggingface.co/jinaai/jina-reranker-v1-tiny-en
+        if name.startswith("bert."):
+            name = name[5:]
+
+        return super().modify_tensors(data_torch, name, bid)
+
 
 @Model.register("OpenELMForCausalLM")
 class OpenELMModel(Model):
@@ -3520,9 +4174,9 @@ class OpenELMModel(Model):
 
     @staticmethod
     def _make_divisible(v: float | int, divisor: int) -> int:
-
+        # ref: https://huggingface.co/apple/OpenELM-270M-Instruct/blob/eb111ff2e6724348e5b905984063d4064d4bc579/configuration_openelm.py#L34-L38
         new_v = max(divisor, int(v + divisor / 2) // divisor * divisor)
-
+        # Make sure that round down does not go down by more than 10%.
         if new_v < 0.9 * v:
             new_v += divisor
         return new_v
@@ -3546,6 +4200,7 @@ def __init__(self, *args, **kwargs):
             self._num_query_heads[0], int
         )
 
+    # Uses the tokenizer from meta-llama/Llama-2-7b-hf
     def set_vocab(self):
         try:
             self._set_vocab_sentencepiece()
@@ -3567,7 +4222,7 @@ def set_gguf_parameters(self):
         self.gguf_writer.add_head_count(self._num_query_heads)
         self.gguf_writer.add_head_count_kv(self._num_kv_heads)
         self.gguf_writer.add_rope_freq_base(self.hparams["rope_freq_constant"])
-
+        # https://huggingface.co/apple/OpenELM-270M-Instruct/blob/c401df2/modeling_openelm.py#L30
         self.gguf_writer.add_layer_norm_rms_eps(1e-6)
         self.gguf_writer.add_rope_dimension_count(int(rot_pct * head_dim))
         self.gguf_writer.add_key_length(head_dim)
@@ -3584,6 +4239,7 @@ def modify_tensors(
         self, data_torch: Tensor, name: str, bid: int | None
     ) -> Iterable[tuple[str, Tensor]]:
 
+        # split ff
         if bid is not None and name == f"transformer.layers.{bid}.ffn.proj_1.weight":
             ff_dim = self._ffn_dims[bid]
             yield (
@@ -3604,7 +4260,9 @@ class ArcticModel(Model):
     model_arch = gguf.MODEL_ARCH.ARCTIC
 
     def set_vocab(self):
-
+        # The reason for using a custom implementation here is that the
+        # snowflake-arctic-instruct model redefined tokens 31998 and 31999 from
+        # tokenizer.model and used them as BOS and EOS instead of adding new tokens.
         from sentencepiece import SentencePieceProcessor
 
         tokenizer_path = self.dir_model / "tokenizer.model"
@@ -3613,6 +4271,7 @@ def set_vocab(self):
             logger.error(f"Error: Missing {tokenizer_path}")
             sys.exit(1)
 
+        # Read the whole vocabulary from the tokenizer.model file
         tokenizer = SentencePieceProcessor()
         tokenizer.LoadFromFile(str(tokenizer_path))
 
@@ -3642,6 +4301,8 @@ def set_vocab(self):
             scores[token_id] = score
             toktypes[token_id] = toktype
 
+        # Use the added_tokens_decoder field from tokeniser_config.json as the source
+        # of information about added/redefined tokens and modify them accordingly.
         tokenizer_config_file = self.dir_model / "tokenizer_config.json"
         if tokenizer_config_file.is_file():
             with open(tokenizer_config_file, "r", encoding="utf-8") as f:
@@ -3661,6 +4322,8 @@ def set_vocab(self):
                         token_type = SentencePieceTokenTypes.USER_DEFINED
                         token_score = -10000.0
 
+                        # Map unk_token to UNKNOWN, other special tokens to CONTROL
+                        # Set the score to 0.0 as in the original tokenizer.model
                         if ("special" in token_json) and token_json["special"]:
                             if token_content == tokenizer_config_json["unk_token"]:
                                 token_type = SentencePieceTokenTypes.UNKNOWN
@@ -3705,6 +4368,7 @@ def modify_tensors(
         if name.endswith("k_proj.weight"):
             data_torch = LlamaModel.permute(data_torch, n_head, n_kv_head)
 
+        # process the experts separately
         if name.find("block_sparse_moe.experts") != -1:
             n_experts = self.hparams["num_local_experts"]
 
@@ -3718,6 +4382,7 @@ def modify_tensors(
             if len(self._experts[bid]) >= n_experts * 3:
                 tensors: list[tuple[str, Tensor]] = []
 
+                # merge the experts into a single 3d tensor
                 for wid in ["w1", "w2", "w3"]:
                     datas: list[Tensor] = []
 
@@ -3743,13 +4408,113 @@ def prepare_tensors(self):
         super().prepare_tensors()
 
         if self._experts is not None:
+            # flatten `list[dict[str, Tensor]]` into `list[str]`
+            experts = [k for d in self._experts for k in d.keys()]
+            if len(experts) > 0:
+                raise ValueError(f"Unprocessed experts: {experts}")
+
+
+@Model.register("DeepseekForCausalLM")
+class DeepseekModel(Model):
+    model_arch = gguf.MODEL_ARCH.DEEPSEEK
+
+    def set_vocab(self):
+        try:
+            self._set_vocab_sentencepiece()
+        except FileNotFoundError:
+            self._set_vocab_gpt2()
+
+    def set_gguf_parameters(self):
+        super().set_gguf_parameters()
+        hparams = self.hparams
+        if "head_dim" in hparams:
+            rope_dim = hparams["head_dim"]
+        else:
+            rope_dim = hparams["hidden_size"] // hparams["num_attention_heads"]
+
+        self.gguf_writer.add_rope_dimension_count(rope_dim)
+        self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE)
+        self.gguf_writer.add_leading_dense_block_count(hparams["first_k_dense_replace"])
+        self.gguf_writer.add_vocab_size(hparams["vocab_size"])
+        self.gguf_writer.add_expert_feed_forward_length(
+            hparams["moe_intermediate_size"]
+        )
+        self.gguf_writer.add_expert_weights_scale(1.0)
+        self.gguf_writer.add_expert_count(hparams["n_routed_experts"])
+        self.gguf_writer.add_expert_shared_count(hparams["n_shared_experts"])
+
+    _experts: list[dict[str, Tensor]] | None = None
+
+    @staticmethod
+    def permute(weights: Tensor, n_head: int, n_head_kv: int | None):
+        if n_head_kv is not None and n_head != n_head_kv:
+            n_head = n_head_kv
+        return (
+            weights.reshape(
+                n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:]
+            )
+            .swapaxes(1, 2)
+            .reshape(weights.shape)
+        )
+
+    def modify_tensors(
+        self, data_torch: Tensor, name: str, bid: int | None
+    ) -> Iterable[tuple[str, Tensor]]:
+        n_head = self.hparams["num_attention_heads"]
+        n_kv_head = self.hparams.get("num_key_value_heads")
+
+        if name.endswith(("q_proj.weight", "q_proj.bias")):
+            data_torch = DeepseekModel.permute(data_torch, n_head, n_head)
+        if name.endswith(("k_proj.weight", "k_proj.bias")):
+            data_torch = DeepseekModel.permute(data_torch, n_head, n_kv_head)
+
+        # process the experts separately
+        if name.find("mlp.experts") != -1:
+            n_experts = self.hparams["n_routed_experts"]
+            assert bid is not None
+
+            if self._experts is None:
+                self._experts = [{} for _ in range(self.block_count)]
+
+            self._experts[bid][name] = data_torch
+
+            if len(self._experts[bid]) >= n_experts * 3:
+                tensors: list[tuple[str, Tensor]] = []
+
+                # merge the experts into a single 3d tensor
+                for w_name in ["down_proj", "gate_proj", "up_proj"]:
+                    datas: list[Tensor] = []
 
+                    for xid in range(n_experts):
+                        ename = f"model.layers.{bid}.mlp.experts.{xid}.{w_name}.weight"
+                        datas.append(self._experts[bid][ename])
+                        del self._experts[bid][ename]
+
+                    data_torch = torch.stack(datas, dim=0)
+
+                    merged_name = f"model.layers.{bid}.mlp.experts.{w_name}.weight"
+
+                    new_name = self.map_tensor_name(merged_name)
+
+                    tensors.append((new_name, data_torch))
+                return tensors
+            else:
+                return []
+
+        return [(self.map_tensor_name(name), data_torch)]
+
+    def prepare_tensors(self):
+        super().prepare_tensors()
+
+        if self._experts is not None:
+            # flatten `list[dict[str, Tensor]]` into `list[str]`
             experts = [k for d in self._experts for k in d.keys()]
             if len(experts) > 0:
                 raise ValueError(f"Unprocessed experts: {experts}")
 
 
 @Model.register("DeepseekV2ForCausalLM")
+@Model.register("DeepseekV3ForCausalLM")
 class DeepseekV2Model(Model):
     model_arch = gguf.MODEL_ARCH.DEEPSEEK2
 
@@ -3775,6 +4540,17 @@ def set_gguf_parameters(self):
         self.gguf_writer.add_expert_count(hparams["n_routed_experts"])
         self.gguf_writer.add_expert_shared_count(hparams["n_shared_experts"])
         self.gguf_writer.add_expert_weights_scale(hparams["routed_scaling_factor"])
+        self.gguf_writer.add_expert_weights_norm(hparams["norm_topk_prob"])
+
+        if hparams["scoring_func"] == "sigmoid":
+            self.gguf_writer.add_expert_gating_func(gguf.ExpertGatingFuncType.SIGMOID)
+        elif hparams["scoring_func"] == "softmax":
+            self.gguf_writer.add_expert_gating_func(gguf.ExpertGatingFuncType.SOFTMAX)
+        else:
+            raise ValueError(
+                f"Unsupported scoring_func value: {hparams['scoring_func']}"
+            )
+
         self.gguf_writer.add_rope_dimension_count(hparams["qk_rope_head_dim"])
 
         if (
@@ -3798,7 +4574,17 @@ def set_gguf_parameters(self):
     def modify_tensors(
         self, data_torch: Tensor, name: str, bid: int | None
     ) -> Iterable[tuple[str, Tensor]]:
+        # rename e_score_correction_bias tensors
+        if name.endswith("e_score_correction_bias"):
+            name = name.replace("e_score_correction_bias", "e_score_correction.bias")
+
+        # skip Multi-Token Prediction (MTP) layers
+        block_count = self.hparams["num_hidden_layers"]
+        match = re.match(r"model.layers.(\d+)", name)
+        if match and int(match.group(1)) >= block_count:
+            return []
 
+        # process the experts separately
         if name.find("mlp.experts") != -1:
             n_experts = self.hparams["n_routed_experts"]
             assert bid is not None
@@ -3811,6 +4597,7 @@ def modify_tensors(
             if len(self._experts[bid]) >= n_experts * 3:
                 tensors: list[tuple[str, Tensor]] = []
 
+                # merge the experts into a single 3d tensor
                 for w_name in ["down_proj", "gate_proj", "up_proj"]:
                     datas: list[Tensor] = []
 
@@ -3836,7 +4623,7 @@ def prepare_tensors(self):
         super().prepare_tensors()
 
         if self._experts is not None:
-
+            # flatten `list[dict[str, Tensor]]` into `list[str]`
             experts = [k for d in self._experts for k in d.keys()]
             if len(experts) > 0:
                 raise ValueError(f"Unprocessed experts: {experts}")
@@ -3854,28 +4641,33 @@ def __init__(self, *args, **kwargs):
         self.shared_token_embeddings_found = False
 
     def set_vocab(self):
-
+        # to avoid TypeError: Descriptors cannot be created directly
+        # exception when importing sentencepiece_model_pb2
         os.environ["PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION"] = "python"
         from sentencepiece import SentencePieceProcessor
         from sentencepiece import sentencepiece_model_pb2 as model
 
         tokenizer_path = self.dir_model / "tokenizer.model"
 
+        # many older models use spiece.model tokenizer model filename
         if not tokenizer_path.is_file():
             tokenizer_path = self.dir_model / "spiece.model"
 
         if not tokenizer_path.is_file():
             raise FileNotFoundError(f"File not found: {tokenizer_path}")
 
-        sentencepiece_model = model.ModelProto()
+        sentencepiece_model = (
+            model.ModelProto()
+        )  # pyright: ignore[reportAttributeAccessIssue]
         sentencepiece_model.ParseFromString(open(tokenizer_path, "rb").read())
 
-        if sentencepiece_model.trainer_spec.model_type == 2:
-
+        # some models like Pile-T5 family use BPE tokenizer instead of Unigram
+        if sentencepiece_model.trainer_spec.model_type == 2:  # BPE
+            # assure the tokenizer model file name is correct
             assert tokenizer_path.name == "tokenizer.model"
             return self._set_vocab_sentencepiece()
         else:
-            assert sentencepiece_model.trainer_spec.model_type == 1
+            assert sentencepiece_model.trainer_spec.model_type == 1  # UNIGRAM
 
         add_prefix = sentencepiece_model.normalizer_spec.add_dummy_prefix
         remove_whitespaces = (
@@ -3979,8 +4771,12 @@ def set_gguf_parameters(self):
     def modify_tensors(
         self, data_torch: Tensor, name: str, bid: int | None
     ) -> Iterable[tuple[str, Tensor]]:
-        del bid
+        del bid  # unused
 
+        # T5 based models contain shared token embeddings tensors saved randomly as either "encoder.embed_tokens.weight",
+        # "decoder.embed_tokens.weight" or "shared.weight" tensor. In some models there are even multiple of them stored
+        # in the safetensors files. We use the first tensor from these three as the token embeddings for both encoder
+        # and decoder and ignore the remaining ones.
         if name in [
             "decoder.embed_tokens.weight",
             "encoder.embed_tokens.weight",
@@ -4007,28 +4803,33 @@ def __init__(self, *args, **kwargs):
         self.shared_token_embeddings_found = False
 
     def set_vocab(self):
-
+        # to avoid TypeError: Descriptors cannot be created directly
+        # exception when importing sentencepiece_model_pb2
         os.environ["PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION"] = "python"
         from sentencepiece import SentencePieceProcessor
         from sentencepiece import sentencepiece_model_pb2 as model
 
         tokenizer_path = self.dir_model / "tokenizer.model"
 
+        # many older models use spiece.model tokenizer model filename
         if not tokenizer_path.is_file():
             tokenizer_path = self.dir_model / "spiece.model"
 
         if not tokenizer_path.is_file():
             raise FileNotFoundError(f"File not found: {tokenizer_path}")
 
-        sentencepiece_model = model.ModelProto()
+        sentencepiece_model = (
+            model.ModelProto()
+        )  # pyright: ignore[reportAttributeAccessIssue]
         sentencepiece_model.ParseFromString(open(tokenizer_path, "rb").read())
 
-        if sentencepiece_model.trainer_spec.model_type == 2:
-
+        # some models like Pile-T5 family use BPE tokenizer instead of Unigram
+        if sentencepiece_model.trainer_spec.model_type == 2:  # BPE
+            # assure the tokenizer model file name is correct
             assert tokenizer_path.name == "tokenizer.model"
             return self._set_vocab_sentencepiece()
         else:
-            assert sentencepiece_model.trainer_spec.model_type == 1
+            assert sentencepiece_model.trainer_spec.model_type == 1  # UNIGRAM
 
         add_prefix = sentencepiece_model.normalizer_spec.add_dummy_prefix
         remove_whitespaces = (
@@ -4129,8 +4930,12 @@ def set_gguf_parameters(self):
     def modify_tensors(
         self, data_torch: Tensor, name: str, bid: int | None
     ) -> Iterable[tuple[str, Tensor]]:
-        del bid
+        del bid  # unused
 
+        # T5 based models contain shared token embeddings tensors saved randomly as either "encoder.embed_tokens.weight",
+        # "decoder.embed_tokens.weight" or "shared.weight" tensor. In some models there are even multiple of them stored
+        # in the safetensors files. We use the first tensor from these three as the token embeddings for both encoder
+        # and decoder and ignore the remaining ones.
         if name in [
             "decoder.embed_tokens.weight",
             "encoder.embed_tokens.weight",
@@ -4155,15 +4960,14 @@ class JaisModel(Model):
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
 
+        # SwigLU activation
         assert self.hparams["activation_function"] == "swiglu"
-
+        # ALiBi position embedding
         assert self.hparams["position_embedding_type"] == "alibi"
 
+        # Embeddings scale
         self.embeddings_scale = 1.0
-
-        self.output_is_wte = False
         if "mup_embeddings_scale" in self.hparams:
-            self.output_is_wte = True
             self.embeddings_scale = self.hparams["mup_embeddings_scale"]
         elif "embeddings_scale" in self.hparams:
             self.embeddings_scale = self.hparams["embeddings_scale"]
@@ -4198,15 +5002,19 @@ def set_gguf_parameters(self):
     def modify_tensors(
         self, data_torch: Tensor, name: str, bid: int | None
     ) -> Iterable[tuple[str, Tensor]]:
-        del bid
+        del bid  # unused
 
         tensors: list[tuple[str, Tensor]] = []
 
+        # we don't need these
         if name.endswith((".attn.bias")):
             return tensors
 
         if name.endswith(("relative_pe.slopes")):
-
+            # Calculate max ALiBi bias (this is the inverse of the ALiBi calculation)
+            # Some other models has max_alibi_bias spelled out explicitly in the hyperparams,
+            # but Jais's PyTorch model simply precalculates the slope values and places them
+            # in relative_pes.slopes
             n_head_closest_log2 = 2 ** math.floor(math.log2(self.hparams["n_head"]))
             first_val = float(data_torch[0].item())
             self.max_alibi_bias = -round(math.log2(first_val) * n_head_closest_log2)
@@ -4222,15 +5030,7 @@ def modify_tensors(
 
         if new_name == self.format_tensor_name(gguf.MODEL_TENSOR.TOKEN_EMBD):
             tensors.append((new_name, data_torch * self.embeddings_scale))
-            if self.output_is_wte:
-                tensors.append(
-                    (
-                        self.format_tensor_name(gguf.MODEL_TENSOR.OUTPUT),
-                        data_torch * self.width_scale,
-                    )
-                )
         elif new_name == self.format_tensor_name(gguf.MODEL_TENSOR.OUTPUT):
-            assert not self.output_is_wte
             tensors.append((new_name, data_torch * self.width_scale))
         else:
             tensors.append((new_name, data_torch))
@@ -4282,7 +5082,8 @@ def set_vocab_chatglm3(self):
 
             text = piece.encode("utf-8")
             score = 0.0
-
+            # Referencing the tokenizer Python implementation(https://huggingface.co/THUDM/chatglm3-6b/blob/main/tokenization_chatglm.py),
+            # it is only valid if it is less than tokenizer.tokenizer.sp_model.vocab_size()
             if len(piece) != 0 and token_id < tokenizer.tokenizer.sp_model.vocab_size():
                 score = tokenizer.tokenizer.sp_model.get_score(token_id)
 
@@ -4314,7 +5115,8 @@ def set_vocab_chatglm3(self):
             toktypes.append(toktype)
 
         self.gguf_writer.add_tokenizer_model("llama")
-
+        # glm3 needs prefix and suffix formatted as:
+        # prompt = "[gMASK]sop<|user|>\n" + prompt + "<|assistant|>"
         self.gguf_writer.add_tokenizer_pre("chatglm-spm")
         self.gguf_writer.add_token_list(tokens)
         self.gguf_writer.add_token_scores(scores)
@@ -4382,6 +5184,7 @@ def set_vocab(self):
             assert len(merged) >= 2 and len(merged) <= 7
             merges.append(" ".join(map(ChatGLMModel.token_bytes_to_string, merged)))
 
+        # for this kind of tokenizer, added_vocab is not a subset of vocab, so they need to be combined
         added_vocab = tokenizer.get_added_vocab()
         reverse_vocab = {
             id_: encoded_tok for encoded_tok, id_ in {**vocab, **added_vocab}.items()
@@ -4408,12 +5211,12 @@ def set_vocab(self):
 
         special_vocab = gguf.SpecialVocab(dir_model, load_merges=False)
         special_vocab.merges = merges
-
+        # only add special tokens when they were not already loaded from config.json
         special_vocab._set_special_token(
             "eos", tokenizer.get_added_vocab()["<|endoftext|>"]
         )
         special_vocab._set_special_token("eot", tokenizer.get_added_vocab()["<|user|>"])
-
+        # this one is usually not in config.json anyway
         special_vocab._set_special_token(
             "unk", tokenizer.get_added_vocab()["<|endoftext|>"]
         )
@@ -4443,7 +5246,7 @@ def set_gguf_parameters(self):
     def modify_tensors(
         self, data_torch: Tensor, name: str, bid: int | None
     ) -> Iterable[tuple[str, Tensor]]:
-        del bid
+        del bid  # unused
 
         if name.endswith(".rotary_pos_emb.inv_freq"):
             return []
@@ -4471,6 +5274,7 @@ def set_gguf_parameters(self):
         )
         self.gguf_writer.add_layer_norm_eps(f_norm_eps)
 
+        # * Partial RoPE
         rot_pct = self.find_hparam(
             ["partial_rotary_factor", "rope_pct", "rope_percent"]
         )
@@ -4478,6 +5282,7 @@ def set_gguf_parameters(self):
         n_head = self.find_hparam(["num_attention_heads", "n_head"])
         self.gguf_writer.add_rope_dimension_count(int(rot_pct * n_embd) // n_head)
 
+        # * RopeScaling for Nemotron
         if "rope_scaling" not in self.hparams or self.hparams["rope_scaling"] is None:
             self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE)
         else:
@@ -4487,7 +5292,10 @@ def set_gguf_parameters(self):
     def modify_tensors(
         self, data_torch: Tensor, name: str, bid: int | None
     ) -> Iterable[tuple[str, Tensor]]:
-
+        # * Adding +1 to LayerNorm's weights here to implement layernorm1p w/o changing anything on the GGML engine side
+        #   model.layers.{l}.input_layernorm.weight
+        #   model.layers.{l}.post_attention_layernorm.weight
+        #   model.norm.weight
         if name.endswith("norm.weight"):
             data_torch = data_torch + 1
 
@@ -4514,7 +5322,10 @@ def set_gguf_parameters(self):
             else 4 * embed_dim
         )
         num_layers = hparams["num_layers"]
-
+        # ignore for now as EXAONE-3.0-7.8B-Instruct attentino_dropout is 0.0
+        # attention_dropout_rate = hparams["attention_dropout"]
+        # ignore for now as EXAONE-3.0-7.8B-Instruct embed_dropout is 0.0
+        # embed_dropout_rate = hparams["embed_dropout"]
         self.gguf_writer.add_embedding_length(embed_dim)
         self.gguf_writer.add_head_count(num_heads)
         self.gguf_writer.add_head_count_kv(num_kv_heads)
@@ -4546,7 +5357,7 @@ def set_gguf_parameters(self):
                     hparams["rope_scaling"]["factor"]
                 )
 
-    def prepare_tensors(self):
+    def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
         if rope_scaling := self.find_hparam(["rope_scaling"], optional=True):
             if rope_scaling.get("rope_type", "").lower() == "llama3":
                 base = self.hparams.get("rope_theta", 10000.0)
@@ -4582,13 +5393,10 @@ def prepare_tensors(self):
                         )
                         rope_factors.append(1 / ((1 - smooth) / factor + smooth))
 
-                if not self.is_lora:
-                    self.gguf_writer.add_tensor(
-                        self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FREQS),
-                        np.array(rope_factors, dtype=np.float32),
-                    )
-
-        super().prepare_tensors()
+                yield (
+                    self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FREQS),
+                    torch.tensor(rope_factors, dtype=torch.float32),
+                )
 
 
 @Model.register("GraniteForCausalLM")
@@ -4610,151 +5418,128 @@ def set_gguf_parameters(self):
         if head_dim := self.hparams.pop("head_dim", None):
             logger.warning("Ignoring head_dim (%s) from config for Granite", head_dim)
         super().set_gguf_parameters()
-
+        # NOTE: Convert _multiplier params to _scale params for naming
+        #   consistency
         if attention_scale := self.hparams.get("attention_multiplier"):
             self.gguf_writer.add_attention_scale(attention_scale)
+            logger.info("gguf: (granite) attention_scale = %s", attention_scale)
         if embedding_scale := self.hparams.get("embedding_multiplier"):
             self.gguf_writer.add_embedding_scale(embedding_scale)
+            logger.info("gguf: (granite) embedding_scale = %s", embedding_scale)
         if residual_scale := self.hparams.get("residual_multiplier"):
             self.gguf_writer.add_residual_scale(residual_scale)
-        if logits_scaling := self.hparams.get("logits_scaling"):
-            self.gguf_writer.add_logit_scale(logits_scaling)
+            logger.info("gguf: (granite) residual_scale = %s", residual_scale)
+        if logits_scale := self.hparams.get("logits_scaling"):
+            self.gguf_writer.add_logit_scale(logits_scale)
+            logger.info("gguf: (granite) logits_scale = %s", logits_scale)
 
 
-@Model.register("JambaForCausalLM")
-class JambaModel(Model):
-    model_arch = gguf.MODEL_ARCH.JAMBA
+@Model.register("GraniteMoeForCausalLM")
+class GraniteMoeModel(GraniteModel):
+    """Conversion for IBM's GraniteMoeForCausalLM"""
 
-    def get_vocab_base_pre(self, tokenizer) -> str:
-        del tokenizer
-
-        return "gpt-2"
-
-    def set_vocab(self):
-        if (self.dir_model / "tokenizer.model").is_file():
-
-            self._set_vocab_sentencepiece()
-        else:
-
-            self._set_vocab_gpt2()
-
-    def set_gguf_parameters(self):
-        d_model = self.find_hparam(["hidden_size", "mamba_d_model"])
-        d_conv = self.find_hparam(["mamba_d_conv"], optional=True) or 4
-        d_inner = self.hparams["mamba_expand"] * d_model
-        d_state = self.find_hparam(["mamba_d_state"], optional=True) or 16
-
-        dt_rank = self.find_hparam(["mamba_dt_rank"], optional=True) or -(
-            d_model // -16
-        )
-        rms_norm_eps = (
-            self.find_hparam(["layer_norm_epsilon", "rms_norm_eps"], optional=True)
-            or 1e-6
-        )
-        n_kv_head = self.hparams["num_key_value_heads"]
-        attn_offset = self.hparams["attn_layer_offset"]
-        attn_period = self.hparams["attn_layer_period"]
-        n_kv_vec = [0 for _ in range(attn_offset)] + [
-            n_kv_head if (i - attn_offset) % attn_period == 0 else 0
-            for i in range(attn_offset, self.block_count)
-        ]
-
-        self.gguf_writer.add_block_count(self.block_count)
-        self.gguf_writer.add_context_length(
-            self.find_hparam(["max_position_embeddings", "n_ctx"])
-        )
-        self.gguf_writer.add_embedding_length(d_model)
-        self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"])
-        self.gguf_writer.add_head_count(self.hparams["num_attention_heads"])
-        self.gguf_writer.add_head_count_kv(n_kv_vec)
-        self.gguf_writer.add_ssm_conv_kernel(d_conv)
-        self.gguf_writer.add_ssm_inner_size(d_inner)
-        self.gguf_writer.add_ssm_state_size(d_state)
-        self.gguf_writer.add_ssm_time_step_rank(dt_rank)
-        self.gguf_writer.add_layer_norm_rms_eps(rms_norm_eps)
-        self.gguf_writer.add_expert_count(self.hparams["num_experts"])
-        self.gguf_writer.add_expert_used_count(self.hparams["num_experts_per_tok"])
-        self.gguf_writer.add_file_type(self.ftype)
-
-    _experts: list[dict[str, Tensor]] | None = None
+    model_arch = gguf.MODEL_ARCH.GRANITE_MOE
 
     def modify_tensors(
         self, data_torch: Tensor, name: str, bid: int | None
     ) -> Iterable[tuple[str, Tensor]]:
+        """In modeling_granitemoe, the JetMoe implementation of parallel experts
+        is used. This essentially merges w1 and w3 into a single tensor with 2x
+        the hidden size that is then split during forward. To keep compatibility
+        with existing mixtral support, we pull them apart here.
+        """
 
-        name = name.replace(".moe.", ".feed_forward.")
-        if bid is not None:
-            moe_offset = self.hparams["expert_layer_offset"]
-            moe_period = self.hparams["expert_layer_period"]
-
-            if not (bid >= moe_offset and (bid - moe_offset) % moe_period == 0):
-                name = name.replace(".experts.0.", ".")
-
-        if ".feed_forward.experts." in name:
-            n_experts = self.hparams["num_experts"]
-
-            assert bid is not None
-
-            if self._experts is None:
-                self._experts = [{} for _ in range(self.block_count)]
-
-            self._experts[bid][name] = data_torch
-
-            if len(self._experts[bid]) >= n_experts * 3:
+        if name.endswith("block_sparse_moe.input_linear.weight"):
+            ffn_dim = self.hparams["intermediate_size"]
+            assert (
+                data_torch.shape[-2] == 2 * ffn_dim
+            ), "Merged FFN tensor size must be 2 * intermediate_size"
+            gate, up = data_torch[..., :ffn_dim, :], data_torch[..., ffn_dim:, :]
+            return [
+                (self.format_tensor_name(gguf.MODEL_TENSOR.FFN_GATE_EXP, bid), gate),
+                (self.format_tensor_name(gguf.MODEL_TENSOR.FFN_UP_EXP, bid), up),
+            ]
 
-                for wid in ["down_proj", "gate_proj", "up_proj"]:
-                    datas: list[Tensor] = []
+        return super().modify_tensors(data_torch, name, bid)
 
-                    for xid in range(n_experts):
-                        ename = f"model.layers.{bid}.feed_forward.experts.{xid}.{wid}.weight"
-                        datas.append(self._experts[bid][ename])
-                        del self._experts[bid][ename]
 
-                    data_torch = torch.stack(datas, dim=0)
+@Model.register("ChameleonForConditionalGeneration")
+@Model.register("ChameleonForCausalLM")  # obsolete
+class ChameleonModel(Model):
+    model_arch = gguf.MODEL_ARCH.CHAMELEON
 
-                    merged_name = f"model.layers.{bid}.mlp.experts.{wid}.weight"
+    def set_gguf_parameters(self):
+        super().set_gguf_parameters()
+        self.gguf_writer.add_swin_norm(self.hparams.get("swin_norm", False))
 
-                    new_name = self.map_tensor_name(merged_name)
+    def set_vocab(self):
+        self._set_vocab_gpt2()
 
-                    yield new_name, data_torch
-            return
+    def modify_tensors(
+        self, data_torch: Tensor, name: str, bid: int | None
+    ) -> Iterable[tuple[str, Tensor]]:
+        # ignore image tokenizer for now
+        # TODO: remove this once image support is implemented for Chameleon
+        if name.startswith("model.vqmodel"):
+            return []
 
-        new_name = self.map_tensor_name(name)
+        n_head = self.hparams["num_attention_heads"]
+        n_kv_head = self.hparams.get("num_key_value_heads")
+        hidden_dim = self.hparams.get("hidden_size")
 
-        if name.endswith(".A_log"):
-            logger.debug("A_log --> A ==> " + new_name)
-            data_torch = -torch.exp(data_torch)
+        if name.endswith(("q_proj.weight", "q_proj.bias")):
+            data_torch = LlamaModel.permute(data_torch, n_head, n_head)
+        if name.endswith(("k_proj.weight", "k_proj.bias")):
+            data_torch = LlamaModel.permute(data_torch, n_head, n_kv_head)
+        if name.endswith(("q_norm.weight", "q_norm.bias")):
+            data_torch = ChameleonModel._reverse_hf_permute(
+                data_torch, n_head, hidden_dim
+            )
+        if name.endswith(("k_norm.weight", "k_norm.bias")):
+            data_torch = ChameleonModel._reverse_hf_permute(
+                data_torch, n_kv_head, hidden_dim
+            )
 
-        yield new_name, data_torch
+        return [(self.map_tensor_name(name), data_torch)]
 
-    def prepare_tensors(self):
-        super().prepare_tensors()
+    # see: https://github.com/huggingface/transformers/blob/72fb02c47dbbe1999ae105319f24631cad6e2e00/src/transformers/models/chameleon/convert_chameleon_weights_to_hf.py#L176-L203
+    @staticmethod
+    def _reverse_hf_permute(data_torch, n_heads, hidden_dim):
+        head_dim = hidden_dim // n_heads
+        data_torch = data_torch[0].view(2, head_dim // 2).t().reshape(1, -1)
+        data_torch = data_torch.repeat_interleave(n_heads, 0)
+        return data_torch
 
-        if self._experts is not None:
 
-            experts = [k for d in self._experts for k in d.keys()]
-            if len(experts) > 0:
-                raise ValueError(f"Unprocessed experts: {experts}")
+###### CONVERSION LOGIC ######
 
 
+# tree of lazy tensors
 class LazyTorchTensor(gguf.LazyBase):
     _tensor_type = torch.Tensor
-
+    # to keep the type-checker happy
     dtype: torch.dtype
     shape: torch.Size
 
+    # only used when converting a torch.Tensor to a np.ndarray
     _dtype_map: dict[torch.dtype, type] = {
         torch.float16: np.float16,
         torch.float32: np.float32,
     }
 
+    # used for safetensors slices
+    # ref: https://github.com/huggingface/safetensors/blob/079781fd0dc455ba0fe851e2b4507c33d0c0d407/bindings/python/src/lib.rs#L1046
+    # TODO: uncomment U64, U32, and U16, ref: https://github.com/pytorch/pytorch/issues/58734
     _dtype_str_map: dict[str, torch.dtype] = {
         "F64": torch.float64,
         "F32": torch.float32,
         "BF16": torch.bfloat16,
         "F16": torch.float16,
+        # "U64": torch.uint64,
         "I64": torch.int64,
+        # "U32": torch.uint32,
         "I32": torch.int32,
+        # "U16": torch.uint16,
         "I16": torch.int16,
         "U8": torch.uint8,
         "I8": torch.int8,
@@ -4790,7 +5575,7 @@ def from_safetensors_slice(cls, st_slice: Any) -> Tensor:
 
     @classmethod
     def __torch_function__(cls, func, types, args=(), kwargs=None):
-        del types
+        del types  # unused
 
         if kwargs is None:
             kwargs = {}
@@ -4808,60 +5593,77 @@ def parse_args() -> argparse.Namespace:
     parser.add_argument(
         "--vocab-only",
         action="store_true",
+        help="extract only the vocab",
     )
     parser.add_argument(
         "--outfile",
         type=Path,
+        help="path to write to; default: based on input. {ftype} will be replaced by the outtype.",
     )
     parser.add_argument(
         "--outtype",
         type=str,
         choices=["f32", "f16", "bf16", "q8_0", "tq1_0", "tq2_0", "auto"],
         default="f16",
+        help="output format - use f32 for float32, f16 for float16, bf16 for bfloat16, q8_0 for Q8_0, tq1_0 or tq2_0 for ternary, and auto for the highest-fidelity 16-bit float type depending on the first loaded tensor type",
     )
     parser.add_argument(
         "--bigendian",
         action="store_true",
+        help="model is executed on big endian machine",
     )
     parser.add_argument(
         "model",
         type=Path,
+        help="directory containing model file",
+    )
+    parser.add_argument(
+        "--use-temp-file",
+        action="store_true",
+        help="use the tempfile library while processing (helpful when running out of memory, process killed)",
     )
-    parser.add_argument("--use-temp-file", action="store_true")
     parser.add_argument(
         "--no-lazy",
         action="store_true",
+        help="use more RAM by computing all outputs before writing (use in case lazy evaluation is broken)",
     )
     parser.add_argument(
         "--model-name",
         type=str,
         default=None,
+        help="name of the model",
     )
     parser.add_argument(
         "--verbose",
         action="store_true",
+        help="increase output verbosity",
     )
     parser.add_argument(
         "--split-max-tensors",
         type=int,
         default=0,
+        help="max tensors in each split",
     )
     parser.add_argument(
         "--split-max-size",
         type=str,
         default="0",
+        help="max size per split N(M|G)",
     )
     parser.add_argument(
         "--dry-run",
         action="store_true",
+        help="only print out a split plan and exit, without writing any new files",
     )
     parser.add_argument(
         "--no-tensor-first-split",
         action="store_true",
+        help="do not add tensors to the first split (disabled by default)",
     )
     parser.add_argument(
         "--metadata",
         type=Path,
+        help="Specify the path for an authorship metadata override file",
     )
 
     return parser.parse_args()
diff --git a/src/convert_lora_to_gguf.py b/src/convert_lora_to_gguf.py
index d5354d2..e132412 100644
--- a/src/convert_lora_to_gguf.py
+++ b/src/convert_lora_to_gguf.py
@@ -18,14 +18,16 @@
     SupportsIndex,
     cast,
 )
+from transformers import AutoConfig
 
 import torch
 
 if TYPE_CHECKING:
     from torch import Tensor
 
-from gguf.constants import *
+import gguf
 
+# reuse model definitions from convert_hf_to_gguf.py
 from convert_hf_to_gguf import LazyTorchTensor, Model
 
 logger = logging.getLogger("lora-to-gguf")
@@ -37,9 +39,10 @@ class PartialLoraTensor:
     B: Tensor | None = None
 
 
+# magic to support tensor shape modifications and splitting
 class LoraTorchTensor:
-    _lora_A: Tensor
-    _lora_B: Tensor
+    _lora_A: Tensor  # (n_rank, row_size)
+    _lora_B: Tensor  # (col_size, n_rank)
     _rank: int
 
     def __init__(self, A: Tensor, B: Tensor):
@@ -57,14 +60,20 @@ def get_lora_A_B(self) -> tuple[Tensor, Tensor]:
 
     def __getitem__(
         self,
-        indices: SupportsIndex | slice | tuple[SupportsIndex | slice | Tensor, ...],
+        indices: (
+            SupportsIndex
+            | slice
+            | tuple[
+                SupportsIndex | slice | Tensor, ...
+            ]  # TODO: add ellipsis in the type signature
+        ),
     ) -> LoraTorchTensor:
         shape = self.shape
         if isinstance(indices, SupportsIndex):
             if len(shape) > 2:
                 return LoraTorchTensor(self._lora_A[indices], self._lora_B[indices])
             else:
-                raise NotImplementedError
+                raise NotImplementedError  # can't return a vector
         elif isinstance(indices, slice):
             if len(shape) > 2:
                 return LoraTorchTensor(self._lora_A[indices], self._lora_B[indices])
@@ -74,7 +83,7 @@ def __getitem__(
             assert len(indices) > 0
             if indices[-1] is Ellipsis:
                 return self[indices[:-1]]
-
+            # expand ellipsis
             indices = tuple(
                 u
                 for v in (
@@ -94,6 +103,7 @@ def __getitem__(
                     *(slice(None, None) for _ in range(len(indices), len(shape))),
                 )
 
+            # TODO: make sure this is correct
             indices_A = (
                 *(
                     (
@@ -109,7 +119,7 @@ def __getitem__(
             indices_B = indices[:-1]
             return LoraTorchTensor(self._lora_A[indices_A], self._lora_B[indices_B])
         else:
-            raise NotImplementedError
+            raise NotImplementedError  # unknown indice type
 
     @property
     def dtype(self) -> torch.dtype:
@@ -132,8 +142,9 @@ def reshape(self, *shape: int | tuple[int, ...]) -> LoraTorchTensor:
             new_shape = cast(tuple[int, ...], shape)
         orig_shape = self.shape
         if len(new_shape) < 2:
-            raise NotImplementedError
+            raise NotImplementedError  # can't become a vector
 
+        # expand -1 in the shape
         if any(dim == -1 for dim in new_shape):
             n_elems = prod(orig_shape)
             n_new_elems = prod(dim if dim != -1 else 1 for dim in new_shape)
@@ -143,7 +154,7 @@ def reshape(self, *shape: int | tuple[int, ...]) -> LoraTorchTensor:
             )
 
         if new_shape[-1] != orig_shape[-1]:
-            raise NotImplementedError
+            raise NotImplementedError  # can't reshape the row size trivially
 
         shape_A = (*(1 for _ in new_shape[:-2]), self._rank, orig_shape[-1])
         shape_B = (*new_shape[:-1], self._rank)
@@ -162,7 +173,7 @@ def permute(self, *dims: int) -> LoraTorchTensor:
         shape = self.shape
         dims = tuple(dim - len(shape) if dim >= 0 else dim for dim in dims)
         if dims[-1] == -1:
-
+            # TODO: support higher dimensional A shapes bigger than 1
             assert all(dim == 1 for dim in self._lora_A.shape[:-2])
             return LoraTorchTensor(self._lora_A, self._lora_B.permute(*dims))
         if len(shape) == 2 and dims[-1] == -2 and dims[-2] == -1:
@@ -170,7 +181,7 @@ def permute(self, *dims: int) -> LoraTorchTensor:
                 self._lora_B.permute(*dims), self._lora_A.permute(*dims)
             )
         else:
-
+            # TODO: compose the above two
             raise NotImplementedError
 
     def transpose(self, dim0: int, dim1: int) -> LoraTorchTensor:
@@ -189,7 +200,7 @@ def to(self, *args, **kwargs):
 
     @classmethod
     def __torch_function__(cls, func: Callable, types, args=(), kwargs=None):
-        del types
+        del types  # unused
 
         if kwargs is None:
             kwargs = {}
@@ -230,28 +241,73 @@ def get_base_tensor_name(lora_tensor_name: str) -> str:
     base_name = lora_tensor_name.replace("base_model.model.", "")
     base_name = base_name.replace(".lora_A.weight", ".weight")
     base_name = base_name.replace(".lora_B.weight", ".weight")
+    # models produced by mergekit-extract-lora have token embeddings in the adapter
+    base_name = base_name.replace(".lora_embedding_A", ".weight")
+    base_name = base_name.replace(".lora_embedding_B", ".weight")
     return base_name
 
 
 def parse_args() -> argparse.Namespace:
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--outfile", type=Path)
+    parser = argparse.ArgumentParser(
+        description="Convert a Hugging Face PEFT LoRA adapter to a GGUF file"
+    )
+    parser.add_argument(
+        "--outfile",
+        type=Path,
+        help="path to write to; default: based on input. {ftype} will be replaced by the outtype.",
+    )
     parser.add_argument(
         "--outtype",
         type=str,
         choices=["f32", "f16", "bf16", "q8_0", "auto"],
         default="f16",
+        help="output format - use f32 for float32, f16 for float16, bf16 for bfloat16, q8_0 for Q8_0, auto for the highest-fidelity 16-bit float type depending on the first loaded tensor type",
+    )
+    parser.add_argument(
+        "--bigendian",
+        action="store_true",
+        help="model is executed on big endian machine",
+    )
+    parser.add_argument(
+        "--no-lazy",
+        action="store_true",
+        help="use more RAM by computing all outputs before writing (use in case lazy evaluation is broken)",
+    )
+    parser.add_argument(
+        "--verbose",
+        action="store_true",
+        help="increase output verbosity",
+    )
+    parser.add_argument(
+        "--dry-run",
+        action="store_true",
+        help="only print out what will be done, without writing any new files",
+    )
+    parser.add_argument(
+        "--base",
+        type=Path,
+        help="directory containing Hugging Face model config files (config.json, tokenizer.json) for the base model that the adapter is based on - only config is needed, actual model weights are not required. If base model is unspecified, it will be loaded from Hugging Face hub based on the adapter config",
+    )
+    parser.add_argument(
+        "--base-model-id",
+        type=str,
+        help="the model ID of the base model, if it is not available locally or in the adapter config. If specified, it will ignore --base and load the base model config from the Hugging Face hub (Example: 'meta-llama/Llama-3.2-1B-Instruct')",
+    )
+    parser.add_argument(
+        "lora_path",
+        type=Path,
+        help="directory containing Hugging Face PEFT LoRA config (adapter_model.json) and weights (adapter_model.safetensors or adapter_model.bin)",
     )
-    parser.add_argument("--bigendian", action="store_true")
-    parser.add_argument("--no-lazy", action="store_true")
-    parser.add_argument("--verbose", action="store_true")
-    parser.add_argument("--dry-run", action="store_true")
-    parser.add_argument("--base", type=Path, required=True)
-    parser.add_argument("lora_path", type=Path)
 
     return parser.parse_args()
 
 
+def load_hparams_from_hf(hf_model_id: str) -> dict[str, Any]:
+    # normally, adapter does not come with base model config, we need to load it from AutoConfig
+    config = AutoConfig.from_pretrained(hf_model_id)
+    return config.to_dict()
+
+
 if __name__ == "__main__":
     args = parse_args()
     logging.basicConfig(level=logging.DEBUG if args.verbose else logging.INFO)
@@ -266,19 +322,20 @@ def parse_args() -> argparse.Namespace:
 
     ftype = ftype_map[args.outtype]
 
-    dir_base_model: Path = args.base
+    dir_base_model: Path | None = args.base
     dir_lora: Path = args.lora_path
+    base_model_id: str | None = args.base_model_id
     lora_config = dir_lora / "adapter_config.json"
     input_model = dir_lora / "adapter_model.safetensors"
 
     if args.outfile is not None:
         fname_out = args.outfile
     else:
-
+        # output in the same directory as the model by default
         fname_out = dir_lora
 
     if os.path.exists(input_model):
-
+        # lazy import load_file only if lora is in safetensors format.
         from safetensors.torch import load_file
 
         lora_model = load_file(input_model, device="cpu")
@@ -286,8 +343,38 @@ def parse_args() -> argparse.Namespace:
         input_model = os.path.join(dir_lora, "adapter_model.bin")
         lora_model = torch.load(input_model, map_location="cpu", weights_only=True)
 
-    logger.info(f"Loading base model: {dir_base_model.name}")
-    hparams = Model.load_hparams(dir_base_model)
+    # load LoRA config
+    with open(lora_config, "r") as f:
+        lparams: dict[str, Any] = json.load(f)
+
+    # load base model
+    if base_model_id is not None:
+        logger.info(f"Loading base model from Hugging Face: {base_model_id}")
+        hparams = load_hparams_from_hf(base_model_id)
+    elif dir_base_model is None:
+        if "base_model_name_or_path" in lparams:
+            model_id = lparams["base_model_name_or_path"]
+            logger.info(f"Loading base model from Hugging Face: {model_id}")
+            try:
+                hparams = load_hparams_from_hf(model_id)
+            except OSError as e:
+                logger.error(f"Failed to load base model config: {e}")
+                logger.error(
+                    "Please try downloading the base model and add its path to --base"
+                )
+                sys.exit(1)
+        else:
+            logger.error(
+                "'base_model_name_or_path' is not found in adapter_config.json"
+            )
+            logger.error(
+                "Base model config is required. Please download the base model and add its path to --base"
+            )
+            sys.exit(1)
+    else:
+        logger.info(f"Loading base model: {dir_base_model.name}")
+        hparams = Model.load_hparams(dir_base_model)
+
     with torch.inference_mode():
         try:
             model_class = Model.from_model_architecture(hparams["architectures"][0])
@@ -309,6 +396,9 @@ def __init__(
                 self.dir_model_card = dir_lora_model
                 self.lora_alpha = float(lora_alpha)
 
+            def set_vocab(self):
+                pass
+
             def set_type(self):
                 self.gguf_writer.add_type(gguf.GGUFType.ADAPTER)
                 self.gguf_writer.add_string(gguf.Keys.Adapter.TYPE, "lora")
@@ -317,7 +407,10 @@ def set_gguf_parameters(self):
                 self.gguf_writer.add_float32(
                     gguf.Keys.Adapter.LORA_ALPHA, self.lora_alpha
                 )
-                super().set_gguf_parameters()
+
+            def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
+                # Never add extra tensors (e.g. rope_freqs) for LoRA adapters
+                return ()
 
             def get_tensors(self) -> Iterator[tuple[str, Tensor]]:
                 tensor_map: dict[str, PartialLoraTensor] = {}
@@ -326,14 +419,26 @@ def get_tensors(self) -> Iterator[tuple[str, Tensor]]:
                     if self.lazy:
                         tensor = LazyTorchTensor.from_eager(tensor)
                     base_name = get_base_tensor_name(name)
-                    is_lora_a = ".lora_A.weight" in name
-                    is_lora_b = ".lora_B.weight" in name
+                    # note: mergekit-extract-lora also adds token embeddings to the adapter
+                    is_lora_a = ".lora_A.weight" in name or ".lora_embedding_A" in name
+                    is_lora_b = ".lora_B.weight" in name or ".lora_embedding_B" in name
                     if not is_lora_a and not is_lora_b:
                         if ".base_layer.weight" in name:
                             continue
+                        # mergekit-extract-lora add these layernorm to the adapter, we need to keep them
+                        if "_layernorm" in name or ".norm" in name:
+                            yield (base_name, tensor)
+                            continue
                         logger.error(
                             f"Unexpected name '{name}': Not a lora_A or lora_B tensor"
                         )
+                        if ".embed_tokens.weight" in name or ".lm_head.weight" in name:
+                            logger.error(
+                                "Embeddings is present in the adapter. This can be due to new tokens added during fine tuning"
+                            )
+                            logger.error(
+                                "Please refer to https://github.com/ggerganov/llama.cpp/pull/9948"
+                            )
                         sys.exit(1)
 
                     if base_name in tensor_map:
@@ -358,17 +463,34 @@ def get_tensors(self) -> Iterator[tuple[str, Tensor]]:
             def modify_tensors(
                 self, data_torch: Tensor, name: str, bid: int | None
             ) -> Iterable[tuple[str, Tensor]]:
-                dest = super().modify_tensors(data_torch, name, bid)
+                dest = list(super().modify_tensors(data_torch, name, bid))
+                # some archs may have the same tensor for lm_head and output (tie word embeddings)
+                # in this case, adapters targeting lm_head will fail when using llama-export-lora
+                # therefore, we ignore them for now
+                # see: https://github.com/ggerganov/llama.cpp/issues/9065
+                if name == "lm_head.weight" and len(dest) == 0:
+                    raise ValueError(
+                        "lm_head is present in adapter, but is ignored in base model"
+                    )
                 for dest_name, dest_data in dest:
+                    # mergekit-extract-lora add these layernorm to the adapter
+                    if "_norm" in dest_name:
+                        assert dest_data.dim() == 1
+                        yield (dest_name, dest_data)
+                        continue
+
+                    # otherwise, we must get the lora_A and lora_B tensors
                     assert isinstance(dest_data, LoraTorchTensor)
                     lora_a, lora_b = dest_data.get_lora_A_B()
 
+                    # note: mergekit-extract-lora flip and transpose A and B
+                    # here we only need to transpose token_embd.lora_a, see llm_build_inp_embd()
+                    if "token_embd.weight" in dest_name:
+                        lora_a = lora_a.T
+
                     yield (dest_name + ".lora_a", lora_a)
                     yield (dest_name + ".lora_b", lora_b)
 
-        with open(lora_config, "r") as f:
-            lparams: dict[str, Any] = json.load(f)
-
         alpha: float = lparams["lora_alpha"]
 
         model_instance = LoraModel(
@@ -381,7 +503,7 @@ def modify_tensors(
             dry_run=args.dry_run,
             dir_lora_model=dir_lora,
             lora_alpha=alpha,
-            is_lora=True,
+            hparams=hparams,
         )
 
         logger.info("Exporting model...")
diff --git a/src/gguf/constants.py b/src/gguf/constants.py
index fe7c905..6e9ee09 100644
--- a/src/gguf/constants.py
+++ b/src/gguf/constants.py
@@ -3,10 +3,18 @@
 from enum import Enum, IntEnum, auto
 from typing import Any
 
-GGUF_MAGIC = 0x46554747
+#
+# constants
+#
+
+GGUF_MAGIC = 0x46554747  # "GGUF"
 GGUF_VERSION = 3
 GGUF_DEFAULT_ALIGNMENT = 32
-GGML_QUANT_VERSION = 2
+GGML_QUANT_VERSION = 2  # GGML_QNT_VERSION from ggml.h
+
+#
+# metadata keys
+#
 
 
 class Keys:
@@ -17,6 +25,7 @@ class General:
         ALIGNMENT = "general.alignment"
         FILE_TYPE = "general.file_type"
 
+        # Authorship Metadata
         NAME = "general.name"
         AUTHOR = "general.author"
         VERSION = "general.version"
@@ -30,38 +39,62 @@ class General:
 
         SIZE_LABEL = "general.size_label"
 
+        # Licensing details
         LICENSE = "general.license"
         LICENSE_NAME = "general.license.name"
         LICENSE_LINK = "general.license.link"
 
-        URL = "general.url"
+        # Typically represents the converted GGUF repo (Unless native)
+        URL = "general.url"  # Model Website/Paper
         DOI = "general.doi"
         UUID = "general.uuid"
-        REPO_URL = "general.repo_url"
+        REPO_URL = "general.repo_url"  # Model Source Repository (git/svn/etc...)
 
-        SOURCE_URL = "general.source.url"
+        # Model Source during conversion
+        SOURCE_URL = "general.source.url"  # Model Website/Paper
         SOURCE_DOI = "general.source.doi"
         SOURCE_UUID = "general.source.uuid"
-        SOURCE_REPO_URL = "general.source.repo_url"
+        SOURCE_REPO_URL = (
+            "general.source.repo_url"  # Model Source Repository (git/svn/etc...)
+        )
 
+        # Base Model Source. There can be more than one source if it's a merged
+        # model like with 'Mistral-7B-Merge-14-v0.1'. This will assist in
+        # tracing linage of models as it is finetuned or merged over time.
         BASE_MODEL_COUNT = "general.base_model.count"
         BASE_MODEL_NAME = "general.base_model.{id}.name"
         BASE_MODEL_AUTHOR = "general.base_model.{id}.author"
         BASE_MODEL_VERSION = "general.base_model.{id}.version"
         BASE_MODEL_ORGANIZATION = "general.base_model.{id}.organization"
-        BASE_MODEL_URL = "general.base_model.{id}.url"
+        BASE_MODEL_DESCRIPTION = "general.base_model.{id}.description"
+        BASE_MODEL_URL = "general.base_model.{id}.url"  # Model Website/Paper
         BASE_MODEL_DOI = "general.base_model.{id}.doi"
         BASE_MODEL_UUID = "general.base_model.{id}.uuid"
-        BASE_MODEL_REPO_URL = "general.base_model.{id}.repo_url"
+        BASE_MODEL_REPO_URL = "general.base_model.{id}.repo_url"  # Model Source Repository (git/svn/etc...)
+
+        # Dataset Source
+        DATASET_COUNT = "general.dataset.count"
+        DATASET_NAME = "general.dataset.{id}.name"
+        DATASET_AUTHOR = "general.dataset.{id}.author"
+        DATASET_VERSION = "general.dataset.{id}.version"
+        DATASET_ORGANIZATION = "general.dataset.{id}.organization"
+        DATASET_DESCRIPTION = "general.dataset.{id}.description"
+        DATASET_URL = "general.dataset.{id}.url"  # Model Website/Paper
+        DATASET_DOI = "general.dataset.{id}.doi"
+        DATASET_UUID = "general.dataset.{id}.uuid"
+        DATASET_REPO_URL = (
+            "general.dataset.{id}.repo_url"  # Model Source Repository (git/svn/etc...)
+        )
 
+        # Array based KV stores
         TAGS = "general.tags"
         LANGUAGES = "general.languages"
-        DATASETS = "general.datasets"
 
     class LLM:
         VOCAB_SIZE = "{arch}.vocab_size"
         CONTEXT_LENGTH = "{arch}.context_length"
         EMBEDDING_LENGTH = "{arch}.embedding_length"
+        FEATURES_LENGTH = "{arch}.features_length"
         BLOCK_COUNT = "{arch}.block_count"
         LEADING_DENSE_BLOCK_COUNT = "{arch}.leading_dense_block_count"
         FEED_FORWARD_LENGTH = "{arch}.feed_forward_length"
@@ -73,11 +106,14 @@ class LLM:
         EXPERT_USED_COUNT = "{arch}.expert_used_count"
         EXPERT_SHARED_COUNT = "{arch}.expert_shared_count"
         EXPERT_WEIGHTS_SCALE = "{arch}.expert_weights_scale"
+        EXPERT_WEIGHTS_NORM = "{arch}.expert_weights_norm"
+        EXPERT_GATING_FUNC = "{arch}.expert_gating_func"
         POOLING_TYPE = "{arch}.pooling_type"
         LOGIT_SCALE = "{arch}.logit_scale"
         DECODER_START_TOKEN_ID = "{arch}.decoder_start_token_id"
         ATTN_LOGIT_SOFTCAPPING = "{arch}.attn_logit_softcapping"
         FINAL_LOGIT_SOFTCAPPING = "{arch}.final_logit_softcapping"
+        SWIN_NORM = "{arch}.swin_norm"
         RESCALE_EVERY_N_LAYERS = "{arch}.rescale_every_n_layers"
         TIME_MIX_EXTRA_DIM = "{arch}.time_mix_extra_dim"
         TIME_DECAY_EXTRA_DIM = "{arch}.time_decay_extra_dim"
@@ -93,6 +129,8 @@ class Attention:
         VALUE_LENGTH = "{arch}.attention.value_length"
         LAYERNORM_EPS = "{arch}.attention.layer_norm_epsilon"
         LAYERNORM_RMS_EPS = "{arch}.attention.layer_norm_rms_epsilon"
+        GROUPNORM_EPS = "{arch}.attention.group_norm_epsilon"
+        GROUPNORM_GROUPS = "{arch}.attention.group_norm_groups"
         CAUSAL = "{arch}.attention.causal"
         Q_LORA_RANK = "{arch}.attention.q_lora_rank"
         KV_LORA_RANK = "{arch}.attention.kv_lora_rank"
@@ -102,6 +140,7 @@ class Attention:
 
     class Rope:
         DIMENSION_COUNT = "{arch}.rope.dimension_count"
+        DIMENSION_SECTIONS = "{arch}.rope.dimension_sections"
         FREQ_BASE = "{arch}.rope.freq_base"
         SCALING_TYPE = "{arch}.rope.scaling.type"
         SCALING_FACTOR = "{arch}.rope.scaling.factor"
@@ -125,16 +164,28 @@ class SSM:
     class WKV:
         HEAD_SIZE = "{arch}.wkv.head_size"
 
+    class PosNet:
+        EMBEDDING_LENGTH = "{arch}.posnet.embedding_length"
+        BLOCK_COUNT = "{arch}.posnet.block_count"
+
+    class ConvNext:
+        EMBEDDING_LENGTH = "{arch}.convnext.embedding_length"
+        BLOCK_COUNT = "{arch}.convnext.block_count"
+
     class Tokenizer:
         MODEL = "tokenizer.ggml.model"
         PRE = "tokenizer.ggml.pre"
         LIST = "tokenizer.ggml.tokens"
         TOKEN_TYPE = "tokenizer.ggml.token_type"
-        TOKEN_TYPE_COUNT = "tokenizer.ggml.token_type_count"
+        TOKEN_TYPE_COUNT = (
+            "tokenizer.ggml.token_type_count"  # for BERT-style token types
+        )
         SCORES = "tokenizer.ggml.scores"
         MERGES = "tokenizer.ggml.merges"
         BOS_ID = "tokenizer.ggml.bos_token_id"
         EOS_ID = "tokenizer.ggml.eos_token_id"
+        EOT_ID = "tokenizer.ggml.eot_token_id"
+        EOM_ID = "tokenizer.ggml.eom_token_id"
         UNK_ID = "tokenizer.ggml.unknown_token_id"
         SEP_ID = "tokenizer.ggml.seperator_token_id"
         PAD_ID = "tokenizer.ggml.padding_token_id"
@@ -150,18 +201,28 @@ class Tokenizer:
         CHAT_TEMPLATE = "tokenizer.chat_template"
         CHAT_TEMPLATE_N = "tokenizer.chat_template.{name}"
         CHAT_TEMPLATES = "tokenizer.chat_templates"
-
+        # FIM/Infill special tokens constants
+        FIM_PRE_ID = "tokenizer.ggml.fim_pre_token_id"
+        FIM_SUF_ID = "tokenizer.ggml.fim_suf_token_id"
+        FIM_MID_ID = "tokenizer.ggml.fim_mid_token_id"
+        FIM_PAD_ID = "tokenizer.ggml.fim_pad_token_id"
+        FIM_REP_ID = "tokenizer.ggml.fim_rep_token_id"
+        FIM_SEP_ID = "tokenizer.ggml.fim_sep_token_id"
+        # deprecated:
         PREFIX_ID = "tokenizer.ggml.prefix_token_id"
         SUFFIX_ID = "tokenizer.ggml.suffix_token_id"
         MIDDLE_ID = "tokenizer.ggml.middle_token_id"
-        EOT_ID = "tokenizer.ggml.eot_token_id"
-        EOM_ID = "tokenizer.ggml.eom_token_id"
 
     class Adapter:
         TYPE = "adapter.type"
         LORA_ALPHA = "adapter.lora.alpha"
 
 
+#
+# recommended mapping of model tensor names for storage in gguf
+#
+
+
 class GGUFType:
     MODEL = "model"
     ADAPTER = "adapter"
@@ -169,6 +230,7 @@ class GGUFType:
 
 class MODEL_ARCH(IntEnum):
     LLAMA = auto()
+    DECI = auto()
     FALCON = auto()
     BAICHUAN = auto()
     GROK = auto()
@@ -186,6 +248,7 @@ class MODEL_ARCH(IntEnum):
     QWEN = auto()
     QWEN2 = auto()
     QWEN2MOE = auto()
+    QWEN2VL = auto()
     PHI2 = auto()
     PHI3 = auto()
     PLAMO = auto()
@@ -199,14 +262,16 @@ class MODEL_ARCH(IntEnum):
     STARCODER2 = auto()
     RWKV6 = auto()
     MAMBA = auto()
-    JAMBA = auto()
     XVERSE = auto()
     COMMAND_R = auto()
+    COHERE2 = auto()
     DBRX = auto()
     OLMO = auto()
+    OLMO2 = auto()
     OLMOE = auto()
     OPENELM = auto()
     ARCTIC = auto()
+    DEEPSEEK = auto()
     DEEPSEEK2 = auto()
     CHATGLM = auto()
     BITNET = auto()
@@ -216,6 +281,9 @@ class MODEL_ARCH(IntEnum):
     NEMOTRON = auto()
     EXAONE = auto()
     GRANITE = auto()
+    GRANITE_MOE = auto()
+    CHAMELEON = auto()
+    WAVTOKENIZER_DEC = auto()
 
 
 class MODEL_TENSOR(IntEnum):
@@ -254,6 +322,7 @@ class MODEL_TENSOR(IntEnum):
     FFN_GATE_SHEXP = auto()
     FFN_DOWN_SHEXP = auto()
     FFN_UP_SHEXP = auto()
+    FFN_EXP_PROBS_B = auto()
     ATTN_Q_NORM = auto()
     ATTN_K_NORM = auto()
     LAYER_OUT_NORM = auto()
@@ -261,10 +330,7 @@ class MODEL_TENSOR(IntEnum):
     SSM_CONV1D = auto()
     SSM_X = auto()
     SSM_DT = auto()
-    SSM_DT_NORM = auto()
     SSM_A = auto()
-    SSM_B_NORM = auto()
-    SSM_C_NORM = auto()
     SSM_D = auto()
     SSM_OUT = auto()
     TIME_MIX_W1 = auto()
@@ -326,10 +392,29 @@ class MODEL_TENSOR(IntEnum):
     ENC_FFN_DOWN = auto()
     ENC_FFN_UP = auto()
     ENC_OUTPUT_NORM = auto()
+    CLS = auto()  # classifier
+    CLS_OUT = auto()  # classifier output projection
+    CONV1D = auto()
+    CONVNEXT_DW = auto()
+    CONVNEXT_NORM = auto()
+    CONVNEXT_PW1 = auto()
+    CONVNEXT_PW2 = auto()
+    CONVNEXT_GAMMA = auto()
+    POSNET_CONV1 = auto()
+    POSNET_CONV2 = auto()
+    POSNET_NORM = auto()
+    POSNET_NORM1 = auto()
+    POSNET_NORM2 = auto()
+    POSNET_ATTN_NORM = auto()
+    POSNET_ATTN_Q = auto()
+    POSNET_ATTN_K = auto()
+    POSNET_ATTN_V = auto()
+    POSNET_ATTN_OUT = auto()
 
 
 MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
     MODEL_ARCH.LLAMA: "llama",
+    MODEL_ARCH.DECI: "deci",
     MODEL_ARCH.FALCON: "falcon",
     MODEL_ARCH.BAICHUAN: "baichuan",
     MODEL_ARCH.GROK: "grok",
@@ -347,6 +432,7 @@ class MODEL_TENSOR(IntEnum):
     MODEL_ARCH.QWEN: "qwen",
     MODEL_ARCH.QWEN2: "qwen2",
     MODEL_ARCH.QWEN2MOE: "qwen2moe",
+    MODEL_ARCH.QWEN2VL: "qwen2vl",
     MODEL_ARCH.PHI2: "phi2",
     MODEL_ARCH.PHI3: "phi3",
     MODEL_ARCH.PLAMO: "plamo",
@@ -360,14 +446,16 @@ class MODEL_TENSOR(IntEnum):
     MODEL_ARCH.STARCODER2: "starcoder2",
     MODEL_ARCH.RWKV6: "rwkv6",
     MODEL_ARCH.MAMBA: "mamba",
-    MODEL_ARCH.JAMBA: "jamba",
     MODEL_ARCH.XVERSE: "xverse",
     MODEL_ARCH.COMMAND_R: "command-r",
+    MODEL_ARCH.COHERE2: "cohere2",
     MODEL_ARCH.DBRX: "dbrx",
     MODEL_ARCH.OLMO: "olmo",
+    MODEL_ARCH.OLMO2: "olmo2",
     MODEL_ARCH.OLMOE: "olmoe",
     MODEL_ARCH.OPENELM: "openelm",
     MODEL_ARCH.ARCTIC: "arctic",
+    MODEL_ARCH.DEEPSEEK: "deepseek",
     MODEL_ARCH.DEEPSEEK2: "deepseek2",
     MODEL_ARCH.CHATGLM: "chatglm",
     MODEL_ARCH.BITNET: "bitnet",
@@ -377,6 +465,9 @@ class MODEL_TENSOR(IntEnum):
     MODEL_ARCH.NEMOTRON: "nemotron",
     MODEL_ARCH.EXAONE: "exaone",
     MODEL_ARCH.GRANITE: "granite",
+    MODEL_ARCH.GRANITE_MOE: "granitemoe",
+    MODEL_ARCH.CHAMELEON: "chameleon",
+    MODEL_ARCH.WAVTOKENIZER_DEC: "wavtokenizer-dec",
 }
 
 TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
@@ -417,15 +508,13 @@ class MODEL_TENSOR(IntEnum):
     MODEL_TENSOR.FFN_GATE_EXP: "blk.{bid}.ffn_gate_exps",
     MODEL_TENSOR.FFN_DOWN_EXP: "blk.{bid}.ffn_down_exps",
     MODEL_TENSOR.FFN_UP_EXP: "blk.{bid}.ffn_up_exps",
+    MODEL_TENSOR.FFN_EXP_PROBS_B: "blk.{bid}.exp_probs_b",
     MODEL_TENSOR.LAYER_OUT_NORM: "blk.{bid}.layer_output_norm",
     MODEL_TENSOR.SSM_IN: "blk.{bid}.ssm_in",
     MODEL_TENSOR.SSM_CONV1D: "blk.{bid}.ssm_conv1d",
     MODEL_TENSOR.SSM_X: "blk.{bid}.ssm_x",
     MODEL_TENSOR.SSM_DT: "blk.{bid}.ssm_dt",
-    MODEL_TENSOR.SSM_DT_NORM: "blk.{bid}.ssm_dt_norm",
     MODEL_TENSOR.SSM_A: "blk.{bid}.ssm_a",
-    MODEL_TENSOR.SSM_B_NORM: "blk.{bid}.ssm_b_norm",
-    MODEL_TENSOR.SSM_C_NORM: "blk.{bid}.ssm_c_norm",
     MODEL_TENSOR.SSM_D: "blk.{bid}.ssm_d",
     MODEL_TENSOR.SSM_OUT: "blk.{bid}.ssm_out",
     MODEL_TENSOR.TIME_MIX_W1: "blk.{bid}.time_mix_w1",
@@ -487,6 +576,24 @@ class MODEL_TENSOR(IntEnum):
     MODEL_TENSOR.ENC_FFN_DOWN: "enc.blk.{bid}.ffn_down",
     MODEL_TENSOR.ENC_FFN_UP: "enc.blk.{bid}.ffn_up",
     MODEL_TENSOR.ENC_OUTPUT_NORM: "enc.output_norm",
+    MODEL_TENSOR.CLS: "cls",
+    MODEL_TENSOR.CLS_OUT: "cls.output",
+    MODEL_TENSOR.CONV1D: "conv1d",
+    MODEL_TENSOR.CONVNEXT_DW: "convnext.{bid}.dw",
+    MODEL_TENSOR.CONVNEXT_NORM: "convnext.{bid}.norm",
+    MODEL_TENSOR.CONVNEXT_PW1: "convnext.{bid}.pw1",
+    MODEL_TENSOR.CONVNEXT_PW2: "convnext.{bid}.pw2",
+    MODEL_TENSOR.CONVNEXT_GAMMA: "convnext.{bid}.gamma",
+    MODEL_TENSOR.POSNET_CONV1: "posnet.{bid}.conv1",
+    MODEL_TENSOR.POSNET_CONV2: "posnet.{bid}.conv2",
+    MODEL_TENSOR.POSNET_NORM: "posnet.{bid}.norm",
+    MODEL_TENSOR.POSNET_NORM1: "posnet.{bid}.norm1",
+    MODEL_TENSOR.POSNET_NORM2: "posnet.{bid}.norm2",
+    MODEL_TENSOR.POSNET_ATTN_NORM: "posnet.{bid}.attn_norm",
+    MODEL_TENSOR.POSNET_ATTN_Q: "posnet.{bid}.attn_q",
+    MODEL_TENSOR.POSNET_ATTN_K: "posnet.{bid}.attn_k",
+    MODEL_TENSOR.POSNET_ATTN_V: "posnet.{bid}.attn_v",
+    MODEL_TENSOR.POSNET_ATTN_OUT: "posnet.{bid}.attn_output",
 }
 
 MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
@@ -510,6 +617,26 @@ class MODEL_TENSOR(IntEnum):
         MODEL_TENSOR.FFN_DOWN_EXP,
         MODEL_TENSOR.FFN_UP_EXP,
     ],
+    MODEL_ARCH.DECI: [
+        MODEL_TENSOR.TOKEN_EMBD,
+        MODEL_TENSOR.OUTPUT_NORM,
+        MODEL_TENSOR.OUTPUT,
+        MODEL_TENSOR.ROPE_FREQS,
+        MODEL_TENSOR.ATTN_NORM,
+        MODEL_TENSOR.ATTN_Q,
+        MODEL_TENSOR.ATTN_K,
+        MODEL_TENSOR.ATTN_V,
+        MODEL_TENSOR.ATTN_OUT,
+        MODEL_TENSOR.ATTN_ROT_EMBD,
+        MODEL_TENSOR.FFN_GATE_INP,
+        MODEL_TENSOR.FFN_NORM,
+        MODEL_TENSOR.FFN_GATE,
+        MODEL_TENSOR.FFN_DOWN,
+        MODEL_TENSOR.FFN_UP,
+        MODEL_TENSOR.FFN_GATE_EXP,
+        MODEL_TENSOR.FFN_DOWN_EXP,
+        MODEL_TENSOR.FFN_UP_EXP,
+    ],
     MODEL_ARCH.GROK: [
         MODEL_TENSOR.TOKEN_EMBD,
         MODEL_TENSOR.OUTPUT_NORM,
@@ -596,6 +723,8 @@ class MODEL_TENSOR(IntEnum):
         MODEL_TENSOR.FFN_DOWN,
         MODEL_TENSOR.FFN_UP,
         MODEL_TENSOR.LAYER_OUT_NORM,
+        MODEL_TENSOR.CLS,
+        MODEL_TENSOR.CLS_OUT,
     ],
     MODEL_ARCH.NOMIC_BERT: [
         MODEL_TENSOR.TOKEN_EMBD,
@@ -627,6 +756,7 @@ class MODEL_TENSOR(IntEnum):
         MODEL_TENSOR.FFN_GATE,
         MODEL_TENSOR.FFN_DOWN,
         MODEL_TENSOR.LAYER_OUT_NORM,
+        MODEL_TENSOR.CLS,
     ],
     MODEL_ARCH.MPT: [
         MODEL_TENSOR.TOKEN_EMBD,
@@ -713,6 +843,21 @@ class MODEL_TENSOR(IntEnum):
         MODEL_TENSOR.FFN_UP,
     ],
     MODEL_ARCH.QWEN2: [
+        MODEL_TENSOR.TOKEN_EMBD,
+        MODEL_TENSOR.OUTPUT_NORM,
+        MODEL_TENSOR.OUTPUT,
+        MODEL_TENSOR.ROPE_FREQS,
+        MODEL_TENSOR.ATTN_NORM,
+        MODEL_TENSOR.ATTN_Q,
+        MODEL_TENSOR.ATTN_K,
+        MODEL_TENSOR.ATTN_V,
+        MODEL_TENSOR.ATTN_OUT,
+        MODEL_TENSOR.FFN_NORM,
+        MODEL_TENSOR.FFN_GATE,
+        MODEL_TENSOR.FFN_DOWN,
+        MODEL_TENSOR.FFN_UP,
+    ],
+    MODEL_ARCH.QWEN2VL: [
         MODEL_TENSOR.TOKEN_EMBD,
         MODEL_TENSOR.OUTPUT_NORM,
         MODEL_TENSOR.OUTPUT,
@@ -790,6 +935,8 @@ class MODEL_TENSOR(IntEnum):
         MODEL_TENSOR.TOKEN_EMBD,
         MODEL_TENSOR.OUTPUT_NORM,
         MODEL_TENSOR.OUTPUT,
+        MODEL_TENSOR.ROPE_FACTORS_LONG,
+        MODEL_TENSOR.ROPE_FACTORS_SHORT,
         MODEL_TENSOR.ATTN_NORM,
         MODEL_TENSOR.ATTN_QKV,
         MODEL_TENSOR.ATTN_Q,
@@ -849,6 +996,8 @@ class MODEL_TENSOR(IntEnum):
         MODEL_TENSOR.OUTPUT,
         MODEL_TENSOR.OUTPUT_NORM,
         MODEL_TENSOR.ROPE_FREQS,
+        MODEL_TENSOR.ROPE_FACTORS_LONG,
+        MODEL_TENSOR.ROPE_FACTORS_SHORT,
         MODEL_TENSOR.ATTN_NORM,
         MODEL_TENSOR.ATTN_Q,
         MODEL_TENSOR.ATTN_K,
@@ -868,6 +1017,8 @@ class MODEL_TENSOR(IntEnum):
         MODEL_TENSOR.TOKEN_EMBD,
         MODEL_TENSOR.OUTPUT_NORM,
         MODEL_TENSOR.OUTPUT,
+        MODEL_TENSOR.ROPE_FACTORS_LONG,
+        MODEL_TENSOR.ROPE_FACTORS_SHORT,
         MODEL_TENSOR.ATTN_NORM,
         MODEL_TENSOR.ATTN_Q_A,
         MODEL_TENSOR.ATTN_Q_B,
@@ -968,51 +1119,37 @@ class MODEL_TENSOR(IntEnum):
         MODEL_TENSOR.SSM_D,
         MODEL_TENSOR.SSM_OUT,
     ],
-    MODEL_ARCH.JAMBA: [
+    MODEL_ARCH.XVERSE: [
         MODEL_TENSOR.TOKEN_EMBD,
         MODEL_TENSOR.OUTPUT_NORM,
         MODEL_TENSOR.OUTPUT,
+        MODEL_TENSOR.ROPE_FREQS,
         MODEL_TENSOR.ATTN_NORM,
         MODEL_TENSOR.ATTN_Q,
         MODEL_TENSOR.ATTN_K,
         MODEL_TENSOR.ATTN_V,
         MODEL_TENSOR.ATTN_OUT,
-        MODEL_TENSOR.SSM_IN,
-        MODEL_TENSOR.SSM_CONV1D,
-        MODEL_TENSOR.SSM_X,
-        MODEL_TENSOR.SSM_DT,
-        MODEL_TENSOR.SSM_DT_NORM,
-        MODEL_TENSOR.SSM_A,
-        MODEL_TENSOR.SSM_B_NORM,
-        MODEL_TENSOR.SSM_C_NORM,
-        MODEL_TENSOR.SSM_D,
-        MODEL_TENSOR.SSM_OUT,
-        MODEL_TENSOR.FFN_GATE_INP,
+        MODEL_TENSOR.ATTN_ROT_EMBD,
         MODEL_TENSOR.FFN_NORM,
         MODEL_TENSOR.FFN_GATE,
         MODEL_TENSOR.FFN_DOWN,
         MODEL_TENSOR.FFN_UP,
-        MODEL_TENSOR.FFN_GATE_EXP,
-        MODEL_TENSOR.FFN_DOWN_EXP,
-        MODEL_TENSOR.FFN_UP_EXP,
     ],
-    MODEL_ARCH.XVERSE: [
+    MODEL_ARCH.COMMAND_R: [
         MODEL_TENSOR.TOKEN_EMBD,
         MODEL_TENSOR.OUTPUT_NORM,
-        MODEL_TENSOR.OUTPUT,
-        MODEL_TENSOR.ROPE_FREQS,
         MODEL_TENSOR.ATTN_NORM,
         MODEL_TENSOR.ATTN_Q,
         MODEL_TENSOR.ATTN_K,
         MODEL_TENSOR.ATTN_V,
         MODEL_TENSOR.ATTN_OUT,
-        MODEL_TENSOR.ATTN_ROT_EMBD,
-        MODEL_TENSOR.FFN_NORM,
         MODEL_TENSOR.FFN_GATE,
         MODEL_TENSOR.FFN_DOWN,
         MODEL_TENSOR.FFN_UP,
+        MODEL_TENSOR.ATTN_K_NORM,
+        MODEL_TENSOR.ATTN_Q_NORM,
     ],
-    MODEL_ARCH.COMMAND_R: [
+    MODEL_ARCH.COHERE2: [
         MODEL_TENSOR.TOKEN_EMBD,
         MODEL_TENSOR.OUTPUT_NORM,
         MODEL_TENSOR.ATTN_NORM,
@@ -1023,8 +1160,6 @@ class MODEL_TENSOR(IntEnum):
         MODEL_TENSOR.FFN_GATE,
         MODEL_TENSOR.FFN_DOWN,
         MODEL_TENSOR.FFN_UP,
-        MODEL_TENSOR.ATTN_K_NORM,
-        MODEL_TENSOR.ATTN_Q_NORM,
     ],
     MODEL_ARCH.DBRX: [
         MODEL_TENSOR.TOKEN_EMBD,
@@ -1050,6 +1185,22 @@ class MODEL_TENSOR(IntEnum):
         MODEL_TENSOR.FFN_DOWN,
         MODEL_TENSOR.FFN_UP,
     ],
+    MODEL_ARCH.OLMO2: [
+        MODEL_TENSOR.TOKEN_EMBD,
+        MODEL_TENSOR.OUTPUT_NORM,
+        MODEL_TENSOR.OUTPUT,
+        MODEL_TENSOR.ATTN_Q,
+        MODEL_TENSOR.ATTN_K,
+        MODEL_TENSOR.ATTN_V,
+        MODEL_TENSOR.ATTN_OUT,
+        MODEL_TENSOR.ATTN_POST_NORM,
+        MODEL_TENSOR.ATTN_Q_NORM,
+        MODEL_TENSOR.ATTN_K_NORM,
+        MODEL_TENSOR.FFN_POST_NORM,
+        MODEL_TENSOR.FFN_GATE,
+        MODEL_TENSOR.FFN_DOWN,
+        MODEL_TENSOR.FFN_UP,
+    ],
     MODEL_ARCH.OLMOE: [
         MODEL_TENSOR.TOKEN_EMBD,
         MODEL_TENSOR.OUTPUT_NORM,
@@ -1101,6 +1252,29 @@ class MODEL_TENSOR(IntEnum):
         MODEL_TENSOR.FFN_DOWN_EXP,
         MODEL_TENSOR.FFN_UP_EXP,
     ],
+    MODEL_ARCH.DEEPSEEK: [
+        MODEL_TENSOR.TOKEN_EMBD,
+        MODEL_TENSOR.OUTPUT_NORM,
+        MODEL_TENSOR.OUTPUT,
+        MODEL_TENSOR.ROPE_FREQS,
+        MODEL_TENSOR.ATTN_NORM,
+        MODEL_TENSOR.ATTN_Q,
+        MODEL_TENSOR.ATTN_K,
+        MODEL_TENSOR.ATTN_V,
+        MODEL_TENSOR.ATTN_OUT,
+        MODEL_TENSOR.ATTN_ROT_EMBD,
+        MODEL_TENSOR.FFN_GATE_INP,
+        MODEL_TENSOR.FFN_NORM,
+        MODEL_TENSOR.FFN_GATE,
+        MODEL_TENSOR.FFN_DOWN,
+        MODEL_TENSOR.FFN_UP,
+        MODEL_TENSOR.FFN_GATE_EXP,
+        MODEL_TENSOR.FFN_DOWN_EXP,
+        MODEL_TENSOR.FFN_UP_EXP,
+        MODEL_TENSOR.FFN_GATE_SHEXP,
+        MODEL_TENSOR.FFN_DOWN_SHEXP,
+        MODEL_TENSOR.FFN_UP_SHEXP,
+    ],
     MODEL_ARCH.DEEPSEEK2: [
         MODEL_TENSOR.TOKEN_EMBD,
         MODEL_TENSOR.OUTPUT_NORM,
@@ -1127,6 +1301,7 @@ class MODEL_TENSOR(IntEnum):
         MODEL_TENSOR.FFN_GATE_SHEXP,
         MODEL_TENSOR.FFN_DOWN_SHEXP,
         MODEL_TENSOR.FFN_UP_SHEXP,
+        MODEL_TENSOR.FFN_EXP_PROBS_B,
     ],
     MODEL_ARCH.CHATGLM: [
         MODEL_TENSOR.TOKEN_EMBD,
@@ -1248,6 +1423,7 @@ class MODEL_TENSOR(IntEnum):
     MODEL_ARCH.GRANITE: [
         MODEL_TENSOR.TOKEN_EMBD,
         MODEL_TENSOR.OUTPUT_NORM,
+        MODEL_TENSOR.OUTPUT,
         MODEL_TENSOR.ATTN_NORM,
         MODEL_TENSOR.ATTN_Q,
         MODEL_TENSOR.ATTN_K,
@@ -1258,13 +1434,72 @@ class MODEL_TENSOR(IntEnum):
         MODEL_TENSOR.FFN_DOWN,
         MODEL_TENSOR.FFN_UP,
     ],
+    MODEL_ARCH.GRANITE_MOE: [
+        MODEL_TENSOR.TOKEN_EMBD,
+        MODEL_TENSOR.OUTPUT_NORM,
+        MODEL_TENSOR.OUTPUT,
+        MODEL_TENSOR.ATTN_NORM,
+        MODEL_TENSOR.ATTN_Q,
+        MODEL_TENSOR.ATTN_K,
+        MODEL_TENSOR.ATTN_V,
+        MODEL_TENSOR.ATTN_OUT,
+        MODEL_TENSOR.FFN_NORM,
+        MODEL_TENSOR.FFN_GATE_INP,
+        MODEL_TENSOR.FFN_GATE_EXP,
+        MODEL_TENSOR.FFN_DOWN_EXP,
+        MODEL_TENSOR.FFN_UP_EXP,
+    ],
+    MODEL_ARCH.CHAMELEON: [
+        MODEL_TENSOR.TOKEN_EMBD,
+        MODEL_TENSOR.OUTPUT_NORM,
+        MODEL_TENSOR.OUTPUT,
+        MODEL_TENSOR.ATTN_NORM,
+        MODEL_TENSOR.ATTN_Q,
+        MODEL_TENSOR.ATTN_Q_NORM,
+        MODEL_TENSOR.ATTN_K,
+        MODEL_TENSOR.ATTN_K_NORM,
+        MODEL_TENSOR.ATTN_V,
+        MODEL_TENSOR.ATTN_OUT,
+        MODEL_TENSOR.FFN_NORM,
+        MODEL_TENSOR.FFN_GATE,
+        MODEL_TENSOR.FFN_DOWN,
+        MODEL_TENSOR.FFN_UP,
+    ],
+    MODEL_ARCH.WAVTOKENIZER_DEC: [
+        MODEL_TENSOR.TOKEN_EMBD,
+        MODEL_TENSOR.TOKEN_EMBD_NORM,
+        MODEL_TENSOR.CONV1D,
+        MODEL_TENSOR.CONVNEXT_DW,
+        MODEL_TENSOR.CONVNEXT_NORM,
+        MODEL_TENSOR.CONVNEXT_PW1,
+        MODEL_TENSOR.CONVNEXT_PW2,
+        MODEL_TENSOR.CONVNEXT_GAMMA,
+        MODEL_TENSOR.OUTPUT,
+        MODEL_TENSOR.OUTPUT_NORM,
+        MODEL_TENSOR.POSNET_CONV1,
+        MODEL_TENSOR.POSNET_CONV2,
+        MODEL_TENSOR.POSNET_NORM,
+        MODEL_TENSOR.POSNET_NORM1,
+        MODEL_TENSOR.POSNET_NORM2,
+        MODEL_TENSOR.POSNET_ATTN_NORM,
+        MODEL_TENSOR.POSNET_ATTN_Q,
+        MODEL_TENSOR.POSNET_ATTN_K,
+        MODEL_TENSOR.POSNET_ATTN_V,
+        MODEL_TENSOR.POSNET_ATTN_OUT,
+    ],
+    # TODO
 }
 
+# tensors that will not be serialized
 MODEL_TENSOR_SKIP: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
     MODEL_ARCH.LLAMA: [
         MODEL_TENSOR.ROPE_FREQS,
         MODEL_TENSOR.ATTN_ROT_EMBD,
     ],
+    MODEL_ARCH.DECI: [
+        MODEL_TENSOR.ROPE_FREQS,
+        MODEL_TENSOR.ATTN_ROT_EMBD,
+    ],
     MODEL_ARCH.BAICHUAN: [
         MODEL_TENSOR.ROPE_FREQS,
         MODEL_TENSOR.ATTN_ROT_EMBD,
@@ -1289,6 +1524,10 @@ class MODEL_TENSOR(IntEnum):
         MODEL_TENSOR.ROPE_FREQS,
         MODEL_TENSOR.ATTN_ROT_EMBD,
     ],
+    MODEL_ARCH.DEEPSEEK: [
+        MODEL_TENSOR.ROPE_FREQS,
+        MODEL_TENSOR.ATTN_ROT_EMBD,
+    ],
     MODEL_ARCH.DEEPSEEK2: [
         MODEL_TENSOR.ROPE_FREQS,
         MODEL_TENSOR.ATTN_ROT_EMBD,
@@ -1302,6 +1541,10 @@ class MODEL_TENSOR(IntEnum):
     ],
 }
 
+#
+# types
+#
+
 
 class TokenType(IntEnum):
     NORMAL = 1
@@ -1316,6 +1559,7 @@ class RopeScalingType(Enum):
     NONE = "none"
     LINEAR = "linear"
     YARN = "yarn"
+    LONGROPE = "longrope"
 
 
 class PoolingType(IntEnum):
@@ -1354,52 +1598,61 @@ class GGMLQuantizationType(IntEnum):
     F64 = 28
     IQ1_M = 29
     BF16 = 30
-    Q4_0_4_4 = 31
-    Q4_0_4_8 = 32
-    Q4_0_8_8 = 33
     TQ1_0 = 34
     TQ2_0 = 35
 
 
+class ExpertGatingFuncType(IntEnum):
+    SOFTMAX = 1
+    SIGMOID = 2
+
+
+# TODO: add GGMLFileType from ggml_ftype in ggml.h
+
+
+# from llama_ftype in llama.h
+# ALL VALUES SHOULD BE THE SAME HERE AS THEY ARE OVER THERE.
 class LlamaFileType(IntEnum):
     ALL_F32 = 0
-    MOSTLY_F16 = 1
-    MOSTLY_Q4_0 = 2
-    MOSTLY_Q4_1 = 3
-
-    MOSTLY_Q8_0 = 7
-    MOSTLY_Q5_0 = 8
-    MOSTLY_Q5_1 = 9
-    MOSTLY_Q2_K = 10
-    MOSTLY_Q3_K_S = 11
-    MOSTLY_Q3_K_M = 12
-    MOSTLY_Q3_K_L = 13
-    MOSTLY_Q4_K_S = 14
-    MOSTLY_Q4_K_M = 15
-    MOSTLY_Q5_K_S = 16
-    MOSTLY_Q5_K_M = 17
-    MOSTLY_Q6_K = 18
-    MOSTLY_IQ2_XXS = 19
-    MOSTLY_IQ2_XS = 20
-    MOSTLY_Q2_K_S = 21
-    MOSTLY_IQ3_XS = 22
-    MOSTLY_IQ3_XXS = 23
-    MOSTLY_IQ1_S = 24
-    MOSTLY_IQ4_NL = 25
-    MOSTLY_IQ3_S = 26
-    MOSTLY_IQ3_M = 27
-    MOSTLY_IQ2_S = 28
-    MOSTLY_IQ2_M = 29
-    MOSTLY_IQ4_XS = 30
-    MOSTLY_IQ1_M = 31
-    MOSTLY_BF16 = 32
-    MOSTLY_Q4_0_4_4 = 33
-    MOSTLY_Q4_0_4_8 = 34
-    MOSTLY_Q4_0_8_8 = 35
-    MOSTLY_TQ1_0 = 36
-    MOSTLY_TQ2_0 = 37
-
-    GUESSED = 1024
+    MOSTLY_F16 = 1  # except 1d tensors
+    MOSTLY_Q4_0 = 2  # except 1d tensors
+    MOSTLY_Q4_1 = 3  # except 1d tensors
+    # MOSTLY_Q4_1_SOME_F16 = 4   # tok_embeddings.weight and output.weight are F16
+    # MOSTLY_Q4_2        = 5   # support has been removed
+    # MOSTLY_Q4_3        = 6   # support has been removed
+    MOSTLY_Q8_0 = 7  # except 1d tensors
+    MOSTLY_Q5_0 = 8  # except 1d tensors
+    MOSTLY_Q5_1 = 9  # except 1d tensors
+    MOSTLY_Q2_K = 10  # except 1d tensors
+    MOSTLY_Q3_K_S = 11  # except 1d tensors
+    MOSTLY_Q3_K_M = 12  # except 1d tensors
+    MOSTLY_Q3_K_L = 13  # except 1d tensors
+    MOSTLY_Q4_K_S = 14  # except 1d tensors
+    MOSTLY_Q4_K_M = 15  # except 1d tensors
+    MOSTLY_Q5_K_S = 16  # except 1d tensors
+    MOSTLY_Q5_K_M = 17  # except 1d tensors
+    MOSTLY_Q6_K = 18  # except 1d tensors
+    MOSTLY_IQ2_XXS = 19  # except 1d tensors
+    MOSTLY_IQ2_XS = 20  # except 1d tensors
+    MOSTLY_Q2_K_S = 21  # except 1d tensors
+    MOSTLY_IQ3_XS = 22  # except 1d tensors
+    MOSTLY_IQ3_XXS = 23  # except 1d tensors
+    MOSTLY_IQ1_S = 24  # except 1d tensors
+    MOSTLY_IQ4_NL = 25  # except 1d tensors
+    MOSTLY_IQ3_S = 26  # except 1d tensors
+    MOSTLY_IQ3_M = 27  # except 1d tensors
+    MOSTLY_IQ2_S = 28  # except 1d tensors
+    MOSTLY_IQ2_M = 29  # except 1d tensors
+    MOSTLY_IQ4_XS = 30  # except 1d tensors
+    MOSTLY_IQ1_M = 31  # except 1d tensors
+    MOSTLY_BF16 = 32  # except 1d tensors
+    # MOSTLY_Q4_0_4_4      = 33  # removed from gguf files, use Q4_0 and runtime repack
+    # MOSTLY_Q4_0_4_8      = 34  # removed from gguf files, use Q4_0 and runtime repack
+    # MOSTLY_Q4_0_8_8      = 35  # removed from gguf files, use Q4_0 and runtime repack
+    MOSTLY_TQ1_0 = 36  # except 1d tensors
+    MOSTLY_TQ2_0 = 37  # except 1d tensors
+
+    GUESSED = 1024  # not specified in the model file
 
 
 class GGUFEndian(IntEnum):
@@ -1434,11 +1687,12 @@ def get_type(val: Any) -> GGUFValueType:
             return GGUFValueType.BOOL
         elif isinstance(val, int):
             return GGUFValueType.INT32
-
+        # TODO: need help with 64-bit types in Python
         else:
             raise ValueError(f"Unknown type: {type(val)}")
 
 
+# Items here are (block size, type size)
 QK_K = 256
 GGML_QUANT_SIZES: dict[GGMLQuantizationType, tuple[int, int]] = {
     GGMLQuantizationType.F32: (1, 4),
@@ -1470,13 +1724,14 @@ def get_type(val: Any) -> GGUFValueType:
     GGMLQuantizationType.F64: (1, 8),
     GGMLQuantizationType.IQ1_M: (256, QK_K // 8 + QK_K // 16 + QK_K // 32),
     GGMLQuantizationType.BF16: (1, 2),
-    GGMLQuantizationType.Q4_0_4_4: (32, 2 + 16),
-    GGMLQuantizationType.Q4_0_4_8: (32, 2 + 16),
-    GGMLQuantizationType.Q4_0_8_8: (32, 2 + 16),
     GGMLQuantizationType.TQ1_0: (256, 2 + 4 * 13),
     GGMLQuantizationType.TQ2_0: (256, 2 + 64),
 }
 
+
+# Aliases for backward compatibility.
+
+# general
 KEY_GENERAL_ARCHITECTURE = Keys.General.ARCHITECTURE
 KEY_GENERAL_QUANTIZATION_VERSION = Keys.General.QUANTIZATION_VERSION
 KEY_GENERAL_ALIGNMENT = Keys.General.ALIGNMENT
@@ -1488,6 +1743,7 @@ def get_type(val: Any) -> GGUFValueType:
 KEY_GENERAL_SOURCE_URL = Keys.General.SOURCE_URL
 KEY_GENERAL_FILE_TYPE = Keys.General.FILE_TYPE
 
+# LLM
 KEY_VOCAB_SIZE = Keys.LLM.VOCAB_SIZE
 KEY_CONTEXT_LENGTH = Keys.LLM.CONTEXT_LENGTH
 KEY_EMBEDDING_LENGTH = Keys.LLM.EMBEDDING_LENGTH
@@ -1496,6 +1752,7 @@ def get_type(val: Any) -> GGUFValueType:
 KEY_USE_PARALLEL_RESIDUAL = Keys.LLM.USE_PARALLEL_RESIDUAL
 KEY_TENSOR_DATA_LAYOUT = Keys.LLM.TENSOR_DATA_LAYOUT
 
+# attention
 KEY_ATTENTION_HEAD_COUNT = Keys.Attention.HEAD_COUNT
 KEY_ATTENTION_HEAD_COUNT_KV = Keys.Attention.HEAD_COUNT_KV
 KEY_ATTENTION_MAX_ALIBI_BIAS = Keys.Attention.MAX_ALIBI_BIAS
@@ -1503,6 +1760,7 @@ def get_type(val: Any) -> GGUFValueType:
 KEY_ATTENTION_LAYERNORM_EPS = Keys.Attention.LAYERNORM_EPS
 KEY_ATTENTION_LAYERNORM_RMS_EPS = Keys.Attention.LAYERNORM_RMS_EPS
 
+# RoPE
 KEY_ROPE_DIMENSION_COUNT = Keys.Rope.DIMENSION_COUNT
 KEY_ROPE_FREQ_BASE = Keys.Rope.FREQ_BASE
 KEY_ROPE_SCALING_TYPE = Keys.Rope.SCALING_TYPE
@@ -1510,12 +1768,14 @@ def get_type(val: Any) -> GGUFValueType:
 KEY_ROPE_SCALING_ORIG_CTX_LEN = Keys.Rope.SCALING_ORIG_CTX_LEN
 KEY_ROPE_SCALING_FINETUNED = Keys.Rope.SCALING_FINETUNED
 
+# SSM
 KEY_SSM_CONV_KERNEL = Keys.SSM.CONV_KERNEL
 KEY_SSM_INNER_SIZE = Keys.SSM.INNER_SIZE
 KEY_SSM_STATE_SIZE = Keys.SSM.STATE_SIZE
 KEY_SSM_TIME_STEP_RANK = Keys.SSM.TIME_STEP_RANK
 KEY_SSM_DT_B_C_RMS = Keys.SSM.DT_B_C_RMS
 
+# tokenization
 KEY_TOKENIZER_MODEL = Keys.Tokenizer.MODEL
 KEY_TOKENIZER_PRE = Keys.Tokenizer.PRE
 KEY_TOKENIZER_LIST = Keys.Tokenizer.LIST
@@ -1524,6 +1784,8 @@ def get_type(val: Any) -> GGUFValueType:
 KEY_TOKENIZER_MERGES = Keys.Tokenizer.MERGES
 KEY_TOKENIZER_BOS_ID = Keys.Tokenizer.BOS_ID
 KEY_TOKENIZER_EOS_ID = Keys.Tokenizer.EOS_ID
+KEY_TOKENIZER_EOT_ID = Keys.Tokenizer.EOT_ID
+KEY_TOKENIZER_EOM_ID = Keys.Tokenizer.EOM_ID
 KEY_TOKENIZER_UNK_ID = Keys.Tokenizer.UNK_ID
 KEY_TOKENIZER_SEP_ID = Keys.Tokenizer.SEP_ID
 KEY_TOKENIZER_PAD_ID = Keys.Tokenizer.PAD_ID
@@ -1531,8 +1793,15 @@ def get_type(val: Any) -> GGUFValueType:
 KEY_TOKENIZER_MASK_ID = Keys.Tokenizer.MASK_ID
 KEY_TOKENIZER_HF_JSON = Keys.Tokenizer.HF_JSON
 KEY_TOKENIZER_RWKV = Keys.Tokenizer.RWKV
-KEY_TOKENIZER_PRIFIX_ID = Keys.Tokenizer.PREFIX_ID
+
+KEY_TOKENIZER_FIM_PRE_ID = Keys.Tokenizer.FIM_PRE_ID
+KEY_TOKENIZER_FIM_SUF_ID = Keys.Tokenizer.FIM_SUF_ID
+KEY_TOKENIZER_FIM_MID_ID = Keys.Tokenizer.FIM_MID_ID
+KEY_TOKENIZER_FIM_PAD_ID = Keys.Tokenizer.FIM_PAD_ID
+KEY_TOKENIZER_FIM_REP_ID = Keys.Tokenizer.FIM_REP_ID
+KEY_TOKENIZER_FIM_SEP_ID = Keys.Tokenizer.FIM_SEP_ID
+
+# deprecated
+KEY_TOKENIZER_PREFIX_ID = Keys.Tokenizer.PREFIX_ID
 KEY_TOKENIZER_SUFFIX_ID = Keys.Tokenizer.SUFFIX_ID
 KEY_TOKENIZER_MIDDLE_ID = Keys.Tokenizer.MIDDLE_ID
-KEY_TOKENIZER_EOT_ID = Keys.Tokenizer.EOT_ID
-KEY_TOKENIZER_EOM_ID = Keys.Tokenizer.EOM_ID
diff --git a/src/gguf/gguf_reader.py b/src/gguf/gguf_reader.py
index a5fc908..962c43e 100644
--- a/src/gguf/gguf_reader.py
+++ b/src/gguf/gguf_reader.py
@@ -169,11 +169,10 @@ def _get(
         count = int(count)
         itemsize = int(np.empty([], dtype=dtype).itemsize)
         end_offs = offset + itemsize * count
-        return (
-            self.data[offset:end_offs]
-            .view(dtype=dtype)[:count]
-            .newbyteorder(override_order or self.byte_order)
-        )
+        arr = self.data[offset:end_offs].view(dtype=dtype)[:count]
+        if override_order is None:
+            return arr
+        return arr.view(arr.dtype.newbyteorder(override_order))
 
     def _push_field(self, field: ReaderField, skip_sum: bool = False) -> int:
         if field.name in self.fields:
diff --git a/src/gguf/gguf_writer.py b/src/gguf/gguf_writer.py
index a8754cf..267ea6c 100644
--- a/src/gguf/gguf_writer.py
+++ b/src/gguf/gguf_writer.py
@@ -26,12 +26,14 @@
     RopeScalingType,
     PoolingType,
     TokenType,
+    ExpertGatingFuncType,
 )
 
 from .quants import quant_shape_from_byte_shape
 
 logger = logging.getLogger(__name__)
 
+
 SHARD_NAME_FORMAT = "{:s}-{:05d}-of-{:05d}.gguf"
 
 
@@ -135,7 +137,7 @@ def get_total_parameter_count(self) -> tuple[int, int, int, int]:
                     continue
                 elif name.endswith(".lora_b"):
                     if last_lora_a is None or last_lora_a[0] != name[:-1] + "a":
-
+                        # Bail when the LoRA pair can't be found trivially
                         logger.warning(
                             "can't measure LoRA size correctly, tensor order is unusual"
                         )
@@ -154,11 +156,14 @@ def get_total_parameter_count(self) -> tuple[int, int, int, int]:
 
                 total_params += size
 
+        # Hopefully this should work even for variable-expert-count models
         expert_count = (expert_sum // n_expert_tensors) if n_expert_tensors > 0 else 0
 
+        # Negate the total to signal it's likely not exact
         if last_lora_a is not None:
             total_params = -total_params
 
+        # NOTE: keep the output in the same order as accepted by 'size_label' in gguf-py/gguf/utility.py
         return total_params, shared_params, expert_params, expert_count
 
     def format_shard_names(self, path: Path) -> list[Path]:
@@ -177,7 +182,7 @@ def open_output_file(self, path: Path | None = None) -> None:
             and self.fout is not None
             and (path is None or path == self.path)
         ):
-
+            # allow calling this multiple times as long as the path is the same
             return
 
         if self.state is not WriterState.NO_FILE:
@@ -206,7 +211,7 @@ def print_plan(self) -> list[Path]:
         if self.dry_run:
             logger.info("Dry run, not writing files")
             for name in filenames:
-                print(name)
+                print(name)  # noqa: NP100
             exit()
 
         return filenames
@@ -390,11 +395,12 @@ def add_tensor_info(
             if tensor_dtype == np.uint8:
                 tensor_shape = quant_shape_from_byte_shape(tensor_shape, raw_dtype)
 
+        # make sure there is at least one tensor before splitting
         if len(self.tensors[-1]) > 0:
-            if (
+            if (  # split when over tensor limit
                 self.split_max_tensors != 0
                 and len(self.tensors[-1]) >= self.split_max_tensors
-            ) or (
+            ) or (  # split when over size limit
                 self.split_max_size != 0
                 and sum(ti.nbytes for ti in self.tensors[-1].values()) + tensor_nbytes
                 > self.split_max_size
@@ -460,6 +466,8 @@ def write_tensor_data(self, tensor: np.ndarray[Any, Any]) -> None:
 
         fout = self.fout[file_id]
 
+        # pop the first tensor info
+        # TODO: cleaner way to get the first key
         first_tensor_name = [
             name for name, _ in zip(self.tensors[file_id].keys(), range(1))
         ][0]
@@ -506,8 +514,11 @@ def write_tensors_to_file(self, *, progress: bool = False) -> None:
                     total = sum(ti.nbytes for ti in tensors.values())
                     shard_bar.reset(total=(total if total > 0 else None))
 
+                # relying on the fact that Python dicts preserve insertion order (since 3.7)
                 for ti in tensors.values():
-                    assert ti.tensor is not None
+                    assert (
+                        ti.tensor is not None
+                    )  # can only iterate once over the tensors
                     assert ti.tensor.nbytes == ti.nbytes
                     ti.tensor.tofile(fout)
                     if shard_bar is not None:
@@ -631,6 +642,11 @@ def add_base_model_organization(self, source_id: int, organization: str) -> None
             Keys.General.BASE_MODEL_ORGANIZATION.format(id=source_id), organization
         )
 
+    def add_base_model_description(self, source_id: int, description: str) -> None:
+        self.add_string(
+            Keys.General.BASE_MODEL_DESCRIPTION.format(id=source_id), description
+        )
+
     def add_base_model_url(self, source_id: int, url: str) -> None:
         self.add_string(Keys.General.BASE_MODEL_URL.format(id=source_id), url)
 
@@ -643,15 +659,46 @@ def add_base_model_uuid(self, source_id: int, uuid: str) -> None:
     def add_base_model_repo_url(self, source_id: int, repo_url: str) -> None:
         self.add_string(Keys.General.BASE_MODEL_REPO_URL.format(id=source_id), repo_url)
 
+    def add_dataset_count(self, source_count: int) -> None:
+        self.add_uint32(Keys.General.DATASET_COUNT, source_count)
+
+    def add_dataset_name(self, source_id: int, name: str) -> None:
+        self.add_string(Keys.General.DATASET_NAME.format(id=source_id), name)
+
+    def add_dataset_author(self, source_id: int, author: str) -> None:
+        self.add_string(Keys.General.DATASET_AUTHOR.format(id=source_id), author)
+
+    def add_dataset_version(self, source_id: int, version: str) -> None:
+        self.add_string(Keys.General.DATASET_VERSION.format(id=source_id), version)
+
+    def add_dataset_organization(self, source_id: int, organization: str) -> None:
+        self.add_string(
+            Keys.General.DATASET_ORGANIZATION.format(id=source_id), organization
+        )
+
+    def add_dataset_description(self, source_id: int, description: str) -> None:
+        self.add_string(
+            Keys.General.DATASET_DESCRIPTION.format(id=source_id), description
+        )
+
+    def add_dataset_url(self, source_id: int, url: str) -> None:
+        self.add_string(Keys.General.DATASET_URL.format(id=source_id), url)
+
+    def add_dataset_doi(self, source_id: int, doi: str) -> None:
+        self.add_string(Keys.General.DATASET_DOI.format(id=source_id), doi)
+
+    def add_dataset_uuid(self, source_id: int, uuid: str) -> None:
+        self.add_string(Keys.General.DATASET_UUID.format(id=source_id), uuid)
+
+    def add_dataset_repo_url(self, source_id: int, repo_url: str) -> None:
+        self.add_string(Keys.General.DATASET_REPO_URL.format(id=source_id), repo_url)
+
     def add_tags(self, tags: Sequence[str]) -> None:
         self.add_array(Keys.General.TAGS, tags)
 
     def add_languages(self, languages: Sequence[str]) -> None:
         self.add_array(Keys.General.LANGUAGES, languages)
 
-    def add_datasets(self, datasets: Sequence[str]) -> None:
-        self.add_array(Keys.General.DATASETS, datasets)
-
     def add_tensor_data_layout(self, layout: str) -> None:
         self.add_string(Keys.LLM.TENSOR_DATA_LAYOUT.format(arch=self.arch), layout)
 
@@ -664,6 +711,21 @@ def add_context_length(self, length: int) -> None:
     def add_embedding_length(self, length: int) -> None:
         self.add_uint32(Keys.LLM.EMBEDDING_LENGTH.format(arch=self.arch), length)
 
+    def add_features_length(self, length: int) -> None:
+        self.add_uint32(Keys.LLM.FEATURES_LENGTH.format(arch=self.arch), length)
+
+    def add_posnet_embedding_length(self, length: int) -> None:
+        self.add_uint32(Keys.PosNet.EMBEDDING_LENGTH.format(arch=self.arch), length)
+
+    def add_posnet_block_count(self, length: int) -> None:
+        self.add_uint32(Keys.PosNet.BLOCK_COUNT.format(arch=self.arch), length)
+
+    def add_convnext_embedding_length(self, length: int) -> None:
+        self.add_uint32(Keys.ConvNext.EMBEDDING_LENGTH.format(arch=self.arch), length)
+
+    def add_convnext_block_count(self, length: int) -> None:
+        self.add_uint32(Keys.ConvNext.BLOCK_COUNT.format(arch=self.arch), length)
+
     def add_block_count(self, length: int) -> None:
         self.add_uint32(Keys.LLM.BLOCK_COUNT.format(arch=self.arch), length)
 
@@ -739,6 +801,15 @@ def add_expert_shared_count(self, count: int) -> None:
     def add_expert_weights_scale(self, value: float) -> None:
         self.add_float32(Keys.LLM.EXPERT_WEIGHTS_SCALE.format(arch=self.arch), value)
 
+    def add_expert_weights_norm(self, value: bool) -> None:
+        self.add_bool(Keys.LLM.EXPERT_WEIGHTS_NORM.format(arch=self.arch), value)
+
+    def add_expert_gating_func(self, value: ExpertGatingFuncType) -> None:
+        self.add_uint32(Keys.LLM.EXPERT_GATING_FUNC.format(arch=self.arch), value.value)
+
+    def add_swin_norm(self, value: bool) -> None:
+        self.add_bool(Keys.LLM.SWIN_NORM.format(arch=self.arch), value)
+
     def add_rescale_every_n_layers(self, count: int) -> None:
         self.add_uint32(Keys.LLM.RESCALE_EVERY_N_LAYERS.format(arch=self.arch), count)
 
@@ -763,6 +834,12 @@ def add_layer_norm_eps(self, value: float) -> None:
     def add_layer_norm_rms_eps(self, value: float) -> None:
         self.add_float32(Keys.Attention.LAYERNORM_RMS_EPS.format(arch=self.arch), value)
 
+    def add_group_norm_eps(self, value: float) -> None:
+        self.add_float32(Keys.Attention.GROUPNORM_EPS.format(arch=self.arch), value)
+
+    def add_group_norm_groups(self, value: int) -> None:
+        self.add_uint32(Keys.Attention.GROUPNORM_GROUPS.format(arch=self.arch), value)
+
     def add_causal_attention(self, value: bool) -> None:
         self.add_bool(Keys.Attention.CAUSAL.format(arch=self.arch), value)
 
@@ -787,6 +864,9 @@ def add_pooling_type(self, value: PoolingType) -> None:
     def add_rope_dimension_count(self, count: int) -> None:
         self.add_uint32(Keys.Rope.DIMENSION_COUNT.format(arch=self.arch), count)
 
+    def add_rope_dimension_sections(self, dims: Sequence[int]) -> None:
+        self.add_array(Keys.Rope.DIMENSION_SECTIONS.format(arch=self.arch), dims)
+
     def add_rope_freq_base(self, value: float) -> None:
         self.add_float32(Keys.Rope.FREQ_BASE.format(arch=self.arch), value)
 
@@ -893,6 +973,7 @@ def add_chat_template(self, value: str | Sequence[Mapping[str, str]]) -> None:
                 name = choice.get("name", "")
                 template = choice.get("template")
 
+                # Allowing non-alphanumerical characters in template name is probably not a good idea, so filter it
                 name = "".join(
                     (c if c in ascii_letters + digits else "_" for c in name)
                 )
@@ -916,15 +997,6 @@ def add_chat_template(self, value: str | Sequence[Mapping[str, str]]) -> None:
 
         self.add_string(Keys.Tokenizer.CHAT_TEMPLATE, value)
 
-    def add_prefix_token_id(self, id: int) -> None:
-        self.add_uint32(Keys.Tokenizer.PREFIX_ID, id)
-
-    def add_suffix_token_id(self, id: int) -> None:
-        self.add_uint32(Keys.Tokenizer.SUFFIX_ID, id)
-
-    def add_middle_token_id(self, id: int) -> None:
-        self.add_uint32(Keys.Tokenizer.MIDDLE_ID, id)
-
     def add_eot_token_id(self, id: int) -> None:
         self.add_uint32(Keys.Tokenizer.EOT_ID, id)
 
diff --git a/src/gguf/lazy.py b/src/gguf/lazy.py
index 831e3dc..f3273f5 100644
--- a/src/gguf/lazy.py
+++ b/src/gguf/lazy.py
@@ -12,6 +12,7 @@
 
 
 class LazyMeta(ABCMeta):
+
     def __new__(
         cls, name: str, bases: tuple[type, ...], namespace: dict[str, Any], **kwargs
     ):
@@ -34,7 +35,7 @@ def __getattr__(self, name: str) -> Any:
 
         # need to make a builder for the wrapped wrapper to copy the name,
         # or else it fails with very cryptic error messages,
-        # because somehow the same string would end up in every closure
+        # because somehow the same string would end up in every closures
         def mk_wrap(op_name: str, *, meta_noop: bool = False):
             # need to wrap the wrapper to get self
             def wrapped_special_op(self, *args, **kwargs):
@@ -254,6 +255,8 @@ def from_eager(cls, t: Any) -> Any:
 class LazyNumpyTensor(LazyBase):
     _tensor_type = np.ndarray
 
+    shape: tuple[int, ...]  # Makes the type checker happy in quants.py
+
     @classmethod
     def meta_with_dtype_and_shape(
         cls, dtype: DTypeLike, shape: tuple[int, ...]
diff --git a/src/gguf/metadata.py b/src/gguf/metadata.py
index 6d39f5a..c9046eb 100644
--- a/src/gguf/metadata.py
+++ b/src/gguf/metadata.py
@@ -41,7 +41,7 @@ class Metadata:
     base_models: Optional[list[dict]] = None
     tags: Optional[list[str]] = None
     languages: Optional[list[str]] = None
-    datasets: Optional[list[str]] = None
+    datasets: Optional[list[dict]] = None
 
     @staticmethod
     def load(
@@ -50,7 +50,7 @@ def load(
         model_name: Optional[str] = None,
         total_params: int = 0,
     ) -> Metadata:
-        # This grabs as much contextual authorship metadata as possible from the model repository
+        # This grabs as many contextual authorship metadata as possible from the model repository
         # making any conversion as required to match the gguf kv store metadata format
         # as well as giving users the ability to override any authorship metadata that may be incorrect
 
@@ -126,13 +126,13 @@ def load(
             "general.base_models", metadata.base_models
         )
 
+        # Datasets is received here as an array of datasets
+        metadata.datasets = metadata_override.get("general.datasets", metadata.datasets)
+
         metadata.tags = metadata_override.get(Keys.General.TAGS, metadata.tags)
         metadata.languages = metadata_override.get(
             Keys.General.LANGUAGES, metadata.languages
         )
-        metadata.datasets = metadata_override.get(
-            Keys.General.DATASETS, metadata.datasets
-        )
 
         # Direct Metadata Override (via direct cli argument)
         if model_name is not None:
@@ -228,7 +228,11 @@ def get_model_id_components(
             org_component, model_full_name_component = None, model_id
 
         # Check if we erroneously matched against './' or '../' etc...
-        if org_component is not None and org_component[0] == ".":
+        if (
+            org_component is not None
+            and len(org_component) > 0
+            and org_component[0] == "."
+        ):
             org_component = None
 
         name_parts: list[str] = model_full_name_component.split("-")
@@ -387,27 +391,86 @@ def apply_metadata_heuristic(
         ########################
         if model_card is not None:
 
-            if "model_name" in model_card and metadata.name is None:
-                # Not part of huggingface model card standard but notice some model creator using it
-                # such as TheBloke in 'TheBloke/Mistral-7B-Instruct-v0.2-GGUF'
-                metadata.name = model_card.get("model_name")
+            def use_model_card_metadata(metadata_key: str, model_card_key: str):
+                if (
+                    model_card_key in model_card
+                    and getattr(metadata, metadata_key, None) is None
+                ):
+                    setattr(metadata, metadata_key, model_card.get(model_card_key))
 
-            if "model_creator" in model_card and metadata.author is None:
-                # Not part of huggingface model card standard but notice some model creator using it
-                # such as TheBloke in 'TheBloke/Mistral-7B-Instruct-v0.2-GGUF'
-                metadata.author = model_card.get("model_creator")
+            def use_array_model_card_metadata(metadata_key: str, model_card_key: str):
+                # Note: Will append rather than replace if already exist
+                tags_value = model_card.get(model_card_key, None)
+                if tags_value is None:
+                    return
 
-            if "model_type" in model_card and metadata.basename is None:
-                # Not part of huggingface model card standard but notice some model creator using it
-                # such as TheBloke in 'TheBloke/Mistral-7B-Instruct-v0.2-GGUF'
-                metadata.basename = model_card.get("model_type")
+                current_value = getattr(metadata, metadata_key, None)
+                if current_value is None:
+                    current_value = []
 
-            if "base_model" in model_card:
+                if isinstance(tags_value, str):
+                    current_value.append(tags_value)
+                elif isinstance(tags_value, list):
+                    current_value.extend(tags_value)
+
+                setattr(metadata, metadata_key, current_value)
+
+            # LLAMA.cpp's direct internal convention
+            # (Definitely not part of hugging face formal/informal standard)
+            #########################################
+            use_model_card_metadata("name", "name")
+            use_model_card_metadata("author", "author")
+            use_model_card_metadata("version", "version")
+            use_model_card_metadata("organization", "organization")
+            use_model_card_metadata("description", "description")
+            use_model_card_metadata("finetune", "finetune")
+            use_model_card_metadata("basename", "basename")
+            use_model_card_metadata("size_label", "size_label")
+            use_model_card_metadata("source_url", "url")
+            use_model_card_metadata("source_doi", "doi")
+            use_model_card_metadata("source_uuid", "uuid")
+            use_model_card_metadata("source_repo_url", "repo_url")
+
+            # LLAMA.cpp's huggingface style convention
+            # (Definitely not part of hugging face formal/informal standard... but with model_ appended to match their style)
+            ###########################################
+            use_model_card_metadata("name", "model_name")
+            use_model_card_metadata("author", "model_author")
+            use_model_card_metadata("version", "model_version")
+            use_model_card_metadata("organization", "model_organization")
+            use_model_card_metadata("description", "model_description")
+            use_model_card_metadata("finetune", "model_finetune")
+            use_model_card_metadata("basename", "model_basename")
+            use_model_card_metadata("size_label", "model_size_label")
+            use_model_card_metadata("source_url", "model_url")
+            use_model_card_metadata("source_doi", "model_doi")
+            use_model_card_metadata("source_uuid", "model_uuid")
+            use_model_card_metadata("source_repo_url", "model_repo_url")
+
+            # Hugging Face Direct Convention
+            #################################
+
+            # Not part of huggingface model card standard but notice some model creator using it
+            # such as TheBloke in 'TheBloke/Mistral-7B-Instruct-v0.2-GGUF'
+            use_model_card_metadata("name", "model_name")
+            use_model_card_metadata("author", "model_creator")
+            use_model_card_metadata("basename", "model_type")
+
+            if (
+                "base_model" in model_card
+                or "base_models" in model_card
+                or "base_model_sources" in model_card
+            ):
                 # This represents the parent models that this is based on
                 # Example: stabilityai/stable-diffusion-xl-base-1.0. Can also be a list (for merges)
                 # Example of merges: https://huggingface.co/EmbeddedLLM/Mistral-7B-Merge-14-v0.1/blob/main/README.md
                 metadata_base_models = []
-                base_model_value = model_card.get("base_model", None)
+                base_model_value = model_card.get(
+                    "base_model",
+                    model_card.get(
+                        "base_models", model_card.get("base_model_sources", None)
+                    ),
+                )
 
                 if base_model_value is not None:
                     if isinstance(base_model_value, str):
@@ -420,86 +483,195 @@ def apply_metadata_heuristic(
 
                 for model_id in metadata_base_models:
                     # NOTE: model size of base model is assumed to be similar to the size of the current model
-                    (
-                        model_full_name_component,
-                        org_component,
-                        basename,
-                        finetune,
-                        version,
-                        size_label,
-                    ) = Metadata.get_model_id_components(model_id, total_params)
                     base_model = {}
-                    if model_full_name_component is not None:
-                        base_model["name"] = Metadata.id_to_title(
-                            model_full_name_component
-                        )
-                    if org_component is not None:
-                        base_model["organization"] = Metadata.id_to_title(org_component)
-                    if version is not None:
-                        base_model["version"] = version
-                    if (
-                        org_component is not None
-                        and model_full_name_component is not None
-                    ):
-                        base_model["repo_url"] = (
-                            f"https://huggingface.co/{org_component}/{model_full_name_component}"
+                    if isinstance(model_id, str):
+                        if (
+                            model_id.startswith("http://")
+                            or model_id.startswith("https://")
+                            or model_id.startswith("ssh://")
+                        ):
+                            base_model["repo_url"] = model_id
+
+                            # Check if Hugging Face ID is present in URL
+                            if "huggingface.co" in model_id:
+                                match = re.match(
+                                    r"https?://huggingface.co/([^/]+/[^/]+)$", model_id
+                                )
+                                if match:
+                                    model_id_component = match.group(1)
+                                    (
+                                        model_full_name_component,
+                                        org_component,
+                                        basename,
+                                        finetune,
+                                        version,
+                                        size_label,
+                                    ) = Metadata.get_model_id_components(
+                                        model_id_component, total_params
+                                    )
+
+                                    # Populate model dictionary with extracted components
+                                    if model_full_name_component is not None:
+                                        base_model["name"] = Metadata.id_to_title(
+                                            model_full_name_component
+                                        )
+                                    if org_component is not None:
+                                        base_model["organization"] = (
+                                            Metadata.id_to_title(org_component)
+                                        )
+                                    if version is not None:
+                                        base_model["version"] = version
+
+                        else:
+                            # Likely a Hugging Face ID
+                            (
+                                model_full_name_component,
+                                org_component,
+                                basename,
+                                finetune,
+                                version,
+                                size_label,
+                            ) = Metadata.get_model_id_components(model_id, total_params)
+
+                            # Populate model dictionary with extracted components
+                            if model_full_name_component is not None:
+                                base_model["name"] = Metadata.id_to_title(
+                                    model_full_name_component
+                                )
+                            if org_component is not None:
+                                base_model["organization"] = Metadata.id_to_title(
+                                    org_component
+                                )
+                            if version is not None:
+                                base_model["version"] = version
+                            if (
+                                org_component is not None
+                                and model_full_name_component is not None
+                            ):
+                                base_model["repo_url"] = (
+                                    f"https://huggingface.co/{org_component}/{model_full_name_component}"
+                                )
+
+                    elif isinstance(model_id, dict):
+                        base_model = model_id
+
+                    else:
+                        logger.error(
+                            f"base model entry '{str(model_id)}' not in a known format"
                         )
-                    metadata.base_models.append(base_model)
-
-            if "license" in model_card and metadata.license is None:
-                metadata.license = model_card.get("license")
-
-            if "license_name" in model_card and metadata.license_name is None:
-                metadata.license_name = model_card.get("license_name")
-
-            if "license_link" in model_card and metadata.license_link is None:
-                metadata.license_link = model_card.get("license_link")
-
-            tags_value = model_card.get("tags", None)
-            if tags_value is not None:
-
-                if metadata.tags is None:
-                    metadata.tags = []
-
-                if isinstance(tags_value, str):
-                    metadata.tags.append(tags_value)
-                elif isinstance(tags_value, list):
-                    metadata.tags.extend(tags_value)
 
-            pipeline_tags_value = model_card.get("pipeline_tag", None)
-            if pipeline_tags_value is not None:
+                    metadata.base_models.append(base_model)
 
-                if metadata.tags is None:
-                    metadata.tags = []
+            if (
+                "datasets" in model_card
+                or "dataset" in model_card
+                or "dataset_sources" in model_card
+            ):
+                # This represents the datasets that this was trained from
+                metadata_datasets = []
+                dataset_value = model_card.get(
+                    "datasets",
+                    model_card.get("dataset", model_card.get("dataset_sources", None)),
+                )
 
-                if isinstance(pipeline_tags_value, str):
-                    metadata.tags.append(pipeline_tags_value)
-                elif isinstance(pipeline_tags_value, list):
-                    metadata.tags.extend(pipeline_tags_value)
+                if dataset_value is not None:
+                    if isinstance(dataset_value, str):
+                        metadata_datasets.append(dataset_value)
+                    elif isinstance(dataset_value, list):
+                        metadata_datasets.extend(dataset_value)
 
-            language_value = model_card.get(
-                "languages", model_card.get("language", None)
-            )
-            if language_value is not None:
+                if metadata.datasets is None:
+                    metadata.datasets = []
 
-                if metadata.languages is None:
-                    metadata.languages = []
+                for dataset_id in metadata_datasets:
+                    # NOTE: model size of base model is assumed to be similar to the size of the current model
+                    dataset = {}
+                    if isinstance(dataset_id, str):
+                        if dataset_id.startswith(("http://", "https://", "ssh://")):
+                            dataset["repo_url"] = dataset_id
+
+                            # Check if Hugging Face ID is present in URL
+                            if "huggingface.co" in dataset_id:
+                                match = re.match(
+                                    r"https?://huggingface.co/([^/]+/[^/]+)$",
+                                    dataset_id,
+                                )
+                                if match:
+                                    dataset_id_component = match.group(1)
+                                    (
+                                        dataset_name_component,
+                                        org_component,
+                                        basename,
+                                        finetune,
+                                        version,
+                                        size_label,
+                                    ) = Metadata.get_model_id_components(
+                                        dataset_id_component, total_params
+                                    )
+
+                                    # Populate dataset dictionary with extracted components
+                                    if dataset_name_component is not None:
+                                        dataset["name"] = Metadata.id_to_title(
+                                            dataset_name_component
+                                        )
+                                    if org_component is not None:
+                                        dataset["organization"] = Metadata.id_to_title(
+                                            org_component
+                                        )
+                                    if version is not None:
+                                        dataset["version"] = version
+
+                        else:
+                            # Likely a Hugging Face ID
+                            (
+                                dataset_name_component,
+                                org_component,
+                                basename,
+                                finetune,
+                                version,
+                                size_label,
+                            ) = Metadata.get_model_id_components(
+                                dataset_id, total_params
+                            )
+
+                            # Populate dataset dictionary with extracted components
+                            if dataset_name_component is not None:
+                                dataset["name"] = Metadata.id_to_title(
+                                    dataset_name_component
+                                )
+                            if org_component is not None:
+                                dataset["organization"] = Metadata.id_to_title(
+                                    org_component
+                                )
+                            if version is not None:
+                                dataset["version"] = version
+                            if (
+                                org_component is not None
+                                and dataset_name_component is not None
+                            ):
+                                dataset["repo_url"] = (
+                                    f"https://huggingface.co/{org_component}/{dataset_name_component}"
+                                )
+
+                    elif isinstance(dataset_id, dict):
+                        dataset = dataset_id
+
+                    else:
+                        logger.error(
+                            f"dataset entry '{str(dataset_id)}' not in a known format"
+                        )
 
-                if isinstance(language_value, str):
-                    metadata.languages.append(language_value)
-                elif isinstance(language_value, list):
-                    metadata.languages.extend(language_value)
+                    metadata.datasets.append(dataset)
 
-            dataset_value = model_card.get("datasets", model_card.get("dataset", None))
-            if dataset_value is not None:
+            use_model_card_metadata("license", "license")
+            use_model_card_metadata("license_name", "license_name")
+            use_model_card_metadata("license_link", "license_link")
 
-                if metadata.datasets is None:
-                    metadata.datasets = []
+            use_array_model_card_metadata("tags", "tags")
+            use_array_model_card_metadata("tags", "pipeline_tag")
 
-                if isinstance(dataset_value, str):
-                    metadata.datasets.append(dataset_value)
-                elif isinstance(dataset_value, list):
-                    metadata.datasets.extend(dataset_value)
+            use_array_model_card_metadata("languages", "languages")
+            use_array_model_card_metadata("languages", "language")
 
         # Hugging Face Parameter Heuristics
         ####################################
@@ -508,7 +680,7 @@ def apply_metadata_heuristic(
 
             hf_name_or_path = hf_params.get("_name_or_path")
             if hf_name_or_path is not None and hf_name_or_path.count("/") <= 1:
-                # Use _name_or_path only if it's actually a model name and not some computer path
+                # Use _name_or_path only if its actually a model name and not some computer path
                 # e.g. 'meta-llama/Llama-2-7b-hf'
                 model_id = hf_name_or_path
                 (
@@ -584,7 +756,10 @@ def set_gguf_meta_model(self, gguf_writer: gguf.GGUFWriter):
             gguf_writer.add_size_label(self.size_label)
 
         if self.license is not None:
-            gguf_writer.add_license(self.license)
+            if isinstance(self.license, list):
+                gguf_writer.add_license(",".join(self.license))
+            else:
+                gguf_writer.add_license(self.license)
         if self.license_name is not None:
             gguf_writer.add_license_name(self.license_name)
         if self.license_link is not None:
@@ -621,6 +796,10 @@ def set_gguf_meta_model(self, gguf_writer: gguf.GGUFWriter):
                     gguf_writer.add_base_model_organization(
                         key, base_model_entry["organization"]
                     )
+                if "description" in base_model_entry:
+                    gguf_writer.add_base_model_description(
+                        key, base_model_entry["description"]
+                    )
                 if "url" in base_model_entry:
                     gguf_writer.add_base_model_url(key, base_model_entry["url"])
                 if "doi" in base_model_entry:
@@ -632,9 +811,33 @@ def set_gguf_meta_model(self, gguf_writer: gguf.GGUFWriter):
                         key, base_model_entry["repo_url"]
                     )
 
+        if self.datasets is not None:
+            gguf_writer.add_dataset_count(len(self.datasets))
+            for key, dataset_entry in enumerate(self.datasets):
+                if "name" in dataset_entry:
+                    gguf_writer.add_dataset_name(key, dataset_entry["name"])
+                if "author" in dataset_entry:
+                    gguf_writer.add_dataset_author(key, dataset_entry["author"])
+                if "version" in dataset_entry:
+                    gguf_writer.add_dataset_version(key, dataset_entry["version"])
+                if "organization" in dataset_entry:
+                    gguf_writer.add_dataset_organization(
+                        key, dataset_entry["organization"]
+                    )
+                if "description" in dataset_entry:
+                    gguf_writer.add_dataset_description(
+                        key, dataset_entry["description"]
+                    )
+                if "url" in dataset_entry:
+                    gguf_writer.add_dataset_url(key, dataset_entry["url"])
+                if "doi" in dataset_entry:
+                    gguf_writer.add_dataset_doi(key, dataset_entry["doi"])
+                if "uuid" in dataset_entry:
+                    gguf_writer.add_dataset_uuid(key, dataset_entry["uuid"])
+                if "repo_url" in dataset_entry:
+                    gguf_writer.add_dataset_repo_url(key, dataset_entry["repo_url"])
+
         if self.tags is not None:
             gguf_writer.add_tags(self.tags)
         if self.languages is not None:
             gguf_writer.add_languages(self.languages)
-        if self.datasets is not None:
-            gguf_writer.add_datasets(self.datasets)
diff --git a/src/gguf/quants.py b/src/gguf/quants.py
index a5d8b44..445c43e 100644
--- a/src/gguf/quants.py
+++ b/src/gguf/quants.py
@@ -1,15 +1,19 @@
 from __future__ import annotations
-from typing import Callable, Sequence
+from abc import ABC, abstractmethod
+from typing import Any, Callable, Sequence
+from math import log2, ceil
 
 from numpy.typing import DTypeLike
 
-from .constants import GGML_QUANT_SIZES, GGMLQuantizationType
+from .constants import GGML_QUANT_SIZES, GGMLQuantizationType, QK_K
 from .lazy import LazyNumpyTensor
 
 import numpy as np
 
 
-def quant_shape_to_byte_shape(shape: Sequence[int], quant_type: GGMLQuantizationType):
+def quant_shape_to_byte_shape(
+    shape: Sequence[int], quant_type: GGMLQuantizationType
+) -> tuple[int, ...]:
     block_size, type_size = GGML_QUANT_SIZES[quant_type]
     if shape[-1] % block_size != 0:
         raise ValueError(
@@ -18,7 +22,9 @@ def quant_shape_to_byte_shape(shape: Sequence[int], quant_type: GGMLQuantization
     return (*shape[:-1], shape[-1] // block_size * type_size)
 
 
-def quant_shape_from_byte_shape(shape: Sequence[int], quant_type: GGMLQuantizationType):
+def quant_shape_from_byte_shape(
+    shape: Sequence[int], quant_type: GGMLQuantizationType
+) -> tuple[int, ...]:
     block_size, type_size = GGML_QUANT_SIZES[quant_type]
     if shape[-1] % type_size != 0:
         raise ValueError(
@@ -27,22 +33,8 @@ def quant_shape_from_byte_shape(shape: Sequence[int], quant_type: GGMLQuantizati
     return (*shape[:-1], shape[-1] // type_size * block_size)
 
 
-# same as ggml_compute_fp32_to_bf16 in ggml-impl.h
-def __compute_fp32_to_bf16(n: np.ndarray) -> np.ndarray:
-    n = n.astype(np.float32, copy=False).view(np.uint32)
-    # force nan to quiet
-    n = np.where(
-        (n & 0x7FFFFFFF) > 0x7F800000,
-        (n & np.uint32(0xFFFF0000)) | np.uint32(64 << 16),
-        n,
-    )
-    # round to nearest even
-    n = (np.uint64(n) + (0x7FFF + ((n >> 16) & 1))) >> 16
-    return n.astype(np.uint16)
-
-
 # This is faster than np.vectorize and np.apply_along_axis because it works on more than one row at a time
-def __apply_over_grouped_rows(
+def _apply_over_grouped_rows(
     func: Callable[[np.ndarray], np.ndarray],
     arr: np.ndarray,
     otype: DTypeLike,
@@ -63,85 +55,1398 @@ def __apply_over_grouped_rows(
     return out.reshape(oshape)
 
 
-def __quantize_bf16_array(n: np.ndarray) -> np.ndarray:
-    return __apply_over_grouped_rows(
-        __compute_fp32_to_bf16, arr=n, otype=np.uint16, oshape=n.shape
-    )
+# round away from zero
+# ref: https://stackoverflow.com/a/59143326/22827863
+def np_roundf(n: np.ndarray) -> np.ndarray:
+    a = abs(n)
+    floored = np.floor(a)
+    b = floored + np.floor(2 * (a - floored))
+    return np.sign(n) * b
+
+
+class QuantError(Exception): ...
 
 
-__quantize_bf16_lazy = LazyNumpyTensor._wrap_fn(
-    __quantize_bf16_array, meta_noop=np.uint16
-)
+_type_traits: dict[GGMLQuantizationType, type[__Quant]] = {}
 
 
-def quantize_bf16(n: np.ndarray):
-    if type(n) is LazyNumpyTensor:
-        return __quantize_bf16_lazy(n)
+def quantize(data: np.ndarray, qtype: GGMLQuantizationType) -> np.ndarray:
+    if qtype == GGMLQuantizationType.F32:
+        return data.astype(np.float32, copy=False)
+    elif qtype == GGMLQuantizationType.F16:
+        return data.astype(np.float16, copy=False)
+    elif (q := _type_traits.get(qtype)) is not None:
+        return q.quantize(data)
     else:
-        return __quantize_bf16_array(n)
+        raise NotImplementedError(
+            f"Quantization for {qtype.name} is not yet implemented"
+        )
 
 
-__q8_block_size, __q8_type_size = GGML_QUANT_SIZES[GGMLQuantizationType.Q8_0]
+def dequantize(data: np.ndarray, qtype: GGMLQuantizationType) -> np.ndarray:
+    if qtype == GGMLQuantizationType.F32:
+        return data.view(np.float32)
+    elif qtype == GGMLQuantizationType.F16:
+        return data.view(np.float16).astype(np.float32)
+    elif (q := _type_traits.get(qtype)) is not None:
+        return q.dequantize(data)
+    else:
+        raise NotImplementedError(
+            f"Dequantization for {qtype.name} is not yet implemented"
+        )
 
 
-def can_quantize_to_q8_0(n: np.ndarray) -> bool:
-    return n.shape[-1] % __q8_block_size == 0
+class __Quant(ABC):
+    qtype: GGMLQuantizationType
+    block_size: int
+    type_size: int
 
+    grid: np.ndarray[Any, np.dtype[np.float32]] | None = None
+    grid_shape: tuple[int, int] = (0, 0)
+    grid_map: tuple[int | float, ...] = ()
+    grid_hex: bytes | None = None
 
-# round away from zero
-# ref: https://stackoverflow.com/a/59143326/22827863
-def np_roundf(n: np.ndarray) -> np.ndarray:
-    a = abs(n)
-    floored = np.floor(a)
-    b = floored + np.floor(2 * (a - floored))
-    return np.sign(n) * b
+    def __init__(self):
+        return TypeError("Quant conversion classes can't have instances")
+
+    def __init_subclass__(cls, qtype: GGMLQuantizationType) -> None:
+        cls.qtype = qtype
+        cls.block_size, cls.type_size = GGML_QUANT_SIZES[qtype]
+        cls.__quantize_lazy = LazyNumpyTensor._wrap_fn(
+            cls.__quantize_array, meta_noop=(np.uint8, cls.__shape_to_bytes)
+        )
+        cls.__dequantize_lazy = LazyNumpyTensor._wrap_fn(
+            cls.__dequantize_array, meta_noop=(np.float32, cls.__shape_from_bytes)
+        )
+        assert qtype not in _type_traits
+        _type_traits[qtype] = cls
+
+    @classmethod
+    def init_grid(cls):
+        if cls.grid is not None or cls.grid_hex is None:
+            return
+
+        bits_per_elem = ceil(log2(len(cls.grid_map)))
+        assert bits_per_elem != 0, cls.qtype.name
+        elems_per_byte = 8 // bits_per_elem
+
+        grid = np.frombuffer(cls.grid_hex, dtype=np.uint8)
+        # decode hexadecimal chars from grid
+        grid = grid.reshape((-1, 2))
+        grid = (np.where(grid > 0x40, grid + 9, grid) & 0x0F) << np.array(
+            [4, 0], dtype=np.uint8
+        ).reshape((1, 2))
+        grid = grid[..., 0] | grid[..., 1]
+        # unpack the grid values
+        grid = grid.reshape((-1, 1)) >> np.array(
+            [i for i in range(0, 8, 8 // elems_per_byte)], dtype=np.uint8
+        ).reshape((1, elems_per_byte))
+        grid = (grid & ((1 << bits_per_elem) - 1)).reshape((-1, 1))
+        grid_map = np.array(cls.grid_map, dtype=np.float32).reshape((1, -1))
+        grid = np.take_along_axis(grid_map, grid, axis=-1)
+        cls.grid = grid.reshape((1, 1, *cls.grid_shape))
+
+    @classmethod
+    @abstractmethod
+    def quantize_blocks(cls, blocks: np.ndarray) -> np.ndarray:
+        raise NotImplementedError
+
+    @classmethod
+    @abstractmethod
+    def dequantize_blocks(cls, blocks: np.ndarray) -> np.ndarray:
+        raise NotImplementedError
+
+    @classmethod
+    def quantize_rows(cls, rows: np.ndarray) -> np.ndarray:
+        rows = rows.astype(np.float32, copy=False)
+        shape = rows.shape
+        n_blocks = rows.size // cls.block_size
+        blocks = rows.reshape((n_blocks, cls.block_size))
+        blocks = cls.quantize_blocks(blocks)
+        assert blocks.dtype == np.uint8
+        assert blocks.shape[-1] == cls.type_size
+        return blocks.reshape(cls.__shape_to_bytes(shape))
+
+    @classmethod
+    def dequantize_rows(cls, rows: np.ndarray) -> np.ndarray:
+        rows = rows.view(np.uint8)
+        shape = rows.shape
+        n_blocks = rows.size // cls.type_size
+        blocks = rows.reshape((n_blocks, cls.type_size))
+        blocks = cls.dequantize_blocks(blocks)
+        assert blocks.dtype == np.float32
+        assert blocks.shape[-1] == cls.block_size
+        return blocks.reshape(cls.__shape_from_bytes(shape))
+
+    @classmethod
+    def __shape_to_bytes(cls, shape: Sequence[int]):
+        return quant_shape_to_byte_shape(shape, cls.qtype)
+
+    @classmethod
+    def __shape_from_bytes(cls, shape: Sequence[int]):
+        return quant_shape_from_byte_shape(shape, cls.qtype)
+
+    @classmethod
+    def __quantize_array(cls, array: np.ndarray) -> np.ndarray:
+        return _apply_over_grouped_rows(
+            cls.quantize_rows,
+            arr=array,
+            otype=np.uint8,
+            oshape=cls.__shape_to_bytes(array.shape),
+        )
+
+    @classmethod
+    def __dequantize_array(cls, array: np.ndarray) -> np.ndarray:
+        cls.init_grid()
+        return _apply_over_grouped_rows(
+            cls.dequantize_rows,
+            arr=array,
+            otype=np.float32,
+            oshape=cls.__shape_from_bytes(array.shape),
+        )
+
+    @classmethod
+    def __quantize_lazy(cls, lazy_tensor: LazyNumpyTensor, /) -> Any:
+        pass
+
+    @classmethod
+    def __dequantize_lazy(cls, lazy_tensor: LazyNumpyTensor, /) -> Any:
+        pass
+
+    @classmethod
+    def can_quantize(cls, tensor: np.ndarray | LazyNumpyTensor) -> bool:
+        return tensor.shape[-1] % cls.block_size == 0
+
+    @classmethod
+    def quantize(cls, tensor: np.ndarray | LazyNumpyTensor) -> np.ndarray:
+        if not cls.can_quantize(tensor):
+            raise QuantError(
+                f"Can't quantize tensor with shape {tensor.shape} to {cls.qtype.name}"
+            )
+        if isinstance(tensor, LazyNumpyTensor):
+            return cls.__quantize_lazy(tensor)
+        else:
+            return cls.__quantize_array(tensor)
+
+    @classmethod
+    def dequantize(cls, tensor: np.ndarray | LazyNumpyTensor) -> np.ndarray:
+        if isinstance(tensor, LazyNumpyTensor):
+            return cls.__dequantize_lazy(tensor)
+        else:
+            return cls.__dequantize_array(tensor)
+
+
+class BF16(__Quant, qtype=GGMLQuantizationType.BF16):
+    @classmethod
+    # same as ggml_compute_fp32_to_bf16 in ggml-impl.h
+    def quantize_blocks(cls, blocks: np.ndarray) -> np.ndarray:
+        n = blocks.view(np.uint32)
+        # force nan to quiet
+        n = np.where(
+            (n & 0x7FFFFFFF) > 0x7F800000,
+            (n & np.uint32(0xFFFF0000)) | np.uint32(64 << 16),
+            n,
+        )
+        # round to nearest even
+        n = (np.uint64(n) + (0x7FFF + ((n >> 16) & 1))) >> 16
+        return n.astype(np.uint16).view(np.uint8)
+
+    @classmethod
+    def dequantize_blocks(cls, blocks: np.ndarray) -> np.ndarray:
+        return (blocks.view(np.int16).astype(np.int32) << 16).view(np.float32)
+
+
+class Q4_0(__Quant, qtype=GGMLQuantizationType.Q4_0):
+    @classmethod
+    def quantize_blocks(cls, blocks: np.ndarray) -> np.ndarray:
+        n_blocks = blocks.shape[0]
+
+        imax = abs(blocks).argmax(axis=-1, keepdims=True)
+        max = np.take_along_axis(blocks, imax, axis=-1)
+
+        d = max / -8
+        with np.errstate(divide="ignore"):
+            id = np.where(d == 0, 0, 1 / d)
+        # FIXME: Q4_0's reference rounding is cursed and depends on FMA
+        qs = (
+            np.trunc(
+                (np.float64(blocks) * np.float64(id)) + np.float64(8.5),
+                dtype=np.float32,
+            )
+            .astype(np.uint8)
+            .clip(0, 15)
+        )
+
+        qs = qs.reshape((n_blocks, 2, cls.block_size // 2))
+        qs = qs[..., 0, :] | (qs[..., 1, :] << np.uint8(4))
+
+        d = d.astype(np.float16).view(np.uint8)
+
+        return np.concatenate([d, qs], axis=-1)
+
+    @classmethod
+    def dequantize_blocks(cls, blocks: np.ndarray) -> np.ndarray:
+        n_blocks = blocks.shape[0]
+
+        d, qs = np.hsplit(blocks, [2])
+
+        d = d.view(np.float16).astype(np.float32)
+
+        qs = qs.reshape((n_blocks, -1, 1, cls.block_size // 2)) >> np.array(
+            [0, 4], dtype=np.uint8
+        ).reshape((1, 1, 2, 1))
+        qs = (qs & np.uint8(0x0F)).reshape((n_blocks, -1)).astype(np.int8) - np.int8(8)
+
+        return d * qs.astype(np.float32)
+
+
+class Q4_1(__Quant, qtype=GGMLQuantizationType.Q4_1):
+    @classmethod
+    def quantize_blocks(cls, blocks: np.ndarray) -> np.ndarray:
+        n_blocks = blocks.shape[0]
+
+        max = blocks.max(axis=-1, keepdims=True)
+        min = blocks.min(axis=-1, keepdims=True)
+
+        d = (max - min) / 15
+        with np.errstate(divide="ignore"):
+            id = np.where(d == 0, 0, 1 / d)
+        qs = (
+            np.trunc((blocks - min) * id + np.float32(0.5), dtype=np.float32)
+            .astype(np.uint8)
+            .clip(0, 15)
+        )
+
+        qs = qs.reshape((n_blocks, 2, cls.block_size // 2))
+        qs = qs[..., 0, :] | (qs[..., 1, :] << np.uint8(4))
+
+        d = d.astype(np.float16).view(np.uint8)
+        m = min.astype(np.float16).view(np.uint8)
+
+        return np.concatenate([d, m, qs], axis=-1)
+
+    @classmethod
+    def dequantize_blocks(cls, blocks: np.ndarray) -> np.ndarray:
+        n_blocks = blocks.shape[0]
+
+        d, rest = np.hsplit(blocks, [2])
+        m, qs = np.hsplit(rest, [2])
+
+        d = d.view(np.float16).astype(np.float32)
+        m = m.view(np.float16).astype(np.float32)
+
+        qs = qs.reshape((n_blocks, -1, 1, cls.block_size // 2)) >> np.array(
+            [0, 4], dtype=np.uint8
+        ).reshape((1, 1, 2, 1))
+        qs = (qs & np.uint8(0x0F)).reshape((n_blocks, -1)).astype(np.float32)
+
+        return (d * qs) + m
+
+
+class Q5_0(__Quant, qtype=GGMLQuantizationType.Q5_0):
+    @classmethod
+    def quantize_blocks(cls, blocks: np.ndarray) -> np.ndarray:
+        n_blocks = blocks.shape[0]
+
+        imax = abs(blocks).argmax(axis=-1, keepdims=True)
+        max = np.take_along_axis(blocks, imax, axis=-1)
+
+        d = max / -16
+        with np.errstate(divide="ignore"):
+            id = np.where(d == 0, 0, 1 / d)
+        # FIXME: Q5_0's reference rounding is cursed and depends on FMA
+        q = (
+            np.trunc(
+                (np.float64(blocks) * np.float64(id)) + np.float64(16.5),
+                dtype=np.float32,
+            )
+            .astype(np.uint8)
+            .clip(0, 31)
+        )
+
+        qs = q.reshape((n_blocks, 2, cls.block_size // 2))
+        qs = (qs[..., 0, :] & np.uint8(0x0F)) | (qs[..., 1, :] << np.uint8(4))
+
+        qh = np.packbits(
+            q.reshape((n_blocks, 1, 32)) >> np.uint8(4), axis=-1, bitorder="little"
+        ).reshape(n_blocks, 4)
+
+        d = d.astype(np.float16).view(np.uint8)
+
+        return np.concatenate([d, qh, qs], axis=-1)
+
+    @classmethod
+    def dequantize_blocks(cls, blocks: np.ndarray) -> np.ndarray:
+        n_blocks = blocks.shape[0]
+
+        d, rest = np.hsplit(blocks, [2])
+        qh, qs = np.hsplit(rest, [4])
+
+        d = d.view(np.float16).astype(np.float32)
+        qh = qh.view(np.uint32)
+
+        qh = qh.reshape((n_blocks, 1)) >> np.array(
+            [i for i in range(32)], dtype=np.uint32
+        ).reshape((1, 32))
+        ql = qs.reshape((n_blocks, -1, 1, cls.block_size // 2)) >> np.array(
+            [0, 4], dtype=np.uint8
+        ).reshape((1, 1, 2, 1))
+        qh = (qh & np.uint32(0x01)).astype(np.uint8)
+        ql = (ql & np.uint8(0x0F)).reshape((n_blocks, -1))
+
+        qs = (ql | (qh << np.uint8(4))).astype(np.int8) - np.int8(16)
+
+        return d * qs.astype(np.float32)
+
+
+class Q5_1(__Quant, qtype=GGMLQuantizationType.Q5_1):
+    @classmethod
+    def quantize_blocks(cls, blocks: np.ndarray) -> np.ndarray:
+        n_blocks = blocks.shape[0]
+
+        max = blocks.max(axis=-1, keepdims=True)
+        min = blocks.min(axis=-1, keepdims=True)
+
+        d = (max - min) / 31
+        with np.errstate(divide="ignore"):
+            id = np.where(d == 0, 0, 1 / d)
+        q = (
+            np.trunc((blocks - min) * id + np.float32(0.5), dtype=np.float32)
+            .astype(np.uint8)
+            .clip(0, 31)
+        )
+
+        qs = q.reshape((n_blocks, 2, cls.block_size // 2))
+        qs = (qs[..., 0, :] & np.uint8(0x0F)) | (qs[..., 1, :] << np.uint8(4))
+
+        qh = np.packbits(
+            q.reshape((n_blocks, 1, 32)) >> np.uint8(4), axis=-1, bitorder="little"
+        ).reshape(n_blocks, 4)
+
+        d = d.astype(np.float16).view(np.uint8)
+        m = min.astype(np.float16).view(np.uint8)
+
+        return np.concatenate([d, m, qh, qs], axis=-1)
+
+    @classmethod
+    def dequantize_blocks(cls, blocks: np.ndarray) -> np.ndarray:
+        n_blocks = blocks.shape[0]
+
+        d, rest = np.hsplit(blocks, [2])
+        m, rest = np.hsplit(rest, [2])
+        qh, qs = np.hsplit(rest, [4])
+
+        d = d.view(np.float16).astype(np.float32)
+        m = m.view(np.float16).astype(np.float32)
+        qh = qh.view(np.uint32)
+
+        qh = qh.reshape((n_blocks, 1)) >> np.array(
+            [i for i in range(32)], dtype=np.uint32
+        ).reshape((1, 32))
+        ql = qs.reshape((n_blocks, -1, 1, cls.block_size // 2)) >> np.array(
+            [0, 4], dtype=np.uint8
+        ).reshape((1, 1, 2, 1))
+        qh = (qh & np.uint32(0x01)).astype(np.uint8)
+        ql = (ql & np.uint8(0x0F)).reshape((n_blocks, -1))
+
+        qs = (ql | (qh << np.uint8(4))).astype(np.float32)
+
+        return (d * qs) + m
+
+
+class Q8_0(__Quant, qtype=GGMLQuantizationType.Q8_0):
+    @classmethod
+    # Implementation of Q8_0 with bit-exact same results as reference implementation in ggml-quants.c
+    def quantize_blocks(cls, blocks: np.ndarray) -> np.ndarray:
+
+        d = abs(blocks).max(axis=1, keepdims=True) / 127
+        with np.errstate(divide="ignore"):
+            id = np.where(d == 0, 0, 1 / d)
+        qs = np_roundf(blocks * id)
+
+        # (n_blocks, 2)
+        d = d.astype(np.float16).view(np.uint8)
+        # (n_blocks, block_size)
+        qs = qs.astype(np.int8).view(np.uint8)
+
+        return np.concatenate([d, qs], axis=1)
+
+    @classmethod
+    def dequantize_blocks(cls, blocks: np.ndarray) -> np.ndarray:
+        d, x = np.split(blocks, [2], axis=1)
+        d = d.view(np.float16).astype(np.float32)
+        x = x.view(np.int8).astype(np.float32)
+
+        return x * d
+
+
+class Q2_K(__Quant, qtype=GGMLQuantizationType.Q2_K):
+    @classmethod
+    def dequantize_blocks(cls, blocks: np.ndarray) -> np.ndarray:
+        n_blocks = blocks.shape[0]
+
+        scales, rest = np.hsplit(blocks, [QK_K // 16])
+        qs, rest = np.hsplit(rest, [QK_K // 4])
+        d, dmin = np.hsplit(rest, [2])
+
+        d = d.view(np.float16).astype(np.float32)
+        dmin = dmin.view(np.float16).astype(np.float32)
+
+        # (n_blocks, 16, 1)
+        dl = (d * (scales & 0xF).astype(np.float32)).reshape((n_blocks, QK_K // 16, 1))
+        ml = (dmin * (scales >> 4).astype(np.float32)).reshape(
+            (n_blocks, QK_K // 16, 1)
+        )
+
+        shift = np.array([0, 2, 4, 6], dtype=np.uint8).reshape((1, 1, 4, 1))
+
+        qs = (qs.reshape((n_blocks, -1, 1, 32)) >> shift) & np.uint8(3)
+
+        qs = qs.reshape((n_blocks, QK_K // 16, 16)).astype(np.float32)
+
+        qs = dl * qs - ml
+
+        return qs.reshape((n_blocks, -1))
+
+
+class Q3_K(__Quant, qtype=GGMLQuantizationType.Q3_K):
+    @classmethod
+    def dequantize_blocks(cls, blocks: np.ndarray) -> np.ndarray:
+        n_blocks = blocks.shape[0]
+
+        hmask, rest = np.hsplit(blocks, [QK_K // 8])
+        qs, rest = np.hsplit(rest, [QK_K // 4])
+        scales, d = np.hsplit(rest, [12])
+
+        d = d.view(np.float16).astype(np.float32)
+
+        # The scales are packed at 6-bit each in this pattern:
+        #  0: IIIIAAAA
+        #  1: JJJJBBBB
+        #  2: KKKKCCCC
+        #  3: LLLLDDDD
+        #  4: MMMMEEEE
+        #  5: NNNNFFFF
+        #  6: OOOOGGGG
+        #  7: PPPPHHHH
+        #  8: MMIIEEAA
+        #  9: NNJJFFBB
+        # 10: OOKKGGCC
+        # 11: PPLLHHDD
+        lscales, hscales = np.hsplit(scales, [8])
+        lscales = lscales.reshape((n_blocks, 1, 8)) >> np.array(
+            [0, 4], dtype=np.uint8
+        ).reshape((1, 2, 1))
+        lscales = lscales.reshape((n_blocks, 16))
+        hscales = hscales.reshape((n_blocks, 1, 4)) >> np.array(
+            [0, 2, 4, 6], dtype=np.uint8
+        ).reshape((1, 4, 1))
+        hscales = hscales.reshape((n_blocks, 16))
+        scales = (lscales & np.uint8(0x0F)) | (
+            (hscales & np.uint8(0x03)) << np.uint8(4)
+        )
+        scales = (scales.astype(np.int8) - np.int8(32)).astype(np.float32)
+
+        dl = (d * scales).reshape((n_blocks, 16, 1))
+
+        ql = qs.reshape((n_blocks, -1, 1, 32)) >> np.array(
+            [0, 2, 4, 6], dtype=np.uint8
+        ).reshape((1, 1, 4, 1))
+        qh = hmask.reshape(n_blocks, -1, 1, 32) >> np.array(
+            [i for i in range(8)], dtype=np.uint8
+        ).reshape((1, 1, 8, 1))
+        ql = ql.reshape((n_blocks, 16, QK_K // 16)) & np.uint8(3)
+        qh = qh.reshape((n_blocks, 16, QK_K // 16)) & np.uint8(1)
+        qh = qh ^ np.uint8(1)  # strangely, the offset is zero when the bitmask is 1
+        q = (ql.astype(np.int8) - (qh << np.uint8(2)).astype(np.int8)).astype(
+            np.float32
+        )
+
+        return (dl * q).reshape((n_blocks, QK_K))
+
+
+class Q4_K(__Quant, qtype=GGMLQuantizationType.Q4_K):
+    K_SCALE_SIZE = 12
+
+    @staticmethod
+    def get_scale_min(scales: np.ndarray) -> tuple[np.ndarray, np.ndarray]:
+        n_blocks = scales.shape[0]
+        scales = scales.view(np.uint8)
+        ### Unpacking the following: ###
+        #  0 EEAAAAAA
+        #  1 FFBBBBBB
+        #  2 GGCCCCCC
+        #  3 HHDDDDDD
+        #  4 eeaaaaaa
+        #  5 ffbbbbbb
+        #  6 ggcccccc
+        #  7 hhdddddd
+        #  8 eeeeEEEE
+        #  9 ffffFFFF
+        # 10 ggggGGGG
+        # 11 hhhhHHHH
+        scales = scales.reshape((n_blocks, 3, 4))
+        d, m, m_d = np.split(scales, 3, axis=-2)
 
+        sc = np.concatenate([d & 0x3F, (m_d & 0x0F) | ((d >> 2) & 0x30)], axis=-1)
+        min = np.concatenate([m & 0x3F, (m_d >> 4) | ((m >> 2) & 0x30)], axis=-1)
 
-def __quantize_q8_0_shape_change(s: tuple[int, ...]) -> tuple[int, ...]:
-    return (*s[:-1], s[-1] // __q8_block_size * __q8_type_size)
+        return (sc.reshape((n_blocks, 8)), min.reshape((n_blocks, 8)))
 
+    @classmethod
+    def dequantize_blocks(cls, blocks: np.ndarray) -> np.ndarray:
+        n_blocks = blocks.shape[0]
 
-# Implementation of Q8_0 with bit-exact same results as reference implementation in ggml-quants.c
-def __quantize_q8_0_rows(n: np.ndarray) -> np.ndarray:
-    shape = n.shape
-    assert shape[-1] % __q8_block_size == 0
+        d, rest = np.hsplit(blocks, [2])
+        dmin, rest = np.hsplit(rest, [2])
+        scales, qs = np.hsplit(rest, [cls.K_SCALE_SIZE])
 
-    n_blocks = n.size // __q8_block_size
+        d = d.view(np.float16).astype(np.float32)
+        dmin = dmin.view(np.float16).astype(np.float32)
 
-    blocks = n.reshape((n_blocks, __q8_block_size)).astype(np.float32, copy=False)
+        sc, m = Q4_K.get_scale_min(scales)
 
-    d = abs(blocks).max(axis=1, keepdims=True) / 127
-    with np.errstate(divide="ignore"):
-        id = np.where(d == 0, 0, 1 / d)
-    qs = np_roundf(blocks * id)
+        d = (d * sc.astype(np.float32)).reshape((n_blocks, -1, 1))
+        dm = (dmin * m.astype(np.float32)).reshape((n_blocks, -1, 1))
 
-    # (n_blocks, 2)
-    d = d.astype(np.float16).view(np.uint8)
-    # (n_blocks, block_size)
-    qs = qs.astype(np.int8).view(np.uint8)
+        qs = qs.reshape((n_blocks, -1, 1, 32)) >> np.array(
+            [0, 4], dtype=np.uint8
+        ).reshape((1, 1, 2, 1))
+        qs = (qs & np.uint8(0x0F)).reshape((n_blocks, -1, 32)).astype(np.float32)
 
-    assert d.shape[1] + qs.shape[1] == __q8_type_size
+        return (d * qs - dm).reshape((n_blocks, QK_K))
 
-    return np.concatenate([d, qs], axis=1).reshape(__quantize_q8_0_shape_change(shape))
 
+class Q5_K(__Quant, qtype=GGMLQuantizationType.Q5_K):
+    @classmethod
+    def dequantize_blocks(cls, blocks: np.ndarray) -> np.ndarray:
+        n_blocks = blocks.shape[0]
 
-def __quantize_q8_0_array(n: np.ndarray) -> np.ndarray:
-    return __apply_over_grouped_rows(
-        __quantize_q8_0_rows,
-        arr=n,
-        otype=np.uint8,
-        oshape=__quantize_q8_0_shape_change(n.shape),
+        d, rest = np.hsplit(blocks, [2])
+        dmin, rest = np.hsplit(rest, [2])
+        scales, rest = np.hsplit(rest, [Q4_K.K_SCALE_SIZE])
+        qh, qs = np.hsplit(rest, [QK_K // 8])
+
+        d = d.view(np.float16).astype(np.float32)
+        dmin = dmin.view(np.float16).astype(np.float32)
+
+        sc, m = Q4_K.get_scale_min(scales)
+
+        d = (d * sc.astype(np.float32)).reshape((n_blocks, -1, 1))
+        dm = (dmin * m.astype(np.float32)).reshape((n_blocks, -1, 1))
+
+        ql = qs.reshape((n_blocks, -1, 1, 32)) >> np.array(
+            [0, 4], dtype=np.uint8
+        ).reshape((1, 1, 2, 1))
+        qh = qh.reshape((n_blocks, -1, 1, 32)) >> np.array(
+            [i for i in range(8)], dtype=np.uint8
+        ).reshape((1, 1, 8, 1))
+        ql = (ql & np.uint8(0x0F)).reshape((n_blocks, -1, 32))
+        qh = (qh & np.uint8(0x01)).reshape((n_blocks, -1, 32))
+        q = (ql | (qh << np.uint8(4))).astype(np.float32)
+
+        return (d * q - dm).reshape((n_blocks, QK_K))
+
+
+class Q6_K(__Quant, qtype=GGMLQuantizationType.Q6_K):
+    @classmethod
+    def dequantize_blocks(cls, blocks: np.ndarray) -> np.ndarray:
+        n_blocks = blocks.shape[0]
+
+        ql, rest = np.hsplit(blocks, [QK_K // 2])
+        qh, rest = np.hsplit(rest, [QK_K // 4])
+        scales, d = np.hsplit(rest, [QK_K // 16])
+
+        scales = scales.view(np.int8).astype(np.float32)
+        d = d.view(np.float16).astype(np.float32)
+        d = (d * scales).reshape((n_blocks, QK_K // 16, 1))
+
+        ql = ql.reshape((n_blocks, -1, 1, 64)) >> np.array(
+            [0, 4], dtype=np.uint8
+        ).reshape((1, 1, 2, 1))
+        ql = (ql & np.uint8(0x0F)).reshape((n_blocks, -1, 32))
+        qh = qh.reshape((n_blocks, -1, 1, 32)) >> np.array(
+            [0, 2, 4, 6], dtype=np.uint8
+        ).reshape((1, 1, 4, 1))
+        qh = (qh & np.uint8(0x03)).reshape((n_blocks, -1, 32))
+        q = (ql | (qh << np.uint8(4))).astype(np.int8) - np.int8(32)
+        q = q.reshape((n_blocks, QK_K // 16, -1)).astype(np.float32)
+
+        return (d * q).reshape((n_blocks, QK_K))
+
+
+class TQ1_0(__Quant, qtype=GGMLQuantizationType.TQ1_0):
+    @classmethod
+    def quantize_blocks(cls, blocks: np.ndarray) -> np.ndarray:
+        n_blocks = blocks.shape[0]
+
+        d = abs(blocks).max(axis=-1, keepdims=True)
+        with np.errstate(divide="ignore"):
+            id = np.where(d == 0, 0, 1 / d)
+        qs = np_roundf(blocks * id)
+        qs = (qs.astype(np.int8) + np.int8(1)).astype(np.uint8)
+
+        qs0, qs1, qh = (
+            qs[..., : (32 * 5)],
+            qs[..., (32 * 5) : (48 * 5)],
+            qs[..., (48 * 5) :],
+        )
+        qs0 = qs0.reshape((n_blocks, -1, 5, 32)) * np.array(
+            [81, 27, 9, 3, 1], dtype=np.uint8
+        ).reshape((1, 1, 5, 1))
+        qs0 = np.sum(qs0, axis=-2).reshape((n_blocks, -1))
+        qs1 = qs1.reshape((n_blocks, -1, 5, 16)) * np.array(
+            [81, 27, 9, 3, 1], dtype=np.uint8
+        ).reshape((1, 1, 5, 1))
+        qs1 = np.sum(qs1, axis=-2).reshape((n_blocks, -1))
+        qh = qh.reshape((n_blocks, -1, 4, 4)) * np.array(
+            [81, 27, 9, 3], dtype=np.uint8
+        ).reshape((1, 1, 4, 1))
+        qh = np.sum(qh, axis=-2).reshape((n_blocks, -1))
+        qs = np.concatenate([qs0, qs1, qh], axis=-1)
+        qs = (qs.astype(np.uint16) * 256 + (243 - 1)) // 243
+
+        qs = qs.astype(np.uint8)
+        d = d.astype(np.float16).view(np.uint8)
+
+        return np.concatenate([qs, d], axis=-1)
+
+    @classmethod
+    def dequantize_blocks(cls, blocks: np.ndarray) -> np.ndarray:
+        n_blocks = blocks.shape[0]
+
+        qs, rest = np.hsplit(blocks, [(QK_K - 4 * QK_K // 64) // 5])
+        qh, d = np.hsplit(rest, [QK_K // 64])
+
+        d = d.view(np.float16).astype(np.float32)
+
+        qs0, qs1 = qs[..., :32], qs[..., 32:]
+        qs0 = qs0.reshape((n_blocks, -1, 1, 32)) * np.array(
+            [1, 3, 9, 27, 81], dtype=np.uint8
+        ).reshape((1, 1, 5, 1))
+        qs0 = qs0.reshape((n_blocks, -1))
+        qs1 = qs1.reshape((n_blocks, -1, 1, 16)) * np.array(
+            [1, 3, 9, 27, 81], dtype=np.uint8
+        ).reshape((1, 1, 5, 1))
+        qs1 = qs1.reshape((n_blocks, -1))
+        qh = qh.reshape((n_blocks, -1, 1, 4)) * np.array(
+            [1, 3, 9, 27], dtype=np.uint8
+        ).reshape((1, 1, 4, 1))
+        qh = qh.reshape((n_blocks, -1))
+        qs = np.concatenate([qs0, qs1, qh], axis=-1)
+        qs = ((qs.astype(np.uint16) * 3) >> 8).astype(np.int8) - np.int8(1)
+
+        return d * qs.astype(np.float32)
+
+
+class TQ2_0(__Quant, qtype=GGMLQuantizationType.TQ2_0):
+    @classmethod
+    def quantize_blocks(cls, blocks: np.ndarray) -> np.ndarray:
+        n_blocks = blocks.shape[0]
+
+        d = abs(blocks).max(axis=-1, keepdims=True)
+        with np.errstate(divide="ignore"):
+            id = np.where(d == 0, 0, 1 / d)
+        qs = np_roundf(blocks * id)
+        qs = (qs.astype(np.int8) + np.int8(1)).astype(np.uint8)
+
+        qs = qs.reshape((n_blocks, -1, 4, 32)) << np.array(
+            [0, 2, 4, 6], dtype=np.uint8
+        ).reshape((1, 1, 4, 1))
+        qs = qs[..., 0, :] | qs[..., 1, :] | qs[..., 2, :] | qs[..., 3, :]
+        qs = qs.reshape((n_blocks, -1))
+
+        d = d.astype(np.float16).view(np.uint8)
+
+        return np.concatenate([qs, d], axis=-1)
+
+    @classmethod
+    def dequantize_blocks(cls, blocks: np.ndarray) -> np.ndarray:
+        n_blocks = blocks.shape[0]
+
+        qs, d = np.hsplit(blocks, [QK_K // 4])
+
+        d = d.view(np.float16).astype(np.float32)
+
+        qs = qs.reshape((n_blocks, -1, 1, 32)) >> np.array(
+            [0, 2, 4, 6], dtype=np.uint8
+        ).reshape((1, 1, 4, 1))
+        qs = (qs & 0x03).reshape((n_blocks, -1)).astype(np.int8) - np.int8(1)
+
+        return d * qs.astype(np.float32)
+
+
+class IQ2_XXS(__Quant, qtype=GGMLQuantizationType.IQ2_XXS):
+    ksigns: bytes = (
+        b"\x00\x81\x82\x03\x84\x05\x06\x87\x88\x09\x0a\x8b\x0c\x8d\x8e\x0f"
+        b"\x90\x11\x12\x93\x14\x95\x96\x17\x18\x99\x9a\x1b\x9c\x1d\x1e\x9f"
+        b"\xa0\x21\x22\xa3\x24\xa5\xa6\x27\x28\xa9\xaa\x2b\xac\x2d\x2e\xaf"
+        b"\x30\xb1\xb2\x33\xb4\x35\x36\xb7\xb8\x39\x3a\xbb\x3c\xbd\xbe\x3f"
+        b"\xc0\x41\x42\xc3\x44\xc5\xc6\x47\x48\xc9\xca\x4b\xcc\x4d\x4e\xcf"
+        b"\x50\xd1\xd2\x53\xd4\x55\x56\xd7\xd8\x59\x5a\xdb\x5c\xdd\xde\x5f"
+        b"\x60\xe1\xe2\x63\xe4\x65\x66\xe7\xe8\x69\x6a\xeb\x6c\xed\xee\x6f"
+        b"\xf0\x71\x72\xf3\x74\xf5\xf6\x77\x78\xf9\xfa\x7b\xfc\x7d\x7e\xff"
     )
 
+    # iq2xxs_grid, but with each byte of the original packed in 2 bits,
+    # by mapping 0x08 to 0, 0x19 to 1, and 0x2b to 2.
+    grid_shape = (256, 8)
+    grid_map = (0x08, 0x19, 0x2B)
+    grid_hex = (
+        b"00000200050008000a00110014002000220028002a0041004400500058006100"
+        b"6400800082008a00a20001010401100115014001840198010002020222028202"
+        b"010404041004210424044004420448046004810484049004a404000502050805"
+        b"200546056905800591050906100640068406a406000805080808140828084108"
+        b"440850085208880804094009020a140a01100410101021104010601084109010"
+        b"951000110811201150115a118011241245120014081420142514491480141815"
+        b"6215001616160118041810184018811800190519a019511a002002200a204420"
+        b"6120802082202921482100220222012404241024402456240025412564259026"
+        b"082820289428442a014004401040184021402440404048405640604081408440"
+        b"9040004120416141804185410142104248425642684200440844204480449944"
+        b"124524450046014804481048404845480049584961498249454a904a00500850"
+        b"1150195020508050885004514251a4519152905492540a550156545600581158"
+        b"195864584059085a046010604060686000615561186260620064056410651265"
+        b"84654268008002800a8041808280048118814081118201840484108415844084"
+        b"608400854685948509864086608602880489118a0490109024904090a1901691"
+        b"8091459200942294449451958198209902a050a085a009a100a218a450a804a9"
+    )
 
-__quantize_q8_0_lazy = LazyNumpyTensor._wrap_fn(
-    __quantize_q8_0_array,
-    meta_noop=(np.uint8, __quantize_q8_0_shape_change),
-)
+    @classmethod
+    def dequantize_blocks(cls, blocks: np.ndarray) -> np.ndarray:
+        n_blocks = blocks.shape[0]
 
+        d, qs = np.hsplit(blocks, [2])
 
-def quantize_q8_0(data: np.ndarray):
-    if type(data) is LazyNumpyTensor:
-        return __quantize_q8_0_lazy(data)
-    else:
-        return __quantize_q8_0_array(data)
+        d = d.view(np.float16).astype(np.float32)
+
+        qs = qs.view(np.uint32).reshape(n_blocks, -1, 2)
+
+        db = (
+            d
+            * (np.float32(0.5) + (qs[..., 1] >> 28).astype(np.float32))
+            * np.float32(0.25)
+        )
+        db = db.reshape((n_blocks, -1, 1, 1))
+
+        # get the sign indices and unpack the bits
+        signs = qs[..., 1].reshape((n_blocks, -1, 1)) >> np.array(
+            [0, 7, 14, 21], dtype=np.uint32
+        ).reshape((1, 1, 4))
+        ksigns = np.frombuffer(cls.ksigns, dtype=np.uint8).reshape((1, 1, 1, 128))
+        signs = (signs & np.uint32(0x7F)).reshape((n_blocks, -1, 4, 1))
+        signs = np.take_along_axis(ksigns, signs, axis=-1)
+        signs = signs.reshape((n_blocks, -1, 4, 1)) >> np.array(
+            [i for i in range(8)], dtype=np.uint8
+        ).reshape((1, 1, 1, 8))
+        signs = signs & np.uint8(0x01)
+        signs = np.where(signs == 0, np.float32(1), np.float32(-1))
+        signs = signs.reshape((n_blocks, -1, 4, 8))
+
+        assert cls.grid is not None
+        grid = np.take_along_axis(
+            cls.grid,
+            qs[..., 0].copy().view(np.uint8).reshape((n_blocks, -1, 1, 1)),
+            axis=-2,
+        )
+        grid = grid.reshape((n_blocks, -1, 4, 8))
+
+        return (db * grid * signs).reshape((n_blocks, -1))
+
+
+class IQ2_XS(__Quant, qtype=GGMLQuantizationType.IQ2_XS):
+    # iq2xs_grid, but with each byte of the original packed in 2 bits,
+    # by mapping 0x08 to 0, 0x19 to 1, and 0x2b to 2.
+    grid_shape = (512, 8)
+    grid_map = (0x08, 0x19, 0x2B)
+    grid_hex = (
+        b"00000200050008000a0011001400160019002000220025002800410044004600"
+        b"49005000520055005800610064008000820085008800910094009900a0000101"
+        b"04010601090110011201150118011a0121012401400142014501480151015401"
+        b"6001680181018401900100020202050208021102140220024102440250025502"
+        b"80028a0201040404060409041004120415041804210424044004420445044804"
+        b"5104540456046004810484049004000502050505080511051405200541054405"
+        b"500561058005010604061006260640064206840600080208050808080a081108"
+        b"14082008250841084408500858088008a008aa08010904091009400981098909"
+        b"000a200a280a960aa00a01100410061009101010121015101810211024104010"
+        b"4210451048105110541060106a10811084109010001102110511081111111411"
+        b"2011411144115011801194119611011204120612101240126012001402140514"
+        b"0814111414142014411444144914501464148014011504151015401500161416"
+        b"49160118041810181218401854188618001905196619511aa91a002002200520"
+        b"08200a201120142020204120442050208020a020012104211021402148216521"
+        b"002222228022a82201240424102429244024002541255225992501261a26a626"
+        b"002808280a28202855288828a22868299029082a202a822a882a8a2a01400440"
+        b"0640094010401240154018402140244040404240454048404a40514054406040"
+        b"6540814084409040004102410541084111411441204141414441504180418541"
+        b"a241014204421042124229424042004402440544084411441444194420444144"
+        b"4444504480449444014504451045244540459a4500460a464446504601480448"
+        b"1048404845485448624800491149444950496949044a00500250055008501150"
+        b"145020502850415044505050805001510451105115514051425100524452aa52"
+        b"0154045410542154405460548154a154005508558055885521566856a1560058"
+        b"14584158505899581a5940594259855a0160046010604060546062608660a960"
+        b"006124624a62926200641664106540654565a46501686a682569066a546a626a"
+        b"00800280058008801180148020802a8041804480508080808280a880aa800181"
+        b"0481068110814081518159810082208280828282a082a8820184048410841284"
+        b"158440846084898400854485a58518866a860088088825885a8880888288a888"
+        b"0689228a808a888a968aa88a0190049010904090569084900091229164915692"
+        b"89920094059444945094589429959095929541965198a6984999159a609a00a0"
+        b"02a008a00aa020a02aa0a0a051a159a1a6a100a202a208a22aa280a2a0a240a4"
+        b"95a465a698a60aa820a822a828a8a0a8a8a804a984a986a928aa2aaa91aaaaaa"
+    )
+
+    @classmethod
+    def dequantize_blocks(cls, blocks: np.ndarray) -> np.ndarray:
+        n_blocks = blocks.shape[0]
+
+        d, rest = np.hsplit(blocks, [2])
+        qs, scales = np.hsplit(rest, [2 * QK_K // 8])
+
+        d = d.view(np.float16).astype(np.float32)
+        qs = qs.view(np.uint16)
+
+        scales = scales.reshape((n_blocks, -1, 1)) >> np.array(
+            [0, 4], dtype=np.uint8
+        ).reshape((1, 1, 2))
+        scales = (scales & 0x0F).reshape((n_blocks, -1))
+        db = d * (np.float32(0.5) + scales) * np.float32(0.25)
+        db = db.reshape((n_blocks, -1, 1, 1))
+
+        # get the sign indices and unpack the bits
+        signs = np.frombuffer(IQ2_XXS.ksigns, dtype=np.uint8).reshape(1, 1, 128)
+        signs = np.take_along_axis(signs, (qs >> 9).reshape((n_blocks, -1, 1)), axis=-1)
+        signs = signs.reshape((n_blocks, -1, 1)) >> np.array(
+            [i for i in range(8)], dtype=np.uint8
+        ).reshape((1, 1, 8))
+        signs = signs & np.uint8(0x01)
+        signs = np.where(signs == 0, np.float32(1), np.float32(-1))
+        signs = signs.reshape((n_blocks, -1, 2, 8))
+
+        assert cls.grid is not None
+        grid = np.take_along_axis(
+            cls.grid, (qs & np.uint16(511)).reshape((n_blocks, -1, 1, 1)), axis=-2
+        )
+        grid = grid.reshape((n_blocks, -1, 2, 8))
+
+        return (db * grid * signs).reshape((n_blocks, -1))
+
+
+class IQ2_S(__Quant, qtype=GGMLQuantizationType.IQ2_S):
+    # iq2s_grid, but with each byte of the original packed in 2 bits,
+    # by mapping 0x08 to 0, 0x19 to 1, and 0x2b to 2.
+    grid_shape = (1024, 8)
+    grid_map = (0x08, 0x19, 0x2B)
+    grid_hex = (
+        b"00000200050008000a0011001400160019002000220025002800410044004600"
+        b"490050005200550058006100640066006900800082008500880091009400a000"
+        b"a500aa0001010401060109011001120115011801210124014001420145014801"
+        b"510154015601590160016501680181018401900192019501a101a40100020202"
+        b"050208021102140220022a02410244024602490250025502800285028a029402"
+        b"a202010404040604090410041204150418042104240426042904400442044504"
+        b"48044a0451045404560459046004620465048104840486048904900495049804"
+        b"a104a40400050205050508050a05110514051605190520052505280541054405"
+        b"46054905500552055505580561056405800582058505880591059405a0050106"
+        b"0406060609061006150640064506480651065406600681068406900600080208"
+        b"050808081108140816081908200825082a084108440846084908500852085508"
+        b"580861086408800885089408aa08010904091009120915091809210940094509"
+        b"480951095409600981099009000a110a140a220a280a2a0a500a990a01100410"
+        b"0610091010101210151018102110241026104010421045104810511054105610"
+        b"59106010621065106810811084108610901095109810a110a410001102110511"
+        b"08110a1111111411161119112011221125112811411144114611491150115211"
+        b"5511581161116411801182118511881191119411011204120912101215122112"
+        b"2412401245125112541281128412901200140214051408141114141416141914"
+        b"2014251428144114441446144914501452145514581461146414801482148514"
+        b"881491149414a014011504150615091510151215151518152115241540154215"
+        b"4515481551155415601581158415901500160516081611161416201641164416"
+        b"50168016aa160118041806180918101815181818211840184218451848185118"
+        b"541860188118841800190219051908191119141920194119441950196919a219"
+        b"041a101a401a561a00200220052008201120142016201920202025202a204120"
+        b"4420502052205520642080208a209420aa200121042110211221152121214021"
+        b"4221452151215421602181218421902100220a22222228222a22442250228822"
+        b"8a22a82201240424062409241024152418242124242440244224452448245124"
+        b"5424602481248424902400250525082511251425202541254425502566258025"
+        b"0126042610264026592600280528112814284128442850288a28aa2801290429"
+        b"102995290a2a222a642a882a8a2a014004400640094010401240154018401a40"
+        b"21402440264040404240454048404a4051405440564059406040624065408140"
+        b"8440904095409840a140a4400041024105410841114114411641194120412241"
+        b"2541414144414641494150415241554158416141644180418241854188419141"
+        b"9441a04101420442104212421542184224424042454248425142544260428142"
+        b"844200440244054408440a441144144416441944204422442544284441444444"
+        b"46444944504452445544584461446444804482448544884491449444a0440145"
+        b"0445064509451045124515451845214524454045424545454845514554456045"
+        b"6a4581458445904500460246054608461146144620464146444650468046a546"
+        b"0148044809481048124815481848214824484048424845484848514854486048"
+        b"84489048004902490549084911491449204941494449504980499649014a044a"
+        b"104a404a00500250055008501150145016501950205022502550285041504450"
+        b"4650495050505250555058506150645080508250855088509150945001510451"
+        b"0651095110511251155118512151245140514251455148515151545160518151"
+        b"8451905100520552085211521452205241524452505269528052015404540654"
+        b"0954105412541554185421542454405442544554485451545454605481548454"
+        b"9054005502550555085511551455205541554455505580550156045610562656"
+        b"405600580258055808581158145820584158445850585a588058015904591059"
+        b"4059005a195a855aa85a01600460066010601260156018602160246040604560"
+        b"4860516054606060846090600061026105610861116114612061416144615061"
+        b"806199610462106240625662a162006405640864116414642064416444645064"
+        b"806401650465106540654a656865926500669466016804681068656898680069"
+        b"2a69426aa16a0080028005800880118014801980208025804180448050805280"
+        b"5580588061808080858091809480018104810981108112811581188121812481"
+        b"408142814581488151815481818184819081a981008205820a82118214824182"
+        b"4482508201840484068409841084128415841884218440844284458448845184"
+        b"5484608481848484908400850285058508851185148520854185448550858085"
+        b"8a85018604861086298640860088058811881488418844885088a28801890489"
+        b"40896589228a588a5a8a828aa28a019004900990109012901590189024904090"
+        b"4290459048905190549060908190849090900091059111911491419144915091"
+        b"5a910192049210924092a6920094029405940894119414942094419444945094"
+        b"8094969401950495109540959895a19500964696649601980498109826984098"
+        b"a998009949995299909a00a005a00aa014a022a02aa041a044a050a0a2a0aaa0"
+        b"40a165a102a20aa222a228a22aa282a288a28aa2a8a201a404a410a440a489a4"
+        b"a4a400a519a551a60aa828a8a2a854a986a908aa0aaa20aa22aa28aa88aaaaaa"
+    )
+
+    @classmethod
+    def dequantize_blocks(cls, blocks: np.ndarray) -> np.ndarray:
+        n_blocks = blocks.shape[0]
+
+        d, rest = np.hsplit(blocks, [2])
+        qs, rest = np.hsplit(rest, [QK_K // 8])
+        signs, rest = np.hsplit(rest, [QK_K // 8])
+        qh, scales = np.hsplit(rest, [QK_K // 32])
+
+        d = d.view(np.float16).astype(np.float32)
+
+        scales = scales.reshape((n_blocks, -1, 1)) >> np.array(
+            [0, 4], dtype=np.uint8
+        ).reshape((1, 1, 2))
+        scales = (scales & 0x0F).reshape((n_blocks, -1))
+        db = d * (np.float32(0.5) + scales) * np.float32(0.25)
+        db = db.reshape((n_blocks, -1, 1, 1))
+
+        # unpack the sign bits
+        signs = signs.reshape((n_blocks, -1, 1)) >> np.array(
+            [i for i in range(8)], dtype=np.uint8
+        ).reshape((1, 1, 8))
+        signs = signs & np.uint8(0x01)
+        signs = np.where(signs == 0, np.float32(1), np.float32(-1))
+        signs = signs.reshape((n_blocks, -1, 2, 8))
+
+        qh = qh.reshape((n_blocks, -1, 1)) >> np.array(
+            [0, 2, 4, 6], dtype=np.uint8
+        ).reshape((1, 1, 4))
+        qs = qs.astype(np.uint16) | ((qh & 0x03).astype(np.uint16) << 8).reshape(
+            (n_blocks, -1)
+        )
+
+        assert cls.grid is not None
+        grid = np.take_along_axis(cls.grid, qs.reshape((n_blocks, -1, 1, 1)), axis=-2)
+        grid = grid.reshape((n_blocks, -1, 2, 8))
+
+        return (db * grid * signs).reshape((n_blocks, -1))
+
+
+class IQ3_XXS(__Quant, qtype=GGMLQuantizationType.IQ3_XXS):
+    grid_shape = (256, 4)
+    grid_map = (0x04, 0x0C, 0x14, 0x1C, 0x24, 0x2C, 0x34, 0x3E)
+    grid_hex = (
+        b"0000020004001100130017002000220031004200730075000101030110011201"
+        b"2101250130013201410154017001000202020402110220022202310233023702"
+        b"5102570275020103070310031203250370031304370444045704730475040105"
+        b"0705320552053506640610071407160743076107011003101010121021102310"
+        b"3010321034104710501000110211111120112211011203121012121221123012"
+        b"7212001302132013311346136613011405145014201524154615711505162217"
+        b"4017002002201120132020202220262031204220012103210521102112212121"
+        b"3021632167217021002202221122172220222222372240225522012310231423"
+        b"7023742335245324032527254125742501270327162745270130103012302130"
+        b"2330503065307230003102312031313144314631013203321032253252327232"
+        b"1133333330344734723400350635223555351436363663363337603704401740"
+        b"3540374053405740744120423742404260426642074345430444514464442545"
+        b"4345704505471047124730471250415070500051065126515551145232527252"
+        b"0253535310542354275472540255315550562457425724604460466064602161"
+        b"6161176264623063366344640565526533660367216703700570077010703270"
+        b"5270267140711272457252720073157333736073217441740075027524753076"
+    )
+
+    @classmethod
+    def dequantize_blocks(cls, blocks: np.ndarray) -> np.ndarray:
+        n_blocks = blocks.shape[0]
+
+        d, rest = np.hsplit(blocks, [2])
+        qs, scales = np.hsplit(rest, [QK_K // 4])
+
+        d = d.view(np.float16).astype(np.float32)
+        scales = scales.view(np.uint32)
+
+        db = d * (np.float32(0.5) + (scales >> 28).astype(np.float32)) * np.float32(0.5)
+        db = db.reshape((n_blocks, -1, 1, 1))
+
+        # get the sign indices and unpack the bits
+        signs = scales.reshape((n_blocks, -1, 1)) >> np.array(
+            [0, 7, 14, 21], dtype=np.uint32
+        ).reshape((1, 1, 4))
+        ksigns = np.frombuffer(IQ2_XXS.ksigns, dtype=np.uint8).reshape((1, 1, 1, 128))
+        signs = (signs & np.uint32(0x7F)).reshape((n_blocks, -1, 4, 1))
+        signs = np.take_along_axis(ksigns, signs, axis=-1)
+        signs = signs.reshape((n_blocks, -1, 4, 1)) >> np.array(
+            [i for i in range(8)], dtype=np.uint8
+        ).reshape((1, 1, 1, 8))
+        signs = signs & np.uint8(0x01)
+        signs = np.where(signs == 0, np.float32(1), np.float32(-1))
+        signs = signs.reshape((n_blocks, -1, 4, 8))
+
+        assert cls.grid is not None
+        grid = np.take_along_axis(cls.grid, qs.reshape((n_blocks, -1, 1, 1)), axis=-2)
+        grid = grid.reshape((n_blocks, -1, 4, 8))
+
+        return (db * grid * signs).reshape((n_blocks, -1))
+
+
+class IQ3_S(__Quant, qtype=GGMLQuantizationType.IQ3_S):
+    grid_shape = (512, 4)
+    grid_map = (0x01, 0x03, 0x05, 0x07, 0x09, 0x0B, 0x0D, 0x0F)
+    grid_hex = (
+        b"0000010002000500070010001100120014001600200021002500330040004200"
+        b"4500470051005300600062007100740077000001010102010401100111011501"
+        b"2001230127013101350144016101650172010002010205020702100213021602"
+        b"2102250230023402420245024702510253027002730203031103150320032203"
+        b"3103330336034403500352036703710375030004130417042104240432044004"
+        b"4304510470040205040520052205260533054105450547056605730506061106"
+        b"1306310652067106000702070407200722072607330750075407001001100210"
+        b"0410101011101310151017102010221031103410361054105610611072100011"
+        b"0111031106111011141121113011331141115011521170117611001212121512"
+        b"1712201224123212401243125512601272120113041307131013131321132713"
+        b"3013341341136213701303140514121414143114331442144614501454140115"
+        b"1015131521153015321551152016241627164416461601170317101712172117"
+        b"3517411762177017002001200320052007201020122014201620212023202720"
+        b"3020322041204320452050205220672070207320752000210221102113211721"
+        b"2221252131213421422151210122042207222122232230223722412253225722"
+        b"7122742200230223052311232223242331233323422350236623012407242024"
+        b"2324322435244124722475240425112522253725402553257025002602260726"
+        b"2126552661260527112726273027432750270230113013301530173022303130"
+        b"3330353042304430473051306330713001310331053114312131233140316031"
+        b"7231763100321232203232323432503201331033143321332333273330334133"
+        b"4333473355337333033411341634223431345234603464340135103512352535"
+        b"3235443556357335163641360137033720372237353700400440124020402440"
+        b"2740324041405040704002410741114113412241304135414341514155410142"
+        b"0342104215422142334240425742624270420443114313432043224331433543"
+        b"0044024424443744404471440545074521456245134634466046104715473047"
+        b"4347514702501050145022504050445047505250665074500151035105511251"
+        b"2151325172510052115223523052365253520253075310532753445351536553"
+        b"7353015404542054325446541255265551555355425602570457225711601360"
+        b"1560316033606060006120612761646112623462426255626262706200631463"
+        b"2163406325644364626400650365346560650566406611671367007004700770"
+        b"2070227036704070547062700271117124714371457101720472107216722172"
+        b"3072517202733273357353730174057413742074507422754275027631760077"
+    )
+
+    @classmethod
+    def dequantize_blocks(cls, blocks: np.ndarray) -> np.ndarray:
+        n_blocks = blocks.shape[0]
+
+        d, rest = np.hsplit(blocks, [2])
+        qs, rest = np.hsplit(rest, [QK_K // 4])
+        qh, rest = np.hsplit(rest, [QK_K // 32])
+        signs, scales = np.hsplit(rest, [QK_K // 8])
+
+        d = d.view(np.float16).astype(np.float32)
+
+        scales = scales.reshape((n_blocks, -1, 1)) >> np.array(
+            [0, 4], dtype=np.uint8
+        ).reshape((1, 1, 2))
+        scales = (scales & 0x0F).reshape((n_blocks, -1))
+        db = d * (1 + 2 * scales)
+        db = db.reshape((n_blocks, -1, 1, 1))
+
+        # unpack the sign bits
+        signs = signs.reshape((n_blocks, -1, 1)) >> np.array(
+            [i for i in range(8)], dtype=np.uint8
+        ).reshape((1, 1, 8))
+        signs = signs & np.uint8(0x01)
+        signs = np.where(signs == 0, np.float32(1), np.float32(-1))
+        signs = signs.reshape((n_blocks, -1, 4, 8))
+
+        qh = qh.reshape((n_blocks, -1, 1)) >> np.array(
+            [i for i in range(8)], dtype=np.uint8
+        )
+        qh = (qh & 0x01).astype(np.uint16).reshape((n_blocks, -1))
+        qs = qs.astype(np.uint16) | (qh << 8)
+
+        assert cls.grid is not None
+        grid = np.take_along_axis(cls.grid, qs.reshape((n_blocks, -1, 1, 1)), axis=-2)
+        grid = grid.reshape((n_blocks, -1, 4, 8))
+
+        return (db * grid * signs).reshape((n_blocks, -1))
+
+
+class IQ1_S(__Quant, qtype=GGMLQuantizationType.IQ1_S):
+    # iq1s_grid, with each byte packed into 2 bits
+    # -1, 0, 1 <=> 0, 1, 2
+    grid_shape = (2048, 8)
+    grid_map = (-1, 0, 1)
+    grid_hex = (
+        b"00000200050008000a00110015002000220028002a0045005100540056006500"
+        b"8000820088008a009500a000a200a800aa000401050111011401160119011a01"
+        b"2501410146014901520155015a0161016401660168018501910194019601a501"
+        b"0002020208020a0215022002220228022a024502510259026402690280028202"
+        b"88028a02910295029902a002a202a802aa021104140416042504410449045504"
+        b"5a046404650491049904a5040105040505050605150518051a05290540054505"
+        b"4a0550055105540555055605590560056205650568056a058105910595059805"
+        b"9a05a105a405a505a605a9051406190641064406500652065506580660066106"
+        b"6606690685069106940699060008020808080a0815082008220828082a084508"
+        b"5108560865088008820888088a089508a008a208a808aa080509110914091909"
+        b"2409250941095009510955096109640969099109940996099909a509000a020a"
+        b"080a0a0a150a200a220a280a2a0a450a510a590a610a650a800a820a850a880a"
+        b"8a0a950aa00aa20aa80aaa0a1010111014101910241025104110441050105510"
+        b"58106110641065106910911094109610a110a510011104110611091110111211"
+        b"1511181121112411291145114a11501151115211541155115611591160116511"
+        b"841192119511a111a41111121412161225124012461249125212551258125a12"
+        b"641266128512911294129612a512011406140914141415141814191421142614"
+        b"41144514461448144a1451145414551456145914621465146814841489149014"
+        b"94149514981499149a14a114a414a514a914021505150a151115141515151615"
+        b"191520152215251528152a154115441545154615511552155415551556155915"
+        b"5a1561156415651566156915801582158415851588158a159015911594159515"
+        b"961599159a15a015a215a51501160416051606161516161618161a1621162616"
+        b"401642164416451648164a165116551656165816591661166416651668166916"
+        b"6a1686168a1692169516a416a916111816182518411844184618491850185518"
+        b"58185a1860186118641866186918851891189418a5181019121915191a192119"
+        b"25194219441945194819511954195519561959195a19601965196a1989199119"
+        b"921995199819a119a619a919091a161a241a261a441a461a491a501a521a551a"
+        b"581a611a661a691a851a911a961a9a1a0020022008200a201520202022202520"
+        b"28202a20452051205920612065208020822088208a209520a020a220a520a820"
+        b"aa2005211121142119212521422144214921552158215a216121642165216621"
+        b"8521902196219921a521012208220a22112215222022222228222a2245225122"
+        b"562259226522812288228a2291229522a022a222a822aa220524142416241924"
+        b"252444244524462449245224552458245a2466248524912494249924a124a524"
+        b"0925152521252925402545254825512554255525592562256525682589259025"
+        b"9425952598259a25a125a425a625a92505261026122619262526412649265526"
+        b"6026612669268426862690269a260028022808280a2815282028222828282a28"
+        b"45285128542865288028822888288a28a028a228a828aa280929112914291929"
+        b"2529462949295229552961296429662969298529902996299929a429a529002a"
+        b"022a082a0a2a202a222a282a2a2a452a512a562a592a652a802a822a882a8a2a"
+        b"952aa02aa22aa82aaa2a054011401640254049405240554058405a4061406440"
+        b"664094409940a140a6400041014104410641094112411541164118411a412141"
+        b"26412941454148414a41514154415541564159415a41654168416a4181418441"
+        b"8641904192419541a041a141a241054211421442164225424142524255425a42"
+        b"6442694289429442a5420144154419442944454448444a445144544455445644"
+        b"61446244654468446a44814486448944904492449544a044a144a94401450245"
+        b"05450a4511451445154516451945204525452a45414544454545464549455045"
+        b"5145544555455645584559456145644565456645694582458445854588459145"
+        b"94459545964599459a45a545a845aa450146054609461446154618461a462146"
+        b"2446294640464246454648465046514652465546564659466246654668468146"
+        b"85468a4694469546a146a446a6460548114815481a4825484248494850485548"
+        b"5848614864486648694885489148944896489948a5480149054906490a491049"
+        b"144915491849214924492649404945494a495149524954495549564959496049"
+        b"6249654966496a49864989499249954996499849a149a449a649a949164a444a"
+        b"464a494a554a584a5a4a644a694a944aa54a0150045005500650095012501550"
+        b"1a50215024502950405045504850515054505550565059506550685086508950"
+        b"95509850a050a150a650a9500551085109510a51115114511551165118511951"
+        b"20512551265128512a5141514451455146514951505151515251545155515651"
+        b"585159515a51615164516551665169518251855191519451955196519951a051"
+        b"a551aa5101520652125215521a5221522452425245524a525152545255525652"
+        b"595262526552855290529252955299529a52a452045405541154145415541654"
+        b"185419542154255428542a54415444544554465449544a545054515454545554"
+        b"5654585459545a54615462546454655466546954805488548a54915494549554"
+        b"96549954a154a454a554aa540155025504550555065509551055115512551455"
+        b"1555165519551a55215524552555265529554055415542554455455546554855"
+        b"4955505551555255545555555655585559555a55605561556455655566556855"
+        b"69556a5581558455855589558a559055915594559555965598559955a155a455"
+        b"a555a655a9550056015602560456065608560956115614561556185619562056"
+        b"2156225624562556265628562956415645564656485649564a56505651565256"
+        b"545655565656585659565a566156645665566956825685568656885689568a56"
+        b"915695569a56a256a556a656a856a95604580558065809581058155818582158"
+        b"2a58455848584a58515854585558565858585958605862586458655882588958"
+        b"9058925895589858a158a9580159025905590a59115914591559165919592559"
+        b"41594459455946594959505951595259545955595659585959595a5961596459"
+        b"655966596959815985598959915994599559965998599959a559045a085a155a"
+        b"1a5a205a255a265a295a455a485a495a515a555a565a585a595a625a655a685a"
+        b"6a5a815a8a5a925a955a965a985a9a5aa15a0560146016601960256044605060"
+        b"5560566058605a60616064606660696081609660a56001610461066109611261"
+        b"15612161226126612961456149615161556156615961656166616a6184618a61"
+        b"92619561a161a661a96111621662196240624162466255625662586260628562"
+        b"91629662a56211641264156416641a6421642664296440644264456448644a64"
+        b"516454645564566459645a646064626465648464856489649064926494649564"
+        b"966498649a64a164a464a964056508650a651165156516651965446545654665"
+        b"496550655165546555655665596561656465656566656965866589658a659165"
+        b"9565966599659a65a265a565a665a86502660966156620662666286629664066"
+        b"456648664a66516654665566566658665a666066656668668066826685668a66"
+        b"9466966698669966a066a466a666aa661668196825684168526855685a686168"
+        b"6968856891689868a66801690469106915692169246926692969406941694569"
+        b"4669486951695469556956695969606965696a69826984698a699569a169a469"
+        b"a569a969116a166a186a416a446a496a506a556a586a5a6a646a656a696a866a"
+        b"946a986a9a6aa66a0080028008800a802080228028802a804580508051805480"
+        b"5680598065808080828088808a809580a080a280a880aa800581118114811681"
+        b"1981258141814481498150815281558156815881598164816681698185818981"
+        b"948196819981a5810082028208820a8215822082228228822a82518254825982"
+        b"65828082828288828a829582a082a282a882aa82148419844184448451845584"
+        b"5a846184648469849484998401850985128515851a8526852985408541854585"
+        b"4885518554855585568559855a856585668568856a8581858485868589859085"
+        b"928595859885a68511861686198625864186448649864a865086558659865a86"
+        b"618666866a86858691869a86a4860088028808880a8815882088228828882a88"
+        b"41884588518854885988658869888088828888888a889588a088a288a888aa88"
+        b"05890689118914891689258941894489468949895089528955895a8961896489"
+        b"858996899989a589008a028a088a0a8a158a208a228a288a2a8a458a518a548a"
+        b"568a808a828a888a8a8a958aa08aa28aa88aaa8a059011901690189019902590"
+        b"419046904990559058905a9069906a9085909190949096909990a59001910491"
+        b"069109911091159118911a912191249126912991409145915091519154915591"
+        b"569159916291659184918691929195919891a191a491a691a991059211921492"
+        b"19922592449246924992509252925592589266926992859294929692a9920194"
+        b"04940694109415941894269440944a9451945494559456945894599460946194"
+        b"62946594849486949294949495949894a194a9940095059508950a9510951195"
+        b"14951595169519952195259529952a9541954495459546954995509551955295"
+        b"549555955695589559955a956195649565956695699581958595889591959295"
+        b"94959595969599959a95a095a295a595a895aa95019604961096159619962096"
+        b"2696299645964896499651965296559656965996659668968296849689968a96"
+        b"929694969596a496a696a9960598169819982598419846985098529855985698"
+        b"5a98649865988598919896989998a59804990699099910991299159918991a99"
+        b"209921992499269940994299459948994a995199549955995699599962996599"
+        b"66996a99819984999099929995999a99a199a699059a159a259a449a469a499a"
+        b"509a559a589a619a859a919a949a959a969a00a002a008a00aa015a020a022a0"
+        b"28a02aa045a051a054a056a059a080a082a088a08aa095a0a0a0a2a0a8a0aaa0"
+        b"05a109a111a114a116a119a11aa146a149a151a155a158a15aa161a164a185a1"
+        b"90a192a196a199a102a208a20aa210a219a222a228a22aa245a251a256a259a2"
+        b"65a280a282a288a28aa295a2a0a2a2a2a8a2aaa219a425a441a444a450a454a4"
+        b"55a458a45aa461a465a466a468a469a485a406a509a510a512a515a518a526a5"
+        b"29a542a545a551a554a555a556a559a565a56aa581a584a585a586a589a592a5"
+        b"95a598a505a611a616a61aa621a625a644a646a64aa652a655a656a658a660a6"
+        b"62a686a690a695a696a699a6a1a6a4a6a6a600a802a808a80aa820a822a828a8"
+        b"2aa851a854a856a859a880a882a888a88aa895a8a0a8a2a8a8a8aaa805a914a9"
+        b"19a921a925a941a950a955a95aa961a966a969a990a996a900aa02aa08aa0aaa"
+        b"20aa22aa28aa2aaa51aa54aa56aa80aa82aa88aa8aaa95aaa0aaa2aaa8aaaaaa"
+    )
+
+    delta = np.float32(0.125)
+
+    @classmethod
+    def dequantize_blocks(cls, blocks: np.ndarray) -> np.ndarray:
+        n_blocks = blocks.shape[0]
+
+        d, rest = np.hsplit(blocks, [2])
+        qs, qh = np.hsplit(rest, [QK_K // 8])
+
+        d = d.view(np.float16).astype(np.float32)
+        qh = qh.view(np.uint16)
+
+        dl = d * (2 * ((qh >> 12) & 7) + 1)
+        dl = dl.reshape((n_blocks, -1, 1, 1))
+        delta = np.where((qh & np.uint16(0x8000)) == 0, cls.delta, -cls.delta)
+        delta = delta.reshape((n_blocks, -1, 1, 1))
+
+        qh = qh.reshape((n_blocks, -1, 1)) >> np.array(
+            [0, 3, 6, 9], dtype=np.uint16
+        ).reshape((1, 1, 4))
+        qs = qs.astype(np.uint16) | ((qh & 7) << 8).reshape((n_blocks, -1))
+
+        assert cls.grid is not None
+        grid = np.take_along_axis(cls.grid, qs.reshape((n_blocks, -1, 1, 1)), axis=-2)
+        grid = grid.reshape((n_blocks, -1, 4, 8))
+
+        return (dl * (grid + delta)).reshape((n_blocks, -1))
+
+
+class IQ1_M(__Quant, qtype=GGMLQuantizationType.IQ1_M):
+    grid_shape = IQ1_S.grid_shape
+    grid_map = IQ1_S.grid_map
+    grid_hex = IQ1_S.grid_hex
+
+    delta = IQ1_S.delta
+
+    # Okay *this* type is weird. It's the only one which stores the f16 scales in multiple parts.
+    @classmethod
+    def dequantize_blocks(cls, blocks: np.ndarray) -> np.ndarray:
+        n_blocks = blocks.shape[0]
+
+        qs, rest = np.hsplit(blocks, [QK_K // 8])
+        qh, scales = np.hsplit(rest, [QK_K // 16])
+
+        # The f16 scale is packed across multiple bytes
+        scales = scales.view(np.uint16)
+        d = (scales.reshape((n_blocks, 4)) & np.uint16(0xF000)) >> np.array(
+            [12, 8, 4, 0], dtype=np.uint16
+        ).reshape((1, 4))
+        d = d[..., 0] | d[..., 1] | d[..., 2] | d[..., 3]
+        d = d.view(np.float16).astype(np.float32).reshape((n_blocks, 1))
+
+        scales = scales.reshape(n_blocks, -1, 1) >> np.array(
+            [0, 3, 6, 9], dtype=np.uint16
+        ).reshape((1, 1, 4))
+        scales = (scales & 0x07).reshape((n_blocks, -1))
+        dl = d * (2 * scales + 1)
+        dl = dl.reshape((n_blocks, -1, 2, 1, 1))
+
+        qh = qh.reshape((n_blocks, -1, 1)) >> np.array([0, 4], dtype=np.uint8).reshape(
+            (1, 1, 2)
+        )
+        qs = qs.astype(np.uint16) | ((qh & 0x07).astype(np.uint16) << 8).reshape(
+            (n_blocks, -1)
+        )
+
+        delta = np.where(qh & 0x08 == 0, cls.delta, -cls.delta)
+        delta = delta.reshape((n_blocks, -1, 2, 2, 1))
+
+        assert cls.grid is not None
+        grid = np.take_along_axis(cls.grid, qs.reshape((n_blocks, -1, 1, 1)), axis=-2)
+        grid = grid.reshape((n_blocks, -1, 2, 2, 8))
+
+        return (dl * (grid + delta)).reshape((n_blocks, -1))
+
+
+class IQ4_NL(__Quant, qtype=GGMLQuantizationType.IQ4_NL):
+    kvalues = (-127, -104, -83, -65, -49, -35, -22, -10, 1, 13, 25, 38, 53, 69, 89, 113)
+
+    @classmethod
+    def dequantize_blocks(cls, blocks: np.ndarray) -> np.ndarray:
+        n_blocks = blocks.shape[0]
+
+        d, qs = np.hsplit(blocks, [2])
+
+        d = d.view(np.float16).astype(np.float32)
+
+        qs = qs.reshape((n_blocks, -1, 1, cls.block_size // 2)) >> np.array(
+            [0, 4], dtype=np.uint8
+        ).reshape((1, 1, 2, 1))
+
+        qs = (qs & np.uint8(0x0F)).reshape((n_blocks, -1, 1))
+
+        kvalues = np.array(cls.kvalues, dtype=np.int8).reshape(1, 1, 16)
+        qs = (
+            np.take_along_axis(kvalues, qs, axis=-1)
+            .astype(np.float32)
+            .reshape((n_blocks, -1))
+        )
+
+        return d * qs
+
+
+class IQ4_XS(__Quant, qtype=GGMLQuantizationType.IQ4_XS):
+    @classmethod
+    def dequantize_blocks(cls, blocks: np.ndarray) -> np.ndarray:
+        n_blocks = blocks.shape[0]
+
+        d, rest = np.hsplit(blocks, [2])
+        scales_h, rest = np.hsplit(rest, [2])
+        scales_l, qs = np.hsplit(rest, [QK_K // 64])
+
+        d = d.view(np.float16).astype(np.float32)
+        scales_h = scales_h.view(np.uint16)
+
+        scales_l = scales_l.reshape((n_blocks, -1, 1)) >> np.array(
+            [0, 4], dtype=np.uint8
+        ).reshape((1, 1, 2))
+        scales_h = scales_h.reshape((n_blocks, 1, -1)) >> np.array(
+            [2 * i for i in range(QK_K // 32)], dtype=np.uint16
+        ).reshape((1, -1, 1))
+        scales_l = scales_l.reshape((n_blocks, -1)) & np.uint8(0x0F)
+        scales_h = scales_h.reshape((n_blocks, -1)).astype(np.uint8) & np.uint8(0x03)
+
+        scales = (scales_l | (scales_h << np.uint8(4))).astype(np.int8) - np.int8(32)
+        dl = (d * scales.astype(np.float32)).reshape((n_blocks, -1, 1))
+
+        qs = qs.reshape((n_blocks, -1, 1, 16)) >> np.array(
+            [0, 4], dtype=np.uint8
+        ).reshape((1, 1, 2, 1))
+        qs = qs.reshape((n_blocks, -1, 32, 1)) & np.uint8(0x0F)
+
+        kvalues = np.array(IQ4_NL.kvalues, dtype=np.int8).reshape((1, 1, 1, -1))
+        qs = (
+            np.take_along_axis(kvalues, qs, axis=-1)
+            .astype(np.float32)
+            .reshape((n_blocks, -1, 32))
+        )
+
+        return (dl * qs).reshape((n_blocks, -1))
diff --git a/src/gguf/tensor_mapping.py b/src/gguf/tensor_mapping.py
index 3161173..3de2a6f 100644
--- a/src/gguf/tensor_mapping.py
+++ b/src/gguf/tensor_mapping.py
@@ -7,463 +7,574 @@
 
 class TensorNameMap:
     mappings_cfg: dict[MODEL_TENSOR, tuple[str, ...]] = {
+        # Token embeddings
         MODEL_TENSOR.TOKEN_EMBD: (
-            "gpt_neox.embed_in",
-            "transformer.wte",
-            "transformer.word_embeddings",
-            "word_embeddings",
-            "model.embed_tokens",
-            "tok_embeddings",
-            "embeddings.word_embeddings",
-            "language_model.embedding.word_embeddings",
-            "wte",
-            "transformer.embd.wte",
-            "model.tok_embeddings",
-            "model.embedding",
-            "backbone.embedding",
-            "backbone.embeddings",
-            "transformer.in_out_embed",
-            "embedding.word_embeddings",
-            "transformer.token_embeddings",
-            "shared",
-            "rwkv.embeddings",
-        ),
-        MODEL_TENSOR.TOKEN_TYPES: ("embeddings.token_type_embeddings",),
+            "gpt_neox.embed_in",  # gptneox
+            "transformer.wte",  # gpt2 gpt-j mpt refact qwen dbrx jais exaone
+            "transformer.word_embeddings",  # falcon
+            "word_embeddings",  # bloom
+            "model.embed_tokens",  # llama-hf nemotron olmoe olmo2
+            "tok_embeddings",  # llama-pth
+            "embeddings.word_embeddings",  # bert nomic-bert
+            "language_model.embedding.word_embeddings",  # persimmon
+            "wte",  # gpt2
+            "transformer.embd.wte",  # phi2
+            "model.tok_embeddings",  # internlm2
+            "model.embedding",  # mamba-qbert
+            "backbone.embedding",  # mamba
+            "backbone.embeddings",  # mamba-hf
+            "transformer.in_out_embed",  # Grok
+            "embedding.word_embeddings",  # chatglm
+            "transformer.token_embeddings",  # openelm
+            "shared",  # t5
+            "rwkv.embeddings",  # rwkv
+        ),
+        # Token type embeddings
+        MODEL_TENSOR.TOKEN_TYPES: (
+            "embeddings.token_type_embeddings",  # bert nomic-bert
+        ),
+        # Normalization of token embeddings
         MODEL_TENSOR.TOKEN_EMBD_NORM: (
-            "word_embeddings_layernorm",
-            "embeddings.LayerNorm",
-            "emb_ln",
-            "transformer.norm",
-            "rwkv.blocks.0.pre_ln",
-        ),
+            "word_embeddings_layernorm",  # bloom
+            "embeddings.LayerNorm",  # bert
+            "emb_ln",  # nomic-bert
+            "transformer.norm",  # openelm
+            "rwkv.blocks.0.pre_ln",  # rwkv
+            "backbone.norm",  # wavtokenizer
+        ),
+        # Position embeddings
         MODEL_TENSOR.POS_EMBD: (
-            "transformer.wpe",
-            "embeddings.position_embeddings",
-            "wpe",
+            "transformer.wpe",  # gpt2
+            "embeddings.position_embeddings",  # bert
+            "wpe",  # gpt2
         ),
+        # Output
         MODEL_TENSOR.OUTPUT: (
-            "embed_out",
-            "lm_head",
-            "output",
-            "word_embeddings_for_head",
-            "lm_head.linear",
-            "output_layer",
-            "head",
-        ),
+            "embed_out",  # gptneox
+            "lm_head",  # gpt2 mpt falcon llama-hf baichuan qwen mamba dbrx jais nemotron exaone olmoe olmo2
+            "output",  # llama-pth bloom internlm2
+            "word_embeddings_for_head",  # persimmon
+            "lm_head.linear",  # phi2
+            "output_layer",  # chatglm
+            "head",  # rwkv
+            "head.out",  # wavtokenizer
+        ),
+        # Output norm
         MODEL_TENSOR.OUTPUT_NORM: (
-            "gpt_neox.final_layer_norm",
-            "transformer.ln_f",
-            "model.norm",
-            "norm",
-            "transformer.norm_f",
-            "ln_f",
-            "language_model.encoder.final_layernorm",
-            "model.final_layernorm",
-            "lm_head.ln",
-            "model.norm_f",
-            "backbone.norm_f",
-            "transformer.rms_norm",
-            "encoder.final_layernorm",
-            "transformer.norm",
-            "model.norm",
-            "rwkv.ln_out",
-        ),
+            "gpt_neox.final_layer_norm",  # gptneox
+            "transformer.ln_f",  # gpt2 gpt-j falcon jais exaone
+            "model.norm",  # llama-hf baichuan internlm2 olmoe olmo2
+            "norm",  # llama-pth
+            "transformer.norm_f",  # mpt dbrx
+            "ln_f",  # refact bloom qwen gpt2
+            "language_model.encoder.final_layernorm",  # persimmon
+            "model.final_layernorm",  # persimmon
+            "lm_head.ln",  # phi2
+            "model.norm_f",  # mamba-qbert
+            "backbone.norm_f",  # mamba
+            "transformer.rms_norm",  # Grok
+            "encoder.final_layernorm",  # chatglm
+            "transformer.norm",  # openelm
+            "model.norm",  # nemotron
+            "rwkv.ln_out",  # rwkv
+            "backbone.final_layer_norm",  # wavtokenizer
+        ),
+        # Rope frequencies
         MODEL_TENSOR.ROPE_FREQS: (
-            "rope.freqs",
-            "rotary_pos_emb.inv_freq",
+            "rope.freqs",  # llama-pth
+            "rotary_pos_emb.inv_freq",  # chatglm
         ),
+        MODEL_TENSOR.ROPE_FACTORS_LONG: (),
+        MODEL_TENSOR.ROPE_FACTORS_SHORT: (),
+        MODEL_TENSOR.CONV1D: ("backbone.embed",),  # roberta
     }
 
     block_mappings_cfg: dict[MODEL_TENSOR, tuple[str, ...]] = {
+        # Attention norm
         MODEL_TENSOR.ATTN_NORM: (
-            "gpt_neox.layers.{bid}.input_layernorm",
-            "transformer.h.{bid}.ln_1",
-            "transformer.blocks.{bid}.norm_1",
-            "transformer.h.{bid}.input_layernorm",
-            "h.{bid}.input_layernorm",
-            "transformer.h.{bid}.ln_mlp",
-            "model.layers.{bid}.input_layernorm",
-            "layers.{bid}.attention_norm",
-            "language_model.encoder.layers.{bid}.input_layernorm",
-            "model.layers.{bid}.ln1",
-            "h.{bid}.ln_1",
-            "transformer.h.{bid}.ln",
-            "model.layers.layers.{bid}.norm",
-            "model.layers.{bid}.attention_norm",
-            "model.layers.{bid}.norm",
-            "backbone.layers.{bid}.norm",
-            "transformer.decoder_layer.{bid}.rms_norm",
-            "transformer.blocks.{bid}.norm_attn_norm.norm_1",
-            "encoder.layers.{bid}.input_layernorm",
-            "transformer.layers.{bid}.attn_norm",
-            "rwkv.blocks.{bid}.ln1",
-        ),
+            "gpt_neox.layers.{bid}.input_layernorm",  # gptneox
+            "transformer.h.{bid}.ln_1",  # gpt2 gpt-j refact qwen jais exaone
+            "transformer.blocks.{bid}.norm_1",  # mpt
+            "transformer.h.{bid}.input_layernorm",  # falcon7b
+            "h.{bid}.input_layernorm",  # bloom
+            "transformer.h.{bid}.ln_mlp",  # falcon40b
+            "model.layers.{bid}.input_layernorm",  # llama-hf nemotron olmoe
+            "layers.{bid}.attention_norm",  # llama-pth
+            "language_model.encoder.layers.{bid}.input_layernorm",  # persimmon
+            "model.layers.{bid}.ln1",  # yi
+            "h.{bid}.ln_1",  # gpt2
+            "transformer.h.{bid}.ln",  # phi2
+            "model.layers.layers.{bid}.norm",  # plamo
+            "model.layers.{bid}.attention_norm",  # internlm2
+            "model.layers.{bid}.norm",  # mamba-qbert
+            "backbone.layers.{bid}.norm",  # mamba
+            "transformer.decoder_layer.{bid}.rms_norm",  # Grok
+            "transformer.blocks.{bid}.norm_attn_norm.norm_1",  # dbrx
+            "encoder.layers.{bid}.input_layernorm",  # chatglm
+            "transformer.layers.{bid}.attn_norm",  # openelm
+            "rwkv.blocks.{bid}.ln1",  # rwkv
+        ),
+        # Attention norm 2
         MODEL_TENSOR.ATTN_NORM_2: (
-            "transformer.h.{bid}.ln_attn",
-            "encoder.layer.{bid}.layer_norm_1",
-            "rwkv.blocks.{bid}.ln2",
+            "transformer.h.{bid}.ln_attn",  # falcon40b
+            "encoder.layer.{bid}.layer_norm_1",  # jina-v2-code
+            "rwkv.blocks.{bid}.ln2",  # rwkv
         ),
+        # Attention query-key-value
         MODEL_TENSOR.ATTN_QKV: (
-            "gpt_neox.layers.{bid}.attention.query_key_value",
-            "transformer.h.{bid}.attn.c_attn",
-            "transformer.blocks.{bid}.attn.Wqkv",
-            "transformer.blocks.{bid}.norm_attn_norm.attn.Wqkv",
-            "transformer.h.{bid}.self_attention.query_key_value",
-            "h.{bid}.self_attention.query_key_value",
-            "language_model.encoder.layers.{bid}.self_attention.query_key_value",
-            "model.layers.{bid}.self_attn.query_key_value",
-            "h.{bid}.attn.c_attn",
-            "transformer.h.{bid}.mixer.Wqkv",
-            "encoder.layers.{bid}.attn.Wqkv",
-            "model.layers.{bid}.self_attn.qkv_proj",
-            "encoder.layers.{bid}.self_attention.query_key_value",
-            "transformer.layers.{bid}.attn.qkv_proj",
-        ),
+            "gpt_neox.layers.{bid}.attention.query_key_value",  # gptneox
+            "transformer.h.{bid}.attn.c_attn",  # gpt2 qwen jais
+            "transformer.blocks.{bid}.attn.Wqkv",  # mpt
+            "transformer.blocks.{bid}.norm_attn_norm.attn.Wqkv",  # dbrx
+            "transformer.h.{bid}.self_attention.query_key_value",  # falcon
+            "h.{bid}.self_attention.query_key_value",  # bloom
+            "language_model.encoder.layers.{bid}.self_attention.query_key_value",  # persimmon
+            "model.layers.{bid}.self_attn.query_key_value",  # persimmon
+            "h.{bid}.attn.c_attn",  # gpt2
+            "transformer.h.{bid}.mixer.Wqkv",  # phi2
+            "encoder.layers.{bid}.attn.Wqkv",  # nomic-bert
+            "model.layers.{bid}.self_attn.qkv_proj",  # phi3
+            "encoder.layers.{bid}.self_attention.query_key_value",  # chatglm
+            "transformer.layers.{bid}.attn.qkv_proj",  # openelm
+        ),
+        # Attention query
         MODEL_TENSOR.ATTN_Q: (
-            "model.layers.{bid}.self_attn.q_proj",
-            "layers.{bid}.attention.wq",
-            "encoder.layer.{bid}.attention.self.query",
-            "transformer.h.{bid}.attn.q_proj",
-            "model.layers.layers.{bid}.self_attn.q_proj",
-            "model.layers.{bid}.attention.wq",
-            "transformer.decoder_layer.{bid}.multi_head_attention.query",
-            "transformer.h.{bid}.attn.attention.q_proj",
-        ),
+            "model.layers.{bid}.self_attn.q_proj",  # llama-hf nemotron olmoe olmo2
+            "model.layers.{bid}.self_attn.q_proj_no_perm",  # llama-custom
+            "layers.{bid}.attention.wq",  # llama-pth
+            "encoder.layer.{bid}.attention.self.query",  # bert
+            "transformer.h.{bid}.attn.q_proj",  # gpt-j
+            "model.layers.layers.{bid}.self_attn.q_proj",  # plamo
+            "model.layers.{bid}.attention.wq",  # internlm2
+            "transformer.decoder_layer.{bid}.multi_head_attention.query",  # Grok
+            "transformer.h.{bid}.attn.attention.q_proj",  # exaone
+        ),
+        # Attention key
         MODEL_TENSOR.ATTN_K: (
-            "model.layers.{bid}.self_attn.k_proj",
-            "layers.{bid}.attention.wk",
-            "encoder.layer.{bid}.attention.self.key",
-            "transformer.h.{bid}.attn.k_proj",
-            "transformer.h.{bid}.attn.k",
-            "model.layers.layers.{bid}.self_attn.k_proj",
-            "model.layers.{bid}.attention.wk",
-            "transformer.decoder_layer.{bid}.multi_head_attention.key",
-            "transformer.h.{bid}.attn.attention.k_proj",
-        ),
+            "model.layers.{bid}.self_attn.k_proj",  # llama-hf nemotron olmoe olmo2
+            "model.layers.{bid}.self_attn.k_proj_no_perm",  # llama-custom
+            "layers.{bid}.attention.wk",  # llama-pth
+            "encoder.layer.{bid}.attention.self.key",  # bert
+            "transformer.h.{bid}.attn.k_proj",  # gpt-j
+            "transformer.h.{bid}.attn.k",  # refact
+            "model.layers.layers.{bid}.self_attn.k_proj",  # plamo
+            "model.layers.{bid}.attention.wk",  # internlm2
+            "transformer.decoder_layer.{bid}.multi_head_attention.key",  # Grok
+            "transformer.h.{bid}.attn.attention.k_proj",  # exaone
+        ),
+        # Attention value
         MODEL_TENSOR.ATTN_V: (
-            "model.layers.{bid}.self_attn.v_proj",
-            "layers.{bid}.attention.wv",
-            "encoder.layer.{bid}.attention.self.value",
-            "transformer.h.{bid}.attn.v_proj",
-            "transformer.h.{bid}.attn.v",
-            "model.layers.layers.{bid}.self_attn.v_proj",
-            "model.layers.{bid}.attention.wv",
-            "transformer.decoder_layer.{bid}.multi_head_attention.value",
-            "transformer.h.{bid}.attn.attention.v_proj",
-        ),
+            "model.layers.{bid}.self_attn.v_proj",  # llama-hf nemotron olmoe olmo2
+            "layers.{bid}.attention.wv",  # llama-pth
+            "encoder.layer.{bid}.attention.self.value",  # bert
+            "transformer.h.{bid}.attn.v_proj",  # gpt-j
+            "transformer.h.{bid}.attn.v",  # refact
+            "model.layers.layers.{bid}.self_attn.v_proj",  # plamo
+            "model.layers.{bid}.attention.wv",  # internlm2
+            "transformer.decoder_layer.{bid}.multi_head_attention.value",  # Grok
+            "transformer.h.{bid}.attn.attention.v_proj",  # exaone
+        ),
+        # Attention output
         MODEL_TENSOR.ATTN_OUT: (
-            "gpt_neox.layers.{bid}.attention.dense",
-            "transformer.h.{bid}.attn.c_proj",
-            "transformer.blocks.{bid}.attn.out_proj",
-            "transformer.h.{bid}.self_attention.dense",
-            "h.{bid}.self_attention.dense",
-            "model.layers.{bid}.self_attn.o_proj",
-            "layers.{bid}.attention.wo",
-            "encoder.layer.{bid}.attention.output.dense",
-            "transformer.h.{bid}.attn.out_proj",
-            "language_model.encoder.layers.{bid}.self_attention.dense",
-            "model.layers.{bid}.self_attn.dense",
-            "h.{bid}.attn.c_proj",
-            "transformer.h.{bid}.mixer.out_proj",
-            "model.layers.layers.{bid}.self_attn.o_proj",
-            "model.layers.{bid}.attention.wo",
-            "encoder.layers.{bid}.attn.out_proj",
-            "transformer.decoder_layer.{bid}.multi_head_attention.linear",
-            "transformer.blocks.{bid}.norm_attn_norm.attn.out_proj",
-            "encoder.layers.{bid}.self_attention.dense",
-            "transformer.layers.{bid}.attn.out_proj",
-            "transformer.h.{bid}.attn.attention.out_proj",
-        ),
+            "gpt_neox.layers.{bid}.attention.dense",  # gptneox
+            "transformer.h.{bid}.attn.c_proj",  # gpt2 refact qwen jais
+            "transformer.blocks.{bid}.attn.out_proj",  # mpt
+            "transformer.h.{bid}.self_attention.dense",  # falcon
+            "h.{bid}.self_attention.dense",  # bloom
+            "model.layers.{bid}.self_attn.o_proj",  # llama-hf nemotron olmoe olmo2
+            "model.layers.{bid}.self_attn.linear_attn",  # deci
+            "layers.{bid}.attention.wo",  # llama-pth
+            "encoder.layer.{bid}.attention.output.dense",  # bert
+            "transformer.h.{bid}.attn.out_proj",  # gpt-j
+            "language_model.encoder.layers.{bid}.self_attention.dense",  # persimmon
+            "model.layers.{bid}.self_attn.dense",  # persimmon
+            "h.{bid}.attn.c_proj",  # gpt2
+            "transformer.h.{bid}.mixer.out_proj",  # phi2
+            "model.layers.layers.{bid}.self_attn.o_proj",  # plamo
+            "model.layers.{bid}.attention.wo",  # internlm2
+            "encoder.layers.{bid}.attn.out_proj",  # nomic-bert
+            "transformer.decoder_layer.{bid}.multi_head_attention.linear",  # Grok
+            "transformer.blocks.{bid}.norm_attn_norm.attn.out_proj",  # dbrx
+            "encoder.layers.{bid}.self_attention.dense",  # chatglm
+            "transformer.layers.{bid}.attn.out_proj",  # openelm
+            "transformer.h.{bid}.attn.attention.out_proj",  # exaone
+        ),
+        # Attention output norm
         MODEL_TENSOR.ATTN_OUT_NORM: (
-            "encoder.layer.{bid}.attention.output.LayerNorm",
-            "encoder.layers.{bid}.norm1",
-            "transformer.decoder_layer.{bid}.rms_norm_1",
-            "transformer.blocks.{bid}.norm_attn_norm.norm_2",
+            "encoder.layer.{bid}.attention.output.LayerNorm",  # bert
+            "encoder.layers.{bid}.norm1",  # nomic-bert
+            "transformer.decoder_layer.{bid}.rms_norm_1",  # Grok
+            "transformer.blocks.{bid}.norm_attn_norm.norm_2",  # dbrx
         ),
-        MODEL_TENSOR.ATTN_POST_NORM: ("model.layers.{bid}.post_attention_layernorm",),
+        MODEL_TENSOR.ATTN_POST_NORM: (
+            "model.layers.{bid}.post_attention_layernorm",  # gemma2 olmo2
+        ),
+        # Rotary embeddings
         MODEL_TENSOR.ATTN_ROT_EMBD: (
-            "model.layers.{bid}.self_attn.rotary_emb.inv_freq",
-            "layers.{bid}.attention.inner_attention.rope.freqs",
-            "model.layers.layers.{bid}.self_attn.rotary_emb.inv_freq",
-            "transformer.h.{bid}.attn.rotary_emb.inv_freq",
+            "model.layers.{bid}.self_attn.rotary_emb.inv_freq",  # llama-hf
+            "layers.{bid}.attention.inner_attention.rope.freqs",  # llama-pth
+            "model.layers.layers.{bid}.self_attn.rotary_emb.inv_freq",  # plamo
+            "transformer.h.{bid}.attn.rotary_emb.inv_freq",  # codeshell
         ),
+        # Feed-forward norm
         MODEL_TENSOR.FFN_NORM: (
-            "gpt_neox.layers.{bid}.post_attention_layernorm",
-            "transformer.h.{bid}.ln_2",
-            "h.{bid}.post_attention_layernorm",
-            "transformer.blocks.{bid}.norm_2",
-            "model.layers.{bid}.post_attention_layernorm",
-            "layers.{bid}.ffn_norm",
-            "language_model.encoder.layers.{bid}.post_attention_layernorm",
-            "model.layers.{bid}.ln2",
-            "h.{bid}.ln_2",
-            "model.layers.{bid}.ffn_norm",
-            "transformer.decoder_layer.{bid}.rms_norm_2",
-            "encoder.layers.{bid}.post_attention_layernorm",
-            "transformer.layers.{bid}.ffn_norm",
-        ),
-        MODEL_TENSOR.FFN_PRE_NORM: ("model.layers.{bid}.pre_feedforward_layernorm",),
-        MODEL_TENSOR.FFN_POST_NORM: ("model.layers.{bid}.post_feedforward_layernorm",),
+            "gpt_neox.layers.{bid}.post_attention_layernorm",  # gptneox
+            "transformer.h.{bid}.ln_2",  # gpt2 refact qwen jais exaone
+            "h.{bid}.post_attention_layernorm",  # bloom
+            "transformer.blocks.{bid}.norm_2",  # mpt
+            "model.layers.{bid}.post_attention_layernorm",  # llama-hf nemotron olmoe
+            "layers.{bid}.ffn_norm",  # llama-pth
+            "language_model.encoder.layers.{bid}.post_attention_layernorm",  # persimmon
+            "model.layers.{bid}.ln2",  # yi
+            "h.{bid}.ln_2",  # gpt2
+            "model.layers.{bid}.ffn_norm",  # internlm2
+            "transformer.decoder_layer.{bid}.rms_norm_2",  # Grok
+            "encoder.layers.{bid}.post_attention_layernorm",  # chatglm
+            "transformer.layers.{bid}.ffn_norm",  # openelm
+        ),
+        # Post feed-forward norm
+        MODEL_TENSOR.FFN_PRE_NORM: (
+            "model.layers.{bid}.pre_feedforward_layernorm",  # gemma2
+        ),
+        # Post feed-forward norm
+        MODEL_TENSOR.FFN_POST_NORM: (
+            "model.layers.{bid}.post_feedforward_layernorm",  # gemma2 olmo2
+        ),
         MODEL_TENSOR.FFN_GATE_INP: (
-            "layers.{bid}.feed_forward.gate",
-            "model.layers.{bid}.block_sparse_moe.gate",
-            "model.layers.{bid}.mlp.gate",
-            "transformer.decoder_layer.{bid}.router",
-            "transformer.blocks.{bid}.ffn.router.layer",
+            "layers.{bid}.feed_forward.gate",  # mixtral
+            "model.layers.{bid}.block_sparse_moe.gate",  # mixtral
+            "model.layers.{bid}.mlp.gate",  # qwen2moe olmoe
+            "transformer.decoder_layer.{bid}.router",  # Grok
+            "transformer.blocks.{bid}.ffn.router.layer",  # dbrx
+            "model.layers.{bid}.block_sparse_moe.router.layer",  # granitemoe
         ),
-        MODEL_TENSOR.FFN_GATE_INP_SHEXP: ("model.layers.{bid}.mlp.shared_expert_gate",),
+        MODEL_TENSOR.FFN_GATE_INP_SHEXP: (
+            "model.layers.{bid}.mlp.shared_expert_gate",  # qwen2moe
+        ),
+        MODEL_TENSOR.FFN_EXP_PROBS_B: (
+            "model.layers.{bid}.mlp.gate.e_score_correction",  # deepseek-v3
+        ),
+        # Feed-forward up
         MODEL_TENSOR.FFN_UP: (
-            "gpt_neox.layers.{bid}.mlp.dense_h_to_4h",
-            "transformer.h.{bid}.mlp.c_fc",
-            "transformer.blocks.{bid}.ffn.up_proj",
-            "transformer.h.{bid}.mlp.dense_h_to_4h",
-            "h.{bid}.mlp.dense_h_to_4h",
-            "model.layers.{bid}.mlp.up_proj",
-            "layers.{bid}.feed_forward.w3",
-            "encoder.layer.{bid}.intermediate.dense",
-            "transformer.h.{bid}.mlp.fc_in",
-            "transformer.h.{bid}.mlp.linear_3",
-            "language_model.encoder.layers.{bid}.mlp.dense_h_to_4h",
-            "model.layers.{bid}.mlp.dense_h_to_4h",
-            "transformer.h.{bid}.mlp.w1",
-            "h.{bid}.mlp.c_fc",
-            "transformer.h.{bid}.mlp.fc1",
-            "model.layers.{bid}.mlp.fc1",
-            "model.layers.{bid}.mlp.gate_up_proj",
-            "model.layers.layers.{bid}.mlp.up_proj",
-            "model.layers.{bid}.feed_forward.w3",
-            "encoder.layers.{bid}.mlp.fc11",
-            "model.layers.{bid}.mlp.c_fc",
-            "encoder.layer.{bid}.mlp.gated_layers_v",
-            "model.layers.{bid}.residual_mlp.w3",
-            "encoder.layers.{bid}.mlp.dense_h_to_4h",
-            "transformer.h.{bid}.mlp.c_fc_1",
+            "gpt_neox.layers.{bid}.mlp.dense_h_to_4h",  # gptneox
+            "transformer.h.{bid}.mlp.c_fc",  # gpt2 jais
+            "transformer.blocks.{bid}.ffn.up_proj",  # mpt
+            "transformer.h.{bid}.mlp.dense_h_to_4h",  # falcon
+            "h.{bid}.mlp.dense_h_to_4h",  # bloom
+            "model.layers.{bid}.mlp.up_proj",  # llama-hf refact nemotron olmo2
+            "layers.{bid}.feed_forward.w3",  # llama-pth
+            "encoder.layer.{bid}.intermediate.dense",  # bert
+            "transformer.h.{bid}.mlp.fc_in",  # gpt-j
+            "transformer.h.{bid}.mlp.linear_3",  # refact
+            "language_model.encoder.layers.{bid}.mlp.dense_h_to_4h",  # persimmon
+            "model.layers.{bid}.mlp.dense_h_to_4h",  # persimmon
+            "transformer.h.{bid}.mlp.w1",  # qwen
+            "h.{bid}.mlp.c_fc",  # gpt2
+            "transformer.h.{bid}.mlp.fc1",  # phi2
+            "model.layers.{bid}.mlp.fc1",  # phi2
+            "model.layers.{bid}.mlp.gate_up_proj",  # phi3
+            "model.layers.layers.{bid}.mlp.up_proj",  # plamo
+            "model.layers.{bid}.feed_forward.w3",  # internlm2
+            "encoder.layers.{bid}.mlp.fc11",  # nomic-bert
+            "model.layers.{bid}.mlp.c_fc",  # starcoder2
+            "encoder.layer.{bid}.mlp.gated_layers_v",  # jina-bert-v2
+            "model.layers.{bid}.residual_mlp.w3",  # arctic
+            "encoder.layers.{bid}.mlp.dense_h_to_4h",  # chatglm
+            "transformer.h.{bid}.mlp.c_fc_1",  # exaone
         ),
         MODEL_TENSOR.FFN_UP_EXP: (
-            "layers.{bid}.feed_forward.experts.w3",
-            "transformer.decoder_layer.{bid}.moe.linear_v",
-            "transformer.blocks.{bid}.ffn.experts.mlp.v1",
-            "model.layers.{bid}.mlp.experts.up_proj",
+            "layers.{bid}.feed_forward.experts.w3",  # mixtral (merged)
+            "transformer.decoder_layer.{bid}.moe.linear_v",  # Grok (merged)
+            "transformer.blocks.{bid}.ffn.experts.mlp.v1",  # dbrx
+            "model.layers.{bid}.mlp.experts.up_proj",  # qwen2moe olmoe (merged)
         ),
         MODEL_TENSOR.FFN_UP_SHEXP: (
-            "model.layers.{bid}.mlp.shared_expert.up_proj",
-            "model.layers.{bid}.mlp.shared_experts.up_proj",
+            "model.layers.{bid}.mlp.shared_expert.up_proj",  # qwen2moe
+            "model.layers.{bid}.mlp.shared_experts.up_proj",  # deepseek deepseek2
         ),
-        MODEL_TENSOR.FFN_ACT: ("transformer.blocks.{bid}.ffn.act",),
+        # AWQ-activation gate
+        MODEL_TENSOR.FFN_ACT: ("transformer.blocks.{bid}.ffn.act",),  # mpt
+        # Feed-forward gate
         MODEL_TENSOR.FFN_GATE: (
-            "model.layers.{bid}.mlp.gate_proj",
-            "layers.{bid}.feed_forward.w1",
-            "transformer.h.{bid}.mlp.w2",
-            "transformer.h.{bid}.mlp.c_fc2",
-            "model.layers.layers.{bid}.mlp.gate_proj",
-            "model.layers.{bid}.feed_forward.w1",
-            "encoder.layers.{bid}.mlp.fc12",
-            "encoder.layer.{bid}.mlp.gated_layers_w",
-            "transformer.h.{bid}.mlp.linear_1",
-            "model.layers.{bid}.residual_mlp.w1",
-            "transformer.h.{bid}.mlp.c_fc_0",
+            "model.layers.{bid}.mlp.gate_proj",  # llama-hf refact olmo2
+            "layers.{bid}.feed_forward.w1",  # llama-pth
+            "transformer.h.{bid}.mlp.w2",  # qwen
+            "transformer.h.{bid}.mlp.c_fc2",  # jais
+            "model.layers.layers.{bid}.mlp.gate_proj",  # plamo
+            "model.layers.{bid}.feed_forward.w1",  # internlm2
+            "encoder.layers.{bid}.mlp.fc12",  # nomic-bert
+            "encoder.layer.{bid}.mlp.gated_layers_w",  # jina-bert-v2
+            "transformer.h.{bid}.mlp.linear_1",  # refact
+            "model.layers.{bid}.residual_mlp.w1",  # arctic
+            "transformer.h.{bid}.mlp.c_fc_0",  # exaone
         ),
         MODEL_TENSOR.FFN_GATE_EXP: (
-            "layers.{bid}.feed_forward.experts.w1",
-            "transformer.decoder_layer.{bid}.moe.linear",
-            "transformer.blocks.{bid}.ffn.experts.mlp.w1",
-            "model.layers.{bid}.mlp.experts.gate_proj",
+            "layers.{bid}.feed_forward.experts.w1",  # mixtral (merged)
+            "transformer.decoder_layer.{bid}.moe.linear",  # Grok (merged)
+            "transformer.blocks.{bid}.ffn.experts.mlp.w1",  # dbrx
+            "model.layers.{bid}.mlp.experts.gate_proj",  # qwen2moe olmoe (merged)
         ),
         MODEL_TENSOR.FFN_GATE_SHEXP: (
-            "model.layers.{bid}.mlp.shared_expert.gate_proj",
-            "model.layers.{bid}.mlp.shared_experts.gate_proj",
+            "model.layers.{bid}.mlp.shared_expert.gate_proj",  # qwen2moe
+            "model.layers.{bid}.mlp.shared_experts.gate_proj",  # deepseek deepseek2
         ),
+        # Feed-forward down
         MODEL_TENSOR.FFN_DOWN: (
-            "gpt_neox.layers.{bid}.mlp.dense_4h_to_h",
-            "transformer.h.{bid}.mlp.c_proj",
-            "transformer.blocks.{bid}.ffn.down_proj",
-            "transformer.h.{bid}.mlp.dense_4h_to_h",
-            "h.{bid}.mlp.dense_4h_to_h",
-            "model.layers.{bid}.mlp.down_proj",
-            "layers.{bid}.feed_forward.w2",
-            "encoder.layer.{bid}.output.dense",
-            "transformer.h.{bid}.mlp.fc_out",
-            "language_model.encoder.layers.{bid}.mlp.dense_4h_to_h",
-            "model.layers.{bid}.mlp.dense_4h_to_h",
-            "h.{bid}.mlp.c_proj",
-            "transformer.h.{bid}.mlp.fc2",
-            "model.layers.{bid}.mlp.fc2",
-            "model.layers.layers.{bid}.mlp.down_proj",
-            "model.layers.{bid}.feed_forward.w2",
-            "encoder.layers.{bid}.mlp.fc2",
-            "model.layers.{bid}.mlp.c_proj",
-            "encoder.layer.{bid}.mlp.wo",
-            "transformer.layers.{bid}.ffn.proj_2",
-            "model.layers.{bid}.residual_mlp.w2",
-            "encoder.layer.{bid}.mlp.down_layer",
-            "encoder.layers.{bid}.mlp.dense_4h_to_h",
-            "model.layers.h.{bid}.mlp.c_proj",
+            "gpt_neox.layers.{bid}.mlp.dense_4h_to_h",  # gptneox
+            "transformer.h.{bid}.mlp.c_proj",  # gpt2 refact qwen jais
+            "transformer.blocks.{bid}.ffn.down_proj",  # mpt
+            "transformer.h.{bid}.mlp.dense_4h_to_h",  # falcon
+            "h.{bid}.mlp.dense_4h_to_h",  # bloom
+            "model.layers.{bid}.mlp.down_proj",  # llama-hf nemotron olmo2
+            "layers.{bid}.feed_forward.w2",  # llama-pth
+            "encoder.layer.{bid}.output.dense",  # bert
+            "transformer.h.{bid}.mlp.fc_out",  # gpt-j
+            "language_model.encoder.layers.{bid}.mlp.dense_4h_to_h",  # persimmon
+            "model.layers.{bid}.mlp.dense_4h_to_h",  # persimmon
+            "h.{bid}.mlp.c_proj",  # gpt2
+            "transformer.h.{bid}.mlp.fc2",  # phi2
+            "model.layers.{bid}.mlp.fc2",  # phi2
+            "model.layers.layers.{bid}.mlp.down_proj",  # plamo
+            "model.layers.{bid}.feed_forward.w2",  # internlm2
+            "encoder.layers.{bid}.mlp.fc2",  # nomic-bert
+            "model.layers.{bid}.mlp.c_proj",  # starcoder2
+            "encoder.layer.{bid}.mlp.wo",  # jina-bert-v2
+            "transformer.layers.{bid}.ffn.proj_2",  # openelm
+            "model.layers.{bid}.residual_mlp.w2",  # arctic
+            "encoder.layer.{bid}.mlp.down_layer",  # jina-bert-v2
+            "encoder.layers.{bid}.mlp.dense_4h_to_h",  # chatglm
+            "model.layers.h.{bid}.mlp.c_proj",  # exaone
         ),
         MODEL_TENSOR.FFN_DOWN_EXP: (
-            "layers.{bid}.feed_forward.experts.w2",
-            "transformer.decoder_layer.{bid}.moe.linear_1",
-            "transformer.blocks.{bid}.ffn.experts.mlp.w2",
-            "model.layers.{bid}.mlp.experts.down_proj",
+            "layers.{bid}.feed_forward.experts.w2",  # mixtral (merged)
+            "transformer.decoder_layer.{bid}.moe.linear_1",  # Grok (merged)
+            "transformer.blocks.{bid}.ffn.experts.mlp.w2",  # dbrx
+            "model.layers.{bid}.mlp.experts.down_proj",  # qwen2moe olmoe (merged)
+            "model.layers.{bid}.block_sparse_moe.output_linear",  # granitemoe
         ),
         MODEL_TENSOR.FFN_DOWN_SHEXP: (
-            "model.layers.{bid}.mlp.shared_expert.down_proj",
-            "model.layers.{bid}.mlp.shared_experts.down_proj",
+            "model.layers.{bid}.mlp.shared_expert.down_proj",  # qwen2moe
+            "model.layers.{bid}.mlp.shared_experts.down_proj",  # deepseek deepseek2
         ),
         MODEL_TENSOR.ATTN_Q_NORM: (
             "language_model.encoder.layers.{bid}.self_attention.q_layernorm",
-            "model.layers.{bid}.self_attn.q_layernorm",
-            "model.layers.{bid}.self_attn.q_norm",
-            "transformer.blocks.{bid}.attn.q_ln",
-            "encoder.layer.{bid}.attention.self.layer_norm_q",
-            "transformer.layers.{bid}.attn.q_norm",
+            "model.layers.{bid}.self_attn.q_layernorm",  # persimmon
+            "model.layers.{bid}.self_attn.q_norm",  # cohere olmoe chameleon olmo2
+            "transformer.blocks.{bid}.attn.q_ln",  # sea-lion
+            "encoder.layer.{bid}.attention.self.layer_norm_q",  # jina-bert-v2
+            "transformer.layers.{bid}.attn.q_norm",  # openelm
         ),
         MODEL_TENSOR.ATTN_K_NORM: (
             "language_model.encoder.layers.{bid}.self_attention.k_layernorm",
-            "model.layers.{bid}.self_attn.k_layernorm",
-            "model.layers.{bid}.self_attn.k_norm",
-            "transformer.blocks.{bid}.attn.k_ln",
-            "encoder.layer.{bid}.attention.self.layer_norm_k",
-            "transformer.layers.{bid}.attn.k_norm",
+            "model.layers.{bid}.self_attn.k_layernorm",  # persimmon
+            "model.layers.{bid}.self_attn.k_norm",  # cohere olmoe chameleon olmo2
+            "transformer.blocks.{bid}.attn.k_ln",  # sea-lion
+            "encoder.layer.{bid}.attention.self.layer_norm_k",  # jina-bert-v2
+            "transformer.layers.{bid}.attn.k_norm",  # openelm
         ),
         MODEL_TENSOR.ROPE_FREQS: (
-            "language_model.encoder.layers.{bid}.self_attention.rotary_emb.inv_freq",
+            "language_model.encoder.layers.{bid}.self_attention.rotary_emb.inv_freq",  # persimmon
         ),
         MODEL_TENSOR.LAYER_OUT_NORM: (
-            "encoder.layer.{bid}.output.LayerNorm",
-            "encoder.layers.{bid}.norm2",
-            "transformer.decoder_layer.{bid}.rms_norm_3",
-            "encoder.layer.{bid}.mlp.layernorm",
-            "encoder.layer.{bid}.layer_norm_2",
+            "encoder.layer.{bid}.output.LayerNorm",  # bert
+            "encoder.layers.{bid}.norm2",  # nomic-bert
+            "transformer.decoder_layer.{bid}.rms_norm_3",  # Grok
+            "encoder.layer.{bid}.mlp.layernorm",  # jina-bert-v2
+            "encoder.layer.{bid}.layer_norm_2",  # jina-v2-code
         ),
         MODEL_TENSOR.SSM_IN: (
             "model.layers.{bid}.in_proj",
             "backbone.layers.{bid}.mixer.in_proj",
-            "model.layers.{bid}.mamba.in_proj",
         ),
         MODEL_TENSOR.SSM_CONV1D: (
             "model.layers.{bid}.conv1d",
             "backbone.layers.{bid}.mixer.conv1d",
-            "model.layers.{bid}.mamba.conv1d",
         ),
         MODEL_TENSOR.SSM_X: (
             "model.layers.{bid}.x_proj",
             "backbone.layers.{bid}.mixer.x_proj",
-            "model.layers.{bid}.mamba.x_proj",
         ),
         MODEL_TENSOR.SSM_DT: (
             "model.layers.{bid}.dt_proj",
             "backbone.layers.{bid}.mixer.dt_proj",
-            "model.layers.{bid}.mamba.dt_proj",
         ),
-        MODEL_TENSOR.SSM_DT_NORM: ("model.layers.{bid}.mamba.dt_layernorm",),
         MODEL_TENSOR.SSM_A: (
             "model.layers.{bid}.A_log",
             "backbone.layers.{bid}.mixer.A_log",
-            "model.layers.{bid}.mamba.A_log",
-        ),
-        MODEL_TENSOR.SSM_B_NORM: (
-            "model.layers.{bid}.mamba.b_layernorm",
-            "model.layers.{bid}.mamba.B_layernorm",
-        ),
-        MODEL_TENSOR.SSM_C_NORM: (
-            "model.layers.{bid}.mamba.c_layernorm",
-            "model.layers.{bid}.mamba.C_layernorm",
         ),
         MODEL_TENSOR.SSM_D: (
             "model.layers.{bid}.D",
             "backbone.layers.{bid}.mixer.D",
-            "model.layers.{bid}.mamba.D",
         ),
         MODEL_TENSOR.SSM_OUT: (
             "model.layers.{bid}.out_proj",
             "backbone.layers.{bid}.mixer.out_proj",
-            "model.layers.{bid}.mamba.out_proj",
-        ),
-        MODEL_TENSOR.TIME_MIX_W1: ("rwkv.blocks.{bid}.attention.time_maa_w1",),
-        MODEL_TENSOR.TIME_MIX_W2: ("rwkv.blocks.{bid}.attention.time_maa_w2",),
-        MODEL_TENSOR.TIME_MIX_LERP_X: ("rwkv.blocks.{bid}.attention.time_maa_x",),
-        MODEL_TENSOR.TIME_MIX_LERP_K: ("rwkv.blocks.{bid}.attention.time_maa_k",),
-        MODEL_TENSOR.TIME_MIX_LERP_V: ("rwkv.blocks.{bid}.attention.time_maa_v",),
-        MODEL_TENSOR.TIME_MIX_LERP_R: ("rwkv.blocks.{bid}.attention.time_maa_r",),
-        MODEL_TENSOR.TIME_MIX_LERP_G: ("rwkv.blocks.{bid}.attention.time_maa_g",),
-        MODEL_TENSOR.TIME_MIX_LERP_W: ("rwkv.blocks.{bid}.attention.time_maa_w",),
-        MODEL_TENSOR.TIME_MIX_FIRST: ("rwkv.blocks.{bid}.attention.time_faaaa",),
-        MODEL_TENSOR.TIME_MIX_DECAY: ("rwkv.blocks.{bid}.attention.time_decay",),
-        MODEL_TENSOR.TIME_MIX_DECAY_W1: ("rwkv.blocks.{bid}.attention.time_decay_w1",),
-        MODEL_TENSOR.TIME_MIX_DECAY_W2: ("rwkv.blocks.{bid}.attention.time_decay_w2",),
-        MODEL_TENSOR.TIME_MIX_KEY: ("rwkv.blocks.{bid}.attention.key",),
-        MODEL_TENSOR.TIME_MIX_VALUE: ("rwkv.blocks.{bid}.attention.value",),
-        MODEL_TENSOR.TIME_MIX_RECEPTANCE: ("rwkv.blocks.{bid}.attention.receptance",),
-        MODEL_TENSOR.TIME_MIX_GATE: ("rwkv.blocks.{bid}.attention.gate",),
-        MODEL_TENSOR.TIME_MIX_LN: ("rwkv.blocks.{bid}.attention.ln_x",),
-        MODEL_TENSOR.TIME_MIX_OUTPUT: ("rwkv.blocks.{bid}.attention.output",),
-        MODEL_TENSOR.CHANNEL_MIX_LERP_K: ("rwkv.blocks.{bid}.feed_forward.time_maa_k",),
-        MODEL_TENSOR.CHANNEL_MIX_LERP_R: ("rwkv.blocks.{bid}.feed_forward.time_maa_r",),
-        MODEL_TENSOR.CHANNEL_MIX_KEY: ("rwkv.blocks.{bid}.feed_forward.key",),
+        ),
+        MODEL_TENSOR.TIME_MIX_W1: (
+            "rwkv.blocks.{bid}.attention.time_maa_w1",  # rwkv v6
+        ),
+        MODEL_TENSOR.TIME_MIX_W2: (
+            "rwkv.blocks.{bid}.attention.time_maa_w2",  # rwkv v6
+        ),
+        MODEL_TENSOR.TIME_MIX_LERP_X: (
+            "rwkv.blocks.{bid}.attention.time_maa_x",  # rwkv v6
+        ),
+        MODEL_TENSOR.TIME_MIX_LERP_K: (
+            "rwkv.blocks.{bid}.attention.time_maa_k",  # rwkv v6
+        ),
+        MODEL_TENSOR.TIME_MIX_LERP_V: (
+            "rwkv.blocks.{bid}.attention.time_maa_v",  # rwkv v6
+        ),
+        MODEL_TENSOR.TIME_MIX_LERP_R: (
+            "rwkv.blocks.{bid}.attention.time_maa_r",  # rwkv v6
+        ),
+        MODEL_TENSOR.TIME_MIX_LERP_G: (
+            "rwkv.blocks.{bid}.attention.time_maa_g",  # rwkv v6
+        ),
+        MODEL_TENSOR.TIME_MIX_LERP_W: (
+            "rwkv.blocks.{bid}.attention.time_maa_w",  # rwkv v6
+        ),
+        MODEL_TENSOR.TIME_MIX_FIRST: (
+            "rwkv.blocks.{bid}.attention.time_faaaa",  # rwkv v6
+        ),
+        MODEL_TENSOR.TIME_MIX_DECAY: (
+            "rwkv.blocks.{bid}.attention.time_decay",  # rwkv v6
+        ),
+        MODEL_TENSOR.TIME_MIX_DECAY_W1: (
+            "rwkv.blocks.{bid}.attention.time_decay_w1",  # rwkv v6
+        ),
+        MODEL_TENSOR.TIME_MIX_DECAY_W2: (
+            "rwkv.blocks.{bid}.attention.time_decay_w2",  # rwkv v6
+        ),
+        MODEL_TENSOR.TIME_MIX_KEY: ("rwkv.blocks.{bid}.attention.key",),  # rwkv
+        MODEL_TENSOR.TIME_MIX_VALUE: ("rwkv.blocks.{bid}.attention.value",),  # rwkv
+        MODEL_TENSOR.TIME_MIX_RECEPTANCE: (
+            "rwkv.blocks.{bid}.attention.receptance",  # rwkv
+        ),
+        MODEL_TENSOR.TIME_MIX_GATE: ("rwkv.blocks.{bid}.attention.gate",),  # rwkv
+        MODEL_TENSOR.TIME_MIX_LN: ("rwkv.blocks.{bid}.attention.ln_x",),  # rwkv
+        MODEL_TENSOR.TIME_MIX_OUTPUT: ("rwkv.blocks.{bid}.attention.output",),  # rwkv
+        MODEL_TENSOR.CHANNEL_MIX_LERP_K: (
+            "rwkv.blocks.{bid}.feed_forward.time_maa_k",  # rwkv v6
+        ),
+        MODEL_TENSOR.CHANNEL_MIX_LERP_R: (
+            "rwkv.blocks.{bid}.feed_forward.time_maa_r",  # rwkv v6
+        ),
+        MODEL_TENSOR.CHANNEL_MIX_KEY: ("rwkv.blocks.{bid}.feed_forward.key",),  # rwkv
         MODEL_TENSOR.CHANNEL_MIX_RECEPTANCE: (
-            "rwkv.blocks.{bid}.feed_forward.receptance",
+            "rwkv.blocks.{bid}.feed_forward.receptance",  # rwkv
         ),
-        MODEL_TENSOR.CHANNEL_MIX_VALUE: ("rwkv.blocks.{bid}.feed_forward.value",),
-        MODEL_TENSOR.ATTN_Q_A: ("model.layers.{bid}.self_attn.q_a_proj",),
-        MODEL_TENSOR.ATTN_Q_B: ("model.layers.{bid}.self_attn.q_b_proj",),
+        MODEL_TENSOR.CHANNEL_MIX_VALUE: (
+            "rwkv.blocks.{bid}.feed_forward.value",  # rwkv
+        ),
+        MODEL_TENSOR.ATTN_Q_A: ("model.layers.{bid}.self_attn.q_a_proj",),  # deepseek2
+        MODEL_TENSOR.ATTN_Q_B: ("model.layers.{bid}.self_attn.q_b_proj",),  # deepseek2
         MODEL_TENSOR.ATTN_KV_A_MQA: (
-            "model.layers.{bid}.self_attn.kv_a_proj_with_mqa",
-        ),
-        MODEL_TENSOR.ATTN_KV_B: ("model.layers.{bid}.self_attn.kv_b_proj",),
-        MODEL_TENSOR.ATTN_Q_A_NORM: ("model.layers.{bid}.self_attn.q_a_layernorm",),
-        MODEL_TENSOR.ATTN_KV_A_NORM: ("model.layers.{bid}.self_attn.kv_a_layernorm",),
-        MODEL_TENSOR.ATTN_SUB_NORM: ("model.layers.{bid}.self_attn.inner_attn_ln",),
-        MODEL_TENSOR.FFN_SUB_NORM: ("model.layers.{bid}.mlp.ffn_layernorm",),
-        MODEL_TENSOR.DEC_ATTN_NORM: ("decoder.block.{bid}.layer.0.layer_norm",),
-        MODEL_TENSOR.DEC_ATTN_Q: ("decoder.block.{bid}.layer.0.SelfAttention.q",),
-        MODEL_TENSOR.DEC_ATTN_K: ("decoder.block.{bid}.layer.0.SelfAttention.k",),
-        MODEL_TENSOR.DEC_ATTN_V: ("decoder.block.{bid}.layer.0.SelfAttention.v",),
-        MODEL_TENSOR.DEC_ATTN_OUT: ("decoder.block.{bid}.layer.0.SelfAttention.o",),
+            "model.layers.{bid}.self_attn.kv_a_proj_with_mqa",  # deepseek2
+        ),
+        MODEL_TENSOR.ATTN_KV_B: (
+            "model.layers.{bid}.self_attn.kv_b_proj",  # deepseek2
+        ),
+        MODEL_TENSOR.ATTN_Q_A_NORM: (
+            "model.layers.{bid}.self_attn.q_a_layernorm",  # deepseek2
+        ),
+        MODEL_TENSOR.ATTN_KV_A_NORM: (
+            "model.layers.{bid}.self_attn.kv_a_layernorm",  # deepseek2
+        ),
+        MODEL_TENSOR.ATTN_SUB_NORM: (
+            "model.layers.{bid}.self_attn.inner_attn_ln",  # bitnet
+        ),
+        MODEL_TENSOR.FFN_SUB_NORM: ("model.layers.{bid}.mlp.ffn_layernorm",),  # bitnet
+        MODEL_TENSOR.DEC_ATTN_NORM: ("decoder.block.{bid}.layer.0.layer_norm",),  # t5
+        MODEL_TENSOR.DEC_ATTN_Q: ("decoder.block.{bid}.layer.0.SelfAttention.q",),  # t5
+        MODEL_TENSOR.DEC_ATTN_K: ("decoder.block.{bid}.layer.0.SelfAttention.k",),  # t5
+        MODEL_TENSOR.DEC_ATTN_V: ("decoder.block.{bid}.layer.0.SelfAttention.v",),  # t5
+        MODEL_TENSOR.DEC_ATTN_OUT: (
+            "decoder.block.{bid}.layer.0.SelfAttention.o",  # t5
+        ),
         MODEL_TENSOR.DEC_ATTN_REL_B: (
-            "decoder.block.{bid}.layer.0.SelfAttention.relative_attention_bias",
+            "decoder.block.{bid}.layer.0.SelfAttention.relative_attention_bias",  # t5
+        ),
+        MODEL_TENSOR.DEC_CROSS_ATTN_NORM: (
+            "decoder.block.{bid}.layer.1.layer_norm",  # t5
         ),
-        MODEL_TENSOR.DEC_CROSS_ATTN_NORM: ("decoder.block.{bid}.layer.1.layer_norm",),
         MODEL_TENSOR.DEC_CROSS_ATTN_Q: (
-            "decoder.block.{bid}.layer.1.EncDecAttention.q",
+            "decoder.block.{bid}.layer.1.EncDecAttention.q",  # t5
         ),
         MODEL_TENSOR.DEC_CROSS_ATTN_K: (
-            "decoder.block.{bid}.layer.1.EncDecAttention.k",
+            "decoder.block.{bid}.layer.1.EncDecAttention.k",  # t5
         ),
         MODEL_TENSOR.DEC_CROSS_ATTN_V: (
-            "decoder.block.{bid}.layer.1.EncDecAttention.v",
+            "decoder.block.{bid}.layer.1.EncDecAttention.v",  # t5
         ),
         MODEL_TENSOR.DEC_CROSS_ATTN_OUT: (
-            "decoder.block.{bid}.layer.1.EncDecAttention.o",
+            "decoder.block.{bid}.layer.1.EncDecAttention.o",  # t5
         ),
         MODEL_TENSOR.DEC_CROSS_ATTN_REL_B: (
-            "decoder.block.{bid}.layer.1.EncDecAttention.relative_attention_bias",
+            "decoder.block.{bid}.layer.1.EncDecAttention.relative_attention_bias",  # t5
+        ),
+        MODEL_TENSOR.DEC_FFN_NORM: ("decoder.block.{bid}.layer.2.layer_norm",),  # t5
+        MODEL_TENSOR.DEC_FFN_GATE: (
+            "decoder.block.{bid}.layer.2.DenseReluDense.wi_0",  # flan-t5
         ),
-        MODEL_TENSOR.DEC_FFN_NORM: ("decoder.block.{bid}.layer.2.layer_norm",),
-        MODEL_TENSOR.DEC_FFN_GATE: ("decoder.block.{bid}.layer.2.DenseReluDense.wi_0",),
         MODEL_TENSOR.DEC_FFN_UP: (
-            "decoder.block.{bid}.layer.2.DenseReluDense.wi",
-            "decoder.block.{bid}.layer.2.DenseReluDense.wi_1",
-        ),
-        MODEL_TENSOR.DEC_FFN_DOWN: ("decoder.block.{bid}.layer.2.DenseReluDense.wo",),
-        MODEL_TENSOR.DEC_OUTPUT_NORM: ("decoder.final_layer_norm",),
-        MODEL_TENSOR.ENC_ATTN_NORM: ("encoder.block.{bid}.layer.0.layer_norm",),
-        MODEL_TENSOR.ENC_ATTN_Q: ("encoder.block.{bid}.layer.0.SelfAttention.q",),
-        MODEL_TENSOR.ENC_ATTN_K: ("encoder.block.{bid}.layer.0.SelfAttention.k",),
-        MODEL_TENSOR.ENC_ATTN_V: ("encoder.block.{bid}.layer.0.SelfAttention.v",),
-        MODEL_TENSOR.ENC_ATTN_OUT: ("encoder.block.{bid}.layer.0.SelfAttention.o",),
+            "decoder.block.{bid}.layer.2.DenseReluDense.wi",  # t5
+            "decoder.block.{bid}.layer.2.DenseReluDense.wi_1",  # flan-t5
+        ),
+        MODEL_TENSOR.DEC_FFN_DOWN: (
+            "decoder.block.{bid}.layer.2.DenseReluDense.wo",  # t5
+        ),
+        MODEL_TENSOR.DEC_OUTPUT_NORM: ("decoder.final_layer_norm",),  # t5
+        MODEL_TENSOR.ENC_ATTN_NORM: ("encoder.block.{bid}.layer.0.layer_norm",),  # t5
+        MODEL_TENSOR.ENC_ATTN_Q: ("encoder.block.{bid}.layer.0.SelfAttention.q",),  # t5
+        MODEL_TENSOR.ENC_ATTN_K: ("encoder.block.{bid}.layer.0.SelfAttention.k",),  # t5
+        MODEL_TENSOR.ENC_ATTN_V: ("encoder.block.{bid}.layer.0.SelfAttention.v",),  # t5
+        MODEL_TENSOR.ENC_ATTN_OUT: (
+            "encoder.block.{bid}.layer.0.SelfAttention.o",  # t5
+        ),
         MODEL_TENSOR.ENC_ATTN_REL_B: (
-            "encoder.block.{bid}.layer.0.SelfAttention.relative_attention_bias",
+            "encoder.block.{bid}.layer.0.SelfAttention.relative_attention_bias",  # t5
+        ),
+        MODEL_TENSOR.ENC_FFN_NORM: ("encoder.block.{bid}.layer.1.layer_norm",),  # t5
+        MODEL_TENSOR.ENC_FFN_GATE: (
+            "encoder.block.{bid}.layer.1.DenseReluDense.wi_0",  # flan-t5
         ),
-        MODEL_TENSOR.ENC_FFN_NORM: ("encoder.block.{bid}.layer.1.layer_norm",),
-        MODEL_TENSOR.ENC_FFN_GATE: ("encoder.block.{bid}.layer.1.DenseReluDense.wi_0",),
         MODEL_TENSOR.ENC_FFN_UP: (
-            "encoder.block.{bid}.layer.1.DenseReluDense.wi",
-            "encoder.block.{bid}.layer.1.DenseReluDense.wi_1",
+            "encoder.block.{bid}.layer.1.DenseReluDense.wi",  # t5
+            "encoder.block.{bid}.layer.1.DenseReluDense.wi_1",  # flan-t5
+        ),
+        MODEL_TENSOR.ENC_FFN_DOWN: (
+            "encoder.block.{bid}.layer.1.DenseReluDense.wo",  # t5
+        ),
+        ############################################################################
+        # TODO: these do not belong to block_mappings_cfg - move them to mappings_cfg
+        MODEL_TENSOR.ENC_OUTPUT_NORM: ("encoder.final_layer_norm",),  # t5
+        MODEL_TENSOR.CLS: (
+            "classifier",  # jina
+            "classifier.dense",  # roberta
+        ),
+        MODEL_TENSOR.CLS_OUT: ("classifier.out_proj",),  # roberta
+        #############################################################################
+        MODEL_TENSOR.CONVNEXT_DW: ("backbone.convnext.{bid}.dwconv",),  # wavtokenizer
+        MODEL_TENSOR.CONVNEXT_NORM: ("backbone.convnext.{bid}.norm",),  # wavtokenizer
+        MODEL_TENSOR.CONVNEXT_PW1: ("backbone.convnext.{bid}.pwconv1",),  # wavtokenizer
+        MODEL_TENSOR.CONVNEXT_PW2: ("backbone.convnext.{bid}.pwconv2",),  # wavtokenizer
+        MODEL_TENSOR.CONVNEXT_GAMMA: ("backbone.convnext.{bid}.gamma",),  # wavtokenizer
+        MODEL_TENSOR.POSNET_CONV1: ("backbone.posnet.{bid}.conv1",),  # wavtokenizer
+        MODEL_TENSOR.POSNET_CONV2: ("backbone.posnet.{bid}.conv2",),  # wavtokenizer
+        MODEL_TENSOR.POSNET_NORM: ("backbone.posnet.{bid}.norm",),  # wavtokenizer
+        MODEL_TENSOR.POSNET_NORM1: ("backbone.posnet.{bid}.norm1",),  # wavtokenizer
+        MODEL_TENSOR.POSNET_NORM2: ("backbone.posnet.{bid}.norm2",),  # wavtokenizer
+        MODEL_TENSOR.POSNET_ATTN_NORM: ("backbone.posnet.{bid}.norm",),  # wavtokenizer
+        MODEL_TENSOR.POSNET_ATTN_Q: ("backbone.posnet.{bid}.q",),  # wavtokenizer
+        MODEL_TENSOR.POSNET_ATTN_K: ("backbone.posnet.{bid}.k",),  # wavtokenizer
+        MODEL_TENSOR.POSNET_ATTN_V: ("backbone.posnet.{bid}.v",),  # wavtokenizer
+        MODEL_TENSOR.POSNET_ATTN_OUT: (
+            "backbone.posnet.{bid}.proj_out",  # wavtokenizer
         ),
-        MODEL_TENSOR.ENC_FFN_DOWN: ("encoder.block.{bid}.layer.1.DenseReluDense.wo",),
-        MODEL_TENSOR.ENC_OUTPUT_NORM: ("encoder.final_layer_norm",),
     }
 
+    # architecture-specific block mappings
     arch_block_mappings_cfg: dict[MODEL_ARCH, dict[MODEL_TENSOR, tuple[str, ...]]] = {
         MODEL_ARCH.ARCTIC: {
             MODEL_TENSOR.FFN_NORM: ("model.layers.{bid}.residual_layernorm",),
diff --git a/src/gguf/vocab.py b/src/gguf/vocab.py
index ea92c4e..3aa13ea 100644
--- a/src/gguf/vocab.py
+++ b/src/gguf/vocab.py
@@ -157,8 +157,36 @@ def _try_load_from_tokenizer_json(self, path: Path) -> bool:
                 tokenizer = json.load(f)
             if self.load_merges:
                 merges = tokenizer.get("model", {}).get("merges")
-                if isinstance(merges, list) and merges and isinstance(merges[0], str):
-                    self.merges = merges
+                if isinstance(merges, list) and merges:
+                    if isinstance(merges[0], str):
+                        self.merges = merges
+                    elif (
+                        isinstance(merges[0], list)
+                        and len(merges[0]) == 2
+                        and isinstance(merges[0][0], str)
+                    ):
+                        # New format since transformers 4.45 to support spaces in merges
+                        # ref: https://github.com/ggerganov/llama.cpp/issues/9692
+                        # TODO: internally store as the new format instead of converting to old
+                        if any(" " in s for pair in merges for s in pair):
+                            logger.warning(
+                                f'Spaces in merges detected, encoding as {chr(ord(" ") + 256)!r}'
+                            )
+                        self.merges = [
+                            " ".join(
+                                [
+                                    # ensure the spaces are properly encoded
+                                    "".join(
+                                        chr(ord(c) + 256) if c == " " else c
+                                        for c in part
+                                    )
+                                    for part in pair
+                                ]
+                            )
+                            for pair in merges
+                        ]
+                    else:
+                        raise ValueError("Unknown tokenizer merges format")
             added_tokens = tokenizer.get("added_tokens", {})
         else:
             added_tokens = {}
@@ -225,7 +253,6 @@ class Vocab(BaseVocab, Protocol):
     fname_tokenizer: Path
 
     def __init__(self, base_path: Path): ...
-
     def all_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]: ...
 
 
diff --git a/src/globals.py b/src/globals.py
index 68f0d6e..97af270 100644
--- a/src/globals.py
+++ b/src/globals.py
@@ -80,11 +80,15 @@ def load_dotenv(self=Any) -> None:
 
 
 def show_about(self) -> None:
-    about_text = (
-        "AutoGGUF\n\n"
-        f"Version: {AUTOGGUF_VERSION}\n\n"
-        "A tool for managing and converting GGUF models."
-    )
+    about_text = f"""AutoGGUF
+
+Version: {AUTOGGUF_VERSION}
+        
+A tool for managing and converting GGUF models.
+This application is licensed under the Apache License 2.0.
+Copyright (c) 2025 leafspark.
+It also utilizes llama.cpp, licensed under the MIT License.
+Copyright (c) 2023-2024 The ggml authors."""
     QMessageBox.about(self, "About AutoGGUF", about_text)