diff --git a/LICENSE b/LICENSE index 0b5e765..7757b14 100644 --- a/LICENSE +++ b/LICENSE @@ -186,7 +186,7 @@ same "printed page" as the copyright notice for easier identification within third-party archives. - Copyright 2024 leafspark + Copyright (c) 2024-2025 leafspark Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/requirements.txt b/requirements.txt index fdafd9e..aa6ad08 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,13 +1,13 @@ PyYAML~=6.0.2 -psutil~=6.1.0 +psutil~=6.1.1 pynvml~=12.0.0 PySide6~=6.8.1 -safetensors~=0.4.5 +safetensors~=0.5.0 numpy<2.0.0 torch~=2.5.1 sentencepiece~=0.2.0 -setuptools~=75.5.0 -huggingface-hub~=0.26.5 -transformers~=4.47.0 +setuptools~=75.6.0 +huggingface-hub~=0.27.0 +transformers~=4.47.1 fastapi~=0.115.6 uvicorn~=0.34.0 diff --git a/src/AutoGGUF.py b/src/AutoGGUF.py index c9d1255..d3593c0 100644 --- a/src/AutoGGUF.py +++ b/src/AutoGGUF.py @@ -500,7 +500,7 @@ def __init__(self, args: List[str]) -> None: # Timer for updating system info self.timer = QTimer() self.timer.timeout.connect(self.update_system_info) - self.timer.start(200) + self.timer.start(500) # Backend selection backend_layout = QHBoxLayout() @@ -1023,7 +1023,9 @@ def __init__(self, args: List[str]) -> None: hf_to_gguf_layout.addRow(OUTPUT_FILE, hf_outfile_layout) self.hf_outtype = QComboBox() - self.hf_outtype.addItems(["f32", "f16", "bf16", "q8_0", "auto"]) + self.hf_outtype.addItems( + ["f32", "f16", "bf16", "q8_0", "tq1_0", "tq2_0", "auto"] + ) hf_to_gguf_layout.addRow(OUTPUT_TYPE, self.hf_outtype) self.hf_vocab_only = QCheckBox(VOCAB_ONLY) diff --git a/src/GPUMonitor.py b/src/GPUMonitor.py index bec2827..edddcd8 100644 --- a/src/GPUMonitor.py +++ b/src/GPUMonitor.py @@ -95,7 +95,7 @@ def __init__(self, parent=None) -> None: self.timer = QTimer(self) self.timer.timeout.connect(self.update_gpu_info) - self.timer.start(200) # Update every 0.2 seconds + self.timer.start(500) # Update every 0.5 seconds self.gpu_data = [] self.vram_data = [] @@ -192,7 +192,7 @@ def update_graph_data() -> None: timer = QTimer(dialog) timer.timeout.connect(update_graph_data) - timer.start(200) # Update every 0.2 seconds + timer.start(500) # Update every 0.5 seconds dialog.exec() @@ -227,7 +227,7 @@ def update_graph_data() -> None: timer = QTimer(dialog) timer.timeout.connect(update_graph_data) - timer.start(200) # Update every 0.2 seconds + timer.start(500) # Update every 0.5 seconds tab_widget.addTab(gpu_graph, GPU_USAGE_OVER_TIME) tab_widget.addTab(vram_graph, VRAM_USAGE_OVER_TIME) diff --git a/src/convert_hf_to_gguf.py b/src/convert_hf_to_gguf.py index c1d2f97..b5b3ab6 100644 --- a/src/convert_hf_to_gguf.py +++ b/src/convert_hf_to_gguf.py @@ -23,6 +23,7 @@ TypeVar, cast, ) +from itertools import chain import math import numpy as np @@ -36,6 +37,9 @@ logger = logging.getLogger("hf-to-gguf") +###### MODEL DEFINITIONS ###### + + class SentencePieceTokenTypes(IntEnum): NORMAL = 1 UNKNOWN = 2 @@ -68,8 +72,8 @@ class Model: model_name: str | None metadata_override: Path | None dir_model_card: Path - is_lora: bool + # subclasses should define this! model_arch: gguf.MODEL_ARCH def __init__( @@ -86,7 +90,7 @@ def __init__( split_max_size: int = 0, dry_run: bool = False, small_first_shard: bool = False, - is_lora: bool = False, + hparams: dict[str, Any] | None = None, ): if type(self) is Model: raise TypeError( @@ -110,7 +114,9 @@ def __init__( self.part_names = Model.get_model_part_names( self.dir_model, "pytorch_model", ".bin" ) - self.hparams = Model.load_hparams(self.dir_model) + self.hparams = ( + Model.load_hparams(self.dir_model) if hparams is None else hparams + ) self.block_count = self.find_hparam( ["n_layers", "num_hidden_layers", "n_layer", "num_layers"] ) @@ -118,11 +124,11 @@ def __init__( self.tensor_names = None self.metadata_override = metadata_override self.model_name = model_name - self.dir_model_card = dir_model - self.is_lora = is_lora + self.dir_model_card = dir_model # overridden in convert_lora_to_gguf.py + # Apply heuristics to figure out typical tensor encoding based on first layer tensor encoding type if self.ftype == gguf.LlamaFileType.GUESSED: - + # NOTE: can't use field "torch_dtype" in config.json, because some finetunes lie. _, first_tensor = next(self.get_tensors()) if first_tensor.dtype == torch.float16: logger.info( @@ -135,6 +141,7 @@ def __init__( ) self.ftype = gguf.LlamaFileType.MOSTLY_BF16 + # Configure GGUF Writer self.gguf_writer = gguf.GGUFWriter( path=None, arch=gguf.MODEL_ARCH_NAMES[self.model_arch], @@ -148,7 +155,8 @@ def __init__( @classmethod def __init_subclass__(cls): - + # can't use an abstract property, because overriding it without type errors + # would require using decorated functions instead of simply defining the property if "model_arch" not in cls.__dict__: raise TypeError(f"Missing property 'model_arch' for {cls.__name__!r}") @@ -219,6 +227,7 @@ def get_tensors(self) -> Iterator[tuple[str, Tensor]]: data = LazyTorchTensor.from_eager(data) yield name, data + # verify tensor name presence and identify potentially missing files if len(tensor_names_from_parts.symmetric_difference(self.tensor_names)) > 0: missing = sorted(self.tensor_names.difference(tensor_names_from_parts)) extra = sorted(tensor_names_from_parts.difference(self.tensor_names)) @@ -285,9 +294,11 @@ def set_gguf_parameters(self): self.gguf_writer.add_context_length(n_ctx) logger.info(f"gguf: context length = {n_ctx}") - n_embd = self.find_hparam(["hidden_size", "n_embd"]) - self.gguf_writer.add_embedding_length(n_embd) - logger.info(f"gguf: embedding length = {n_embd}") + if ( + n_embd := self.find_hparam(["hidden_size", "n_embd"], optional=True) + ) is not None: + self.gguf_writer.add_embedding_length(n_embd) + logger.info(f"gguf: embedding length = {n_embd}") if ( n_ff := self.find_hparam(["intermediate_size", "n_inner"], optional=True) @@ -295,9 +306,11 @@ def set_gguf_parameters(self): self.gguf_writer.add_feed_forward_length(n_ff) logger.info(f"gguf: feed forward length = {n_ff}") - n_head = self.find_hparam(["num_attention_heads", "n_head"]) - self.gguf_writer.add_head_count(n_head) - logger.info(f"gguf: head count = {n_head}") + if ( + n_head := self.find_hparam(["num_attention_heads", "n_head"], optional=True) + ) is not None: + self.gguf_writer.add_head_count(n_head) + logger.info(f"gguf: head count = {n_head}") if (n_head_kv := self.hparams.get("num_key_value_heads")) is not None: self.gguf_writer.add_head_count_kv(n_head_kv) @@ -333,24 +346,30 @@ def set_gguf_parameters(self): def modify_tensors( self, data_torch: Tensor, name: str, bid: int | None ) -> Iterable[tuple[str, Tensor]]: - del bid + del bid # unused return [(self.map_tensor_name(name), data_torch)] def tensor_force_quant( self, name: str, new_name: str, bid: int | None, n_dims: int ) -> gguf.GGMLQuantizationType | bool: - del name, new_name, bid, n_dims + del name, new_name, bid, n_dims # unused return False + # some models need extra generated tensors (like rope_freqs) + def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]: + return () + def prepare_tensors(self): max_name_len = max(len(s) for _, s in self.tensor_map.mapping.values()) + len( ".weight," ) - for name, data_torch in self.get_tensors(): - + for name, data_torch in chain( + self.generate_extra_tensors(), self.get_tensors() + ): + # we don't need these if name.endswith( (".attention.masked_bias", ".attention.bias", ".rotary_emb.inv_freq") ): @@ -358,28 +377,37 @@ def prepare_tensors(self): old_dtype = data_torch.dtype + # convert any unsupported data types to float32 if data_torch.dtype not in (torch.float16, torch.float32): data_torch = data_torch.to(torch.float32) + # use the first number-like part of the tensor name as the block id bid = None for part in name.split("."): if part.isdecimal(): bid = int(part) break - for new_name, data in ( - (n, d.squeeze().numpy()) - for n, d in self.modify_tensors(data_torch, name, bid) - ): - data: np.ndarray + for new_name, data_torch in self.modify_tensors(data_torch, name, bid): + # TODO: why do we squeeze here? + # data = data_torch.squeeze().numpy() + data = data_torch.numpy() + + # if data ends up empty, it means data_torch was a scalar tensor -> restore + if len(data.shape) == 0: + data = data_torch.numpy() + n_dims = len(data.shape) data_qtype: gguf.GGMLQuantizationType | bool = self.tensor_force_quant( name, new_name, bid, n_dims ) + # Most of the codebase that takes in 1D tensors or norms only handles F32 tensors if n_dims <= 1 or new_name.endswith("_norm.weight"): data_qtype = gguf.GGMLQuantizationType.F32 + # Conditions should closely match those in llama_model_quantize_internal in llama.cpp + # Some tensor types are always in float32 if data_qtype is False and ( any( self.match_model_tensor_name(new_name, key, bid) @@ -393,6 +421,8 @@ def prepare_tensors(self): gguf.MODEL_TENSOR.TIME_MIX_W2, gguf.MODEL_TENSOR.TIME_MIX_DECAY_W1, gguf.MODEL_TENSOR.TIME_MIX_DECAY_W2, + gguf.MODEL_TENSOR.POSNET_NORM1, + gguf.MODEL_TENSOR.POSNET_NORM2, ) ) or not new_name.endswith(".weight") @@ -410,9 +440,10 @@ def prepare_tensors(self): gguf.LlamaFileType.MOSTLY_TQ1_0, gguf.LlamaFileType.MOSTLY_TQ2_0, ): - + # TODO: use Q4_K and Q6_K data_qtype = gguf.GGMLQuantizationType.F16 + # No override (data_qtype is False), or wants to be quantized (data_qtype is True) if isinstance(data_qtype, bool): if self.ftype == gguf.LlamaFileType.ALL_F32: data_qtype = gguf.GGMLQuantizationType.F32 @@ -442,8 +473,10 @@ def prepare_tensors(self): else data.shape ) + # reverse shape to make it similar to the internal ggml dimension order shape_str = f"{{{', '.join(str(n) for n in reversed(shape))}}}" + # n_dims is implicit in the shape logger.info( f"{f'%-{max_name_len}s' % f'{new_name},'} {old_dtype} --> {data_qtype.name}, shape = {shape_str}" ) @@ -463,18 +496,22 @@ def prepare_metadata(self, vocab_only: bool): self.metadata_override, self.dir_model_card, self.model_name, total_params ) + # Fallback to model directory name if metadata name is still missing if self.metadata.name is None: self.metadata.name = self.dir_model.name + # Generate parameter weight class (useful for leader boards) if not yet determined if self.metadata.size_label is None and total_params > 0: self.metadata.size_label = gguf.size_label( total_params, shared_params, expert_params, expert_count ) + # Extract the encoding scheme from the file type name. e.g. 'gguf.LlamaFileType.MOSTLY_Q8_0' --> 'Q8_0' output_type: str = self.ftype.name.partition("_")[2] + # Filename Output if self.fname_out.is_dir(): - + # Generate default filename based on model specification and available metadata if not vocab_only: fname_default: str = gguf.naming_convention( self.metadata.name, @@ -496,9 +533,14 @@ def prepare_metadata(self, vocab_only: bool): model_type="vocab", ) + # Use the default filename self.fname_out = self.fname_out / f"{fname_default}.gguf" else: + # Output path is a custom defined templated filename + # Note: `not is_dir()` is used because `.is_file()` will not detect + # file template strings as it doesn't actually exist as a file + # Process templated file name with the output ftype, useful with the "auto" ftype self.fname_out = self.fname_out.parent / gguf.fill_templated_filename( self.fname_out.name, output_type ) @@ -576,11 +618,13 @@ def does_token_look_special(self, token: str | bytes) -> bool: else: token_text = token + # Some models mark some added tokens which ought to be control tokens as not special. + # (e.g. command-r, command-r-plus, deepseek-coder, gemma{,-2}) seems_special = token_text in ( - "", + "", # deepseek-coder "", "<2mass>", - "[@BOS@]", + "[@BOS@]", # gemma{,-2} ) seems_special = seems_special or ( @@ -588,14 +632,16 @@ def does_token_look_special(self, token: str | bytes) -> bool: ) seems_special = seems_special or ( token_text.startswith("<|") and token_text.endswith("|>") - ) + ) # deepseek-coder + # TODO: should these be marked as UNUSED instead? (maybe not) seems_special = seems_special or ( token_text.startswith("") - ) + ) # gemma{,-2} return seems_special + # used for GPT-2 BPE and WordPiece vocabs def get_vocab_base(self) -> tuple[list[str], list[int], str]: tokens: list[str] = [] toktypes: list[int] = [] @@ -620,12 +666,28 @@ def get_vocab_base(self) -> tuple[list[str], list[int], str]: else: token: str = reverse_vocab[i] if token in added_vocab: + # The tokenizer in llama.cpp assumes the CONTROL and USER_DEFINED tokens are pre-normalized. + # To avoid unexpected issues - we make sure to normalize non-normalized tokens + if not tokenizer.added_tokens_decoder[i].normalized: + previous_token = token + token = tokenizer.decode( + tokenizer.encode(token, add_special_tokens=False) + ) + if previous_token != token: + logger.info( + f"{repr(previous_token)} is encoded and decoded back to {repr(token)} using AutoTokenizer" + ) + if tokenizer.added_tokens_decoder[ i ].special or self.does_token_look_special(token): toktypes.append(gguf.TokenType.CONTROL) else: - token = token.replace(b"\xe2\x96\x81".decode("utf-8"), " ") + # NOTE: this was added for Gemma. + # Encoding and decoding the tokens above isn't sufficient for this case. + token = token.replace( + b"\xe2\x96\x81".decode("utf-8"), " " + ) # pre-normalize user-defined spaces toktypes.append(gguf.TokenType.USER_DEFINED) else: toktypes.append(gguf.TokenType.NORMAL) @@ -633,7 +695,15 @@ def get_vocab_base(self) -> tuple[list[str], list[int], str]: return tokens, toktypes, tokpre + # NOTE: this function is generated by convert_hf_to_gguf_update.py + # do not modify it manually! + # ref: https://github.com/ggerganov/llama.cpp/pull/6920 + # Marker: Start get_vocab_base_pre def get_vocab_base_pre(self, tokenizer) -> str: + # encoding this string and hashing the resulting tokens would (hopefully) give us a unique identifier that + # is specific for the BPE pre-tokenizer used by the model + # we will use this unique identifier to write a "tokenizer.ggml.pre" entry in the GGUF file which we can + # use in llama.cpp to implement the same pre-tokenizer chktxt = "\n \n\n \n\n\n \t \t\t \t\n \n \n \n \n🚀 (normal) 😶\u200d🌫️ (multiple emojis concatenated) ✅ 🦙🦙 3 33 333 3333 33333 333333 3333333 33333333 3.3 3..3 3...3 កាន់តែពិសេសអាច😁 ?我想在apple工作1314151天~ ------======= нещо на Български ''''''```````\"\"\"\"......!!!!!!?????? I've been 'told he's there, 'RE you sure? 'M not sure I'll make it, 'D you like some tea? We'Ve a'lL" @@ -645,96 +715,126 @@ def get_vocab_base_pre(self, tokenizer) -> str: res = None + # NOTE: if you get an error here, you need to update the convert_hf_to_gguf_update.py script + # or pull the latest version of the model from Huggingface + # don't edit the hashes manually! if chkhsh == "0ef9807a4087ebef797fc749390439009c3b9eda9ad1a097abbe738f486c01e5": - + # ref: https://huggingface.co/meta-llama/Meta-Llama-3-8B res = "llama-bpe" if chkhsh == "049ecf7629871e3041641907f3de7c733e4dbfdc736f57d882ba0b0845599754": - + # ref: https://huggingface.co/deepseek-ai/deepseek-llm-7b-base res = "deepseek-llm" if chkhsh == "347715f544604f9118bb75ed199f68779f423cabb20db6de6f31b908d04d7821": - + # ref: https://huggingface.co/deepseek-ai/deepseek-coder-6.7b-base res = "deepseek-coder" if chkhsh == "8aeee3860c56296a157a1fe2fad249ec40aa59b1bb5709f4ade11c4e6fe652ed": - + # ref: https://huggingface.co/tiiuae/falcon-7b res = "falcon" + if chkhsh == "9d032fcbd5501f4a38150912590928bfb36091efb5df11b8e2124b0390e3fb1e": + # ref: https://huggingface.co/tiiuae/Falcon3-7B-Base + res = "falcon3" if chkhsh == "0876d13b50744004aa9aeae05e7b0647eac9d801b5ba4668afc01e709c15e19f": - + # ref: https://huggingface.co/BAAI/bge-small-en-v1.5 res = "bert-bge" + if chkhsh == "8e62295832751ca1e8f92f2226f403dea30dc5165e448b5bfa05af5340c64ec7": + # ref: https://huggingface.co/BAAI/bge-large-zh-v1.5 + res = "bert-bge-large" if chkhsh == "b6dc8df998e1cfbdc4eac8243701a65afe638679230920b50d6f17d81c098166": - + # ref: https://huggingface.co/mosaicml/mpt-7b res = "mpt" if chkhsh == "35d91631860c815f952d711435f48d356ebac988362536bed955d43bfa436e34": - + # ref: https://huggingface.co/bigcode/starcoder2-3b res = "starcoder" if chkhsh == "3ce83efda5659b07b1ad37ca97ca5797ea4285d9b9ab0dc679e4a720c9da7454": - + # ref: https://huggingface.co/openai-community/gpt2 res = "gpt-2" if chkhsh == "32d85c31273f8019248f2559fed492d929ea28b17e51d81d3bb36fff23ca72b3": - + # ref: https://huggingface.co/stabilityai/stablelm-2-zephyr-1_6b res = "stablelm2" if chkhsh == "6221ad2852e85ce96f791f476e0b390cf9b474c9e3d1362f53a24a06dc8220ff": - + # ref: https://huggingface.co/smallcloudai/Refact-1_6-base res = "refact" if chkhsh == "9c2227e4dd922002fb81bde4fc02b0483ca4f12911410dee2255e4987644e3f8": - + # ref: https://huggingface.co/CohereForAI/c4ai-command-r-v01 res = "command-r" if chkhsh == "e636dc30a262dcc0d8c323492e32ae2b70728f4df7dfe9737d9f920a282b8aea": - + # ref: https://huggingface.co/Qwen/Qwen1.5-7B res = "qwen2" if chkhsh == "b6dc8df998e1cfbdc4eac8243701a65afe638679230920b50d6f17d81c098166": - + # ref: https://huggingface.co/allenai/OLMo-1.7-7B-hf res = "olmo" if chkhsh == "a8594e3edff7c29c003940395316294b2c623e09894deebbc65f33f1515df79e": - + # ref: https://huggingface.co/databricks/dbrx-base res = "dbrx" + if chkhsh == "c7699093ba4255a91e702aa38a596aa81669f3525dae06c2953267dde580f448": + # ref: https://huggingface.co/jinaai/jina-reranker-v1-tiny-en + res = "jina-v1-en" if chkhsh == "0876d13b50744004aa9aeae05e7b0647eac9d801b5ba4668afc01e709c15e19f": - + # ref: https://huggingface.co/jinaai/jina-embeddings-v2-base-en res = "jina-v2-en" if chkhsh == "171aeeedd6fb548d418a7461d053f11b6f1f1fc9b387bd66640d28a4b9f5c643": - + # ref: https://huggingface.co/jinaai/jina-embeddings-v2-base-es res = "jina-v2-es" if chkhsh == "27949a2493fc4a9f53f5b9b029c82689cfbe5d3a1929bb25e043089e28466de6": - + # ref: https://huggingface.co/jinaai/jina-embeddings-v2-base-de res = "jina-v2-de" if chkhsh == "c136ed14d01c2745d4f60a9596ae66800e2b61fa45643e72436041855ad4089d": - + # ref: https://huggingface.co/abacusai/Smaug-Llama-3-70B-Instruct res = "smaug-bpe" if chkhsh == "c7ea5862a53e4272c035c8238367063e2b270d51faa48c0f09e9d5b54746c360": - + # ref: https://huggingface.co/LumiOpen/Poro-34B-chat res = "poro-chat" if chkhsh == "7967bfa498ade6b757b064f31e964dddbb80f8f9a4d68d4ba7998fcf281c531a": - + # ref: https://huggingface.co/jinaai/jina-embeddings-v2-base-code res = "jina-v2-code" if chkhsh == "b6e8e1518dc4305be2fe39c313ed643381c4da5db34a98f6a04c093f8afbe99b": - + # ref: https://huggingface.co/THUDM/glm-4-9b-chat res = "chatglm-bpe" if chkhsh == "7fc505bd3104ca1083b150b17d088b59534ede9bde81f0dd2090967d7fe52cee": - + # ref: https://huggingface.co/LumiOpen/Viking-7B res = "viking" if chkhsh == "b53802fb28e26d645c3a310b34bfe07da813026ec7c7716883404d5e0f8b1901": - + # ref: https://huggingface.co/core42/jais-13b res = "jais" if chkhsh == "7b3e7548e4308f52a76e8229e4e6cc831195d0d1df43aed21ac6c93da05fec5f": - + # ref: https://huggingface.co/WisdomShell/CodeShell-7B res = "codeshell" if chkhsh == "63b97e4253352e6f357cc59ea5b583e3a680eaeaf2632188c2b952de2588485e": - + # ref: https://huggingface.co/mistralai/Mistral-Nemo-Base-2407 res = "tekken" if chkhsh == "855059429035d75a914d1eda9f10a876752e281a054a7a3d421ef0533e5b6249": - + # ref: https://huggingface.co/HuggingFaceTB/SmolLM-135M res = "smollm" if chkhsh == "3c30d3ad1d6b64202cd222813e7736c2db6e1bd6d67197090fc1211fbc612ae7": - + # ref: https://huggingface.co/bigscience/bloom res = "bloom" if chkhsh == "bc01ce58980e1db43859146dc51b1758b3b88729b217a74792e9f8d43e479d21": - + # ref: https://huggingface.co/TurkuNLP/gpt3-finnish-small res = "gpt3-finnish" if chkhsh == "4e2b24cc4770243d65a2c9ec19770a72f08cffc161adbb73fcbb6b7dd45a0aae": - + # ref: https://huggingface.co/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct res = "exaone" if chkhsh == "fcace8b9cac38ce847670c970cd5892031a753a1ef381abd1d9af00f713da085": - + # ref: https://huggingface.co/microsoft/phi-2 res = "phi-2" + if chkhsh == "60824e3c0d9401f89943cbb2fff727f0e2d4c545ba4df2d6e4f09a6db0f5b450": + # ref: https://huggingface.co/facebook/chameleon-7b + res = "chameleon" + if chkhsh == "1431a23e583c97432bc230bff598d103ddb5a1f89960c8f1d1051aaa944d0b35": + # ref: https://huggingface.co/sapienzanlp/Minerva-7B-base-v1.0 + res = "minerva-7b" + if chkhsh == "8b5a93ed704057481f240da0be7e7dca721d7f8f4755263b6807227a2cbeae65": + # ref: https://huggingface.co/sentence-transformers/stsb-roberta-base + res = "roberta-bpe" + if chkhsh == "ad851be1dba641f2e3711822f816db2c265f788b37c63b4e1aeacb9ee92de8eb": + # ref: https://huggingface.co/ai-sage/GigaChat-20B-A3B-instruct + res = "gigachat" + if chkhsh == "d4c8f286ea6b520b3d495c4455483cfa2302c0cfcd4be05d781b6a8a0a7cdaf1": + # ref: https://huggingface.co/Infinigence/Megrez-3B-Instruct + res = "megrez" + if chkhsh == "877081d19cf6996e2c4ff0e1236341e9b7bde288f5311a56a937f0afbbb3aeb5": + # ref: https://huggingface.co/deepseek-ai/DeepSeek-V3 + res = "deepseek-v3" if res is None: logger.warning("\n") @@ -769,6 +869,10 @@ def get_vocab_base_pre(self, tokenizer) -> str: logger.debug(f"chkhsh: {chkhsh}") return res + # Marker: End get_vocab_base_pre + + def _set_vocab_none(self) -> None: + self.gguf_writer.add_tokenizer_model("none") def _set_vocab_gpt2(self) -> None: tokens, toktypes, tokpre = self.get_vocab_base() @@ -805,6 +909,7 @@ def _set_vocab_qwen(self): assert len(merged) == 2 merges.append(" ".join(map(QwenModel.token_bytes_to_string, merged))) + # for this kind of tokenizer, added_vocab is not a subset of vocab, so they need to be combined added_vocab = tokenizer.special_tokens reverse_vocab = { id_: encoded_tok for encoded_tok, id_ in {**vocab, **added_vocab}.items() @@ -828,7 +933,7 @@ def _set_vocab_qwen(self): special_vocab = gguf.SpecialVocab(dir_model, load_merges=False) special_vocab.merges = merges - + # only add special tokens when they were not already loaded from config.json if len(special_vocab.special_token_ids) == 0: special_vocab._set_special_token( "bos", tokenizer.special_tokens["<|endoftext|>"] @@ -836,7 +941,7 @@ def _set_vocab_qwen(self): special_vocab._set_special_token( "eos", tokenizer.special_tokens["<|endoftext|>"] ) - + # this one is usually not in config.json anyway special_vocab._set_special_token( "unk", tokenizer.special_tokens["<|endoftext|>"] ) @@ -924,7 +1029,9 @@ def _create_vocab_sentencepiece(self): if token_data.get("special") or self.does_token_look_special(token): toktypes[token_id] = SentencePieceTokenTypes.CONTROL else: - token = token.replace(b"\xe2\x96\x81".decode("utf-8"), " ") + token = token.replace( + b"\xe2\x96\x81".decode("utf-8"), " " + ) # pre-normalize user-defined spaces toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED scores[token_id] = -1000.0 @@ -976,7 +1083,7 @@ def _set_vocab_builtin( default_pre = "mpt" if model_name == "gpt-neox" else "default" field = vocab_reader.get_field(gguf.Keys.Tokenizer.MODEL) - assert field + assert field # tokenizer model self.gguf_writer.add_tokenizer_model(bytes(field.parts[-1]).decode("utf-8")) field = vocab_reader.get_field(gguf.Keys.Tokenizer.PRE) @@ -985,27 +1092,27 @@ def _set_vocab_builtin( ) field = vocab_reader.get_field(gguf.Keys.Tokenizer.LIST) - assert field + assert field # token list self.gguf_writer.add_token_list( [bytes(field.parts[i]) for i in field.data][:vocab_size] ) if model_name == "llama-spm": field = vocab_reader.get_field(gguf.Keys.Tokenizer.SCORES) - assert field + assert field # token scores self.gguf_writer.add_token_scores( [field.parts[i].tolist()[0] for i in field.data][:vocab_size] ) field = vocab_reader.get_field(gguf.Keys.Tokenizer.TOKEN_TYPE) - assert field + assert field # token types self.gguf_writer.add_token_types( [field.parts[i].tolist()[0] for i in field.data][:vocab_size] ) if model_name != "llama-spm": field = vocab_reader.get_field(gguf.Keys.Tokenizer.MERGES) - assert field + assert field # token merges self.gguf_writer.add_token_merges( [bytes(field.parts[i]) for i in field.data] ) @@ -1050,7 +1157,7 @@ def set_gguf_parameters(self): def modify_tensors( self, data_torch: Tensor, name: str, bid: int | None ) -> Iterable[tuple[str, Tensor]]: - del bid + del bid # unused n_head = self.hparams.get("n_head", self.hparams.get("num_attention_heads")) n_embed = self.hparams.get("hidden_size", self.hparams.get("n_embed")) @@ -1058,7 +1165,9 @@ def modify_tensors( tensors: list[tuple[str, Tensor]] = [] if re.match(r"gpt_neox\.layers\.\d+\.attention\.query_key_value\.weight", name): - + # Map bloom-style qkv_linear to gpt-style qkv_linear + # bloom: https://github.com/huggingface/transformers/blob/main/src/transformers/models/bloom/modeling_bloom.py#L238-L252 # noqa + # gpt-2: https://github.com/huggingface/transformers/blob/main/src/transformers/models/gpt2/modeling_gpt2.py#L312 # noqa qkv_weights = data_torch.reshape((n_head, 3, n_embed // n_head, n_embed)) data_torch = torch.cat( ( @@ -1105,7 +1214,7 @@ def set_gguf_parameters(self): def modify_tensors( self, data_torch: Tensor, name: str, bid: int | None ) -> Iterable[tuple[str, Tensor]]: - del bid + del bid # unused n_head = self.hparams.get("n_head", self.hparams.get("num_attention_heads")) n_embed = self.hparams.get("hidden_size", self.hparams.get("n_embed")) @@ -1115,7 +1224,9 @@ def modify_tensors( tensors: list[tuple[str, Tensor]] = [] if re.match(r"h\.\d+\.self_attention\.query_key_value\.weight", name): - + # Map bloom-style qkv_linear to gpt-style qkv_linear + # bloom: https://github.com/huggingface/transformers/blob/main/src/transformers/models/bloom/modeling_bloom.py#L238-L252 # noqa + # gpt-2: https://github.com/huggingface/transformers/blob/main/src/transformers/models/gpt2/modeling_gpt2.py#L312 # noqa qkv_weights = data_torch.reshape((n_head, 3, n_embed // n_head, n_embed)) data_torch = torch.cat( ( @@ -1143,6 +1254,7 @@ def modify_tensors( if name == "word_embeddings.weight": assert self.tensor_names is not None + # TODO: tie them at runtime, don't duplicate in the model file if all( s not in self.tensor_names for s in ("lm_head.weight", "output.weight") ): @@ -1161,7 +1273,7 @@ def set_vocab(self): try: self._set_vocab_gpt2() except Exception: - + # Fallback for SEA-LION model self._set_vocab_sentencepiece() self.gguf_writer.add_add_bos_token(False) self.gguf_writer.add_pad_token_id(3) @@ -1190,7 +1302,7 @@ def set_gguf_parameters(self): def modify_tensors( self, data_torch: Tensor, name: str, bid: int | None ) -> Iterable[tuple[str, Tensor]]: - del bid + del bid # unused if "scales" in name: new_name = self.map_tensor_name( @@ -1233,7 +1345,8 @@ def set_gguf_parameters(self): self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"]) self.gguf_writer.add_head_count(head_count) self.gguf_writer.add_head_count_kv(head_count_kv) - + # note: config provides rms norm but it is actually layer norm + # ref: https://huggingface.co/OrionStarAI/Orion-14B-Chat/blob/276a17221ce42beb45f66fac657a41540e71f4f5/modeling_orion.py#L570-L571 self.gguf_writer.add_layer_norm_eps(self.hparams["rms_norm_eps"]) @@ -1362,7 +1475,8 @@ def set_vocab(self): tokenizer = AutoTokenizer.from_pretrained(dir_model) vocab_size = hparams.get("vocab_size", len(tokenizer.vocab)) - + # Since we are checking the maximum index, we need to ensure it's strictly less than vocab_size, + # because vocab_size is the count of items, and indexes start at 0. max_vocab_index = max(tokenizer.get_vocab().values()) if max_vocab_index >= vocab_size: raise ValueError("Vocabulary size exceeds expected maximum size.") @@ -1374,12 +1488,12 @@ def set_vocab(self): for token_id in range(vocab_size): token_text = reverse_vocab[token_id].encode("utf-8") - + # replace "\x00" to string with length > 0 if token_text == b"\x00": - toktype = gguf.TokenType.BYTE + toktype = gguf.TokenType.BYTE # special token_text = f"<{token_text}>".encode("utf-8") elif re.fullmatch(rb"<0x[0-9A-Fa-f]{2}>", token_text): - toktype = gguf.TokenType.BYTE + toktype = gguf.TokenType.BYTE # special elif reverse_vocab[token_id] in added_vocab: if tokenizer.added_tokens_decoder[token_id].special: toktype = gguf.TokenType.CONTROL @@ -1440,11 +1554,12 @@ def set_gguf_parameters(self): def modify_tensors( self, data_torch: Tensor, name: str, bid: int | None ) -> Iterable[tuple[str, Tensor]]: - del bid + del bid # unused head_count = self.hparams["num_attention_heads"] head_count_kv = self.hparams.get("num_key_value_heads", head_count) + # HF models permute some of the tensors, so we need to undo that if name.endswith("q_proj.weight"): data_torch = self._reverse_hf_permute(data_torch, head_count, head_count) if name.endswith("k_proj.weight"): @@ -1474,18 +1589,18 @@ class FalconModel(Model): def set_gguf_parameters(self): block_count = self.hparams.get("num_hidden_layers") if block_count is None: - block_count = self.hparams["n_layer"] + block_count = self.hparams["n_layer"] # old name n_head = self.hparams.get("num_attention_heads") if n_head is None: - n_head = self.hparams["n_head"] + n_head = self.hparams["n_head"] # old name n_head_kv = self.hparams.get("num_kv_heads") if n_head_kv is None: - n_head_kv = self.hparams.get("n_head_kv", 1) + n_head_kv = self.hparams.get("n_head_kv", 1) # old name - self.gguf_writer.add_context_length(2048) - self.gguf_writer.add_tensor_data_layout("jploski") + self.gguf_writer.add_context_length(2048) # not in config.json + self.gguf_writer.add_tensor_data_layout("jploski") # qkv tensor transform self.gguf_writer.add_embedding_length(self.hparams["hidden_size"]) self.gguf_writer.add_feed_forward_length(4 * self.hparams["hidden_size"]) self.gguf_writer.add_block_count(block_count) @@ -1497,7 +1612,17 @@ def set_gguf_parameters(self): def modify_tensors( self, data_torch: Tensor, name: str, bid: int | None ) -> Iterable[tuple[str, Tensor]]: - del bid + del bid # unused + + # QKV tensor transform + # The original query_key_value tensor contains n_head_kv "kv groups", + # each consisting of n_head/n_head_kv query weights followed by one key + # and one value weight (shared by all query heads in the kv group). + # This layout makes it a big pain to work with in GGML. + # So we rearrange them here,, so that we have n_head query weights + # followed by n_head_kv key weights followed by n_head_kv value weights, + # in contiguous fashion. + # ref: https://github.com/jploski/ggml/blob/falcon40b/examples/falcon/convert-hf-to-ggml.py if "query_key_value" in name: n_head = self.find_hparam(["num_attention_heads", "n_head"]) @@ -1541,6 +1666,7 @@ class RefactModel(Model): def set_vocab(self): super().set_vocab() + # TODO: how to determine special FIM tokens automatically? special_vocab = gguf.SpecialVocab( self.dir_model, load_merges=False, @@ -1549,7 +1675,7 @@ def set_vocab(self): special_vocab._set_special_token("prefix", 1) special_vocab._set_special_token("suffix", 3) special_vocab._set_special_token("middle", 2) - special_vocab.chat_template = None + special_vocab.chat_template = None # do not add it twice special_vocab.add_to_gguf(self.gguf_writer) def set_gguf_parameters(self): @@ -1561,6 +1687,7 @@ def set_gguf_parameters(self): block_count = self.hparams["n_layer"] + # refact uses Alibi. So this is from config.json which might be used by training. self.gguf_writer.add_context_length(self.hparams["n_positions"]) self.gguf_writer.add_embedding_length(self.hparams["n_embd"]) @@ -1633,7 +1760,7 @@ def set_vocab(self): if (self.dir_model / "tokenizer.json").is_file(): self._set_vocab_gpt2() else: - + # StableLM 2 1.6B used to have a vocab in a similar format to Qwen's vocab self._set_vocab_qwen() def set_gguf_parameters(self): @@ -1712,7 +1839,7 @@ def _stack_qk_norm( layer_name: str = "q_layernorm", ): datas: list[Tensor] = [] - + # extract the norms in order for xid in range(n_head): ename = f"model.layers.{bid}.self_attn.{layer_name}.norms.{xid}.weight" datas.append(norms[ename]) @@ -1728,7 +1855,7 @@ def prepare_tensors(self): super().prepare_tensors() if self._q_norms is not None or self._k_norms is not None: - + # flatten two `list[dict[str, Tensor]]` into a single `list[str]` norms = ( [k for d in self._q_norms for k in d.keys()] if self._q_norms is not None @@ -1755,9 +1882,10 @@ def set_vocab(self): try: self._set_vocab_llama_hf() except (FileNotFoundError, TypeError): - + # Llama 3 self._set_vocab_gpt2() + # Apply to CodeLlama only (and ignore for Llama 3 with a vocab size of 128256) if self.hparams.get("vocab_size", 32000) == 32016: special_vocab = gguf.SpecialVocab( self.dir_model, @@ -1770,6 +1898,19 @@ def set_vocab(self): special_vocab._set_special_token("eot", 32010) special_vocab.add_to_gguf(self.gguf_writer) + tokenizer_config_file = self.dir_model / "tokenizer_config.json" + if tokenizer_config_file.is_file(): + with open(tokenizer_config_file, "r", encoding="utf-8") as f: + tokenizer_config_json = json.load(f) + if "add_prefix_space" in tokenizer_config_json: + self.gguf_writer.add_add_space_prefix( + tokenizer_config_json["add_prefix_space"] + ) + + # Apply to granite small models only + if self.hparams.get("vocab_size", 32000) == 49152: + self.gguf_writer.add_add_bos_token(False) + def set_gguf_parameters(self): super().set_gguf_parameters() hparams = self.hparams @@ -1791,18 +1932,6 @@ def set_gguf_parameters(self): self.hparams["rope_scaling"]["factor"] ) - tokenizer_config_file = self.dir_model / "tokenizer_config.json" - if tokenizer_config_file.is_file(): - with open(tokenizer_config_file, "r", encoding="utf-8") as f: - tokenizer_config_json = json.load(f) - if "add_prefix_space" in tokenizer_config_json: - self.gguf_writer.add_add_space_prefix( - tokenizer_config_json["add_prefix_space"] - ) - - if self.hparams.get("vocab_size", 32000) == 49152: - self.gguf_writer.add_add_bos_token(False) - @staticmethod def permute(weights: Tensor, n_head: int, n_head_kv: int | None): if n_head_kv is not None and n_head != n_head_kv: @@ -1828,6 +1957,7 @@ def modify_tensors( if name.endswith(("k_proj.weight", "k_proj.bias")): data_torch = LlamaModel.permute(data_torch, n_head, n_kv_head) + # process the experts separately if name.find("block_sparse_moe.experts") != -1: n_experts = self.hparams["num_local_experts"] @@ -1841,6 +1971,7 @@ def modify_tensors( if len(self._experts[bid]) >= n_experts * 3: tensors: list[tuple[str, Tensor]] = [] + # merge the experts into a single 3d tensor for wid in ["w1", "w2", "w3"]: datas: list[Tensor] = [] @@ -1862,7 +1993,7 @@ def modify_tensors( return [(self.map_tensor_name(name), data_torch)] - def prepare_tensors(self): + def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]: if rope_scaling := self.find_hparam(["rope_scaling"], optional=True): if rope_scaling.get("rope_type", "").lower() == "llama3": base = self.hparams.get("rope_theta", 10000.0) @@ -1898,21 +2029,233 @@ def prepare_tensors(self): ) rope_factors.append(1 / ((1 - smooth) / factor + smooth)) - if not self.is_lora: - self.gguf_writer.add_tensor( - self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FREQS), - np.array(rope_factors, dtype=np.float32), - ) + yield ( + self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FREQS), + torch.tensor(rope_factors, dtype=torch.float32), + ) + def prepare_tensors(self): super().prepare_tensors() if self._experts is not None: - + # flatten `list[dict[str, Tensor]]` into `list[str]` experts = [k for d in self._experts for k in d.keys()] if len(experts) > 0: raise ValueError(f"Unprocessed experts: {experts}") +@Model.register("DeciLMForCausalLM") +class DeciModel(Model): + model_arch = gguf.MODEL_ARCH.DECI + + @staticmethod + def _ffn_mult_to_intermediate_size(ffn_mult: float, n_embd: int) -> int: + # DeciLM-specific code + intermediate_size = int(2 * ffn_mult * n_embd / 3) + return DeciModel._find_multiple(intermediate_size, 256) + + @staticmethod + def _find_multiple(n: int, k: int) -> int: + # DeciLM-specific code + if n % k == 0: + return n + return n + k - (n % k) + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + if "block_configs" in self.hparams: # Llama-3_1-Nemotron-51B + _block_configs: list[dict[str, Any]] = self.hparams["block_configs"] + assert self.block_count == len(_block_configs) + self._num_kv_heads = list() + self._num_heads = list() + _ffn_multipliers = list() + # ***linear attention layer*** + # if n_heads_in_group is None and replace_with_linear is True + # then _num_kv_heads[il] is 0 and _num_heads[il] is num_attention_heads + # ***attention-free layer*** + # if n_heads_in_group is None and replace_with_linear is False + # then _num_kv_heads[il] is 0 and _num_heads[il] is 0 + # ***normal attention-layer*** + # if n_heads_in_group is not None, then + # _num_kv_heads[il] is num_attention_head // n_heads_in_group and + # _num_heads[il] is num_attention_head + for il in range(len(_block_configs)): + if _block_configs[il]["attention"]["n_heads_in_group"] is None: + if _block_configs[il]["attention"]["replace_with_linear"] is True: + self._num_kv_heads.append(0) + self._num_heads.append(self.hparams["num_attention_heads"]) + else: + self._num_kv_heads.append(0) + self._num_heads.append(0) + else: + self._num_kv_heads.append( + self.hparams["num_attention_heads"] + // _block_configs[il]["attention"]["n_heads_in_group"] + ) + self._num_heads.append(self.hparams["num_attention_heads"]) + _ffn_multipliers.append(_block_configs[il]["ffn"]["ffn_mult"]) + assert self.block_count == len(self._num_kv_heads) + assert self.block_count == len(self._num_heads) + assert self.block_count == len(_ffn_multipliers) + assert isinstance(self._num_kv_heads, list) and isinstance( + self._num_kv_heads[0], int + ) + assert isinstance(self._num_heads, list) and isinstance( + self._num_heads[0], int + ) + assert isinstance(_ffn_multipliers, list) and isinstance( + _ffn_multipliers[0], float + ) + self._ffn_dims: list[int] = [ + DeciModel._ffn_mult_to_intermediate_size( + multiplier, self.hparams["hidden_size"] + ) + for multiplier in _ffn_multipliers + ] + + def set_vocab(self): + # Please change tokenizer_config.json of Llama-3_1-Nemotron-51B's + # eos_token from '|eot_id|' to '|end_of_text|' + if self.hparams.get("vocab_size", 128256) == 128256: + tokens, toktypes, tokpre = self.get_vocab_base() + self.gguf_writer.add_tokenizer_model("gpt2") + self.gguf_writer.add_tokenizer_pre(tokpre) + self.gguf_writer.add_token_list(tokens) + self.gguf_writer.add_token_types(toktypes) + + special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=True) + special_vocab.add_to_gguf(self.gguf_writer) + else: + # DeciLM-7B + self._set_vocab_llama_hf() + + def set_gguf_parameters(self): + if "block_configs" in self.hparams: # Llama-3_1-Nemotron-51B + assert self.block_count == len(self._num_kv_heads) + assert self.block_count == len(self._num_heads) + assert self.block_count == len(self._ffn_dims) + if (rope_theta := self.hparams.get("rope_theta")) is not None: + self.gguf_writer.add_rope_freq_base(rope_theta) + self.gguf_writer.add_head_count_kv(self._num_kv_heads) + self.gguf_writer.add_head_count(self._num_heads) + self.gguf_writer.add_feed_forward_length(self._ffn_dims) + self.gguf_writer.add_block_count(self.block_count) + self.gguf_writer.add_context_length(self.hparams["max_position_embeddings"]) + self.gguf_writer.add_embedding_length(self.hparams["hidden_size"]) + self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"]) + self.gguf_writer.add_key_length( + self.hparams["hidden_size"] // self.hparams["num_attention_heads"] + ) + self.gguf_writer.add_value_length( + self.hparams["hidden_size"] // self.hparams["num_attention_heads"] + ) + self.gguf_writer.add_file_type(self.ftype) + else: # DeciLM-7B + super().set_gguf_parameters() + if "num_key_value_heads_per_layer" in self.hparams: # DeciLM-7B + self._num_kv_heads: list[int] = self.hparams[ + "num_key_value_heads_per_layer" + ] + assert self.block_count == len(self._num_kv_heads) + self.gguf_writer.add_head_count_kv(self._num_kv_heads) + hparams = self.hparams + self.gguf_writer.add_vocab_size(hparams["vocab_size"]) + + if "head_dim" in hparams: + rope_dim = hparams["head_dim"] + else: + rope_dim = hparams["hidden_size"] // hparams["num_attention_heads"] + self.gguf_writer.add_rope_dimension_count(rope_dim) + + if ( + self.hparams.get("rope_scaling") is not None + and "factor" in self.hparams["rope_scaling"] + ): + if self.hparams["rope_scaling"].get("type") == "linear": + self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR) + self.gguf_writer.add_rope_scaling_factor( + self.hparams["rope_scaling"]["factor"] + ) + + @staticmethod + def permute(weights: Tensor, n_head: int, n_head_kv: int | None): + if n_head_kv is not None and n_head != n_head_kv: + n_head = n_head_kv + return ( + weights.reshape( + n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:] + ) + .swapaxes(1, 2) + .reshape(weights.shape) + ) + + def modify_tensors( + self, data_torch: Tensor, name: str, bid: int | None + ) -> Iterable[tuple[str, Tensor]]: + n_head = self.hparams["num_attention_heads"] + if bid is not None: + if "num_key_value_heads_per_layer" in self.hparams: + n_kv_head = self.hparams["num_key_value_heads_per_layer"][bid] + elif "block_configs" in self.hparams: + n_kv_head = self._num_kv_heads[bid] + n_head = self._num_heads[bid] + else: + n_kv_head = self.hparams.get("num_key_value_heads") + else: + n_kv_head = self.hparams.get("num_key_value_heads") + + if name.endswith(("q_proj.weight", "q_proj.bias")): + data_torch = DeciModel.permute(data_torch, n_head, n_head) + if name.endswith(("k_proj.weight", "k_proj.bias")): + data_torch = DeciModel.permute(data_torch, n_head, n_kv_head) + return [(self.map_tensor_name(name), data_torch)] + + def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]: + if rope_scaling := self.find_hparam(["rope_scaling"], optional=True): + if rope_scaling.get("rope_type", "").lower() == "llama3": + base = self.hparams.get("rope_theta", 10000.0) + dim = self.hparams.get( + "head_dim", + self.hparams["hidden_size"] // self.hparams["num_attention_heads"], + ) + freqs = 1.0 / ( + base ** (torch.arange(0, dim, 2, dtype=torch.float32) / dim) + ) + + factor = rope_scaling.get("factor", 8.0) + low_freq_factor = rope_scaling.get("low_freq_factor", 1.0) + high_freq_factor = rope_scaling.get("high_freq_factor", 4.0) + old_context_len = self.hparams.get( + "original_max_position_embeddings", 8192 + ) + + low_freq_wavelen = old_context_len / low_freq_factor + high_freq_wavelen = old_context_len / high_freq_factor + assert low_freq_wavelen != high_freq_wavelen + + rope_factors = [] + for freq in freqs: + wavelen = 2 * math.pi / freq + if wavelen < high_freq_wavelen: + rope_factors.append(1) + elif wavelen > low_freq_wavelen: + rope_factors.append(factor) + else: + smooth = (old_context_len / wavelen - low_freq_factor) / ( + high_freq_factor - low_freq_factor + ) + rope_factors.append(1 / ((1 - smooth) / factor + smooth)) + + yield ( + self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FREQS), + torch.tensor(rope_factors, dtype=torch.float32), + ) + + def prepare_tensors(self): + super().prepare_tensors() + + @Model.register("BitnetForCausalLM") class BitnetModel(Model): model_arch = gguf.MODEL_ARCH.BITNET @@ -1930,7 +2273,9 @@ def weight_quant(self, weight: Tensor) -> Tensor: weight = weight.float() scale = weight.abs().mean().clamp(min=1e-5) iscale = 1 / scale - + # TODO: multiply by the scale directly instead of inverting it twice + # (this is also unnecessarily doubly inverted upstream) + # ref: https://huggingface.co/1bitLLM/bitnet_b1_58-3B/blob/af89e318d78a70802061246bf037199d2fb97020/utils_quant.py#L10 result = (weight * iscale).round().clamp(-1, 1) / iscale return result.type(dtype) @@ -1951,7 +2296,7 @@ def modify_tensors( gguf.MODEL_TENSOR.FFN_GATE, ] ): - + # transform weight into 1/0/-1 (in fp32) data_torch = self.weight_quant(data_torch) yield (new_name, data_torch) @@ -1975,7 +2320,7 @@ def set_gguf_parameters(self): def modify_tensors( self, data_torch: Tensor, name: str, bid: int | None ) -> Iterable[tuple[str, Tensor]]: - + # process the experts separately if name.find(".moe.") != -1: n_experts = self.hparams["num_local_experts"] @@ -1989,6 +2334,7 @@ def modify_tensors( if len(self._experts[bid]) >= n_experts * 3: tensors: list[tuple[str, Tensor]] = [] + # merge the experts into a single 3d tensor for wid in ["linear", "linear_1", "linear_v"]: datas: list[Tensor] = [] @@ -2044,17 +2390,26 @@ def set_gguf_parameters(self): def modify_tensors( self, data_torch: Tensor, name: str, bid: int | None ) -> Iterable[tuple[str, Tensor]]: - del bid + del bid # unused n_expert = self.hparams["ffn_config"]["moe_num_experts"] n_ff = self.hparams["ffn_config"]["ffn_hidden_size"] n_embd = self.hparams["d_model"] + # Specific behavior for experts tensors: suffix .weight, view as 3D and transpose + # original implementation expects (n_expert, n_ff, n_embd) for all experts weights + # But llama.cpp moe graph works differently + # AND the dimensions in ggml are typically in the reverse order of the pytorch dimensions + # so (n_expert, n_ff, n_embd) in pytorch is {n_embd, n_ff, n_expert} in ggml_tensor exp_tensor_names = { - "ffn.experts.mlp.w1": None, - "ffn.experts.mlp.w2": (0, 2, 1), + "ffn.experts.mlp.w1": None, # LLM_TENSOR_FFN_GATE_EXPS ggml_tensor->ne{n_embd, n_ff, n_expert} + "ffn.experts.mlp.w2": ( + 0, + 2, + 1, + ), # LLM_TENSOR_FFN_DOWN_EXPS ggml_tensor->ne{n_ff, n_embd, n_expert} "ffn.experts.mlp.v1": None, - } + } # LLM_TENSOR_FFN_UP_EXPS ggml_tensor->ne{n_embd, n_ff, n_expert} experts = False for exp_tensor_name in exp_tensor_names.keys(): @@ -2065,6 +2420,12 @@ def modify_tensors( data_torch = data_torch.permute(*permute_tensor) break + # map tensor names + # In MoE models the ffn tensors are typically most of the model weights, + # and need to be quantizable. Quantize expects tensor names to be suffixed by .weight. + # Every other model has the weight names ending in .weight, + # let's assume that is the convention which is not the case for dbrx: + # https://huggingface.co/databricks/dbrx-instruct/blob/main/model.safetensors.index.json#L15 new_name = self.map_tensor_name( name if not experts else name + ".weight", try_suffixes=(".weight",) ) @@ -2074,7 +2435,7 @@ def modify_tensors( def tensor_force_quant( self, name: str, new_name: str, bid: int | None, n_dims: int ) -> gguf.GGMLQuantizationType | bool: - del name, new_name, bid + del name, new_name, bid # unused return n_dims > 1 @@ -2084,48 +2445,71 @@ class MiniCPMModel(Model): model_arch = gguf.MODEL_ARCH.MINICPM def set_gguf_parameters(self): - block_count = self.hparams["num_hidden_layers"] - self.gguf_writer.add_context_length(self.hparams["max_position_embeddings"]) - self.gguf_writer.add_embedding_length(self.hparams["hidden_size"]) - self.gguf_writer.add_block_count(block_count) - self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"]) - self.gguf_writer.add_rope_dimension_count( - self.hparams["hidden_size"] // self.hparams["num_attention_heads"] + super().set_gguf_parameters() + embedding_scale = float(self.hparams["scale_emb"]) + self.gguf_writer.add_embedding_scale(embedding_scale) + logger.info(f"gguf: (minicpm) embedding_scale = {embedding_scale}") + residual_scale = ( + self.hparams["scale_depth"] / self.hparams["num_hidden_layers"] ** 0.5 ) - self.gguf_writer.add_head_count(self.hparams["num_attention_heads"]) - self.gguf_writer.add_head_count_kv(self.hparams["num_key_value_heads"]) - self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"]) - self.gguf_writer.add_file_type(self.ftype) + self.gguf_writer.add_residual_scale(residual_scale) + logger.info(f"gguf: (minicpm) residual_scale = {residual_scale}") + logit_scale = self.hparams["hidden_size"] / self.hparams["dim_model_base"] + self.gguf_writer.add_logit_scale(logit_scale) + logger.info(f"gguf: (minicpm) logit_scale = {logit_scale}") + if self.hparams.get("rope_scaling") is not None: + if self.hparams["rope_scaling"].get("type") == "longrope": + self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LONGROPE) + logger.info( + f"gguf: (minicpm) rope_scaling_type = {gguf.RopeScalingType.LONGROPE}" + ) - def set_vocab(self): - self._set_vocab_llama_hf() + def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]: + rope_dims = self.hparams["hidden_size"] // self.hparams["num_attention_heads"] - def _reverse_hf_permute( - self, weights: Tensor, n_head: int, n_kv_head: int | None = None - ) -> Tensor: - if n_kv_head is not None and n_head != n_kv_head: - n_head //= n_kv_head + rope_scaling = self.find_hparam(["rope_scaling"], True) + if rope_scaling is not None: + long_factors = rope_scaling.get("long_factor", None) + short_factors = rope_scaling.get("short_factor", None) - return ( - weights.reshape( - n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:] + if long_factors is None or short_factors is None: + raise KeyError( + "Missing the required key rope_scaling.long_factor or rope_scaling_short_factor" + ) + + if ( + len(long_factors) != len(short_factors) + or len(long_factors) != rope_dims / 2 + ): + raise ValueError( + f"The length of rope long and short factors must be {rope_dims / 2}" + ) + + yield ( + self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FACTORS_LONG), + torch.tensor(long_factors, dtype=torch.float32), ) - .swapaxes(1, 2) - .reshape(weights.shape) - ) + yield ( + self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FACTORS_SHORT), + torch.tensor(short_factors, dtype=torch.float32), + ) + + def set_vocab(self): + self._set_vocab_sentencepiece() def modify_tensors( self, data_torch: Tensor, name: str, bid: int | None ) -> Iterable[tuple[str, Tensor]]: - del bid + del bid # unused n_head = self.hparams["num_attention_heads"] n_kv_head = self.hparams.get("num_key_value_heads") + # HF models permute some of the tensors, so we need to undo that if name.endswith(("q_proj.weight")): - data_torch = self._reverse_hf_permute(data_torch, n_head, n_head) + data_torch = LlamaModel.permute(data_torch, n_head, n_head) if name.endswith(("k_proj.weight")): - data_torch = self._reverse_hf_permute(data_torch, n_head, n_kv_head) + data_torch = LlamaModel.permute(data_torch, n_head, n_kv_head) return [(self.map_tensor_name(name), data_torch)] @@ -2137,8 +2521,6 @@ class MiniCPM3Model(Model): def set_gguf_parameters(self): hparams = self.hparams - rope_dims = hparams["qk_rope_head_dim"] - self.gguf_writer.add_file_type(self.ftype) self.gguf_writer.add_context_length(hparams["max_position_embeddings"]) self.gguf_writer.add_embedding_length(hparams["hidden_size"]) @@ -2156,37 +2538,38 @@ def set_gguf_parameters(self): ) self.gguf_writer.add_rope_dimension_count(hparams["qk_rope_head_dim"]) + def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]: rope_scaling = self.find_hparam(["rope_scaling"], True) - if rope_scaling is None: - return + if rope_scaling is not None: + rope_dims = self.hparams["qk_rope_head_dim"] - long_factors = rope_scaling.get("long_factor", None) - short_factors = rope_scaling.get("short_factor", None) + long_factors = rope_scaling.get("long_factor", None) + short_factors = rope_scaling.get("short_factor", None) - if long_factors is None or short_factors is None: - raise KeyError( - "Missing the required key rope_scaling.long_factor or rope_scaling_short_factor" - ) + if long_factors is None or short_factors is None: + raise KeyError( + "Missing the required key rope_scaling.long_factor or rope_scaling_short_factor" + ) - if ( - len(long_factors) != len(short_factors) - or len(long_factors) != rope_dims / 2 - ): - raise ValueError( - f"The length of rope long and short factors must be {rope_dims / 2}" - ) + if ( + len(long_factors) != len(short_factors) + or len(long_factors) != rope_dims / 2 + ): + raise ValueError( + f"The length of rope long and short factors must be {rope_dims / 2}" + ) - self.gguf_writer.add_tensor( - gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.ROPE_FACTORS_LONG] + ".weight", - np.array(long_factors, dtype=np.float32), - ) - self.gguf_writer.add_tensor( - gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.ROPE_FACTORS_SHORT] + ".weight", - np.array(short_factors, dtype=np.float32), - ) + yield ( + self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FACTORS_LONG), + torch.tensor(long_factors, dtype=torch.float32), + ) + yield ( + self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FACTORS_SHORT), + torch.tensor(short_factors, dtype=torch.float32), + ) def set_vocab(self): - self._set_vocab_llama_hf() + self._set_vocab_sentencepiece() def _reverse_hf_permute( self, weights: Tensor, n_head: int, n_kv_head: int | None = None @@ -2264,6 +2647,87 @@ def set_vocab(self): except FileNotFoundError: self._set_vocab_gpt2() + def set_gguf_parameters(self): + super().set_gguf_parameters() + if ( + self.hparams.get("rope_scaling") is not None + and "factor" in self.hparams["rope_scaling"] + ): + if self.hparams["rope_scaling"].get("type") == "yarn": + self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN) + self.gguf_writer.add_rope_scaling_factor( + self.hparams["rope_scaling"]["factor"] + ) + self.gguf_writer.add_rope_scaling_orig_ctx_len( + self.hparams["rope_scaling"]["original_max_position_embeddings"] + ) + + +@Model.register("Qwen2VLForConditionalGeneration") +class Qwen2VLModel(Model): + model_arch = gguf.MODEL_ARCH.QWEN2VL + + def set_gguf_parameters(self): + super().set_gguf_parameters() + mrope_section = self.hparams["rope_scaling"]["mrope_section"] + mrope_section += [0] * max(0, 4 - len(mrope_section)) + self.gguf_writer.add_rope_dimension_sections(mrope_section) + + def set_vocab(self): + try: + self._set_vocab_sentencepiece() + except FileNotFoundError: + self._set_vocab_gpt2() + + def get_tensors(self) -> Iterator[tuple[str, Tensor]]: + for name, data in super().get_tensors(): + if name.startswith("visual."): + continue + yield name, data + + +@Model.register("WavTokenizerDec") +class WavTokenizerDecModel(Model): + model_arch = gguf.MODEL_ARCH.WAVTOKENIZER_DEC + + def modify_tensors( + self, data_torch: Tensor, name: str, bid: int | None + ) -> Iterable[tuple[str, Tensor]]: + del bid # unused + + if ( + name.endswith("codebook.cluster_size") + or name.endswith("codebook.embed_avg") + or name.endswith("codebook.inited") + ): + logger.debug(f"Skipping {name!r}") + return [] + + logger.info(f"{self.map_tensor_name(name)} -> {data_torch.shape}") + + return [(self.map_tensor_name(name), data_torch)] + + def set_vocab(self): + self._set_vocab_none() + + def set_gguf_parameters(self): + super().set_gguf_parameters() + self.gguf_writer.add_vocab_size(self.hparams["vocab_size"]) + self.gguf_writer.add_features_length(self.hparams["n_embd_features"]) + self.gguf_writer.add_feed_forward_length(self.hparams["n_ff"]) + self.gguf_writer.add_group_norm_eps(self.hparams["group_norm_epsilon"]) + self.gguf_writer.add_group_norm_groups(self.hparams["group_norm_groups"]) + + self.gguf_writer.add_posnet_embedding_length(self.hparams["posnet"]["n_embd"]) + self.gguf_writer.add_posnet_block_count(self.hparams["posnet"]["n_layer"]) + + self.gguf_writer.add_convnext_embedding_length( + self.hparams["convnext"]["n_embd"] + ) + self.gguf_writer.add_convnext_block_count(self.hparams["convnext"]["n_layer"]) + + self.gguf_writer.add_causal_attention(False) + @Model.register("Qwen2MoeForCausalLM") class Qwen2MoeModel(Model): @@ -2295,7 +2759,7 @@ def set_gguf_parameters(self): def modify_tensors( self, data_torch: Tensor, name: str, bid: int | None ) -> Iterable[tuple[str, Tensor]]: - + # process the experts separately if name.find("experts") != -1: n_experts = self.hparams["num_experts"] assert bid is not None @@ -2308,6 +2772,7 @@ def modify_tensors( if len(self._experts[bid]) >= n_experts * 3: tensors: list[tuple[str, Tensor]] = [] + # merge the experts into a single 3d tensor for w_name in ["down_proj", "gate_proj", "up_proj"]: datas: list[Tensor] = [] @@ -2333,7 +2798,7 @@ def prepare_tensors(self): super().prepare_tensors() if self._experts is not None: - + # flatten `list[dict[str, Tensor]]` into `list[str]` experts = [k for d in self._experts for k in d.keys()] if len(experts) > 0: raise ValueError(f"Unprocessed experts: {experts}") @@ -2355,10 +2820,11 @@ def set_gguf_parameters(self): def modify_tensors( self, data_torch: Tensor, name: str, bid: int | None ) -> Iterable[tuple[str, Tensor]]: - del bid + del bid # unused tensors: list[tuple[str, Tensor]] = [] + # we don't need these if name.endswith((".attn.bias", ".attn.masked_bias")): return tensors @@ -2371,6 +2837,7 @@ def modify_tensors( tensors.append((new_name, data_torch)) + # note: GPT2 output is tied to (same as) wte in original model if new_name == self.format_tensor_name(gguf.MODEL_TENSOR.TOKEN_EMBD): tensors.append( (self.format_tensor_name(gguf.MODEL_TENSOR.OUTPUT), data_torch) @@ -2412,6 +2879,15 @@ class Phi3MiniModel(Model): model_arch = gguf.MODEL_ARCH.PHI3 def set_vocab(self): + # Phi-4 model uses GPT2Tokenizer + tokenizer_config_file = self.dir_model / "tokenizer_config.json" + if tokenizer_config_file.is_file(): + with open(tokenizer_config_file, "r", encoding="utf-8") as f: + tokenizer_config_json = json.load(f) + tokenizer_class = tokenizer_config_json["tokenizer_class"] + if tokenizer_class == "GPT2Tokenizer": + return self._set_vocab_gpt2() + from sentencepiece import SentencePieceProcessor tokenizer_path = self.dir_model / "tokenizer.model" @@ -2538,8 +3014,20 @@ def set_gguf_parameters(self): self.gguf_writer.add_rope_dimension_count(rope_dims) self.gguf_writer.add_rope_freq_base(self.find_hparam(["rope_theta"])) self.gguf_writer.add_file_type(self.ftype) - self.gguf_writer.add_sliding_window(self.find_hparam(["sliding_window"])) + sliding_window = self.hparams.get("sliding_window") + # use zero value of sliding_window to distinguish Phi-4 from other PHI3 models + if sliding_window is None: + sliding_window = 0 + self.gguf_writer.add_sliding_window(sliding_window) + + def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]: + n_embd = self.find_hparam(["hidden_size", "n_embd"]) + n_head = self.find_hparam(["num_attention_heads", "n_head"]) + max_pos_embds = self.find_hparam(["n_positions", "max_position_embeddings"]) + orig_max_pos_embds = self.find_hparam(["original_max_position_embeddings"]) + rope_dims = n_embd // n_head + # write rope scaling for long context (128k) model rope_scaling = self.find_hparam(["rope_scaling"], True) if rope_scaling is None: return @@ -2581,15 +3069,14 @@ def set_gguf_parameters(self): f"The length of rope long and short factors must be {rope_dims / 2}" ) - if not self.is_lora: - self.gguf_writer.add_tensor( - gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.ROPE_FACTORS_LONG] + ".weight", - np.array(long_factors, dtype=np.float32), - ) - self.gguf_writer.add_tensor( - gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.ROPE_FACTORS_SHORT] + ".weight", - np.array(short_factors, dtype=np.float32), - ) + yield ( + self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FACTORS_LONG), + torch.tensor(long_factors, dtype=torch.float32), + ) + yield ( + self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FACTORS_SHORT), + torch.tensor(short_factors, dtype=torch.float32), + ) @Model.register("PlamoForCausalLM") @@ -2603,12 +3090,14 @@ def set_gguf_parameters(self): hparams = self.hparams block_count = hparams["num_hidden_layers"] - self.gguf_writer.add_context_length(4096) + self.gguf_writer.add_context_length(4096) # not in config.json self.gguf_writer.add_embedding_length(hparams["hidden_size"]) self.gguf_writer.add_feed_forward_length(hparams["intermediate_size"]) self.gguf_writer.add_block_count(block_count) self.gguf_writer.add_head_count(hparams["num_attention_heads"]) - self.gguf_writer.add_head_count_kv(5) + self.gguf_writer.add_head_count_kv( + 5 + ) # hparams["num_key_value_heads"]) is wrong self.gguf_writer.add_layer_norm_rms_eps(hparams["rms_norm_eps"]) self.gguf_writer.add_file_type(self.ftype) @@ -2629,10 +3118,11 @@ def shuffle_attn_output_weight(self, data_torch): def modify_tensors( self, data_torch: Tensor, name: str, bid: int | None ) -> Iterable[tuple[str, Tensor]]: - del bid + del bid # unused new_name = self.map_tensor_name(name) + # shuffle for broadcasting of gqa in ggml_mul_mat if new_name.endswith("attn_q.weight"): data_torch = self.shuffle_attn_q_weight(data_torch) elif new_name.endswith("attn_output.weight"): @@ -2663,7 +3153,7 @@ def set_gguf_parameters(self): def modify_tensors( self, data_torch: Tensor, name: str, bid: int | None ) -> Iterable[tuple[str, Tensor]]: - del bid + del bid # unused new_name = self.map_tensor_name(name) @@ -2675,7 +3165,7 @@ def modify_tensors( if all( s not in self.tensor_names for s in ("lm_head.weight", "output.weight") ): - + # copy tok_embd.weight to output.weight tensors.append( (self.format_tensor_name(gguf.MODEL_TENSOR.OUTPUT), data_torch) ) @@ -2688,7 +3178,10 @@ class InternLM2Model(Model): model_arch = gguf.MODEL_ARCH.INTERNLM2 def set_vocab(self): - + # (TODO): Is there a better way? + # Copy from _set_vocab_sentencepiece, The only difference is that we will treat the character + # \x00 specially and convert it into an emoji character to prevent it from being mistakenly + # recognized as an empty string in C++. from sentencepiece import SentencePieceProcessor from sentencepiece import sentencepiece_model_pb2 as model @@ -2702,7 +3195,9 @@ def set_vocab(self): logger.error(f"Error: Missing {tokenizer_path}") sys.exit(1) - sentencepiece_model = model.ModelProto() + sentencepiece_model = ( + model.ModelProto() + ) # pyright: ignore[reportAttributeAccessIssue] sentencepiece_model.ParseFromString(open(tokenizer_path, "rb").read()) add_prefix = sentencepiece_model.normalizer_spec.add_dummy_prefix @@ -2716,7 +3211,8 @@ def set_vocab(self): text = piece.encode("utf-8") score = tokenizer.GetScore(token_id) if text == b"\x00": - + # (TODO): fixme + # Hack here and replace the \x00 characters. logger.warning(f"InternLM2 convert token '{text}' to '🐉'!") text = "🐉".encode("utf-8") @@ -2729,7 +3225,7 @@ def set_vocab(self): toktype = SentencePieceTokenTypes.UNUSED elif tokenizer.IsByte(token_id): toktype = SentencePieceTokenTypes.BYTE - + # take care of ununsed raw token if piece.startswith("[UNUSED"): toktype = SentencePieceTokenTypes.UNUSED @@ -2806,7 +3302,9 @@ def set_vocab(self): special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens)) old_eos = special_vocab.special_token_ids["eos"] if chat_eos_token_id is not None: - + # For the chat model, we replace the eos with '<|im_end|>'. + # TODO: this is a hack, should be fixed + # https://github.com/ggerganov/llama.cpp/pull/6745#issuecomment-2067687048 special_vocab.special_token_ids["eos"] = chat_eos_token_id logger.warning( f"Replace eos:{old_eos} with a special token:{chat_eos_token_id}" @@ -2851,6 +3349,7 @@ def modify_tensors( qkv = qkv.reshape((num_groups, q_per_kv + 2, head_dim, n_embd)) q, k, v = qkv[:, :q_per_kv], qkv[:, -2], qkv[:, -1] + # The model weights of q and k equire additional reshape. q = LlamaModel.permute(q.reshape((-1, q.shape[-1])), num_heads, num_heads) k = LlamaModel.permute( k.reshape((-1, k.shape[-1])), num_heads, num_kv_heads @@ -2866,7 +3365,7 @@ def modify_tensors( return [(self.map_tensor_name(name), data_torch)] -@Model.register("BertModel", "CamembertModel") +@Model.register("BertModel", "BertForMaskedLM", "CamembertModel") class BertModel(Model): model_arch = gguf.MODEL_ARCH.BERT @@ -2878,6 +3377,7 @@ def set_gguf_parameters(self): super().set_gguf_parameters() self.gguf_writer.add_causal_attention(False) + # get pooling path pooling_path = None module_path = self.dir_model / "modules.json" if module_path.is_file(): @@ -2888,6 +3388,7 @@ def set_gguf_parameters(self): pooling_path = mod["path"] break + # get pooling type if pooling_path is not None: with open( self.dir_model / pooling_path / "config.json", encoding="utf-8" @@ -2905,8 +3406,12 @@ def set_vocab(self): tokens, toktypes, tokpre = self.get_vocab_base() self.vocab_size = len(tokens) - self.gguf_writer.add_token_type_count(2) + # we need this to validate the size of the token_type embeddings + # though currently we are passing all zeros to the token_type embeddings + # "Sequence A" or "Sequence B" + self.gguf_writer.add_token_type_count(self.hparams.get("type_vocab_size", 1)) + # convert to phantom space vocab def phantom(tok): if tok.startswith("[") and tok.endswith("]"): return tok @@ -2916,29 +3421,96 @@ def phantom(tok): tokens = list(map(phantom, tokens)) + # add vocab to gguf self.gguf_writer.add_tokenizer_model("bert") self.gguf_writer.add_tokenizer_pre(tokpre) self.gguf_writer.add_token_list(tokens) self.gguf_writer.add_token_types(toktypes) + # handle special tokens special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens)) special_vocab.add_to_gguf(self.gguf_writer) def modify_tensors( self, data_torch: Tensor, name: str, bid: int | None ) -> Iterable[tuple[str, Tensor]]: - del bid + del bid # unused + + if name.startswith("bert."): + name = name[5:] + + if name.endswith(".gamma"): + name = name[:-6] + ".weight" + if name.endswith(".beta"): + name = name[:-5] + ".bias" + + # we are only using BERT for embeddings so we don't need the pooling layer if name in ( "embeddings.position_ids", "pooler.dense.weight", "pooler.dense.bias", ): + return [] # we don't need these + + if name.startswith("cls.predictions"): + return [] + + if name.startswith("cls.seq_relationship"): return [] return [(self.map_tensor_name(name), data_torch)] +@Model.register("RobertaModel") +class RobertaModel(BertModel): + model_arch = gguf.MODEL_ARCH.BERT + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + # we need the pad_token_id to know how to chop down position_embd matrix + if (pad_token_id := self.hparams.get("pad_token_id")) is not None: + self._position_offset = 1 + pad_token_id + if "max_position_embeddings" in self.hparams: + self.hparams["max_position_embeddings"] -= self._position_offset + else: + self._position_offset = None + + def set_vocab(self): + """Support BPE tokenizers for roberta models""" + bpe_tok_path = self.dir_model / "tokenizer.json" + if bpe_tok_path.exists(): + self._set_vocab_gpt2() + self.gguf_writer.add_add_bos_token(True) + self.gguf_writer.add_add_eos_token(True) + + # we need this to validate the size of the token_type embeddings + # though currently we are passing all zeros to the token_type embeddings + # "Sequence A" or "Sequence B" + self.gguf_writer.add_token_type_count( + self.hparams.get("type_vocab_size", 1) + ) + + else: + return super().set_vocab() + + def modify_tensors( + self, data_torch: Tensor, name: str, bid: int | None + ) -> Iterable[tuple[str, Tensor]]: + # if name starts with "roberta.", remove the prefix + # e.g. https://huggingface.co/BAAI/bge-reranker-v2-m3/tree/main + if name.startswith("roberta."): + name = name[8:] + + # position embeddings start at pad_token_id + 1, so just chop down the weight tensor + if name == "embeddings.position_embeddings.weight": + if self._position_offset is not None: + data_torch = data_torch[self._position_offset :, :] + + return super().modify_tensors(data_torch, name, bid) + + @Model.register("NomicBertModel") class NomicBertModel(BertModel): model_arch = gguf.MODEL_ARCH.NOMIC_BERT @@ -2946,18 +3518,20 @@ class NomicBertModel(BertModel): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) + # the HF config claims n_ctx=8192, but it uses RoPE scaling self.hparams["n_ctx"] = 2048 + # SwigLU activation assert self.hparams["activation_function"] == "swiglu" - + # this doesn't do anything in the HF version assert self.hparams["causal"] is False - + # no bias tensors assert self.hparams["qkv_proj_bias"] is False assert self.hparams["mlp_fc1_bias"] is False assert self.hparams["mlp_fc2_bias"] is False - + # norm at end of layer assert self.hparams["prenorm"] is False - + # standard RoPE assert self.hparams["rotary_emb_fraction"] == 1.0 assert self.hparams["rotary_emb_interleaved"] is False assert self.hparams["rotary_emb_scale_base"] is None @@ -2967,13 +3541,14 @@ def set_gguf_parameters(self): self.gguf_writer.add_rope_freq_base(self.hparams["rotary_emb_base"]) -@Model.register("XLMRobertaModel") +@Model.register("XLMRobertaModel", "XLMRobertaForSequenceClassification") class XLMRobertaModel(BertModel): model_arch = gguf.MODEL_ARCH.BERT def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) + # we need the pad_token_id to know how to chop down position_embd matrix if (pad_token_id := self.hparams.get("pad_token_id")) is not None: self._position_offset = 1 + pad_token_id if "max_position_embeddings" in self.hparams: @@ -2982,7 +3557,8 @@ def __init__(self, *args, **kwargs): self._position_offset = None def set_vocab(self): - + # to avoid TypeError: Descriptors cannot be created directly + # exception when importing sentencepiece_model_pb2 os.environ["PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION"] = "python" from sentencepiece import SentencePieceProcessor from sentencepiece import sentencepiece_model_pb2 as model @@ -2991,9 +3567,11 @@ def set_vocab(self): if not tokenizer_path.is_file(): raise FileNotFoundError(f"File not found: {tokenizer_path}") - sentencepiece_model = model.ModelProto() + sentencepiece_model = ( + model.ModelProto() + ) # pyright: ignore[reportAttributeAccessIssue] sentencepiece_model.ParseFromString(open(tokenizer_path, "rb").read()) - assert sentencepiece_model.trainer_spec.model_type == 1 + assert sentencepiece_model.trainer_spec.model_type == 1 # UNIGRAM add_prefix = sentencepiece_model.normalizer_spec.add_dummy_prefix remove_whitespaces = ( @@ -3039,6 +3617,7 @@ def set_vocab(self): scores.append(-1000.0) toktypes.append(SentencePieceTokenTypes.UNUSED) + # realign tokens (see HF tokenizer code) tokens = [b"", b"", b"", b""] + tokens[3:-1] scores = [0.0, 0.0, 0.0, 0.0] + scores[3:-1] toktypes = [ @@ -3054,7 +3633,7 @@ def set_vocab(self): self.gguf_writer.add_token_scores(scores) self.gguf_writer.add_token_types(toktypes) self.gguf_writer.add_add_space_prefix(add_prefix) - self.gguf_writer.add_token_type_count(1) + self.gguf_writer.add_token_type_count(self.hparams.get("type_vocab_size", 1)) self.gguf_writer.add_remove_extra_whitespaces(remove_whitespaces) if precompiled_charsmap: self.gguf_writer.add_precompiled_charsmap(precompiled_charsmap) @@ -3068,7 +3647,12 @@ def set_vocab(self): def modify_tensors( self, data_torch: Tensor, name: str, bid: int | None ) -> Iterable[tuple[str, Tensor]]: + # if name starts with "roberta.", remove the prefix + # e.g. https://huggingface.co/BAAI/bge-reranker-v2-m3/tree/main + if name.startswith("roberta."): + name = name[8:] + # position embeddings start at pad_token_id + 1, so just chop down the weight tensor if name == "embeddings.position_embeddings.weight": if self._position_offset is not None: data_torch = data_torch[self._position_offset :, :] @@ -3083,6 +3667,7 @@ class GemmaModel(Model): def set_vocab(self): self._set_vocab_sentencepiece() + # TODO: these special tokens should be exported only for the CodeGemma family special_vocab = gguf.SpecialVocab( self.dir_model, load_merges=False, @@ -3093,7 +3678,7 @@ def set_vocab(self): special_vocab._set_special_token("middle", 68) special_vocab._set_special_token("fsep", 70) special_vocab._set_special_token("eot", 107) - special_vocab.chat_template = None + special_vocab.chat_template = None # do not add it twice special_vocab.add_to_gguf(self.gguf_writer) self.gguf_writer.add_add_space_prefix(False) @@ -3120,14 +3705,17 @@ def set_gguf_parameters(self): def modify_tensors( self, data_torch: Tensor, name: str, bid: int | None ) -> Iterable[tuple[str, Tensor]]: - del bid + del bid # unused + # lm_head is not used in llama.cpp, while autoawq will include this tensor in model + # To prevent errors, skip loading lm_head.weight. if name == "lm_head.weight": logger.debug( f"Skipping get tensor {name!r} in safetensors so that convert can end normally." ) return [] + # ref: https://github.com/huggingface/transformers/blob/fc37f38915372c15992b540dfcbbe00a916d4fc6/src/transformers/models/gemma/modeling_gemma.py#L89 if name.endswith("norm.weight"): data_torch = data_torch + 1 @@ -3172,14 +3760,17 @@ def set_gguf_parameters(self): def modify_tensors( self, data_torch: Tensor, name: str, bid: int | None ) -> Iterable[tuple[str, Tensor]]: - del bid + del bid # unused + # lm_head is not used in llama.cpp, while autoawq will include this tensor in model + # To prevent errors, skip loading lm_head.weight. if name == "lm_head.weight": logger.debug( f"Skipping get tensor {name!r} in safetensors so that convert can end normally." ) return [] + # ref: https://github.com/huggingface/transformers/blob/fc37f38915372c15992b540dfcbbe00a916d4fc6/src/transformers/models/gemma/modeling_gemma.py#L89 if name.endswith("norm.weight"): data_torch = data_torch + 1 @@ -3215,7 +3806,7 @@ def set_vocab(self): token = token.encode("utf-8") if isinstance(token, str) else token assert isinstance(token, bytes) assert len(token) == token_len - token_text: str = repr(token)[2:-1] + token_text: str = repr(token)[2:-1] # "b'\xff'" -> "\xff" tokens.append(token_text.encode("utf-8")) toktypes.append(gguf.TokenType.NORMAL) remainder = vocab_size - len(tokens) @@ -3228,6 +3819,9 @@ def set_vocab(self): self.gguf_writer.add_token_list(tokens) self.gguf_writer.add_token_types(toktypes) special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=False) + special_vocab.chat_template = "rwkv-world" + # hack: Add '\n\n' as the EOT token to make it chat normally + special_vocab._set_special_token("eot", 261) special_vocab.add_to_gguf(self.gguf_writer) def set_gguf_parameters(self): @@ -3244,6 +3838,7 @@ def set_gguf_parameters(self): time_mix_extra_dim = 64 if hidden_size == 4096 else 32 time_decay_extra_dim = 128 if hidden_size == 4096 else 64 + # RWKV isn't context limited self.gguf_writer.add_context_length(1048576) self.gguf_writer.add_embedding_length(hidden_size) self.gguf_writer.add_block_count(block_count) @@ -3255,6 +3850,7 @@ def set_gguf_parameters(self): self.gguf_writer.add_feed_forward_length(intermediate_size) self.gguf_writer.add_file_type(self.ftype) + # required by llama.cpp, unused self.gguf_writer.add_head_count(0) def modify_tensors( @@ -3275,6 +3871,9 @@ def modify_tensors( if new_name.endswith("time_mix_w2.weight"): data_torch = data_torch.permute(0, 2, 1) + if new_name.endswith("time_mix_decay.weight") or "lerp" in new_name: + data_torch = data_torch.squeeze() + rescale_every_n_layers = self.hparams["rescale_every"] if rescale_every_n_layers > 0: if new_name.endswith("time_mix_output.weight") or new_name.endswith( @@ -3291,9 +3890,10 @@ class MambaModel(Model): def set_vocab(self): vocab_size = self.hparams["vocab_size"] - + # Round vocab size to next multiple of 8 pad_vocab = self.hparams.get("pad_vocab_size_multiple", 8) - + # pad using ceiling division + # ref: https://stackoverflow.com/a/17511341/22827863 vocab_size = -(vocab_size // -pad_vocab) * pad_vocab self.hparams["vocab_size"] = vocab_size @@ -3302,7 +3902,7 @@ def set_vocab(self): elif (self.dir_model / "tokenizer.model").is_file(): self._set_vocab_sentencepiece() else: - + # Use the GPT-NeoX tokenizer when no tokenizer files are present self._set_vocab_builtin("gpt-neox", vocab_size) def set_gguf_parameters(self): @@ -3313,7 +3913,9 @@ def set_gguf_parameters(self): or 2 * d_model ) d_state = self.find_hparam(["state_size", "d_state"], optional=True) or 16 - + # ceiling division + # ref: https://stackoverflow.com/a/17511341/22827863 + # ref: https://github.com/state-spaces/mamba/blob/ce59daea3a090d011d6476c6e5b97f6d58ddad8b/mamba_ssm/modules/mamba_simple.py#L58 dt_rank = self.find_hparam(["time_step_rank", "dt_rank"], optional=True) or -( d_model // -16 ) @@ -3322,23 +3924,31 @@ def set_gguf_parameters(self): or 1e-5 ) use_dt_b_c_norm = False - + # For falconmamba we do apply RMS norm on B / DT and C layers if self.find_hparam(["model_type"], optional=True) in ("falcon_mamba",): use_dt_b_c_norm = True - + # Fail early for models which don't have a block expansion factor of 2 assert d_inner == 2 * d_model - self.gguf_writer.add_context_length(2**20) + self.gguf_writer.add_context_length( + 2**20 + ) # arbitrary value; for those who use the default self.gguf_writer.add_embedding_length(d_model) - self.gguf_writer.add_feed_forward_length(0) - self.gguf_writer.add_head_count(0) + self.gguf_writer.add_feed_forward_length( + 0 + ) # unused, but seemingly required when loading + self.gguf_writer.add_head_count( + 0 + ) # unused, but seemingly required when loading self.gguf_writer.add_block_count(self.block_count) self.gguf_writer.add_ssm_conv_kernel(d_conv) self.gguf_writer.add_ssm_inner_size(d_inner) self.gguf_writer.add_ssm_state_size(d_state) self.gguf_writer.add_ssm_time_step_rank(dt_rank) self.gguf_writer.add_layer_norm_rms_eps(rms_norm_eps) - self.gguf_writer.add_ssm_dt_b_c_rms(use_dt_b_c_norm) + self.gguf_writer.add_ssm_dt_b_c_rms( + use_dt_b_c_norm + ) # For classic Mamba we don't apply rms norm on B / DT layers self.gguf_writer.add_file_type(self.ftype) _tok_embd = None @@ -3346,7 +3956,7 @@ def set_gguf_parameters(self): def modify_tensors( self, data_torch: Tensor, name: str, bid: int | None ) -> Iterable[tuple[str, Tensor]]: - del bid + del bid # unused output_name = self.format_tensor_name(gguf.MODEL_TENSOR.OUTPUT) tok_embd_name = self.format_tensor_name(gguf.MODEL_TENSOR.TOKEN_EMBD) @@ -3357,6 +3967,7 @@ def modify_tensors( logger.debug("A_log --> A ==> " + new_name) data_torch = -torch.exp(data_torch) + # assuming token_embd.weight is seen before output.weight if self._tok_embd is not None and new_name == output_name: if torch.equal(self._tok_embd, data_torch): logger.debug( @@ -3376,6 +3987,9 @@ class CommandR2Model(Model): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) + # max_position_embeddings = 8192 in config.json but model was actually + # trained on 128k context length + # aya-23 models don't have model_max_length specified self.hparams["max_position_embeddings"] = self.find_hparam( ["model_max_length", "max_position_embeddings"] ) @@ -3386,6 +4000,26 @@ def set_gguf_parameters(self): self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE) +@Model.register("Cohere2ForCausalLM") +class Cohere2Model(Model): + model_arch = gguf.MODEL_ARCH.COHERE2 + + def set_gguf_parameters(self): + super().set_gguf_parameters() + + self.gguf_writer.add_logit_scale(self.hparams["logit_scale"]) + self.gguf_writer.add_sliding_window(self.hparams["sliding_window"]) + self.gguf_writer.add_vocab_size(self.hparams["vocab_size"]) + + rotary_pct = self.hparams["rotary_pct"] + hidden_size = self.hparams["hidden_size"] + num_attention_heads = self.hparams["num_attention_heads"] + self.gguf_writer.add_rope_dimension_count( + int(rotary_pct * (hidden_size // num_attention_heads)) + ) + self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE) + + @Model.register("OlmoForCausalLM") @Model.register("OLMoForCausalLM") class OlmoModel(Model): @@ -3398,10 +4032,12 @@ def set_gguf_parameters(self): if clip_qkv is not None: self.gguf_writer.add_clamp_kqv(clip_qkv) + # Same as super class, but permuting q_proj, k_proj + # Copied from: LlamaModel def modify_tensors( self, data_torch: Tensor, name: str, bid: int | None ) -> Iterable[tuple[str, Tensor]]: - del bid + del bid # unused n_head = self.hparams["num_attention_heads"] n_kv_head = self.hparams.get("num_key_value_heads") @@ -3414,6 +4050,11 @@ def modify_tensors( return [(self.map_tensor_name(name), data_torch)] +@Model.register("Olmo2ForCausalLM") +class Olmo2Model(Model): + model_arch = gguf.MODEL_ARCH.OLMO2 + + @Model.register("OlmoeForCausalLM") class OlmoeModel(Model): model_arch = gguf.MODEL_ARCH.OLMOE @@ -3426,10 +4067,11 @@ def set_gguf_parameters(self): _experts: list[dict[str, Tensor]] | None = None + # Copied from: Qwen2MoeModel def modify_tensors( self, data_torch: Tensor, name: str, bid: int | None ) -> Iterable[tuple[str, Tensor]]: - + # process the experts separately if name.find("experts") != -1: n_experts = self.hparams["num_experts"] assert bid is not None @@ -3442,6 +4084,7 @@ def modify_tensors( if len(self._experts[bid]) >= n_experts * 3: tensors: list[tuple[str, Tensor]] = [] + # merge the experts into a single 3d tensor for w_name in ["down_proj", "gate_proj", "up_proj"]: datas: list[Tensor] = [] @@ -3463,11 +4106,12 @@ def modify_tensors( return [(self.map_tensor_name(name), data_torch)] + # Copied from: Qwen2MoeModel def prepare_tensors(self): super().prepare_tensors() if self._experts is not None: - + # flatten `list[dict[str, Tensor]]` into `list[str]` experts = [k for d in self._experts for k in d.keys()] if len(experts) > 0: raise ValueError(f"Unprocessed experts: {experts}") @@ -3513,6 +4157,16 @@ def set_vocab(self): self.gguf_writer.add_add_bos_token(True) self.gguf_writer.add_add_eos_token(True) + def modify_tensors( + self, data_torch: Tensor, name: str, bid: int | None + ) -> Iterable[tuple[str, Tensor]]: + # if name starts with "bert.", remove the prefix + # e.g. https://huggingface.co/jinaai/jina-reranker-v1-tiny-en + if name.startswith("bert."): + name = name[5:] + + return super().modify_tensors(data_torch, name, bid) + @Model.register("OpenELMForCausalLM") class OpenELMModel(Model): @@ -3520,9 +4174,9 @@ class OpenELMModel(Model): @staticmethod def _make_divisible(v: float | int, divisor: int) -> int: - + # ref: https://huggingface.co/apple/OpenELM-270M-Instruct/blob/eb111ff2e6724348e5b905984063d4064d4bc579/configuration_openelm.py#L34-L38 new_v = max(divisor, int(v + divisor / 2) // divisor * divisor) - + # Make sure that round down does not go down by more than 10%. if new_v < 0.9 * v: new_v += divisor return new_v @@ -3546,6 +4200,7 @@ def __init__(self, *args, **kwargs): self._num_query_heads[0], int ) + # Uses the tokenizer from meta-llama/Llama-2-7b-hf def set_vocab(self): try: self._set_vocab_sentencepiece() @@ -3567,7 +4222,7 @@ def set_gguf_parameters(self): self.gguf_writer.add_head_count(self._num_query_heads) self.gguf_writer.add_head_count_kv(self._num_kv_heads) self.gguf_writer.add_rope_freq_base(self.hparams["rope_freq_constant"]) - + # https://huggingface.co/apple/OpenELM-270M-Instruct/blob/c401df2/modeling_openelm.py#L30 self.gguf_writer.add_layer_norm_rms_eps(1e-6) self.gguf_writer.add_rope_dimension_count(int(rot_pct * head_dim)) self.gguf_writer.add_key_length(head_dim) @@ -3584,6 +4239,7 @@ def modify_tensors( self, data_torch: Tensor, name: str, bid: int | None ) -> Iterable[tuple[str, Tensor]]: + # split ff if bid is not None and name == f"transformer.layers.{bid}.ffn.proj_1.weight": ff_dim = self._ffn_dims[bid] yield ( @@ -3604,7 +4260,9 @@ class ArcticModel(Model): model_arch = gguf.MODEL_ARCH.ARCTIC def set_vocab(self): - + # The reason for using a custom implementation here is that the + # snowflake-arctic-instruct model redefined tokens 31998 and 31999 from + # tokenizer.model and used them as BOS and EOS instead of adding new tokens. from sentencepiece import SentencePieceProcessor tokenizer_path = self.dir_model / "tokenizer.model" @@ -3613,6 +4271,7 @@ def set_vocab(self): logger.error(f"Error: Missing {tokenizer_path}") sys.exit(1) + # Read the whole vocabulary from the tokenizer.model file tokenizer = SentencePieceProcessor() tokenizer.LoadFromFile(str(tokenizer_path)) @@ -3642,6 +4301,8 @@ def set_vocab(self): scores[token_id] = score toktypes[token_id] = toktype + # Use the added_tokens_decoder field from tokeniser_config.json as the source + # of information about added/redefined tokens and modify them accordingly. tokenizer_config_file = self.dir_model / "tokenizer_config.json" if tokenizer_config_file.is_file(): with open(tokenizer_config_file, "r", encoding="utf-8") as f: @@ -3661,6 +4322,8 @@ def set_vocab(self): token_type = SentencePieceTokenTypes.USER_DEFINED token_score = -10000.0 + # Map unk_token to UNKNOWN, other special tokens to CONTROL + # Set the score to 0.0 as in the original tokenizer.model if ("special" in token_json) and token_json["special"]: if token_content == tokenizer_config_json["unk_token"]: token_type = SentencePieceTokenTypes.UNKNOWN @@ -3705,6 +4368,7 @@ def modify_tensors( if name.endswith("k_proj.weight"): data_torch = LlamaModel.permute(data_torch, n_head, n_kv_head) + # process the experts separately if name.find("block_sparse_moe.experts") != -1: n_experts = self.hparams["num_local_experts"] @@ -3718,6 +4382,7 @@ def modify_tensors( if len(self._experts[bid]) >= n_experts * 3: tensors: list[tuple[str, Tensor]] = [] + # merge the experts into a single 3d tensor for wid in ["w1", "w2", "w3"]: datas: list[Tensor] = [] @@ -3743,13 +4408,113 @@ def prepare_tensors(self): super().prepare_tensors() if self._experts is not None: + # flatten `list[dict[str, Tensor]]` into `list[str]` + experts = [k for d in self._experts for k in d.keys()] + if len(experts) > 0: + raise ValueError(f"Unprocessed experts: {experts}") + + +@Model.register("DeepseekForCausalLM") +class DeepseekModel(Model): + model_arch = gguf.MODEL_ARCH.DEEPSEEK + + def set_vocab(self): + try: + self._set_vocab_sentencepiece() + except FileNotFoundError: + self._set_vocab_gpt2() + + def set_gguf_parameters(self): + super().set_gguf_parameters() + hparams = self.hparams + if "head_dim" in hparams: + rope_dim = hparams["head_dim"] + else: + rope_dim = hparams["hidden_size"] // hparams["num_attention_heads"] + + self.gguf_writer.add_rope_dimension_count(rope_dim) + self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE) + self.gguf_writer.add_leading_dense_block_count(hparams["first_k_dense_replace"]) + self.gguf_writer.add_vocab_size(hparams["vocab_size"]) + self.gguf_writer.add_expert_feed_forward_length( + hparams["moe_intermediate_size"] + ) + self.gguf_writer.add_expert_weights_scale(1.0) + self.gguf_writer.add_expert_count(hparams["n_routed_experts"]) + self.gguf_writer.add_expert_shared_count(hparams["n_shared_experts"]) + + _experts: list[dict[str, Tensor]] | None = None + + @staticmethod + def permute(weights: Tensor, n_head: int, n_head_kv: int | None): + if n_head_kv is not None and n_head != n_head_kv: + n_head = n_head_kv + return ( + weights.reshape( + n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:] + ) + .swapaxes(1, 2) + .reshape(weights.shape) + ) + + def modify_tensors( + self, data_torch: Tensor, name: str, bid: int | None + ) -> Iterable[tuple[str, Tensor]]: + n_head = self.hparams["num_attention_heads"] + n_kv_head = self.hparams.get("num_key_value_heads") + + if name.endswith(("q_proj.weight", "q_proj.bias")): + data_torch = DeepseekModel.permute(data_torch, n_head, n_head) + if name.endswith(("k_proj.weight", "k_proj.bias")): + data_torch = DeepseekModel.permute(data_torch, n_head, n_kv_head) + + # process the experts separately + if name.find("mlp.experts") != -1: + n_experts = self.hparams["n_routed_experts"] + assert bid is not None + + if self._experts is None: + self._experts = [{} for _ in range(self.block_count)] + + self._experts[bid][name] = data_torch + + if len(self._experts[bid]) >= n_experts * 3: + tensors: list[tuple[str, Tensor]] = [] + + # merge the experts into a single 3d tensor + for w_name in ["down_proj", "gate_proj", "up_proj"]: + datas: list[Tensor] = [] + for xid in range(n_experts): + ename = f"model.layers.{bid}.mlp.experts.{xid}.{w_name}.weight" + datas.append(self._experts[bid][ename]) + del self._experts[bid][ename] + + data_torch = torch.stack(datas, dim=0) + + merged_name = f"model.layers.{bid}.mlp.experts.{w_name}.weight" + + new_name = self.map_tensor_name(merged_name) + + tensors.append((new_name, data_torch)) + return tensors + else: + return [] + + return [(self.map_tensor_name(name), data_torch)] + + def prepare_tensors(self): + super().prepare_tensors() + + if self._experts is not None: + # flatten `list[dict[str, Tensor]]` into `list[str]` experts = [k for d in self._experts for k in d.keys()] if len(experts) > 0: raise ValueError(f"Unprocessed experts: {experts}") @Model.register("DeepseekV2ForCausalLM") +@Model.register("DeepseekV3ForCausalLM") class DeepseekV2Model(Model): model_arch = gguf.MODEL_ARCH.DEEPSEEK2 @@ -3775,6 +4540,17 @@ def set_gguf_parameters(self): self.gguf_writer.add_expert_count(hparams["n_routed_experts"]) self.gguf_writer.add_expert_shared_count(hparams["n_shared_experts"]) self.gguf_writer.add_expert_weights_scale(hparams["routed_scaling_factor"]) + self.gguf_writer.add_expert_weights_norm(hparams["norm_topk_prob"]) + + if hparams["scoring_func"] == "sigmoid": + self.gguf_writer.add_expert_gating_func(gguf.ExpertGatingFuncType.SIGMOID) + elif hparams["scoring_func"] == "softmax": + self.gguf_writer.add_expert_gating_func(gguf.ExpertGatingFuncType.SOFTMAX) + else: + raise ValueError( + f"Unsupported scoring_func value: {hparams['scoring_func']}" + ) + self.gguf_writer.add_rope_dimension_count(hparams["qk_rope_head_dim"]) if ( @@ -3798,7 +4574,17 @@ def set_gguf_parameters(self): def modify_tensors( self, data_torch: Tensor, name: str, bid: int | None ) -> Iterable[tuple[str, Tensor]]: + # rename e_score_correction_bias tensors + if name.endswith("e_score_correction_bias"): + name = name.replace("e_score_correction_bias", "e_score_correction.bias") + + # skip Multi-Token Prediction (MTP) layers + block_count = self.hparams["num_hidden_layers"] + match = re.match(r"model.layers.(\d+)", name) + if match and int(match.group(1)) >= block_count: + return [] + # process the experts separately if name.find("mlp.experts") != -1: n_experts = self.hparams["n_routed_experts"] assert bid is not None @@ -3811,6 +4597,7 @@ def modify_tensors( if len(self._experts[bid]) >= n_experts * 3: tensors: list[tuple[str, Tensor]] = [] + # merge the experts into a single 3d tensor for w_name in ["down_proj", "gate_proj", "up_proj"]: datas: list[Tensor] = [] @@ -3836,7 +4623,7 @@ def prepare_tensors(self): super().prepare_tensors() if self._experts is not None: - + # flatten `list[dict[str, Tensor]]` into `list[str]` experts = [k for d in self._experts for k in d.keys()] if len(experts) > 0: raise ValueError(f"Unprocessed experts: {experts}") @@ -3854,28 +4641,33 @@ def __init__(self, *args, **kwargs): self.shared_token_embeddings_found = False def set_vocab(self): - + # to avoid TypeError: Descriptors cannot be created directly + # exception when importing sentencepiece_model_pb2 os.environ["PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION"] = "python" from sentencepiece import SentencePieceProcessor from sentencepiece import sentencepiece_model_pb2 as model tokenizer_path = self.dir_model / "tokenizer.model" + # many older models use spiece.model tokenizer model filename if not tokenizer_path.is_file(): tokenizer_path = self.dir_model / "spiece.model" if not tokenizer_path.is_file(): raise FileNotFoundError(f"File not found: {tokenizer_path}") - sentencepiece_model = model.ModelProto() + sentencepiece_model = ( + model.ModelProto() + ) # pyright: ignore[reportAttributeAccessIssue] sentencepiece_model.ParseFromString(open(tokenizer_path, "rb").read()) - if sentencepiece_model.trainer_spec.model_type == 2: - + # some models like Pile-T5 family use BPE tokenizer instead of Unigram + if sentencepiece_model.trainer_spec.model_type == 2: # BPE + # assure the tokenizer model file name is correct assert tokenizer_path.name == "tokenizer.model" return self._set_vocab_sentencepiece() else: - assert sentencepiece_model.trainer_spec.model_type == 1 + assert sentencepiece_model.trainer_spec.model_type == 1 # UNIGRAM add_prefix = sentencepiece_model.normalizer_spec.add_dummy_prefix remove_whitespaces = ( @@ -3979,8 +4771,12 @@ def set_gguf_parameters(self): def modify_tensors( self, data_torch: Tensor, name: str, bid: int | None ) -> Iterable[tuple[str, Tensor]]: - del bid + del bid # unused + # T5 based models contain shared token embeddings tensors saved randomly as either "encoder.embed_tokens.weight", + # "decoder.embed_tokens.weight" or "shared.weight" tensor. In some models there are even multiple of them stored + # in the safetensors files. We use the first tensor from these three as the token embeddings for both encoder + # and decoder and ignore the remaining ones. if name in [ "decoder.embed_tokens.weight", "encoder.embed_tokens.weight", @@ -4007,28 +4803,33 @@ def __init__(self, *args, **kwargs): self.shared_token_embeddings_found = False def set_vocab(self): - + # to avoid TypeError: Descriptors cannot be created directly + # exception when importing sentencepiece_model_pb2 os.environ["PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION"] = "python" from sentencepiece import SentencePieceProcessor from sentencepiece import sentencepiece_model_pb2 as model tokenizer_path = self.dir_model / "tokenizer.model" + # many older models use spiece.model tokenizer model filename if not tokenizer_path.is_file(): tokenizer_path = self.dir_model / "spiece.model" if not tokenizer_path.is_file(): raise FileNotFoundError(f"File not found: {tokenizer_path}") - sentencepiece_model = model.ModelProto() + sentencepiece_model = ( + model.ModelProto() + ) # pyright: ignore[reportAttributeAccessIssue] sentencepiece_model.ParseFromString(open(tokenizer_path, "rb").read()) - if sentencepiece_model.trainer_spec.model_type == 2: - + # some models like Pile-T5 family use BPE tokenizer instead of Unigram + if sentencepiece_model.trainer_spec.model_type == 2: # BPE + # assure the tokenizer model file name is correct assert tokenizer_path.name == "tokenizer.model" return self._set_vocab_sentencepiece() else: - assert sentencepiece_model.trainer_spec.model_type == 1 + assert sentencepiece_model.trainer_spec.model_type == 1 # UNIGRAM add_prefix = sentencepiece_model.normalizer_spec.add_dummy_prefix remove_whitespaces = ( @@ -4129,8 +4930,12 @@ def set_gguf_parameters(self): def modify_tensors( self, data_torch: Tensor, name: str, bid: int | None ) -> Iterable[tuple[str, Tensor]]: - del bid + del bid # unused + # T5 based models contain shared token embeddings tensors saved randomly as either "encoder.embed_tokens.weight", + # "decoder.embed_tokens.weight" or "shared.weight" tensor. In some models there are even multiple of them stored + # in the safetensors files. We use the first tensor from these three as the token embeddings for both encoder + # and decoder and ignore the remaining ones. if name in [ "decoder.embed_tokens.weight", "encoder.embed_tokens.weight", @@ -4155,15 +4960,14 @@ class JaisModel(Model): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) + # SwigLU activation assert self.hparams["activation_function"] == "swiglu" - + # ALiBi position embedding assert self.hparams["position_embedding_type"] == "alibi" + # Embeddings scale self.embeddings_scale = 1.0 - - self.output_is_wte = False if "mup_embeddings_scale" in self.hparams: - self.output_is_wte = True self.embeddings_scale = self.hparams["mup_embeddings_scale"] elif "embeddings_scale" in self.hparams: self.embeddings_scale = self.hparams["embeddings_scale"] @@ -4198,15 +5002,19 @@ def set_gguf_parameters(self): def modify_tensors( self, data_torch: Tensor, name: str, bid: int | None ) -> Iterable[tuple[str, Tensor]]: - del bid + del bid # unused tensors: list[tuple[str, Tensor]] = [] + # we don't need these if name.endswith((".attn.bias")): return tensors if name.endswith(("relative_pe.slopes")): - + # Calculate max ALiBi bias (this is the inverse of the ALiBi calculation) + # Some other models has max_alibi_bias spelled out explicitly in the hyperparams, + # but Jais's PyTorch model simply precalculates the slope values and places them + # in relative_pes.slopes n_head_closest_log2 = 2 ** math.floor(math.log2(self.hparams["n_head"])) first_val = float(data_torch[0].item()) self.max_alibi_bias = -round(math.log2(first_val) * n_head_closest_log2) @@ -4222,15 +5030,7 @@ def modify_tensors( if new_name == self.format_tensor_name(gguf.MODEL_TENSOR.TOKEN_EMBD): tensors.append((new_name, data_torch * self.embeddings_scale)) - if self.output_is_wte: - tensors.append( - ( - self.format_tensor_name(gguf.MODEL_TENSOR.OUTPUT), - data_torch * self.width_scale, - ) - ) elif new_name == self.format_tensor_name(gguf.MODEL_TENSOR.OUTPUT): - assert not self.output_is_wte tensors.append((new_name, data_torch * self.width_scale)) else: tensors.append((new_name, data_torch)) @@ -4282,7 +5082,8 @@ def set_vocab_chatglm3(self): text = piece.encode("utf-8") score = 0.0 - + # Referencing the tokenizer Python implementation(https://huggingface.co/THUDM/chatglm3-6b/blob/main/tokenization_chatglm.py), + # it is only valid if it is less than tokenizer.tokenizer.sp_model.vocab_size() if len(piece) != 0 and token_id < tokenizer.tokenizer.sp_model.vocab_size(): score = tokenizer.tokenizer.sp_model.get_score(token_id) @@ -4314,7 +5115,8 @@ def set_vocab_chatglm3(self): toktypes.append(toktype) self.gguf_writer.add_tokenizer_model("llama") - + # glm3 needs prefix and suffix formatted as: + # prompt = "[gMASK]sop<|user|>\n" + prompt + "<|assistant|>" self.gguf_writer.add_tokenizer_pre("chatglm-spm") self.gguf_writer.add_token_list(tokens) self.gguf_writer.add_token_scores(scores) @@ -4382,6 +5184,7 @@ def set_vocab(self): assert len(merged) >= 2 and len(merged) <= 7 merges.append(" ".join(map(ChatGLMModel.token_bytes_to_string, merged))) + # for this kind of tokenizer, added_vocab is not a subset of vocab, so they need to be combined added_vocab = tokenizer.get_added_vocab() reverse_vocab = { id_: encoded_tok for encoded_tok, id_ in {**vocab, **added_vocab}.items() @@ -4408,12 +5211,12 @@ def set_vocab(self): special_vocab = gguf.SpecialVocab(dir_model, load_merges=False) special_vocab.merges = merges - + # only add special tokens when they were not already loaded from config.json special_vocab._set_special_token( "eos", tokenizer.get_added_vocab()["<|endoftext|>"] ) special_vocab._set_special_token("eot", tokenizer.get_added_vocab()["<|user|>"]) - + # this one is usually not in config.json anyway special_vocab._set_special_token( "unk", tokenizer.get_added_vocab()["<|endoftext|>"] ) @@ -4443,7 +5246,7 @@ def set_gguf_parameters(self): def modify_tensors( self, data_torch: Tensor, name: str, bid: int | None ) -> Iterable[tuple[str, Tensor]]: - del bid + del bid # unused if name.endswith(".rotary_pos_emb.inv_freq"): return [] @@ -4471,6 +5274,7 @@ def set_gguf_parameters(self): ) self.gguf_writer.add_layer_norm_eps(f_norm_eps) + # * Partial RoPE rot_pct = self.find_hparam( ["partial_rotary_factor", "rope_pct", "rope_percent"] ) @@ -4478,6 +5282,7 @@ def set_gguf_parameters(self): n_head = self.find_hparam(["num_attention_heads", "n_head"]) self.gguf_writer.add_rope_dimension_count(int(rot_pct * n_embd) // n_head) + # * RopeScaling for Nemotron if "rope_scaling" not in self.hparams or self.hparams["rope_scaling"] is None: self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE) else: @@ -4487,7 +5292,10 @@ def set_gguf_parameters(self): def modify_tensors( self, data_torch: Tensor, name: str, bid: int | None ) -> Iterable[tuple[str, Tensor]]: - + # * Adding +1 to LayerNorm's weights here to implement layernorm1p w/o changing anything on the GGML engine side + # model.layers.{l}.input_layernorm.weight + # model.layers.{l}.post_attention_layernorm.weight + # model.norm.weight if name.endswith("norm.weight"): data_torch = data_torch + 1 @@ -4514,7 +5322,10 @@ def set_gguf_parameters(self): else 4 * embed_dim ) num_layers = hparams["num_layers"] - + # ignore for now as EXAONE-3.0-7.8B-Instruct attentino_dropout is 0.0 + # attention_dropout_rate = hparams["attention_dropout"] + # ignore for now as EXAONE-3.0-7.8B-Instruct embed_dropout is 0.0 + # embed_dropout_rate = hparams["embed_dropout"] self.gguf_writer.add_embedding_length(embed_dim) self.gguf_writer.add_head_count(num_heads) self.gguf_writer.add_head_count_kv(num_kv_heads) @@ -4546,7 +5357,7 @@ def set_gguf_parameters(self): hparams["rope_scaling"]["factor"] ) - def prepare_tensors(self): + def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]: if rope_scaling := self.find_hparam(["rope_scaling"], optional=True): if rope_scaling.get("rope_type", "").lower() == "llama3": base = self.hparams.get("rope_theta", 10000.0) @@ -4582,13 +5393,10 @@ def prepare_tensors(self): ) rope_factors.append(1 / ((1 - smooth) / factor + smooth)) - if not self.is_lora: - self.gguf_writer.add_tensor( - self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FREQS), - np.array(rope_factors, dtype=np.float32), - ) - - super().prepare_tensors() + yield ( + self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FREQS), + torch.tensor(rope_factors, dtype=torch.float32), + ) @Model.register("GraniteForCausalLM") @@ -4610,151 +5418,128 @@ def set_gguf_parameters(self): if head_dim := self.hparams.pop("head_dim", None): logger.warning("Ignoring head_dim (%s) from config for Granite", head_dim) super().set_gguf_parameters() - + # NOTE: Convert _multiplier params to _scale params for naming + # consistency if attention_scale := self.hparams.get("attention_multiplier"): self.gguf_writer.add_attention_scale(attention_scale) + logger.info("gguf: (granite) attention_scale = %s", attention_scale) if embedding_scale := self.hparams.get("embedding_multiplier"): self.gguf_writer.add_embedding_scale(embedding_scale) + logger.info("gguf: (granite) embedding_scale = %s", embedding_scale) if residual_scale := self.hparams.get("residual_multiplier"): self.gguf_writer.add_residual_scale(residual_scale) - if logits_scaling := self.hparams.get("logits_scaling"): - self.gguf_writer.add_logit_scale(logits_scaling) + logger.info("gguf: (granite) residual_scale = %s", residual_scale) + if logits_scale := self.hparams.get("logits_scaling"): + self.gguf_writer.add_logit_scale(logits_scale) + logger.info("gguf: (granite) logits_scale = %s", logits_scale) -@Model.register("JambaForCausalLM") -class JambaModel(Model): - model_arch = gguf.MODEL_ARCH.JAMBA +@Model.register("GraniteMoeForCausalLM") +class GraniteMoeModel(GraniteModel): + """Conversion for IBM's GraniteMoeForCausalLM""" - def get_vocab_base_pre(self, tokenizer) -> str: - del tokenizer - - return "gpt-2" - - def set_vocab(self): - if (self.dir_model / "tokenizer.model").is_file(): - - self._set_vocab_sentencepiece() - else: - - self._set_vocab_gpt2() - - def set_gguf_parameters(self): - d_model = self.find_hparam(["hidden_size", "mamba_d_model"]) - d_conv = self.find_hparam(["mamba_d_conv"], optional=True) or 4 - d_inner = self.hparams["mamba_expand"] * d_model - d_state = self.find_hparam(["mamba_d_state"], optional=True) or 16 - - dt_rank = self.find_hparam(["mamba_dt_rank"], optional=True) or -( - d_model // -16 - ) - rms_norm_eps = ( - self.find_hparam(["layer_norm_epsilon", "rms_norm_eps"], optional=True) - or 1e-6 - ) - n_kv_head = self.hparams["num_key_value_heads"] - attn_offset = self.hparams["attn_layer_offset"] - attn_period = self.hparams["attn_layer_period"] - n_kv_vec = [0 for _ in range(attn_offset)] + [ - n_kv_head if (i - attn_offset) % attn_period == 0 else 0 - for i in range(attn_offset, self.block_count) - ] - - self.gguf_writer.add_block_count(self.block_count) - self.gguf_writer.add_context_length( - self.find_hparam(["max_position_embeddings", "n_ctx"]) - ) - self.gguf_writer.add_embedding_length(d_model) - self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"]) - self.gguf_writer.add_head_count(self.hparams["num_attention_heads"]) - self.gguf_writer.add_head_count_kv(n_kv_vec) - self.gguf_writer.add_ssm_conv_kernel(d_conv) - self.gguf_writer.add_ssm_inner_size(d_inner) - self.gguf_writer.add_ssm_state_size(d_state) - self.gguf_writer.add_ssm_time_step_rank(dt_rank) - self.gguf_writer.add_layer_norm_rms_eps(rms_norm_eps) - self.gguf_writer.add_expert_count(self.hparams["num_experts"]) - self.gguf_writer.add_expert_used_count(self.hparams["num_experts_per_tok"]) - self.gguf_writer.add_file_type(self.ftype) - - _experts: list[dict[str, Tensor]] | None = None + model_arch = gguf.MODEL_ARCH.GRANITE_MOE def modify_tensors( self, data_torch: Tensor, name: str, bid: int | None ) -> Iterable[tuple[str, Tensor]]: + """In modeling_granitemoe, the JetMoe implementation of parallel experts + is used. This essentially merges w1 and w3 into a single tensor with 2x + the hidden size that is then split during forward. To keep compatibility + with existing mixtral support, we pull them apart here. + """ - name = name.replace(".moe.", ".feed_forward.") - if bid is not None: - moe_offset = self.hparams["expert_layer_offset"] - moe_period = self.hparams["expert_layer_period"] - - if not (bid >= moe_offset and (bid - moe_offset) % moe_period == 0): - name = name.replace(".experts.0.", ".") - - if ".feed_forward.experts." in name: - n_experts = self.hparams["num_experts"] - - assert bid is not None - - if self._experts is None: - self._experts = [{} for _ in range(self.block_count)] - - self._experts[bid][name] = data_torch - - if len(self._experts[bid]) >= n_experts * 3: + if name.endswith("block_sparse_moe.input_linear.weight"): + ffn_dim = self.hparams["intermediate_size"] + assert ( + data_torch.shape[-2] == 2 * ffn_dim + ), "Merged FFN tensor size must be 2 * intermediate_size" + gate, up = data_torch[..., :ffn_dim, :], data_torch[..., ffn_dim:, :] + return [ + (self.format_tensor_name(gguf.MODEL_TENSOR.FFN_GATE_EXP, bid), gate), + (self.format_tensor_name(gguf.MODEL_TENSOR.FFN_UP_EXP, bid), up), + ] - for wid in ["down_proj", "gate_proj", "up_proj"]: - datas: list[Tensor] = [] + return super().modify_tensors(data_torch, name, bid) - for xid in range(n_experts): - ename = f"model.layers.{bid}.feed_forward.experts.{xid}.{wid}.weight" - datas.append(self._experts[bid][ename]) - del self._experts[bid][ename] - data_torch = torch.stack(datas, dim=0) +@Model.register("ChameleonForConditionalGeneration") +@Model.register("ChameleonForCausalLM") # obsolete +class ChameleonModel(Model): + model_arch = gguf.MODEL_ARCH.CHAMELEON - merged_name = f"model.layers.{bid}.mlp.experts.{wid}.weight" + def set_gguf_parameters(self): + super().set_gguf_parameters() + self.gguf_writer.add_swin_norm(self.hparams.get("swin_norm", False)) - new_name = self.map_tensor_name(merged_name) + def set_vocab(self): + self._set_vocab_gpt2() - yield new_name, data_torch - return + def modify_tensors( + self, data_torch: Tensor, name: str, bid: int | None + ) -> Iterable[tuple[str, Tensor]]: + # ignore image tokenizer for now + # TODO: remove this once image support is implemented for Chameleon + if name.startswith("model.vqmodel"): + return [] - new_name = self.map_tensor_name(name) + n_head = self.hparams["num_attention_heads"] + n_kv_head = self.hparams.get("num_key_value_heads") + hidden_dim = self.hparams.get("hidden_size") - if name.endswith(".A_log"): - logger.debug("A_log --> A ==> " + new_name) - data_torch = -torch.exp(data_torch) + if name.endswith(("q_proj.weight", "q_proj.bias")): + data_torch = LlamaModel.permute(data_torch, n_head, n_head) + if name.endswith(("k_proj.weight", "k_proj.bias")): + data_torch = LlamaModel.permute(data_torch, n_head, n_kv_head) + if name.endswith(("q_norm.weight", "q_norm.bias")): + data_torch = ChameleonModel._reverse_hf_permute( + data_torch, n_head, hidden_dim + ) + if name.endswith(("k_norm.weight", "k_norm.bias")): + data_torch = ChameleonModel._reverse_hf_permute( + data_torch, n_kv_head, hidden_dim + ) - yield new_name, data_torch + return [(self.map_tensor_name(name), data_torch)] - def prepare_tensors(self): - super().prepare_tensors() + # see: https://github.com/huggingface/transformers/blob/72fb02c47dbbe1999ae105319f24631cad6e2e00/src/transformers/models/chameleon/convert_chameleon_weights_to_hf.py#L176-L203 + @staticmethod + def _reverse_hf_permute(data_torch, n_heads, hidden_dim): + head_dim = hidden_dim // n_heads + data_torch = data_torch[0].view(2, head_dim // 2).t().reshape(1, -1) + data_torch = data_torch.repeat_interleave(n_heads, 0) + return data_torch - if self._experts is not None: - experts = [k for d in self._experts for k in d.keys()] - if len(experts) > 0: - raise ValueError(f"Unprocessed experts: {experts}") +###### CONVERSION LOGIC ###### +# tree of lazy tensors class LazyTorchTensor(gguf.LazyBase): _tensor_type = torch.Tensor - + # to keep the type-checker happy dtype: torch.dtype shape: torch.Size + # only used when converting a torch.Tensor to a np.ndarray _dtype_map: dict[torch.dtype, type] = { torch.float16: np.float16, torch.float32: np.float32, } + # used for safetensors slices + # ref: https://github.com/huggingface/safetensors/blob/079781fd0dc455ba0fe851e2b4507c33d0c0d407/bindings/python/src/lib.rs#L1046 + # TODO: uncomment U64, U32, and U16, ref: https://github.com/pytorch/pytorch/issues/58734 _dtype_str_map: dict[str, torch.dtype] = { "F64": torch.float64, "F32": torch.float32, "BF16": torch.bfloat16, "F16": torch.float16, + # "U64": torch.uint64, "I64": torch.int64, + # "U32": torch.uint32, "I32": torch.int32, + # "U16": torch.uint16, "I16": torch.int16, "U8": torch.uint8, "I8": torch.int8, @@ -4790,7 +5575,7 @@ def from_safetensors_slice(cls, st_slice: Any) -> Tensor: @classmethod def __torch_function__(cls, func, types, args=(), kwargs=None): - del types + del types # unused if kwargs is None: kwargs = {} @@ -4808,60 +5593,77 @@ def parse_args() -> argparse.Namespace: parser.add_argument( "--vocab-only", action="store_true", + help="extract only the vocab", ) parser.add_argument( "--outfile", type=Path, + help="path to write to; default: based on input. {ftype} will be replaced by the outtype.", ) parser.add_argument( "--outtype", type=str, choices=["f32", "f16", "bf16", "q8_0", "tq1_0", "tq2_0", "auto"], default="f16", + help="output format - use f32 for float32, f16 for float16, bf16 for bfloat16, q8_0 for Q8_0, tq1_0 or tq2_0 for ternary, and auto for the highest-fidelity 16-bit float type depending on the first loaded tensor type", ) parser.add_argument( "--bigendian", action="store_true", + help="model is executed on big endian machine", ) parser.add_argument( "model", type=Path, + help="directory containing model file", + ) + parser.add_argument( + "--use-temp-file", + action="store_true", + help="use the tempfile library while processing (helpful when running out of memory, process killed)", ) - parser.add_argument("--use-temp-file", action="store_true") parser.add_argument( "--no-lazy", action="store_true", + help="use more RAM by computing all outputs before writing (use in case lazy evaluation is broken)", ) parser.add_argument( "--model-name", type=str, default=None, + help="name of the model", ) parser.add_argument( "--verbose", action="store_true", + help="increase output verbosity", ) parser.add_argument( "--split-max-tensors", type=int, default=0, + help="max tensors in each split", ) parser.add_argument( "--split-max-size", type=str, default="0", + help="max size per split N(M|G)", ) parser.add_argument( "--dry-run", action="store_true", + help="only print out a split plan and exit, without writing any new files", ) parser.add_argument( "--no-tensor-first-split", action="store_true", + help="do not add tensors to the first split (disabled by default)", ) parser.add_argument( "--metadata", type=Path, + help="Specify the path for an authorship metadata override file", ) return parser.parse_args() diff --git a/src/convert_lora_to_gguf.py b/src/convert_lora_to_gguf.py index d5354d2..e132412 100644 --- a/src/convert_lora_to_gguf.py +++ b/src/convert_lora_to_gguf.py @@ -18,14 +18,16 @@ SupportsIndex, cast, ) +from transformers import AutoConfig import torch if TYPE_CHECKING: from torch import Tensor -from gguf.constants import * +import gguf +# reuse model definitions from convert_hf_to_gguf.py from convert_hf_to_gguf import LazyTorchTensor, Model logger = logging.getLogger("lora-to-gguf") @@ -37,9 +39,10 @@ class PartialLoraTensor: B: Tensor | None = None +# magic to support tensor shape modifications and splitting class LoraTorchTensor: - _lora_A: Tensor - _lora_B: Tensor + _lora_A: Tensor # (n_rank, row_size) + _lora_B: Tensor # (col_size, n_rank) _rank: int def __init__(self, A: Tensor, B: Tensor): @@ -57,14 +60,20 @@ def get_lora_A_B(self) -> tuple[Tensor, Tensor]: def __getitem__( self, - indices: SupportsIndex | slice | tuple[SupportsIndex | slice | Tensor, ...], + indices: ( + SupportsIndex + | slice + | tuple[ + SupportsIndex | slice | Tensor, ... + ] # TODO: add ellipsis in the type signature + ), ) -> LoraTorchTensor: shape = self.shape if isinstance(indices, SupportsIndex): if len(shape) > 2: return LoraTorchTensor(self._lora_A[indices], self._lora_B[indices]) else: - raise NotImplementedError + raise NotImplementedError # can't return a vector elif isinstance(indices, slice): if len(shape) > 2: return LoraTorchTensor(self._lora_A[indices], self._lora_B[indices]) @@ -74,7 +83,7 @@ def __getitem__( assert len(indices) > 0 if indices[-1] is Ellipsis: return self[indices[:-1]] - + # expand ellipsis indices = tuple( u for v in ( @@ -94,6 +103,7 @@ def __getitem__( *(slice(None, None) for _ in range(len(indices), len(shape))), ) + # TODO: make sure this is correct indices_A = ( *( ( @@ -109,7 +119,7 @@ def __getitem__( indices_B = indices[:-1] return LoraTorchTensor(self._lora_A[indices_A], self._lora_B[indices_B]) else: - raise NotImplementedError + raise NotImplementedError # unknown indice type @property def dtype(self) -> torch.dtype: @@ -132,8 +142,9 @@ def reshape(self, *shape: int | tuple[int, ...]) -> LoraTorchTensor: new_shape = cast(tuple[int, ...], shape) orig_shape = self.shape if len(new_shape) < 2: - raise NotImplementedError + raise NotImplementedError # can't become a vector + # expand -1 in the shape if any(dim == -1 for dim in new_shape): n_elems = prod(orig_shape) n_new_elems = prod(dim if dim != -1 else 1 for dim in new_shape) @@ -143,7 +154,7 @@ def reshape(self, *shape: int | tuple[int, ...]) -> LoraTorchTensor: ) if new_shape[-1] != orig_shape[-1]: - raise NotImplementedError + raise NotImplementedError # can't reshape the row size trivially shape_A = (*(1 for _ in new_shape[:-2]), self._rank, orig_shape[-1]) shape_B = (*new_shape[:-1], self._rank) @@ -162,7 +173,7 @@ def permute(self, *dims: int) -> LoraTorchTensor: shape = self.shape dims = tuple(dim - len(shape) if dim >= 0 else dim for dim in dims) if dims[-1] == -1: - + # TODO: support higher dimensional A shapes bigger than 1 assert all(dim == 1 for dim in self._lora_A.shape[:-2]) return LoraTorchTensor(self._lora_A, self._lora_B.permute(*dims)) if len(shape) == 2 and dims[-1] == -2 and dims[-2] == -1: @@ -170,7 +181,7 @@ def permute(self, *dims: int) -> LoraTorchTensor: self._lora_B.permute(*dims), self._lora_A.permute(*dims) ) else: - + # TODO: compose the above two raise NotImplementedError def transpose(self, dim0: int, dim1: int) -> LoraTorchTensor: @@ -189,7 +200,7 @@ def to(self, *args, **kwargs): @classmethod def __torch_function__(cls, func: Callable, types, args=(), kwargs=None): - del types + del types # unused if kwargs is None: kwargs = {} @@ -230,28 +241,73 @@ def get_base_tensor_name(lora_tensor_name: str) -> str: base_name = lora_tensor_name.replace("base_model.model.", "") base_name = base_name.replace(".lora_A.weight", ".weight") base_name = base_name.replace(".lora_B.weight", ".weight") + # models produced by mergekit-extract-lora have token embeddings in the adapter + base_name = base_name.replace(".lora_embedding_A", ".weight") + base_name = base_name.replace(".lora_embedding_B", ".weight") return base_name def parse_args() -> argparse.Namespace: - parser = argparse.ArgumentParser() - parser.add_argument("--outfile", type=Path) + parser = argparse.ArgumentParser( + description="Convert a Hugging Face PEFT LoRA adapter to a GGUF file" + ) + parser.add_argument( + "--outfile", + type=Path, + help="path to write to; default: based on input. {ftype} will be replaced by the outtype.", + ) parser.add_argument( "--outtype", type=str, choices=["f32", "f16", "bf16", "q8_0", "auto"], default="f16", + help="output format - use f32 for float32, f16 for float16, bf16 for bfloat16, q8_0 for Q8_0, auto for the highest-fidelity 16-bit float type depending on the first loaded tensor type", + ) + parser.add_argument( + "--bigendian", + action="store_true", + help="model is executed on big endian machine", + ) + parser.add_argument( + "--no-lazy", + action="store_true", + help="use more RAM by computing all outputs before writing (use in case lazy evaluation is broken)", + ) + parser.add_argument( + "--verbose", + action="store_true", + help="increase output verbosity", + ) + parser.add_argument( + "--dry-run", + action="store_true", + help="only print out what will be done, without writing any new files", + ) + parser.add_argument( + "--base", + type=Path, + help="directory containing Hugging Face model config files (config.json, tokenizer.json) for the base model that the adapter is based on - only config is needed, actual model weights are not required. If base model is unspecified, it will be loaded from Hugging Face hub based on the adapter config", + ) + parser.add_argument( + "--base-model-id", + type=str, + help="the model ID of the base model, if it is not available locally or in the adapter config. If specified, it will ignore --base and load the base model config from the Hugging Face hub (Example: 'meta-llama/Llama-3.2-1B-Instruct')", + ) + parser.add_argument( + "lora_path", + type=Path, + help="directory containing Hugging Face PEFT LoRA config (adapter_model.json) and weights (adapter_model.safetensors or adapter_model.bin)", ) - parser.add_argument("--bigendian", action="store_true") - parser.add_argument("--no-lazy", action="store_true") - parser.add_argument("--verbose", action="store_true") - parser.add_argument("--dry-run", action="store_true") - parser.add_argument("--base", type=Path, required=True) - parser.add_argument("lora_path", type=Path) return parser.parse_args() +def load_hparams_from_hf(hf_model_id: str) -> dict[str, Any]: + # normally, adapter does not come with base model config, we need to load it from AutoConfig + config = AutoConfig.from_pretrained(hf_model_id) + return config.to_dict() + + if __name__ == "__main__": args = parse_args() logging.basicConfig(level=logging.DEBUG if args.verbose else logging.INFO) @@ -266,19 +322,20 @@ def parse_args() -> argparse.Namespace: ftype = ftype_map[args.outtype] - dir_base_model: Path = args.base + dir_base_model: Path | None = args.base dir_lora: Path = args.lora_path + base_model_id: str | None = args.base_model_id lora_config = dir_lora / "adapter_config.json" input_model = dir_lora / "adapter_model.safetensors" if args.outfile is not None: fname_out = args.outfile else: - + # output in the same directory as the model by default fname_out = dir_lora if os.path.exists(input_model): - + # lazy import load_file only if lora is in safetensors format. from safetensors.torch import load_file lora_model = load_file(input_model, device="cpu") @@ -286,8 +343,38 @@ def parse_args() -> argparse.Namespace: input_model = os.path.join(dir_lora, "adapter_model.bin") lora_model = torch.load(input_model, map_location="cpu", weights_only=True) - logger.info(f"Loading base model: {dir_base_model.name}") - hparams = Model.load_hparams(dir_base_model) + # load LoRA config + with open(lora_config, "r") as f: + lparams: dict[str, Any] = json.load(f) + + # load base model + if base_model_id is not None: + logger.info(f"Loading base model from Hugging Face: {base_model_id}") + hparams = load_hparams_from_hf(base_model_id) + elif dir_base_model is None: + if "base_model_name_or_path" in lparams: + model_id = lparams["base_model_name_or_path"] + logger.info(f"Loading base model from Hugging Face: {model_id}") + try: + hparams = load_hparams_from_hf(model_id) + except OSError as e: + logger.error(f"Failed to load base model config: {e}") + logger.error( + "Please try downloading the base model and add its path to --base" + ) + sys.exit(1) + else: + logger.error( + "'base_model_name_or_path' is not found in adapter_config.json" + ) + logger.error( + "Base model config is required. Please download the base model and add its path to --base" + ) + sys.exit(1) + else: + logger.info(f"Loading base model: {dir_base_model.name}") + hparams = Model.load_hparams(dir_base_model) + with torch.inference_mode(): try: model_class = Model.from_model_architecture(hparams["architectures"][0]) @@ -309,6 +396,9 @@ def __init__( self.dir_model_card = dir_lora_model self.lora_alpha = float(lora_alpha) + def set_vocab(self): + pass + def set_type(self): self.gguf_writer.add_type(gguf.GGUFType.ADAPTER) self.gguf_writer.add_string(gguf.Keys.Adapter.TYPE, "lora") @@ -317,7 +407,10 @@ def set_gguf_parameters(self): self.gguf_writer.add_float32( gguf.Keys.Adapter.LORA_ALPHA, self.lora_alpha ) - super().set_gguf_parameters() + + def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]: + # Never add extra tensors (e.g. rope_freqs) for LoRA adapters + return () def get_tensors(self) -> Iterator[tuple[str, Tensor]]: tensor_map: dict[str, PartialLoraTensor] = {} @@ -326,14 +419,26 @@ def get_tensors(self) -> Iterator[tuple[str, Tensor]]: if self.lazy: tensor = LazyTorchTensor.from_eager(tensor) base_name = get_base_tensor_name(name) - is_lora_a = ".lora_A.weight" in name - is_lora_b = ".lora_B.weight" in name + # note: mergekit-extract-lora also adds token embeddings to the adapter + is_lora_a = ".lora_A.weight" in name or ".lora_embedding_A" in name + is_lora_b = ".lora_B.weight" in name or ".lora_embedding_B" in name if not is_lora_a and not is_lora_b: if ".base_layer.weight" in name: continue + # mergekit-extract-lora add these layernorm to the adapter, we need to keep them + if "_layernorm" in name or ".norm" in name: + yield (base_name, tensor) + continue logger.error( f"Unexpected name '{name}': Not a lora_A or lora_B tensor" ) + if ".embed_tokens.weight" in name or ".lm_head.weight" in name: + logger.error( + "Embeddings is present in the adapter. This can be due to new tokens added during fine tuning" + ) + logger.error( + "Please refer to https://github.com/ggerganov/llama.cpp/pull/9948" + ) sys.exit(1) if base_name in tensor_map: @@ -358,17 +463,34 @@ def get_tensors(self) -> Iterator[tuple[str, Tensor]]: def modify_tensors( self, data_torch: Tensor, name: str, bid: int | None ) -> Iterable[tuple[str, Tensor]]: - dest = super().modify_tensors(data_torch, name, bid) + dest = list(super().modify_tensors(data_torch, name, bid)) + # some archs may have the same tensor for lm_head and output (tie word embeddings) + # in this case, adapters targeting lm_head will fail when using llama-export-lora + # therefore, we ignore them for now + # see: https://github.com/ggerganov/llama.cpp/issues/9065 + if name == "lm_head.weight" and len(dest) == 0: + raise ValueError( + "lm_head is present in adapter, but is ignored in base model" + ) for dest_name, dest_data in dest: + # mergekit-extract-lora add these layernorm to the adapter + if "_norm" in dest_name: + assert dest_data.dim() == 1 + yield (dest_name, dest_data) + continue + + # otherwise, we must get the lora_A and lora_B tensors assert isinstance(dest_data, LoraTorchTensor) lora_a, lora_b = dest_data.get_lora_A_B() + # note: mergekit-extract-lora flip and transpose A and B + # here we only need to transpose token_embd.lora_a, see llm_build_inp_embd() + if "token_embd.weight" in dest_name: + lora_a = lora_a.T + yield (dest_name + ".lora_a", lora_a) yield (dest_name + ".lora_b", lora_b) - with open(lora_config, "r") as f: - lparams: dict[str, Any] = json.load(f) - alpha: float = lparams["lora_alpha"] model_instance = LoraModel( @@ -381,7 +503,7 @@ def modify_tensors( dry_run=args.dry_run, dir_lora_model=dir_lora, lora_alpha=alpha, - is_lora=True, + hparams=hparams, ) logger.info("Exporting model...") diff --git a/src/gguf/constants.py b/src/gguf/constants.py index fe7c905..6e9ee09 100644 --- a/src/gguf/constants.py +++ b/src/gguf/constants.py @@ -3,10 +3,18 @@ from enum import Enum, IntEnum, auto from typing import Any -GGUF_MAGIC = 0x46554747 +# +# constants +# + +GGUF_MAGIC = 0x46554747 # "GGUF" GGUF_VERSION = 3 GGUF_DEFAULT_ALIGNMENT = 32 -GGML_QUANT_VERSION = 2 +GGML_QUANT_VERSION = 2 # GGML_QNT_VERSION from ggml.h + +# +# metadata keys +# class Keys: @@ -17,6 +25,7 @@ class General: ALIGNMENT = "general.alignment" FILE_TYPE = "general.file_type" + # Authorship Metadata NAME = "general.name" AUTHOR = "general.author" VERSION = "general.version" @@ -30,38 +39,62 @@ class General: SIZE_LABEL = "general.size_label" + # Licensing details LICENSE = "general.license" LICENSE_NAME = "general.license.name" LICENSE_LINK = "general.license.link" - URL = "general.url" + # Typically represents the converted GGUF repo (Unless native) + URL = "general.url" # Model Website/Paper DOI = "general.doi" UUID = "general.uuid" - REPO_URL = "general.repo_url" + REPO_URL = "general.repo_url" # Model Source Repository (git/svn/etc...) - SOURCE_URL = "general.source.url" + # Model Source during conversion + SOURCE_URL = "general.source.url" # Model Website/Paper SOURCE_DOI = "general.source.doi" SOURCE_UUID = "general.source.uuid" - SOURCE_REPO_URL = "general.source.repo_url" + SOURCE_REPO_URL = ( + "general.source.repo_url" # Model Source Repository (git/svn/etc...) + ) + # Base Model Source. There can be more than one source if it's a merged + # model like with 'Mistral-7B-Merge-14-v0.1'. This will assist in + # tracing linage of models as it is finetuned or merged over time. BASE_MODEL_COUNT = "general.base_model.count" BASE_MODEL_NAME = "general.base_model.{id}.name" BASE_MODEL_AUTHOR = "general.base_model.{id}.author" BASE_MODEL_VERSION = "general.base_model.{id}.version" BASE_MODEL_ORGANIZATION = "general.base_model.{id}.organization" - BASE_MODEL_URL = "general.base_model.{id}.url" + BASE_MODEL_DESCRIPTION = "general.base_model.{id}.description" + BASE_MODEL_URL = "general.base_model.{id}.url" # Model Website/Paper BASE_MODEL_DOI = "general.base_model.{id}.doi" BASE_MODEL_UUID = "general.base_model.{id}.uuid" - BASE_MODEL_REPO_URL = "general.base_model.{id}.repo_url" + BASE_MODEL_REPO_URL = "general.base_model.{id}.repo_url" # Model Source Repository (git/svn/etc...) + + # Dataset Source + DATASET_COUNT = "general.dataset.count" + DATASET_NAME = "general.dataset.{id}.name" + DATASET_AUTHOR = "general.dataset.{id}.author" + DATASET_VERSION = "general.dataset.{id}.version" + DATASET_ORGANIZATION = "general.dataset.{id}.organization" + DATASET_DESCRIPTION = "general.dataset.{id}.description" + DATASET_URL = "general.dataset.{id}.url" # Model Website/Paper + DATASET_DOI = "general.dataset.{id}.doi" + DATASET_UUID = "general.dataset.{id}.uuid" + DATASET_REPO_URL = ( + "general.dataset.{id}.repo_url" # Model Source Repository (git/svn/etc...) + ) + # Array based KV stores TAGS = "general.tags" LANGUAGES = "general.languages" - DATASETS = "general.datasets" class LLM: VOCAB_SIZE = "{arch}.vocab_size" CONTEXT_LENGTH = "{arch}.context_length" EMBEDDING_LENGTH = "{arch}.embedding_length" + FEATURES_LENGTH = "{arch}.features_length" BLOCK_COUNT = "{arch}.block_count" LEADING_DENSE_BLOCK_COUNT = "{arch}.leading_dense_block_count" FEED_FORWARD_LENGTH = "{arch}.feed_forward_length" @@ -73,11 +106,14 @@ class LLM: EXPERT_USED_COUNT = "{arch}.expert_used_count" EXPERT_SHARED_COUNT = "{arch}.expert_shared_count" EXPERT_WEIGHTS_SCALE = "{arch}.expert_weights_scale" + EXPERT_WEIGHTS_NORM = "{arch}.expert_weights_norm" + EXPERT_GATING_FUNC = "{arch}.expert_gating_func" POOLING_TYPE = "{arch}.pooling_type" LOGIT_SCALE = "{arch}.logit_scale" DECODER_START_TOKEN_ID = "{arch}.decoder_start_token_id" ATTN_LOGIT_SOFTCAPPING = "{arch}.attn_logit_softcapping" FINAL_LOGIT_SOFTCAPPING = "{arch}.final_logit_softcapping" + SWIN_NORM = "{arch}.swin_norm" RESCALE_EVERY_N_LAYERS = "{arch}.rescale_every_n_layers" TIME_MIX_EXTRA_DIM = "{arch}.time_mix_extra_dim" TIME_DECAY_EXTRA_DIM = "{arch}.time_decay_extra_dim" @@ -93,6 +129,8 @@ class Attention: VALUE_LENGTH = "{arch}.attention.value_length" LAYERNORM_EPS = "{arch}.attention.layer_norm_epsilon" LAYERNORM_RMS_EPS = "{arch}.attention.layer_norm_rms_epsilon" + GROUPNORM_EPS = "{arch}.attention.group_norm_epsilon" + GROUPNORM_GROUPS = "{arch}.attention.group_norm_groups" CAUSAL = "{arch}.attention.causal" Q_LORA_RANK = "{arch}.attention.q_lora_rank" KV_LORA_RANK = "{arch}.attention.kv_lora_rank" @@ -102,6 +140,7 @@ class Attention: class Rope: DIMENSION_COUNT = "{arch}.rope.dimension_count" + DIMENSION_SECTIONS = "{arch}.rope.dimension_sections" FREQ_BASE = "{arch}.rope.freq_base" SCALING_TYPE = "{arch}.rope.scaling.type" SCALING_FACTOR = "{arch}.rope.scaling.factor" @@ -125,16 +164,28 @@ class SSM: class WKV: HEAD_SIZE = "{arch}.wkv.head_size" + class PosNet: + EMBEDDING_LENGTH = "{arch}.posnet.embedding_length" + BLOCK_COUNT = "{arch}.posnet.block_count" + + class ConvNext: + EMBEDDING_LENGTH = "{arch}.convnext.embedding_length" + BLOCK_COUNT = "{arch}.convnext.block_count" + class Tokenizer: MODEL = "tokenizer.ggml.model" PRE = "tokenizer.ggml.pre" LIST = "tokenizer.ggml.tokens" TOKEN_TYPE = "tokenizer.ggml.token_type" - TOKEN_TYPE_COUNT = "tokenizer.ggml.token_type_count" + TOKEN_TYPE_COUNT = ( + "tokenizer.ggml.token_type_count" # for BERT-style token types + ) SCORES = "tokenizer.ggml.scores" MERGES = "tokenizer.ggml.merges" BOS_ID = "tokenizer.ggml.bos_token_id" EOS_ID = "tokenizer.ggml.eos_token_id" + EOT_ID = "tokenizer.ggml.eot_token_id" + EOM_ID = "tokenizer.ggml.eom_token_id" UNK_ID = "tokenizer.ggml.unknown_token_id" SEP_ID = "tokenizer.ggml.seperator_token_id" PAD_ID = "tokenizer.ggml.padding_token_id" @@ -150,18 +201,28 @@ class Tokenizer: CHAT_TEMPLATE = "tokenizer.chat_template" CHAT_TEMPLATE_N = "tokenizer.chat_template.{name}" CHAT_TEMPLATES = "tokenizer.chat_templates" - + # FIM/Infill special tokens constants + FIM_PRE_ID = "tokenizer.ggml.fim_pre_token_id" + FIM_SUF_ID = "tokenizer.ggml.fim_suf_token_id" + FIM_MID_ID = "tokenizer.ggml.fim_mid_token_id" + FIM_PAD_ID = "tokenizer.ggml.fim_pad_token_id" + FIM_REP_ID = "tokenizer.ggml.fim_rep_token_id" + FIM_SEP_ID = "tokenizer.ggml.fim_sep_token_id" + # deprecated: PREFIX_ID = "tokenizer.ggml.prefix_token_id" SUFFIX_ID = "tokenizer.ggml.suffix_token_id" MIDDLE_ID = "tokenizer.ggml.middle_token_id" - EOT_ID = "tokenizer.ggml.eot_token_id" - EOM_ID = "tokenizer.ggml.eom_token_id" class Adapter: TYPE = "adapter.type" LORA_ALPHA = "adapter.lora.alpha" +# +# recommended mapping of model tensor names for storage in gguf +# + + class GGUFType: MODEL = "model" ADAPTER = "adapter" @@ -169,6 +230,7 @@ class GGUFType: class MODEL_ARCH(IntEnum): LLAMA = auto() + DECI = auto() FALCON = auto() BAICHUAN = auto() GROK = auto() @@ -186,6 +248,7 @@ class MODEL_ARCH(IntEnum): QWEN = auto() QWEN2 = auto() QWEN2MOE = auto() + QWEN2VL = auto() PHI2 = auto() PHI3 = auto() PLAMO = auto() @@ -199,14 +262,16 @@ class MODEL_ARCH(IntEnum): STARCODER2 = auto() RWKV6 = auto() MAMBA = auto() - JAMBA = auto() XVERSE = auto() COMMAND_R = auto() + COHERE2 = auto() DBRX = auto() OLMO = auto() + OLMO2 = auto() OLMOE = auto() OPENELM = auto() ARCTIC = auto() + DEEPSEEK = auto() DEEPSEEK2 = auto() CHATGLM = auto() BITNET = auto() @@ -216,6 +281,9 @@ class MODEL_ARCH(IntEnum): NEMOTRON = auto() EXAONE = auto() GRANITE = auto() + GRANITE_MOE = auto() + CHAMELEON = auto() + WAVTOKENIZER_DEC = auto() class MODEL_TENSOR(IntEnum): @@ -254,6 +322,7 @@ class MODEL_TENSOR(IntEnum): FFN_GATE_SHEXP = auto() FFN_DOWN_SHEXP = auto() FFN_UP_SHEXP = auto() + FFN_EXP_PROBS_B = auto() ATTN_Q_NORM = auto() ATTN_K_NORM = auto() LAYER_OUT_NORM = auto() @@ -261,10 +330,7 @@ class MODEL_TENSOR(IntEnum): SSM_CONV1D = auto() SSM_X = auto() SSM_DT = auto() - SSM_DT_NORM = auto() SSM_A = auto() - SSM_B_NORM = auto() - SSM_C_NORM = auto() SSM_D = auto() SSM_OUT = auto() TIME_MIX_W1 = auto() @@ -326,10 +392,29 @@ class MODEL_TENSOR(IntEnum): ENC_FFN_DOWN = auto() ENC_FFN_UP = auto() ENC_OUTPUT_NORM = auto() + CLS = auto() # classifier + CLS_OUT = auto() # classifier output projection + CONV1D = auto() + CONVNEXT_DW = auto() + CONVNEXT_NORM = auto() + CONVNEXT_PW1 = auto() + CONVNEXT_PW2 = auto() + CONVNEXT_GAMMA = auto() + POSNET_CONV1 = auto() + POSNET_CONV2 = auto() + POSNET_NORM = auto() + POSNET_NORM1 = auto() + POSNET_NORM2 = auto() + POSNET_ATTN_NORM = auto() + POSNET_ATTN_Q = auto() + POSNET_ATTN_K = auto() + POSNET_ATTN_V = auto() + POSNET_ATTN_OUT = auto() MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = { MODEL_ARCH.LLAMA: "llama", + MODEL_ARCH.DECI: "deci", MODEL_ARCH.FALCON: "falcon", MODEL_ARCH.BAICHUAN: "baichuan", MODEL_ARCH.GROK: "grok", @@ -347,6 +432,7 @@ class MODEL_TENSOR(IntEnum): MODEL_ARCH.QWEN: "qwen", MODEL_ARCH.QWEN2: "qwen2", MODEL_ARCH.QWEN2MOE: "qwen2moe", + MODEL_ARCH.QWEN2VL: "qwen2vl", MODEL_ARCH.PHI2: "phi2", MODEL_ARCH.PHI3: "phi3", MODEL_ARCH.PLAMO: "plamo", @@ -360,14 +446,16 @@ class MODEL_TENSOR(IntEnum): MODEL_ARCH.STARCODER2: "starcoder2", MODEL_ARCH.RWKV6: "rwkv6", MODEL_ARCH.MAMBA: "mamba", - MODEL_ARCH.JAMBA: "jamba", MODEL_ARCH.XVERSE: "xverse", MODEL_ARCH.COMMAND_R: "command-r", + MODEL_ARCH.COHERE2: "cohere2", MODEL_ARCH.DBRX: "dbrx", MODEL_ARCH.OLMO: "olmo", + MODEL_ARCH.OLMO2: "olmo2", MODEL_ARCH.OLMOE: "olmoe", MODEL_ARCH.OPENELM: "openelm", MODEL_ARCH.ARCTIC: "arctic", + MODEL_ARCH.DEEPSEEK: "deepseek", MODEL_ARCH.DEEPSEEK2: "deepseek2", MODEL_ARCH.CHATGLM: "chatglm", MODEL_ARCH.BITNET: "bitnet", @@ -377,6 +465,9 @@ class MODEL_TENSOR(IntEnum): MODEL_ARCH.NEMOTRON: "nemotron", MODEL_ARCH.EXAONE: "exaone", MODEL_ARCH.GRANITE: "granite", + MODEL_ARCH.GRANITE_MOE: "granitemoe", + MODEL_ARCH.CHAMELEON: "chameleon", + MODEL_ARCH.WAVTOKENIZER_DEC: "wavtokenizer-dec", } TENSOR_NAMES: dict[MODEL_TENSOR, str] = { @@ -417,15 +508,13 @@ class MODEL_TENSOR(IntEnum): MODEL_TENSOR.FFN_GATE_EXP: "blk.{bid}.ffn_gate_exps", MODEL_TENSOR.FFN_DOWN_EXP: "blk.{bid}.ffn_down_exps", MODEL_TENSOR.FFN_UP_EXP: "blk.{bid}.ffn_up_exps", + MODEL_TENSOR.FFN_EXP_PROBS_B: "blk.{bid}.exp_probs_b", MODEL_TENSOR.LAYER_OUT_NORM: "blk.{bid}.layer_output_norm", MODEL_TENSOR.SSM_IN: "blk.{bid}.ssm_in", MODEL_TENSOR.SSM_CONV1D: "blk.{bid}.ssm_conv1d", MODEL_TENSOR.SSM_X: "blk.{bid}.ssm_x", MODEL_TENSOR.SSM_DT: "blk.{bid}.ssm_dt", - MODEL_TENSOR.SSM_DT_NORM: "blk.{bid}.ssm_dt_norm", MODEL_TENSOR.SSM_A: "blk.{bid}.ssm_a", - MODEL_TENSOR.SSM_B_NORM: "blk.{bid}.ssm_b_norm", - MODEL_TENSOR.SSM_C_NORM: "blk.{bid}.ssm_c_norm", MODEL_TENSOR.SSM_D: "blk.{bid}.ssm_d", MODEL_TENSOR.SSM_OUT: "blk.{bid}.ssm_out", MODEL_TENSOR.TIME_MIX_W1: "blk.{bid}.time_mix_w1", @@ -487,6 +576,24 @@ class MODEL_TENSOR(IntEnum): MODEL_TENSOR.ENC_FFN_DOWN: "enc.blk.{bid}.ffn_down", MODEL_TENSOR.ENC_FFN_UP: "enc.blk.{bid}.ffn_up", MODEL_TENSOR.ENC_OUTPUT_NORM: "enc.output_norm", + MODEL_TENSOR.CLS: "cls", + MODEL_TENSOR.CLS_OUT: "cls.output", + MODEL_TENSOR.CONV1D: "conv1d", + MODEL_TENSOR.CONVNEXT_DW: "convnext.{bid}.dw", + MODEL_TENSOR.CONVNEXT_NORM: "convnext.{bid}.norm", + MODEL_TENSOR.CONVNEXT_PW1: "convnext.{bid}.pw1", + MODEL_TENSOR.CONVNEXT_PW2: "convnext.{bid}.pw2", + MODEL_TENSOR.CONVNEXT_GAMMA: "convnext.{bid}.gamma", + MODEL_TENSOR.POSNET_CONV1: "posnet.{bid}.conv1", + MODEL_TENSOR.POSNET_CONV2: "posnet.{bid}.conv2", + MODEL_TENSOR.POSNET_NORM: "posnet.{bid}.norm", + MODEL_TENSOR.POSNET_NORM1: "posnet.{bid}.norm1", + MODEL_TENSOR.POSNET_NORM2: "posnet.{bid}.norm2", + MODEL_TENSOR.POSNET_ATTN_NORM: "posnet.{bid}.attn_norm", + MODEL_TENSOR.POSNET_ATTN_Q: "posnet.{bid}.attn_q", + MODEL_TENSOR.POSNET_ATTN_K: "posnet.{bid}.attn_k", + MODEL_TENSOR.POSNET_ATTN_V: "posnet.{bid}.attn_v", + MODEL_TENSOR.POSNET_ATTN_OUT: "posnet.{bid}.attn_output", } MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = { @@ -510,6 +617,26 @@ class MODEL_TENSOR(IntEnum): MODEL_TENSOR.FFN_DOWN_EXP, MODEL_TENSOR.FFN_UP_EXP, ], + MODEL_ARCH.DECI: [ + MODEL_TENSOR.TOKEN_EMBD, + MODEL_TENSOR.OUTPUT_NORM, + MODEL_TENSOR.OUTPUT, + MODEL_TENSOR.ROPE_FREQS, + MODEL_TENSOR.ATTN_NORM, + MODEL_TENSOR.ATTN_Q, + MODEL_TENSOR.ATTN_K, + MODEL_TENSOR.ATTN_V, + MODEL_TENSOR.ATTN_OUT, + MODEL_TENSOR.ATTN_ROT_EMBD, + MODEL_TENSOR.FFN_GATE_INP, + MODEL_TENSOR.FFN_NORM, + MODEL_TENSOR.FFN_GATE, + MODEL_TENSOR.FFN_DOWN, + MODEL_TENSOR.FFN_UP, + MODEL_TENSOR.FFN_GATE_EXP, + MODEL_TENSOR.FFN_DOWN_EXP, + MODEL_TENSOR.FFN_UP_EXP, + ], MODEL_ARCH.GROK: [ MODEL_TENSOR.TOKEN_EMBD, MODEL_TENSOR.OUTPUT_NORM, @@ -596,6 +723,8 @@ class MODEL_TENSOR(IntEnum): MODEL_TENSOR.FFN_DOWN, MODEL_TENSOR.FFN_UP, MODEL_TENSOR.LAYER_OUT_NORM, + MODEL_TENSOR.CLS, + MODEL_TENSOR.CLS_OUT, ], MODEL_ARCH.NOMIC_BERT: [ MODEL_TENSOR.TOKEN_EMBD, @@ -627,6 +756,7 @@ class MODEL_TENSOR(IntEnum): MODEL_TENSOR.FFN_GATE, MODEL_TENSOR.FFN_DOWN, MODEL_TENSOR.LAYER_OUT_NORM, + MODEL_TENSOR.CLS, ], MODEL_ARCH.MPT: [ MODEL_TENSOR.TOKEN_EMBD, @@ -713,6 +843,21 @@ class MODEL_TENSOR(IntEnum): MODEL_TENSOR.FFN_UP, ], MODEL_ARCH.QWEN2: [ + MODEL_TENSOR.TOKEN_EMBD, + MODEL_TENSOR.OUTPUT_NORM, + MODEL_TENSOR.OUTPUT, + MODEL_TENSOR.ROPE_FREQS, + MODEL_TENSOR.ATTN_NORM, + MODEL_TENSOR.ATTN_Q, + MODEL_TENSOR.ATTN_K, + MODEL_TENSOR.ATTN_V, + MODEL_TENSOR.ATTN_OUT, + MODEL_TENSOR.FFN_NORM, + MODEL_TENSOR.FFN_GATE, + MODEL_TENSOR.FFN_DOWN, + MODEL_TENSOR.FFN_UP, + ], + MODEL_ARCH.QWEN2VL: [ MODEL_TENSOR.TOKEN_EMBD, MODEL_TENSOR.OUTPUT_NORM, MODEL_TENSOR.OUTPUT, @@ -790,6 +935,8 @@ class MODEL_TENSOR(IntEnum): MODEL_TENSOR.TOKEN_EMBD, MODEL_TENSOR.OUTPUT_NORM, MODEL_TENSOR.OUTPUT, + MODEL_TENSOR.ROPE_FACTORS_LONG, + MODEL_TENSOR.ROPE_FACTORS_SHORT, MODEL_TENSOR.ATTN_NORM, MODEL_TENSOR.ATTN_QKV, MODEL_TENSOR.ATTN_Q, @@ -849,6 +996,8 @@ class MODEL_TENSOR(IntEnum): MODEL_TENSOR.OUTPUT, MODEL_TENSOR.OUTPUT_NORM, MODEL_TENSOR.ROPE_FREQS, + MODEL_TENSOR.ROPE_FACTORS_LONG, + MODEL_TENSOR.ROPE_FACTORS_SHORT, MODEL_TENSOR.ATTN_NORM, MODEL_TENSOR.ATTN_Q, MODEL_TENSOR.ATTN_K, @@ -868,6 +1017,8 @@ class MODEL_TENSOR(IntEnum): MODEL_TENSOR.TOKEN_EMBD, MODEL_TENSOR.OUTPUT_NORM, MODEL_TENSOR.OUTPUT, + MODEL_TENSOR.ROPE_FACTORS_LONG, + MODEL_TENSOR.ROPE_FACTORS_SHORT, MODEL_TENSOR.ATTN_NORM, MODEL_TENSOR.ATTN_Q_A, MODEL_TENSOR.ATTN_Q_B, @@ -968,51 +1119,37 @@ class MODEL_TENSOR(IntEnum): MODEL_TENSOR.SSM_D, MODEL_TENSOR.SSM_OUT, ], - MODEL_ARCH.JAMBA: [ + MODEL_ARCH.XVERSE: [ MODEL_TENSOR.TOKEN_EMBD, MODEL_TENSOR.OUTPUT_NORM, MODEL_TENSOR.OUTPUT, + MODEL_TENSOR.ROPE_FREQS, MODEL_TENSOR.ATTN_NORM, MODEL_TENSOR.ATTN_Q, MODEL_TENSOR.ATTN_K, MODEL_TENSOR.ATTN_V, MODEL_TENSOR.ATTN_OUT, - MODEL_TENSOR.SSM_IN, - MODEL_TENSOR.SSM_CONV1D, - MODEL_TENSOR.SSM_X, - MODEL_TENSOR.SSM_DT, - MODEL_TENSOR.SSM_DT_NORM, - MODEL_TENSOR.SSM_A, - MODEL_TENSOR.SSM_B_NORM, - MODEL_TENSOR.SSM_C_NORM, - MODEL_TENSOR.SSM_D, - MODEL_TENSOR.SSM_OUT, - MODEL_TENSOR.FFN_GATE_INP, + MODEL_TENSOR.ATTN_ROT_EMBD, MODEL_TENSOR.FFN_NORM, MODEL_TENSOR.FFN_GATE, MODEL_TENSOR.FFN_DOWN, MODEL_TENSOR.FFN_UP, - MODEL_TENSOR.FFN_GATE_EXP, - MODEL_TENSOR.FFN_DOWN_EXP, - MODEL_TENSOR.FFN_UP_EXP, ], - MODEL_ARCH.XVERSE: [ + MODEL_ARCH.COMMAND_R: [ MODEL_TENSOR.TOKEN_EMBD, MODEL_TENSOR.OUTPUT_NORM, - MODEL_TENSOR.OUTPUT, - MODEL_TENSOR.ROPE_FREQS, MODEL_TENSOR.ATTN_NORM, MODEL_TENSOR.ATTN_Q, MODEL_TENSOR.ATTN_K, MODEL_TENSOR.ATTN_V, MODEL_TENSOR.ATTN_OUT, - MODEL_TENSOR.ATTN_ROT_EMBD, - MODEL_TENSOR.FFN_NORM, MODEL_TENSOR.FFN_GATE, MODEL_TENSOR.FFN_DOWN, MODEL_TENSOR.FFN_UP, + MODEL_TENSOR.ATTN_K_NORM, + MODEL_TENSOR.ATTN_Q_NORM, ], - MODEL_ARCH.COMMAND_R: [ + MODEL_ARCH.COHERE2: [ MODEL_TENSOR.TOKEN_EMBD, MODEL_TENSOR.OUTPUT_NORM, MODEL_TENSOR.ATTN_NORM, @@ -1023,8 +1160,6 @@ class MODEL_TENSOR(IntEnum): MODEL_TENSOR.FFN_GATE, MODEL_TENSOR.FFN_DOWN, MODEL_TENSOR.FFN_UP, - MODEL_TENSOR.ATTN_K_NORM, - MODEL_TENSOR.ATTN_Q_NORM, ], MODEL_ARCH.DBRX: [ MODEL_TENSOR.TOKEN_EMBD, @@ -1050,6 +1185,22 @@ class MODEL_TENSOR(IntEnum): MODEL_TENSOR.FFN_DOWN, MODEL_TENSOR.FFN_UP, ], + MODEL_ARCH.OLMO2: [ + MODEL_TENSOR.TOKEN_EMBD, + MODEL_TENSOR.OUTPUT_NORM, + MODEL_TENSOR.OUTPUT, + MODEL_TENSOR.ATTN_Q, + MODEL_TENSOR.ATTN_K, + MODEL_TENSOR.ATTN_V, + MODEL_TENSOR.ATTN_OUT, + MODEL_TENSOR.ATTN_POST_NORM, + MODEL_TENSOR.ATTN_Q_NORM, + MODEL_TENSOR.ATTN_K_NORM, + MODEL_TENSOR.FFN_POST_NORM, + MODEL_TENSOR.FFN_GATE, + MODEL_TENSOR.FFN_DOWN, + MODEL_TENSOR.FFN_UP, + ], MODEL_ARCH.OLMOE: [ MODEL_TENSOR.TOKEN_EMBD, MODEL_TENSOR.OUTPUT_NORM, @@ -1101,6 +1252,29 @@ class MODEL_TENSOR(IntEnum): MODEL_TENSOR.FFN_DOWN_EXP, MODEL_TENSOR.FFN_UP_EXP, ], + MODEL_ARCH.DEEPSEEK: [ + MODEL_TENSOR.TOKEN_EMBD, + MODEL_TENSOR.OUTPUT_NORM, + MODEL_TENSOR.OUTPUT, + MODEL_TENSOR.ROPE_FREQS, + MODEL_TENSOR.ATTN_NORM, + MODEL_TENSOR.ATTN_Q, + MODEL_TENSOR.ATTN_K, + MODEL_TENSOR.ATTN_V, + MODEL_TENSOR.ATTN_OUT, + MODEL_TENSOR.ATTN_ROT_EMBD, + MODEL_TENSOR.FFN_GATE_INP, + MODEL_TENSOR.FFN_NORM, + MODEL_TENSOR.FFN_GATE, + MODEL_TENSOR.FFN_DOWN, + MODEL_TENSOR.FFN_UP, + MODEL_TENSOR.FFN_GATE_EXP, + MODEL_TENSOR.FFN_DOWN_EXP, + MODEL_TENSOR.FFN_UP_EXP, + MODEL_TENSOR.FFN_GATE_SHEXP, + MODEL_TENSOR.FFN_DOWN_SHEXP, + MODEL_TENSOR.FFN_UP_SHEXP, + ], MODEL_ARCH.DEEPSEEK2: [ MODEL_TENSOR.TOKEN_EMBD, MODEL_TENSOR.OUTPUT_NORM, @@ -1127,6 +1301,7 @@ class MODEL_TENSOR(IntEnum): MODEL_TENSOR.FFN_GATE_SHEXP, MODEL_TENSOR.FFN_DOWN_SHEXP, MODEL_TENSOR.FFN_UP_SHEXP, + MODEL_TENSOR.FFN_EXP_PROBS_B, ], MODEL_ARCH.CHATGLM: [ MODEL_TENSOR.TOKEN_EMBD, @@ -1248,6 +1423,7 @@ class MODEL_TENSOR(IntEnum): MODEL_ARCH.GRANITE: [ MODEL_TENSOR.TOKEN_EMBD, MODEL_TENSOR.OUTPUT_NORM, + MODEL_TENSOR.OUTPUT, MODEL_TENSOR.ATTN_NORM, MODEL_TENSOR.ATTN_Q, MODEL_TENSOR.ATTN_K, @@ -1258,13 +1434,72 @@ class MODEL_TENSOR(IntEnum): MODEL_TENSOR.FFN_DOWN, MODEL_TENSOR.FFN_UP, ], + MODEL_ARCH.GRANITE_MOE: [ + MODEL_TENSOR.TOKEN_EMBD, + MODEL_TENSOR.OUTPUT_NORM, + MODEL_TENSOR.OUTPUT, + MODEL_TENSOR.ATTN_NORM, + MODEL_TENSOR.ATTN_Q, + MODEL_TENSOR.ATTN_K, + MODEL_TENSOR.ATTN_V, + MODEL_TENSOR.ATTN_OUT, + MODEL_TENSOR.FFN_NORM, + MODEL_TENSOR.FFN_GATE_INP, + MODEL_TENSOR.FFN_GATE_EXP, + MODEL_TENSOR.FFN_DOWN_EXP, + MODEL_TENSOR.FFN_UP_EXP, + ], + MODEL_ARCH.CHAMELEON: [ + MODEL_TENSOR.TOKEN_EMBD, + MODEL_TENSOR.OUTPUT_NORM, + MODEL_TENSOR.OUTPUT, + MODEL_TENSOR.ATTN_NORM, + MODEL_TENSOR.ATTN_Q, + MODEL_TENSOR.ATTN_Q_NORM, + MODEL_TENSOR.ATTN_K, + MODEL_TENSOR.ATTN_K_NORM, + MODEL_TENSOR.ATTN_V, + MODEL_TENSOR.ATTN_OUT, + MODEL_TENSOR.FFN_NORM, + MODEL_TENSOR.FFN_GATE, + MODEL_TENSOR.FFN_DOWN, + MODEL_TENSOR.FFN_UP, + ], + MODEL_ARCH.WAVTOKENIZER_DEC: [ + MODEL_TENSOR.TOKEN_EMBD, + MODEL_TENSOR.TOKEN_EMBD_NORM, + MODEL_TENSOR.CONV1D, + MODEL_TENSOR.CONVNEXT_DW, + MODEL_TENSOR.CONVNEXT_NORM, + MODEL_TENSOR.CONVNEXT_PW1, + MODEL_TENSOR.CONVNEXT_PW2, + MODEL_TENSOR.CONVNEXT_GAMMA, + MODEL_TENSOR.OUTPUT, + MODEL_TENSOR.OUTPUT_NORM, + MODEL_TENSOR.POSNET_CONV1, + MODEL_TENSOR.POSNET_CONV2, + MODEL_TENSOR.POSNET_NORM, + MODEL_TENSOR.POSNET_NORM1, + MODEL_TENSOR.POSNET_NORM2, + MODEL_TENSOR.POSNET_ATTN_NORM, + MODEL_TENSOR.POSNET_ATTN_Q, + MODEL_TENSOR.POSNET_ATTN_K, + MODEL_TENSOR.POSNET_ATTN_V, + MODEL_TENSOR.POSNET_ATTN_OUT, + ], + # TODO } +# tensors that will not be serialized MODEL_TENSOR_SKIP: dict[MODEL_ARCH, list[MODEL_TENSOR]] = { MODEL_ARCH.LLAMA: [ MODEL_TENSOR.ROPE_FREQS, MODEL_TENSOR.ATTN_ROT_EMBD, ], + MODEL_ARCH.DECI: [ + MODEL_TENSOR.ROPE_FREQS, + MODEL_TENSOR.ATTN_ROT_EMBD, + ], MODEL_ARCH.BAICHUAN: [ MODEL_TENSOR.ROPE_FREQS, MODEL_TENSOR.ATTN_ROT_EMBD, @@ -1289,6 +1524,10 @@ class MODEL_TENSOR(IntEnum): MODEL_TENSOR.ROPE_FREQS, MODEL_TENSOR.ATTN_ROT_EMBD, ], + MODEL_ARCH.DEEPSEEK: [ + MODEL_TENSOR.ROPE_FREQS, + MODEL_TENSOR.ATTN_ROT_EMBD, + ], MODEL_ARCH.DEEPSEEK2: [ MODEL_TENSOR.ROPE_FREQS, MODEL_TENSOR.ATTN_ROT_EMBD, @@ -1302,6 +1541,10 @@ class MODEL_TENSOR(IntEnum): ], } +# +# types +# + class TokenType(IntEnum): NORMAL = 1 @@ -1316,6 +1559,7 @@ class RopeScalingType(Enum): NONE = "none" LINEAR = "linear" YARN = "yarn" + LONGROPE = "longrope" class PoolingType(IntEnum): @@ -1354,52 +1598,61 @@ class GGMLQuantizationType(IntEnum): F64 = 28 IQ1_M = 29 BF16 = 30 - Q4_0_4_4 = 31 - Q4_0_4_8 = 32 - Q4_0_8_8 = 33 TQ1_0 = 34 TQ2_0 = 35 +class ExpertGatingFuncType(IntEnum): + SOFTMAX = 1 + SIGMOID = 2 + + +# TODO: add GGMLFileType from ggml_ftype in ggml.h + + +# from llama_ftype in llama.h +# ALL VALUES SHOULD BE THE SAME HERE AS THEY ARE OVER THERE. class LlamaFileType(IntEnum): ALL_F32 = 0 - MOSTLY_F16 = 1 - MOSTLY_Q4_0 = 2 - MOSTLY_Q4_1 = 3 - - MOSTLY_Q8_0 = 7 - MOSTLY_Q5_0 = 8 - MOSTLY_Q5_1 = 9 - MOSTLY_Q2_K = 10 - MOSTLY_Q3_K_S = 11 - MOSTLY_Q3_K_M = 12 - MOSTLY_Q3_K_L = 13 - MOSTLY_Q4_K_S = 14 - MOSTLY_Q4_K_M = 15 - MOSTLY_Q5_K_S = 16 - MOSTLY_Q5_K_M = 17 - MOSTLY_Q6_K = 18 - MOSTLY_IQ2_XXS = 19 - MOSTLY_IQ2_XS = 20 - MOSTLY_Q2_K_S = 21 - MOSTLY_IQ3_XS = 22 - MOSTLY_IQ3_XXS = 23 - MOSTLY_IQ1_S = 24 - MOSTLY_IQ4_NL = 25 - MOSTLY_IQ3_S = 26 - MOSTLY_IQ3_M = 27 - MOSTLY_IQ2_S = 28 - MOSTLY_IQ2_M = 29 - MOSTLY_IQ4_XS = 30 - MOSTLY_IQ1_M = 31 - MOSTLY_BF16 = 32 - MOSTLY_Q4_0_4_4 = 33 - MOSTLY_Q4_0_4_8 = 34 - MOSTLY_Q4_0_8_8 = 35 - MOSTLY_TQ1_0 = 36 - MOSTLY_TQ2_0 = 37 - - GUESSED = 1024 + MOSTLY_F16 = 1 # except 1d tensors + MOSTLY_Q4_0 = 2 # except 1d tensors + MOSTLY_Q4_1 = 3 # except 1d tensors + # MOSTLY_Q4_1_SOME_F16 = 4 # tok_embeddings.weight and output.weight are F16 + # MOSTLY_Q4_2 = 5 # support has been removed + # MOSTLY_Q4_3 = 6 # support has been removed + MOSTLY_Q8_0 = 7 # except 1d tensors + MOSTLY_Q5_0 = 8 # except 1d tensors + MOSTLY_Q5_1 = 9 # except 1d tensors + MOSTLY_Q2_K = 10 # except 1d tensors + MOSTLY_Q3_K_S = 11 # except 1d tensors + MOSTLY_Q3_K_M = 12 # except 1d tensors + MOSTLY_Q3_K_L = 13 # except 1d tensors + MOSTLY_Q4_K_S = 14 # except 1d tensors + MOSTLY_Q4_K_M = 15 # except 1d tensors + MOSTLY_Q5_K_S = 16 # except 1d tensors + MOSTLY_Q5_K_M = 17 # except 1d tensors + MOSTLY_Q6_K = 18 # except 1d tensors + MOSTLY_IQ2_XXS = 19 # except 1d tensors + MOSTLY_IQ2_XS = 20 # except 1d tensors + MOSTLY_Q2_K_S = 21 # except 1d tensors + MOSTLY_IQ3_XS = 22 # except 1d tensors + MOSTLY_IQ3_XXS = 23 # except 1d tensors + MOSTLY_IQ1_S = 24 # except 1d tensors + MOSTLY_IQ4_NL = 25 # except 1d tensors + MOSTLY_IQ3_S = 26 # except 1d tensors + MOSTLY_IQ3_M = 27 # except 1d tensors + MOSTLY_IQ2_S = 28 # except 1d tensors + MOSTLY_IQ2_M = 29 # except 1d tensors + MOSTLY_IQ4_XS = 30 # except 1d tensors + MOSTLY_IQ1_M = 31 # except 1d tensors + MOSTLY_BF16 = 32 # except 1d tensors + # MOSTLY_Q4_0_4_4 = 33 # removed from gguf files, use Q4_0 and runtime repack + # MOSTLY_Q4_0_4_8 = 34 # removed from gguf files, use Q4_0 and runtime repack + # MOSTLY_Q4_0_8_8 = 35 # removed from gguf files, use Q4_0 and runtime repack + MOSTLY_TQ1_0 = 36 # except 1d tensors + MOSTLY_TQ2_0 = 37 # except 1d tensors + + GUESSED = 1024 # not specified in the model file class GGUFEndian(IntEnum): @@ -1434,11 +1687,12 @@ def get_type(val: Any) -> GGUFValueType: return GGUFValueType.BOOL elif isinstance(val, int): return GGUFValueType.INT32 - + # TODO: need help with 64-bit types in Python else: raise ValueError(f"Unknown type: {type(val)}") +# Items here are (block size, type size) QK_K = 256 GGML_QUANT_SIZES: dict[GGMLQuantizationType, tuple[int, int]] = { GGMLQuantizationType.F32: (1, 4), @@ -1470,13 +1724,14 @@ def get_type(val: Any) -> GGUFValueType: GGMLQuantizationType.F64: (1, 8), GGMLQuantizationType.IQ1_M: (256, QK_K // 8 + QK_K // 16 + QK_K // 32), GGMLQuantizationType.BF16: (1, 2), - GGMLQuantizationType.Q4_0_4_4: (32, 2 + 16), - GGMLQuantizationType.Q4_0_4_8: (32, 2 + 16), - GGMLQuantizationType.Q4_0_8_8: (32, 2 + 16), GGMLQuantizationType.TQ1_0: (256, 2 + 4 * 13), GGMLQuantizationType.TQ2_0: (256, 2 + 64), } + +# Aliases for backward compatibility. + +# general KEY_GENERAL_ARCHITECTURE = Keys.General.ARCHITECTURE KEY_GENERAL_QUANTIZATION_VERSION = Keys.General.QUANTIZATION_VERSION KEY_GENERAL_ALIGNMENT = Keys.General.ALIGNMENT @@ -1488,6 +1743,7 @@ def get_type(val: Any) -> GGUFValueType: KEY_GENERAL_SOURCE_URL = Keys.General.SOURCE_URL KEY_GENERAL_FILE_TYPE = Keys.General.FILE_TYPE +# LLM KEY_VOCAB_SIZE = Keys.LLM.VOCAB_SIZE KEY_CONTEXT_LENGTH = Keys.LLM.CONTEXT_LENGTH KEY_EMBEDDING_LENGTH = Keys.LLM.EMBEDDING_LENGTH @@ -1496,6 +1752,7 @@ def get_type(val: Any) -> GGUFValueType: KEY_USE_PARALLEL_RESIDUAL = Keys.LLM.USE_PARALLEL_RESIDUAL KEY_TENSOR_DATA_LAYOUT = Keys.LLM.TENSOR_DATA_LAYOUT +# attention KEY_ATTENTION_HEAD_COUNT = Keys.Attention.HEAD_COUNT KEY_ATTENTION_HEAD_COUNT_KV = Keys.Attention.HEAD_COUNT_KV KEY_ATTENTION_MAX_ALIBI_BIAS = Keys.Attention.MAX_ALIBI_BIAS @@ -1503,6 +1760,7 @@ def get_type(val: Any) -> GGUFValueType: KEY_ATTENTION_LAYERNORM_EPS = Keys.Attention.LAYERNORM_EPS KEY_ATTENTION_LAYERNORM_RMS_EPS = Keys.Attention.LAYERNORM_RMS_EPS +# RoPE KEY_ROPE_DIMENSION_COUNT = Keys.Rope.DIMENSION_COUNT KEY_ROPE_FREQ_BASE = Keys.Rope.FREQ_BASE KEY_ROPE_SCALING_TYPE = Keys.Rope.SCALING_TYPE @@ -1510,12 +1768,14 @@ def get_type(val: Any) -> GGUFValueType: KEY_ROPE_SCALING_ORIG_CTX_LEN = Keys.Rope.SCALING_ORIG_CTX_LEN KEY_ROPE_SCALING_FINETUNED = Keys.Rope.SCALING_FINETUNED +# SSM KEY_SSM_CONV_KERNEL = Keys.SSM.CONV_KERNEL KEY_SSM_INNER_SIZE = Keys.SSM.INNER_SIZE KEY_SSM_STATE_SIZE = Keys.SSM.STATE_SIZE KEY_SSM_TIME_STEP_RANK = Keys.SSM.TIME_STEP_RANK KEY_SSM_DT_B_C_RMS = Keys.SSM.DT_B_C_RMS +# tokenization KEY_TOKENIZER_MODEL = Keys.Tokenizer.MODEL KEY_TOKENIZER_PRE = Keys.Tokenizer.PRE KEY_TOKENIZER_LIST = Keys.Tokenizer.LIST @@ -1524,6 +1784,8 @@ def get_type(val: Any) -> GGUFValueType: KEY_TOKENIZER_MERGES = Keys.Tokenizer.MERGES KEY_TOKENIZER_BOS_ID = Keys.Tokenizer.BOS_ID KEY_TOKENIZER_EOS_ID = Keys.Tokenizer.EOS_ID +KEY_TOKENIZER_EOT_ID = Keys.Tokenizer.EOT_ID +KEY_TOKENIZER_EOM_ID = Keys.Tokenizer.EOM_ID KEY_TOKENIZER_UNK_ID = Keys.Tokenizer.UNK_ID KEY_TOKENIZER_SEP_ID = Keys.Tokenizer.SEP_ID KEY_TOKENIZER_PAD_ID = Keys.Tokenizer.PAD_ID @@ -1531,8 +1793,15 @@ def get_type(val: Any) -> GGUFValueType: KEY_TOKENIZER_MASK_ID = Keys.Tokenizer.MASK_ID KEY_TOKENIZER_HF_JSON = Keys.Tokenizer.HF_JSON KEY_TOKENIZER_RWKV = Keys.Tokenizer.RWKV -KEY_TOKENIZER_PRIFIX_ID = Keys.Tokenizer.PREFIX_ID + +KEY_TOKENIZER_FIM_PRE_ID = Keys.Tokenizer.FIM_PRE_ID +KEY_TOKENIZER_FIM_SUF_ID = Keys.Tokenizer.FIM_SUF_ID +KEY_TOKENIZER_FIM_MID_ID = Keys.Tokenizer.FIM_MID_ID +KEY_TOKENIZER_FIM_PAD_ID = Keys.Tokenizer.FIM_PAD_ID +KEY_TOKENIZER_FIM_REP_ID = Keys.Tokenizer.FIM_REP_ID +KEY_TOKENIZER_FIM_SEP_ID = Keys.Tokenizer.FIM_SEP_ID + +# deprecated +KEY_TOKENIZER_PREFIX_ID = Keys.Tokenizer.PREFIX_ID KEY_TOKENIZER_SUFFIX_ID = Keys.Tokenizer.SUFFIX_ID KEY_TOKENIZER_MIDDLE_ID = Keys.Tokenizer.MIDDLE_ID -KEY_TOKENIZER_EOT_ID = Keys.Tokenizer.EOT_ID -KEY_TOKENIZER_EOM_ID = Keys.Tokenizer.EOM_ID diff --git a/src/gguf/gguf_reader.py b/src/gguf/gguf_reader.py index a5fc908..962c43e 100644 --- a/src/gguf/gguf_reader.py +++ b/src/gguf/gguf_reader.py @@ -169,11 +169,10 @@ def _get( count = int(count) itemsize = int(np.empty([], dtype=dtype).itemsize) end_offs = offset + itemsize * count - return ( - self.data[offset:end_offs] - .view(dtype=dtype)[:count] - .newbyteorder(override_order or self.byte_order) - ) + arr = self.data[offset:end_offs].view(dtype=dtype)[:count] + if override_order is None: + return arr + return arr.view(arr.dtype.newbyteorder(override_order)) def _push_field(self, field: ReaderField, skip_sum: bool = False) -> int: if field.name in self.fields: diff --git a/src/gguf/gguf_writer.py b/src/gguf/gguf_writer.py index a8754cf..267ea6c 100644 --- a/src/gguf/gguf_writer.py +++ b/src/gguf/gguf_writer.py @@ -26,12 +26,14 @@ RopeScalingType, PoolingType, TokenType, + ExpertGatingFuncType, ) from .quants import quant_shape_from_byte_shape logger = logging.getLogger(__name__) + SHARD_NAME_FORMAT = "{:s}-{:05d}-of-{:05d}.gguf" @@ -135,7 +137,7 @@ def get_total_parameter_count(self) -> tuple[int, int, int, int]: continue elif name.endswith(".lora_b"): if last_lora_a is None or last_lora_a[0] != name[:-1] + "a": - + # Bail when the LoRA pair can't be found trivially logger.warning( "can't measure LoRA size correctly, tensor order is unusual" ) @@ -154,11 +156,14 @@ def get_total_parameter_count(self) -> tuple[int, int, int, int]: total_params += size + # Hopefully this should work even for variable-expert-count models expert_count = (expert_sum // n_expert_tensors) if n_expert_tensors > 0 else 0 + # Negate the total to signal it's likely not exact if last_lora_a is not None: total_params = -total_params + # NOTE: keep the output in the same order as accepted by 'size_label' in gguf-py/gguf/utility.py return total_params, shared_params, expert_params, expert_count def format_shard_names(self, path: Path) -> list[Path]: @@ -177,7 +182,7 @@ def open_output_file(self, path: Path | None = None) -> None: and self.fout is not None and (path is None or path == self.path) ): - + # allow calling this multiple times as long as the path is the same return if self.state is not WriterState.NO_FILE: @@ -206,7 +211,7 @@ def print_plan(self) -> list[Path]: if self.dry_run: logger.info("Dry run, not writing files") for name in filenames: - print(name) + print(name) # noqa: NP100 exit() return filenames @@ -390,11 +395,12 @@ def add_tensor_info( if tensor_dtype == np.uint8: tensor_shape = quant_shape_from_byte_shape(tensor_shape, raw_dtype) + # make sure there is at least one tensor before splitting if len(self.tensors[-1]) > 0: - if ( + if ( # split when over tensor limit self.split_max_tensors != 0 and len(self.tensors[-1]) >= self.split_max_tensors - ) or ( + ) or ( # split when over size limit self.split_max_size != 0 and sum(ti.nbytes for ti in self.tensors[-1].values()) + tensor_nbytes > self.split_max_size @@ -460,6 +466,8 @@ def write_tensor_data(self, tensor: np.ndarray[Any, Any]) -> None: fout = self.fout[file_id] + # pop the first tensor info + # TODO: cleaner way to get the first key first_tensor_name = [ name for name, _ in zip(self.tensors[file_id].keys(), range(1)) ][0] @@ -506,8 +514,11 @@ def write_tensors_to_file(self, *, progress: bool = False) -> None: total = sum(ti.nbytes for ti in tensors.values()) shard_bar.reset(total=(total if total > 0 else None)) + # relying on the fact that Python dicts preserve insertion order (since 3.7) for ti in tensors.values(): - assert ti.tensor is not None + assert ( + ti.tensor is not None + ) # can only iterate once over the tensors assert ti.tensor.nbytes == ti.nbytes ti.tensor.tofile(fout) if shard_bar is not None: @@ -631,6 +642,11 @@ def add_base_model_organization(self, source_id: int, organization: str) -> None Keys.General.BASE_MODEL_ORGANIZATION.format(id=source_id), organization ) + def add_base_model_description(self, source_id: int, description: str) -> None: + self.add_string( + Keys.General.BASE_MODEL_DESCRIPTION.format(id=source_id), description + ) + def add_base_model_url(self, source_id: int, url: str) -> None: self.add_string(Keys.General.BASE_MODEL_URL.format(id=source_id), url) @@ -643,15 +659,46 @@ def add_base_model_uuid(self, source_id: int, uuid: str) -> None: def add_base_model_repo_url(self, source_id: int, repo_url: str) -> None: self.add_string(Keys.General.BASE_MODEL_REPO_URL.format(id=source_id), repo_url) + def add_dataset_count(self, source_count: int) -> None: + self.add_uint32(Keys.General.DATASET_COUNT, source_count) + + def add_dataset_name(self, source_id: int, name: str) -> None: + self.add_string(Keys.General.DATASET_NAME.format(id=source_id), name) + + def add_dataset_author(self, source_id: int, author: str) -> None: + self.add_string(Keys.General.DATASET_AUTHOR.format(id=source_id), author) + + def add_dataset_version(self, source_id: int, version: str) -> None: + self.add_string(Keys.General.DATASET_VERSION.format(id=source_id), version) + + def add_dataset_organization(self, source_id: int, organization: str) -> None: + self.add_string( + Keys.General.DATASET_ORGANIZATION.format(id=source_id), organization + ) + + def add_dataset_description(self, source_id: int, description: str) -> None: + self.add_string( + Keys.General.DATASET_DESCRIPTION.format(id=source_id), description + ) + + def add_dataset_url(self, source_id: int, url: str) -> None: + self.add_string(Keys.General.DATASET_URL.format(id=source_id), url) + + def add_dataset_doi(self, source_id: int, doi: str) -> None: + self.add_string(Keys.General.DATASET_DOI.format(id=source_id), doi) + + def add_dataset_uuid(self, source_id: int, uuid: str) -> None: + self.add_string(Keys.General.DATASET_UUID.format(id=source_id), uuid) + + def add_dataset_repo_url(self, source_id: int, repo_url: str) -> None: + self.add_string(Keys.General.DATASET_REPO_URL.format(id=source_id), repo_url) + def add_tags(self, tags: Sequence[str]) -> None: self.add_array(Keys.General.TAGS, tags) def add_languages(self, languages: Sequence[str]) -> None: self.add_array(Keys.General.LANGUAGES, languages) - def add_datasets(self, datasets: Sequence[str]) -> None: - self.add_array(Keys.General.DATASETS, datasets) - def add_tensor_data_layout(self, layout: str) -> None: self.add_string(Keys.LLM.TENSOR_DATA_LAYOUT.format(arch=self.arch), layout) @@ -664,6 +711,21 @@ def add_context_length(self, length: int) -> None: def add_embedding_length(self, length: int) -> None: self.add_uint32(Keys.LLM.EMBEDDING_LENGTH.format(arch=self.arch), length) + def add_features_length(self, length: int) -> None: + self.add_uint32(Keys.LLM.FEATURES_LENGTH.format(arch=self.arch), length) + + def add_posnet_embedding_length(self, length: int) -> None: + self.add_uint32(Keys.PosNet.EMBEDDING_LENGTH.format(arch=self.arch), length) + + def add_posnet_block_count(self, length: int) -> None: + self.add_uint32(Keys.PosNet.BLOCK_COUNT.format(arch=self.arch), length) + + def add_convnext_embedding_length(self, length: int) -> None: + self.add_uint32(Keys.ConvNext.EMBEDDING_LENGTH.format(arch=self.arch), length) + + def add_convnext_block_count(self, length: int) -> None: + self.add_uint32(Keys.ConvNext.BLOCK_COUNT.format(arch=self.arch), length) + def add_block_count(self, length: int) -> None: self.add_uint32(Keys.LLM.BLOCK_COUNT.format(arch=self.arch), length) @@ -739,6 +801,15 @@ def add_expert_shared_count(self, count: int) -> None: def add_expert_weights_scale(self, value: float) -> None: self.add_float32(Keys.LLM.EXPERT_WEIGHTS_SCALE.format(arch=self.arch), value) + def add_expert_weights_norm(self, value: bool) -> None: + self.add_bool(Keys.LLM.EXPERT_WEIGHTS_NORM.format(arch=self.arch), value) + + def add_expert_gating_func(self, value: ExpertGatingFuncType) -> None: + self.add_uint32(Keys.LLM.EXPERT_GATING_FUNC.format(arch=self.arch), value.value) + + def add_swin_norm(self, value: bool) -> None: + self.add_bool(Keys.LLM.SWIN_NORM.format(arch=self.arch), value) + def add_rescale_every_n_layers(self, count: int) -> None: self.add_uint32(Keys.LLM.RESCALE_EVERY_N_LAYERS.format(arch=self.arch), count) @@ -763,6 +834,12 @@ def add_layer_norm_eps(self, value: float) -> None: def add_layer_norm_rms_eps(self, value: float) -> None: self.add_float32(Keys.Attention.LAYERNORM_RMS_EPS.format(arch=self.arch), value) + def add_group_norm_eps(self, value: float) -> None: + self.add_float32(Keys.Attention.GROUPNORM_EPS.format(arch=self.arch), value) + + def add_group_norm_groups(self, value: int) -> None: + self.add_uint32(Keys.Attention.GROUPNORM_GROUPS.format(arch=self.arch), value) + def add_causal_attention(self, value: bool) -> None: self.add_bool(Keys.Attention.CAUSAL.format(arch=self.arch), value) @@ -787,6 +864,9 @@ def add_pooling_type(self, value: PoolingType) -> None: def add_rope_dimension_count(self, count: int) -> None: self.add_uint32(Keys.Rope.DIMENSION_COUNT.format(arch=self.arch), count) + def add_rope_dimension_sections(self, dims: Sequence[int]) -> None: + self.add_array(Keys.Rope.DIMENSION_SECTIONS.format(arch=self.arch), dims) + def add_rope_freq_base(self, value: float) -> None: self.add_float32(Keys.Rope.FREQ_BASE.format(arch=self.arch), value) @@ -893,6 +973,7 @@ def add_chat_template(self, value: str | Sequence[Mapping[str, str]]) -> None: name = choice.get("name", "") template = choice.get("template") + # Allowing non-alphanumerical characters in template name is probably not a good idea, so filter it name = "".join( (c if c in ascii_letters + digits else "_" for c in name) ) @@ -916,15 +997,6 @@ def add_chat_template(self, value: str | Sequence[Mapping[str, str]]) -> None: self.add_string(Keys.Tokenizer.CHAT_TEMPLATE, value) - def add_prefix_token_id(self, id: int) -> None: - self.add_uint32(Keys.Tokenizer.PREFIX_ID, id) - - def add_suffix_token_id(self, id: int) -> None: - self.add_uint32(Keys.Tokenizer.SUFFIX_ID, id) - - def add_middle_token_id(self, id: int) -> None: - self.add_uint32(Keys.Tokenizer.MIDDLE_ID, id) - def add_eot_token_id(self, id: int) -> None: self.add_uint32(Keys.Tokenizer.EOT_ID, id) diff --git a/src/gguf/lazy.py b/src/gguf/lazy.py index 831e3dc..f3273f5 100644 --- a/src/gguf/lazy.py +++ b/src/gguf/lazy.py @@ -12,6 +12,7 @@ class LazyMeta(ABCMeta): + def __new__( cls, name: str, bases: tuple[type, ...], namespace: dict[str, Any], **kwargs ): @@ -34,7 +35,7 @@ def __getattr__(self, name: str) -> Any: # need to make a builder for the wrapped wrapper to copy the name, # or else it fails with very cryptic error messages, - # because somehow the same string would end up in every closure + # because somehow the same string would end up in every closures def mk_wrap(op_name: str, *, meta_noop: bool = False): # need to wrap the wrapper to get self def wrapped_special_op(self, *args, **kwargs): @@ -254,6 +255,8 @@ def from_eager(cls, t: Any) -> Any: class LazyNumpyTensor(LazyBase): _tensor_type = np.ndarray + shape: tuple[int, ...] # Makes the type checker happy in quants.py + @classmethod def meta_with_dtype_and_shape( cls, dtype: DTypeLike, shape: tuple[int, ...] diff --git a/src/gguf/metadata.py b/src/gguf/metadata.py index 6d39f5a..c9046eb 100644 --- a/src/gguf/metadata.py +++ b/src/gguf/metadata.py @@ -41,7 +41,7 @@ class Metadata: base_models: Optional[list[dict]] = None tags: Optional[list[str]] = None languages: Optional[list[str]] = None - datasets: Optional[list[str]] = None + datasets: Optional[list[dict]] = None @staticmethod def load( @@ -50,7 +50,7 @@ def load( model_name: Optional[str] = None, total_params: int = 0, ) -> Metadata: - # This grabs as much contextual authorship metadata as possible from the model repository + # This grabs as many contextual authorship metadata as possible from the model repository # making any conversion as required to match the gguf kv store metadata format # as well as giving users the ability to override any authorship metadata that may be incorrect @@ -126,13 +126,13 @@ def load( "general.base_models", metadata.base_models ) + # Datasets is received here as an array of datasets + metadata.datasets = metadata_override.get("general.datasets", metadata.datasets) + metadata.tags = metadata_override.get(Keys.General.TAGS, metadata.tags) metadata.languages = metadata_override.get( Keys.General.LANGUAGES, metadata.languages ) - metadata.datasets = metadata_override.get( - Keys.General.DATASETS, metadata.datasets - ) # Direct Metadata Override (via direct cli argument) if model_name is not None: @@ -228,7 +228,11 @@ def get_model_id_components( org_component, model_full_name_component = None, model_id # Check if we erroneously matched against './' or '../' etc... - if org_component is not None and org_component[0] == ".": + if ( + org_component is not None + and len(org_component) > 0 + and org_component[0] == "." + ): org_component = None name_parts: list[str] = model_full_name_component.split("-") @@ -387,27 +391,86 @@ def apply_metadata_heuristic( ######################## if model_card is not None: - if "model_name" in model_card and metadata.name is None: - # Not part of huggingface model card standard but notice some model creator using it - # such as TheBloke in 'TheBloke/Mistral-7B-Instruct-v0.2-GGUF' - metadata.name = model_card.get("model_name") + def use_model_card_metadata(metadata_key: str, model_card_key: str): + if ( + model_card_key in model_card + and getattr(metadata, metadata_key, None) is None + ): + setattr(metadata, metadata_key, model_card.get(model_card_key)) - if "model_creator" in model_card and metadata.author is None: - # Not part of huggingface model card standard but notice some model creator using it - # such as TheBloke in 'TheBloke/Mistral-7B-Instruct-v0.2-GGUF' - metadata.author = model_card.get("model_creator") + def use_array_model_card_metadata(metadata_key: str, model_card_key: str): + # Note: Will append rather than replace if already exist + tags_value = model_card.get(model_card_key, None) + if tags_value is None: + return - if "model_type" in model_card and metadata.basename is None: - # Not part of huggingface model card standard but notice some model creator using it - # such as TheBloke in 'TheBloke/Mistral-7B-Instruct-v0.2-GGUF' - metadata.basename = model_card.get("model_type") + current_value = getattr(metadata, metadata_key, None) + if current_value is None: + current_value = [] - if "base_model" in model_card: + if isinstance(tags_value, str): + current_value.append(tags_value) + elif isinstance(tags_value, list): + current_value.extend(tags_value) + + setattr(metadata, metadata_key, current_value) + + # LLAMA.cpp's direct internal convention + # (Definitely not part of hugging face formal/informal standard) + ######################################### + use_model_card_metadata("name", "name") + use_model_card_metadata("author", "author") + use_model_card_metadata("version", "version") + use_model_card_metadata("organization", "organization") + use_model_card_metadata("description", "description") + use_model_card_metadata("finetune", "finetune") + use_model_card_metadata("basename", "basename") + use_model_card_metadata("size_label", "size_label") + use_model_card_metadata("source_url", "url") + use_model_card_metadata("source_doi", "doi") + use_model_card_metadata("source_uuid", "uuid") + use_model_card_metadata("source_repo_url", "repo_url") + + # LLAMA.cpp's huggingface style convention + # (Definitely not part of hugging face formal/informal standard... but with model_ appended to match their style) + ########################################### + use_model_card_metadata("name", "model_name") + use_model_card_metadata("author", "model_author") + use_model_card_metadata("version", "model_version") + use_model_card_metadata("organization", "model_organization") + use_model_card_metadata("description", "model_description") + use_model_card_metadata("finetune", "model_finetune") + use_model_card_metadata("basename", "model_basename") + use_model_card_metadata("size_label", "model_size_label") + use_model_card_metadata("source_url", "model_url") + use_model_card_metadata("source_doi", "model_doi") + use_model_card_metadata("source_uuid", "model_uuid") + use_model_card_metadata("source_repo_url", "model_repo_url") + + # Hugging Face Direct Convention + ################################# + + # Not part of huggingface model card standard but notice some model creator using it + # such as TheBloke in 'TheBloke/Mistral-7B-Instruct-v0.2-GGUF' + use_model_card_metadata("name", "model_name") + use_model_card_metadata("author", "model_creator") + use_model_card_metadata("basename", "model_type") + + if ( + "base_model" in model_card + or "base_models" in model_card + or "base_model_sources" in model_card + ): # This represents the parent models that this is based on # Example: stabilityai/stable-diffusion-xl-base-1.0. Can also be a list (for merges) # Example of merges: https://huggingface.co/EmbeddedLLM/Mistral-7B-Merge-14-v0.1/blob/main/README.md metadata_base_models = [] - base_model_value = model_card.get("base_model", None) + base_model_value = model_card.get( + "base_model", + model_card.get( + "base_models", model_card.get("base_model_sources", None) + ), + ) if base_model_value is not None: if isinstance(base_model_value, str): @@ -420,86 +483,195 @@ def apply_metadata_heuristic( for model_id in metadata_base_models: # NOTE: model size of base model is assumed to be similar to the size of the current model - ( - model_full_name_component, - org_component, - basename, - finetune, - version, - size_label, - ) = Metadata.get_model_id_components(model_id, total_params) base_model = {} - if model_full_name_component is not None: - base_model["name"] = Metadata.id_to_title( - model_full_name_component - ) - if org_component is not None: - base_model["organization"] = Metadata.id_to_title(org_component) - if version is not None: - base_model["version"] = version - if ( - org_component is not None - and model_full_name_component is not None - ): - base_model["repo_url"] = ( - f"https://huggingface.co/{org_component}/{model_full_name_component}" + if isinstance(model_id, str): + if ( + model_id.startswith("http://") + or model_id.startswith("https://") + or model_id.startswith("ssh://") + ): + base_model["repo_url"] = model_id + + # Check if Hugging Face ID is present in URL + if "huggingface.co" in model_id: + match = re.match( + r"https?://huggingface.co/([^/]+/[^/]+)$", model_id + ) + if match: + model_id_component = match.group(1) + ( + model_full_name_component, + org_component, + basename, + finetune, + version, + size_label, + ) = Metadata.get_model_id_components( + model_id_component, total_params + ) + + # Populate model dictionary with extracted components + if model_full_name_component is not None: + base_model["name"] = Metadata.id_to_title( + model_full_name_component + ) + if org_component is not None: + base_model["organization"] = ( + Metadata.id_to_title(org_component) + ) + if version is not None: + base_model["version"] = version + + else: + # Likely a Hugging Face ID + ( + model_full_name_component, + org_component, + basename, + finetune, + version, + size_label, + ) = Metadata.get_model_id_components(model_id, total_params) + + # Populate model dictionary with extracted components + if model_full_name_component is not None: + base_model["name"] = Metadata.id_to_title( + model_full_name_component + ) + if org_component is not None: + base_model["organization"] = Metadata.id_to_title( + org_component + ) + if version is not None: + base_model["version"] = version + if ( + org_component is not None + and model_full_name_component is not None + ): + base_model["repo_url"] = ( + f"https://huggingface.co/{org_component}/{model_full_name_component}" + ) + + elif isinstance(model_id, dict): + base_model = model_id + + else: + logger.error( + f"base model entry '{str(model_id)}' not in a known format" ) - metadata.base_models.append(base_model) - - if "license" in model_card and metadata.license is None: - metadata.license = model_card.get("license") - - if "license_name" in model_card and metadata.license_name is None: - metadata.license_name = model_card.get("license_name") - - if "license_link" in model_card and metadata.license_link is None: - metadata.license_link = model_card.get("license_link") - - tags_value = model_card.get("tags", None) - if tags_value is not None: - - if metadata.tags is None: - metadata.tags = [] - - if isinstance(tags_value, str): - metadata.tags.append(tags_value) - elif isinstance(tags_value, list): - metadata.tags.extend(tags_value) - pipeline_tags_value = model_card.get("pipeline_tag", None) - if pipeline_tags_value is not None: + metadata.base_models.append(base_model) - if metadata.tags is None: - metadata.tags = [] + if ( + "datasets" in model_card + or "dataset" in model_card + or "dataset_sources" in model_card + ): + # This represents the datasets that this was trained from + metadata_datasets = [] + dataset_value = model_card.get( + "datasets", + model_card.get("dataset", model_card.get("dataset_sources", None)), + ) - if isinstance(pipeline_tags_value, str): - metadata.tags.append(pipeline_tags_value) - elif isinstance(pipeline_tags_value, list): - metadata.tags.extend(pipeline_tags_value) + if dataset_value is not None: + if isinstance(dataset_value, str): + metadata_datasets.append(dataset_value) + elif isinstance(dataset_value, list): + metadata_datasets.extend(dataset_value) - language_value = model_card.get( - "languages", model_card.get("language", None) - ) - if language_value is not None: + if metadata.datasets is None: + metadata.datasets = [] - if metadata.languages is None: - metadata.languages = [] + for dataset_id in metadata_datasets: + # NOTE: model size of base model is assumed to be similar to the size of the current model + dataset = {} + if isinstance(dataset_id, str): + if dataset_id.startswith(("http://", "https://", "ssh://")): + dataset["repo_url"] = dataset_id + + # Check if Hugging Face ID is present in URL + if "huggingface.co" in dataset_id: + match = re.match( + r"https?://huggingface.co/([^/]+/[^/]+)$", + dataset_id, + ) + if match: + dataset_id_component = match.group(1) + ( + dataset_name_component, + org_component, + basename, + finetune, + version, + size_label, + ) = Metadata.get_model_id_components( + dataset_id_component, total_params + ) + + # Populate dataset dictionary with extracted components + if dataset_name_component is not None: + dataset["name"] = Metadata.id_to_title( + dataset_name_component + ) + if org_component is not None: + dataset["organization"] = Metadata.id_to_title( + org_component + ) + if version is not None: + dataset["version"] = version + + else: + # Likely a Hugging Face ID + ( + dataset_name_component, + org_component, + basename, + finetune, + version, + size_label, + ) = Metadata.get_model_id_components( + dataset_id, total_params + ) + + # Populate dataset dictionary with extracted components + if dataset_name_component is not None: + dataset["name"] = Metadata.id_to_title( + dataset_name_component + ) + if org_component is not None: + dataset["organization"] = Metadata.id_to_title( + org_component + ) + if version is not None: + dataset["version"] = version + if ( + org_component is not None + and dataset_name_component is not None + ): + dataset["repo_url"] = ( + f"https://huggingface.co/{org_component}/{dataset_name_component}" + ) + + elif isinstance(dataset_id, dict): + dataset = dataset_id + + else: + logger.error( + f"dataset entry '{str(dataset_id)}' not in a known format" + ) - if isinstance(language_value, str): - metadata.languages.append(language_value) - elif isinstance(language_value, list): - metadata.languages.extend(language_value) + metadata.datasets.append(dataset) - dataset_value = model_card.get("datasets", model_card.get("dataset", None)) - if dataset_value is not None: + use_model_card_metadata("license", "license") + use_model_card_metadata("license_name", "license_name") + use_model_card_metadata("license_link", "license_link") - if metadata.datasets is None: - metadata.datasets = [] + use_array_model_card_metadata("tags", "tags") + use_array_model_card_metadata("tags", "pipeline_tag") - if isinstance(dataset_value, str): - metadata.datasets.append(dataset_value) - elif isinstance(dataset_value, list): - metadata.datasets.extend(dataset_value) + use_array_model_card_metadata("languages", "languages") + use_array_model_card_metadata("languages", "language") # Hugging Face Parameter Heuristics #################################### @@ -508,7 +680,7 @@ def apply_metadata_heuristic( hf_name_or_path = hf_params.get("_name_or_path") if hf_name_or_path is not None and hf_name_or_path.count("/") <= 1: - # Use _name_or_path only if it's actually a model name and not some computer path + # Use _name_or_path only if its actually a model name and not some computer path # e.g. 'meta-llama/Llama-2-7b-hf' model_id = hf_name_or_path ( @@ -584,7 +756,10 @@ def set_gguf_meta_model(self, gguf_writer: gguf.GGUFWriter): gguf_writer.add_size_label(self.size_label) if self.license is not None: - gguf_writer.add_license(self.license) + if isinstance(self.license, list): + gguf_writer.add_license(",".join(self.license)) + else: + gguf_writer.add_license(self.license) if self.license_name is not None: gguf_writer.add_license_name(self.license_name) if self.license_link is not None: @@ -621,6 +796,10 @@ def set_gguf_meta_model(self, gguf_writer: gguf.GGUFWriter): gguf_writer.add_base_model_organization( key, base_model_entry["organization"] ) + if "description" in base_model_entry: + gguf_writer.add_base_model_description( + key, base_model_entry["description"] + ) if "url" in base_model_entry: gguf_writer.add_base_model_url(key, base_model_entry["url"]) if "doi" in base_model_entry: @@ -632,9 +811,33 @@ def set_gguf_meta_model(self, gguf_writer: gguf.GGUFWriter): key, base_model_entry["repo_url"] ) + if self.datasets is not None: + gguf_writer.add_dataset_count(len(self.datasets)) + for key, dataset_entry in enumerate(self.datasets): + if "name" in dataset_entry: + gguf_writer.add_dataset_name(key, dataset_entry["name"]) + if "author" in dataset_entry: + gguf_writer.add_dataset_author(key, dataset_entry["author"]) + if "version" in dataset_entry: + gguf_writer.add_dataset_version(key, dataset_entry["version"]) + if "organization" in dataset_entry: + gguf_writer.add_dataset_organization( + key, dataset_entry["organization"] + ) + if "description" in dataset_entry: + gguf_writer.add_dataset_description( + key, dataset_entry["description"] + ) + if "url" in dataset_entry: + gguf_writer.add_dataset_url(key, dataset_entry["url"]) + if "doi" in dataset_entry: + gguf_writer.add_dataset_doi(key, dataset_entry["doi"]) + if "uuid" in dataset_entry: + gguf_writer.add_dataset_uuid(key, dataset_entry["uuid"]) + if "repo_url" in dataset_entry: + gguf_writer.add_dataset_repo_url(key, dataset_entry["repo_url"]) + if self.tags is not None: gguf_writer.add_tags(self.tags) if self.languages is not None: gguf_writer.add_languages(self.languages) - if self.datasets is not None: - gguf_writer.add_datasets(self.datasets) diff --git a/src/gguf/quants.py b/src/gguf/quants.py index a5d8b44..445c43e 100644 --- a/src/gguf/quants.py +++ b/src/gguf/quants.py @@ -1,15 +1,19 @@ from __future__ import annotations -from typing import Callable, Sequence +from abc import ABC, abstractmethod +from typing import Any, Callable, Sequence +from math import log2, ceil from numpy.typing import DTypeLike -from .constants import GGML_QUANT_SIZES, GGMLQuantizationType +from .constants import GGML_QUANT_SIZES, GGMLQuantizationType, QK_K from .lazy import LazyNumpyTensor import numpy as np -def quant_shape_to_byte_shape(shape: Sequence[int], quant_type: GGMLQuantizationType): +def quant_shape_to_byte_shape( + shape: Sequence[int], quant_type: GGMLQuantizationType +) -> tuple[int, ...]: block_size, type_size = GGML_QUANT_SIZES[quant_type] if shape[-1] % block_size != 0: raise ValueError( @@ -18,7 +22,9 @@ def quant_shape_to_byte_shape(shape: Sequence[int], quant_type: GGMLQuantization return (*shape[:-1], shape[-1] // block_size * type_size) -def quant_shape_from_byte_shape(shape: Sequence[int], quant_type: GGMLQuantizationType): +def quant_shape_from_byte_shape( + shape: Sequence[int], quant_type: GGMLQuantizationType +) -> tuple[int, ...]: block_size, type_size = GGML_QUANT_SIZES[quant_type] if shape[-1] % type_size != 0: raise ValueError( @@ -27,22 +33,8 @@ def quant_shape_from_byte_shape(shape: Sequence[int], quant_type: GGMLQuantizati return (*shape[:-1], shape[-1] // type_size * block_size) -# same as ggml_compute_fp32_to_bf16 in ggml-impl.h -def __compute_fp32_to_bf16(n: np.ndarray) -> np.ndarray: - n = n.astype(np.float32, copy=False).view(np.uint32) - # force nan to quiet - n = np.where( - (n & 0x7FFFFFFF) > 0x7F800000, - (n & np.uint32(0xFFFF0000)) | np.uint32(64 << 16), - n, - ) - # round to nearest even - n = (np.uint64(n) + (0x7FFF + ((n >> 16) & 1))) >> 16 - return n.astype(np.uint16) - - # This is faster than np.vectorize and np.apply_along_axis because it works on more than one row at a time -def __apply_over_grouped_rows( +def _apply_over_grouped_rows( func: Callable[[np.ndarray], np.ndarray], arr: np.ndarray, otype: DTypeLike, @@ -63,85 +55,1398 @@ def __apply_over_grouped_rows( return out.reshape(oshape) -def __quantize_bf16_array(n: np.ndarray) -> np.ndarray: - return __apply_over_grouped_rows( - __compute_fp32_to_bf16, arr=n, otype=np.uint16, oshape=n.shape - ) +# round away from zero +# ref: https://stackoverflow.com/a/59143326/22827863 +def np_roundf(n: np.ndarray) -> np.ndarray: + a = abs(n) + floored = np.floor(a) + b = floored + np.floor(2 * (a - floored)) + return np.sign(n) * b + + +class QuantError(Exception): ... -__quantize_bf16_lazy = LazyNumpyTensor._wrap_fn( - __quantize_bf16_array, meta_noop=np.uint16 -) +_type_traits: dict[GGMLQuantizationType, type[__Quant]] = {} -def quantize_bf16(n: np.ndarray): - if type(n) is LazyNumpyTensor: - return __quantize_bf16_lazy(n) +def quantize(data: np.ndarray, qtype: GGMLQuantizationType) -> np.ndarray: + if qtype == GGMLQuantizationType.F32: + return data.astype(np.float32, copy=False) + elif qtype == GGMLQuantizationType.F16: + return data.astype(np.float16, copy=False) + elif (q := _type_traits.get(qtype)) is not None: + return q.quantize(data) else: - return __quantize_bf16_array(n) + raise NotImplementedError( + f"Quantization for {qtype.name} is not yet implemented" + ) -__q8_block_size, __q8_type_size = GGML_QUANT_SIZES[GGMLQuantizationType.Q8_0] +def dequantize(data: np.ndarray, qtype: GGMLQuantizationType) -> np.ndarray: + if qtype == GGMLQuantizationType.F32: + return data.view(np.float32) + elif qtype == GGMLQuantizationType.F16: + return data.view(np.float16).astype(np.float32) + elif (q := _type_traits.get(qtype)) is not None: + return q.dequantize(data) + else: + raise NotImplementedError( + f"Dequantization for {qtype.name} is not yet implemented" + ) -def can_quantize_to_q8_0(n: np.ndarray) -> bool: - return n.shape[-1] % __q8_block_size == 0 +class __Quant(ABC): + qtype: GGMLQuantizationType + block_size: int + type_size: int + grid: np.ndarray[Any, np.dtype[np.float32]] | None = None + grid_shape: tuple[int, int] = (0, 0) + grid_map: tuple[int | float, ...] = () + grid_hex: bytes | None = None -# round away from zero -# ref: https://stackoverflow.com/a/59143326/22827863 -def np_roundf(n: np.ndarray) -> np.ndarray: - a = abs(n) - floored = np.floor(a) - b = floored + np.floor(2 * (a - floored)) - return np.sign(n) * b + def __init__(self): + return TypeError("Quant conversion classes can't have instances") + + def __init_subclass__(cls, qtype: GGMLQuantizationType) -> None: + cls.qtype = qtype + cls.block_size, cls.type_size = GGML_QUANT_SIZES[qtype] + cls.__quantize_lazy = LazyNumpyTensor._wrap_fn( + cls.__quantize_array, meta_noop=(np.uint8, cls.__shape_to_bytes) + ) + cls.__dequantize_lazy = LazyNumpyTensor._wrap_fn( + cls.__dequantize_array, meta_noop=(np.float32, cls.__shape_from_bytes) + ) + assert qtype not in _type_traits + _type_traits[qtype] = cls + + @classmethod + def init_grid(cls): + if cls.grid is not None or cls.grid_hex is None: + return + + bits_per_elem = ceil(log2(len(cls.grid_map))) + assert bits_per_elem != 0, cls.qtype.name + elems_per_byte = 8 // bits_per_elem + + grid = np.frombuffer(cls.grid_hex, dtype=np.uint8) + # decode hexadecimal chars from grid + grid = grid.reshape((-1, 2)) + grid = (np.where(grid > 0x40, grid + 9, grid) & 0x0F) << np.array( + [4, 0], dtype=np.uint8 + ).reshape((1, 2)) + grid = grid[..., 0] | grid[..., 1] + # unpack the grid values + grid = grid.reshape((-1, 1)) >> np.array( + [i for i in range(0, 8, 8 // elems_per_byte)], dtype=np.uint8 + ).reshape((1, elems_per_byte)) + grid = (grid & ((1 << bits_per_elem) - 1)).reshape((-1, 1)) + grid_map = np.array(cls.grid_map, dtype=np.float32).reshape((1, -1)) + grid = np.take_along_axis(grid_map, grid, axis=-1) + cls.grid = grid.reshape((1, 1, *cls.grid_shape)) + + @classmethod + @abstractmethod + def quantize_blocks(cls, blocks: np.ndarray) -> np.ndarray: + raise NotImplementedError + + @classmethod + @abstractmethod + def dequantize_blocks(cls, blocks: np.ndarray) -> np.ndarray: + raise NotImplementedError + + @classmethod + def quantize_rows(cls, rows: np.ndarray) -> np.ndarray: + rows = rows.astype(np.float32, copy=False) + shape = rows.shape + n_blocks = rows.size // cls.block_size + blocks = rows.reshape((n_blocks, cls.block_size)) + blocks = cls.quantize_blocks(blocks) + assert blocks.dtype == np.uint8 + assert blocks.shape[-1] == cls.type_size + return blocks.reshape(cls.__shape_to_bytes(shape)) + + @classmethod + def dequantize_rows(cls, rows: np.ndarray) -> np.ndarray: + rows = rows.view(np.uint8) + shape = rows.shape + n_blocks = rows.size // cls.type_size + blocks = rows.reshape((n_blocks, cls.type_size)) + blocks = cls.dequantize_blocks(blocks) + assert blocks.dtype == np.float32 + assert blocks.shape[-1] == cls.block_size + return blocks.reshape(cls.__shape_from_bytes(shape)) + + @classmethod + def __shape_to_bytes(cls, shape: Sequence[int]): + return quant_shape_to_byte_shape(shape, cls.qtype) + + @classmethod + def __shape_from_bytes(cls, shape: Sequence[int]): + return quant_shape_from_byte_shape(shape, cls.qtype) + + @classmethod + def __quantize_array(cls, array: np.ndarray) -> np.ndarray: + return _apply_over_grouped_rows( + cls.quantize_rows, + arr=array, + otype=np.uint8, + oshape=cls.__shape_to_bytes(array.shape), + ) + + @classmethod + def __dequantize_array(cls, array: np.ndarray) -> np.ndarray: + cls.init_grid() + return _apply_over_grouped_rows( + cls.dequantize_rows, + arr=array, + otype=np.float32, + oshape=cls.__shape_from_bytes(array.shape), + ) + + @classmethod + def __quantize_lazy(cls, lazy_tensor: LazyNumpyTensor, /) -> Any: + pass + + @classmethod + def __dequantize_lazy(cls, lazy_tensor: LazyNumpyTensor, /) -> Any: + pass + + @classmethod + def can_quantize(cls, tensor: np.ndarray | LazyNumpyTensor) -> bool: + return tensor.shape[-1] % cls.block_size == 0 + + @classmethod + def quantize(cls, tensor: np.ndarray | LazyNumpyTensor) -> np.ndarray: + if not cls.can_quantize(tensor): + raise QuantError( + f"Can't quantize tensor with shape {tensor.shape} to {cls.qtype.name}" + ) + if isinstance(tensor, LazyNumpyTensor): + return cls.__quantize_lazy(tensor) + else: + return cls.__quantize_array(tensor) + + @classmethod + def dequantize(cls, tensor: np.ndarray | LazyNumpyTensor) -> np.ndarray: + if isinstance(tensor, LazyNumpyTensor): + return cls.__dequantize_lazy(tensor) + else: + return cls.__dequantize_array(tensor) + + +class BF16(__Quant, qtype=GGMLQuantizationType.BF16): + @classmethod + # same as ggml_compute_fp32_to_bf16 in ggml-impl.h + def quantize_blocks(cls, blocks: np.ndarray) -> np.ndarray: + n = blocks.view(np.uint32) + # force nan to quiet + n = np.where( + (n & 0x7FFFFFFF) > 0x7F800000, + (n & np.uint32(0xFFFF0000)) | np.uint32(64 << 16), + n, + ) + # round to nearest even + n = (np.uint64(n) + (0x7FFF + ((n >> 16) & 1))) >> 16 + return n.astype(np.uint16).view(np.uint8) + + @classmethod + def dequantize_blocks(cls, blocks: np.ndarray) -> np.ndarray: + return (blocks.view(np.int16).astype(np.int32) << 16).view(np.float32) + + +class Q4_0(__Quant, qtype=GGMLQuantizationType.Q4_0): + @classmethod + def quantize_blocks(cls, blocks: np.ndarray) -> np.ndarray: + n_blocks = blocks.shape[0] + + imax = abs(blocks).argmax(axis=-1, keepdims=True) + max = np.take_along_axis(blocks, imax, axis=-1) + + d = max / -8 + with np.errstate(divide="ignore"): + id = np.where(d == 0, 0, 1 / d) + # FIXME: Q4_0's reference rounding is cursed and depends on FMA + qs = ( + np.trunc( + (np.float64(blocks) * np.float64(id)) + np.float64(8.5), + dtype=np.float32, + ) + .astype(np.uint8) + .clip(0, 15) + ) + + qs = qs.reshape((n_blocks, 2, cls.block_size // 2)) + qs = qs[..., 0, :] | (qs[..., 1, :] << np.uint8(4)) + + d = d.astype(np.float16).view(np.uint8) + + return np.concatenate([d, qs], axis=-1) + + @classmethod + def dequantize_blocks(cls, blocks: np.ndarray) -> np.ndarray: + n_blocks = blocks.shape[0] + + d, qs = np.hsplit(blocks, [2]) + + d = d.view(np.float16).astype(np.float32) + + qs = qs.reshape((n_blocks, -1, 1, cls.block_size // 2)) >> np.array( + [0, 4], dtype=np.uint8 + ).reshape((1, 1, 2, 1)) + qs = (qs & np.uint8(0x0F)).reshape((n_blocks, -1)).astype(np.int8) - np.int8(8) + + return d * qs.astype(np.float32) + + +class Q4_1(__Quant, qtype=GGMLQuantizationType.Q4_1): + @classmethod + def quantize_blocks(cls, blocks: np.ndarray) -> np.ndarray: + n_blocks = blocks.shape[0] + + max = blocks.max(axis=-1, keepdims=True) + min = blocks.min(axis=-1, keepdims=True) + + d = (max - min) / 15 + with np.errstate(divide="ignore"): + id = np.where(d == 0, 0, 1 / d) + qs = ( + np.trunc((blocks - min) * id + np.float32(0.5), dtype=np.float32) + .astype(np.uint8) + .clip(0, 15) + ) + + qs = qs.reshape((n_blocks, 2, cls.block_size // 2)) + qs = qs[..., 0, :] | (qs[..., 1, :] << np.uint8(4)) + + d = d.astype(np.float16).view(np.uint8) + m = min.astype(np.float16).view(np.uint8) + + return np.concatenate([d, m, qs], axis=-1) + + @classmethod + def dequantize_blocks(cls, blocks: np.ndarray) -> np.ndarray: + n_blocks = blocks.shape[0] + + d, rest = np.hsplit(blocks, [2]) + m, qs = np.hsplit(rest, [2]) + + d = d.view(np.float16).astype(np.float32) + m = m.view(np.float16).astype(np.float32) + + qs = qs.reshape((n_blocks, -1, 1, cls.block_size // 2)) >> np.array( + [0, 4], dtype=np.uint8 + ).reshape((1, 1, 2, 1)) + qs = (qs & np.uint8(0x0F)).reshape((n_blocks, -1)).astype(np.float32) + + return (d * qs) + m + + +class Q5_0(__Quant, qtype=GGMLQuantizationType.Q5_0): + @classmethod + def quantize_blocks(cls, blocks: np.ndarray) -> np.ndarray: + n_blocks = blocks.shape[0] + + imax = abs(blocks).argmax(axis=-1, keepdims=True) + max = np.take_along_axis(blocks, imax, axis=-1) + + d = max / -16 + with np.errstate(divide="ignore"): + id = np.where(d == 0, 0, 1 / d) + # FIXME: Q5_0's reference rounding is cursed and depends on FMA + q = ( + np.trunc( + (np.float64(blocks) * np.float64(id)) + np.float64(16.5), + dtype=np.float32, + ) + .astype(np.uint8) + .clip(0, 31) + ) + + qs = q.reshape((n_blocks, 2, cls.block_size // 2)) + qs = (qs[..., 0, :] & np.uint8(0x0F)) | (qs[..., 1, :] << np.uint8(4)) + + qh = np.packbits( + q.reshape((n_blocks, 1, 32)) >> np.uint8(4), axis=-1, bitorder="little" + ).reshape(n_blocks, 4) + + d = d.astype(np.float16).view(np.uint8) + + return np.concatenate([d, qh, qs], axis=-1) + + @classmethod + def dequantize_blocks(cls, blocks: np.ndarray) -> np.ndarray: + n_blocks = blocks.shape[0] + + d, rest = np.hsplit(blocks, [2]) + qh, qs = np.hsplit(rest, [4]) + + d = d.view(np.float16).astype(np.float32) + qh = qh.view(np.uint32) + + qh = qh.reshape((n_blocks, 1)) >> np.array( + [i for i in range(32)], dtype=np.uint32 + ).reshape((1, 32)) + ql = qs.reshape((n_blocks, -1, 1, cls.block_size // 2)) >> np.array( + [0, 4], dtype=np.uint8 + ).reshape((1, 1, 2, 1)) + qh = (qh & np.uint32(0x01)).astype(np.uint8) + ql = (ql & np.uint8(0x0F)).reshape((n_blocks, -1)) + + qs = (ql | (qh << np.uint8(4))).astype(np.int8) - np.int8(16) + + return d * qs.astype(np.float32) + + +class Q5_1(__Quant, qtype=GGMLQuantizationType.Q5_1): + @classmethod + def quantize_blocks(cls, blocks: np.ndarray) -> np.ndarray: + n_blocks = blocks.shape[0] + + max = blocks.max(axis=-1, keepdims=True) + min = blocks.min(axis=-1, keepdims=True) + + d = (max - min) / 31 + with np.errstate(divide="ignore"): + id = np.where(d == 0, 0, 1 / d) + q = ( + np.trunc((blocks - min) * id + np.float32(0.5), dtype=np.float32) + .astype(np.uint8) + .clip(0, 31) + ) + + qs = q.reshape((n_blocks, 2, cls.block_size // 2)) + qs = (qs[..., 0, :] & np.uint8(0x0F)) | (qs[..., 1, :] << np.uint8(4)) + + qh = np.packbits( + q.reshape((n_blocks, 1, 32)) >> np.uint8(4), axis=-1, bitorder="little" + ).reshape(n_blocks, 4) + + d = d.astype(np.float16).view(np.uint8) + m = min.astype(np.float16).view(np.uint8) + + return np.concatenate([d, m, qh, qs], axis=-1) + + @classmethod + def dequantize_blocks(cls, blocks: np.ndarray) -> np.ndarray: + n_blocks = blocks.shape[0] + + d, rest = np.hsplit(blocks, [2]) + m, rest = np.hsplit(rest, [2]) + qh, qs = np.hsplit(rest, [4]) + + d = d.view(np.float16).astype(np.float32) + m = m.view(np.float16).astype(np.float32) + qh = qh.view(np.uint32) + + qh = qh.reshape((n_blocks, 1)) >> np.array( + [i for i in range(32)], dtype=np.uint32 + ).reshape((1, 32)) + ql = qs.reshape((n_blocks, -1, 1, cls.block_size // 2)) >> np.array( + [0, 4], dtype=np.uint8 + ).reshape((1, 1, 2, 1)) + qh = (qh & np.uint32(0x01)).astype(np.uint8) + ql = (ql & np.uint8(0x0F)).reshape((n_blocks, -1)) + + qs = (ql | (qh << np.uint8(4))).astype(np.float32) + + return (d * qs) + m + + +class Q8_0(__Quant, qtype=GGMLQuantizationType.Q8_0): + @classmethod + # Implementation of Q8_0 with bit-exact same results as reference implementation in ggml-quants.c + def quantize_blocks(cls, blocks: np.ndarray) -> np.ndarray: + + d = abs(blocks).max(axis=1, keepdims=True) / 127 + with np.errstate(divide="ignore"): + id = np.where(d == 0, 0, 1 / d) + qs = np_roundf(blocks * id) + + # (n_blocks, 2) + d = d.astype(np.float16).view(np.uint8) + # (n_blocks, block_size) + qs = qs.astype(np.int8).view(np.uint8) + + return np.concatenate([d, qs], axis=1) + + @classmethod + def dequantize_blocks(cls, blocks: np.ndarray) -> np.ndarray: + d, x = np.split(blocks, [2], axis=1) + d = d.view(np.float16).astype(np.float32) + x = x.view(np.int8).astype(np.float32) + + return x * d + + +class Q2_K(__Quant, qtype=GGMLQuantizationType.Q2_K): + @classmethod + def dequantize_blocks(cls, blocks: np.ndarray) -> np.ndarray: + n_blocks = blocks.shape[0] + + scales, rest = np.hsplit(blocks, [QK_K // 16]) + qs, rest = np.hsplit(rest, [QK_K // 4]) + d, dmin = np.hsplit(rest, [2]) + + d = d.view(np.float16).astype(np.float32) + dmin = dmin.view(np.float16).astype(np.float32) + + # (n_blocks, 16, 1) + dl = (d * (scales & 0xF).astype(np.float32)).reshape((n_blocks, QK_K // 16, 1)) + ml = (dmin * (scales >> 4).astype(np.float32)).reshape( + (n_blocks, QK_K // 16, 1) + ) + + shift = np.array([0, 2, 4, 6], dtype=np.uint8).reshape((1, 1, 4, 1)) + + qs = (qs.reshape((n_blocks, -1, 1, 32)) >> shift) & np.uint8(3) + + qs = qs.reshape((n_blocks, QK_K // 16, 16)).astype(np.float32) + + qs = dl * qs - ml + + return qs.reshape((n_blocks, -1)) + + +class Q3_K(__Quant, qtype=GGMLQuantizationType.Q3_K): + @classmethod + def dequantize_blocks(cls, blocks: np.ndarray) -> np.ndarray: + n_blocks = blocks.shape[0] + + hmask, rest = np.hsplit(blocks, [QK_K // 8]) + qs, rest = np.hsplit(rest, [QK_K // 4]) + scales, d = np.hsplit(rest, [12]) + + d = d.view(np.float16).astype(np.float32) + + # The scales are packed at 6-bit each in this pattern: + # 0: IIIIAAAA + # 1: JJJJBBBB + # 2: KKKKCCCC + # 3: LLLLDDDD + # 4: MMMMEEEE + # 5: NNNNFFFF + # 6: OOOOGGGG + # 7: PPPPHHHH + # 8: MMIIEEAA + # 9: NNJJFFBB + # 10: OOKKGGCC + # 11: PPLLHHDD + lscales, hscales = np.hsplit(scales, [8]) + lscales = lscales.reshape((n_blocks, 1, 8)) >> np.array( + [0, 4], dtype=np.uint8 + ).reshape((1, 2, 1)) + lscales = lscales.reshape((n_blocks, 16)) + hscales = hscales.reshape((n_blocks, 1, 4)) >> np.array( + [0, 2, 4, 6], dtype=np.uint8 + ).reshape((1, 4, 1)) + hscales = hscales.reshape((n_blocks, 16)) + scales = (lscales & np.uint8(0x0F)) | ( + (hscales & np.uint8(0x03)) << np.uint8(4) + ) + scales = (scales.astype(np.int8) - np.int8(32)).astype(np.float32) + + dl = (d * scales).reshape((n_blocks, 16, 1)) + + ql = qs.reshape((n_blocks, -1, 1, 32)) >> np.array( + [0, 2, 4, 6], dtype=np.uint8 + ).reshape((1, 1, 4, 1)) + qh = hmask.reshape(n_blocks, -1, 1, 32) >> np.array( + [i for i in range(8)], dtype=np.uint8 + ).reshape((1, 1, 8, 1)) + ql = ql.reshape((n_blocks, 16, QK_K // 16)) & np.uint8(3) + qh = qh.reshape((n_blocks, 16, QK_K // 16)) & np.uint8(1) + qh = qh ^ np.uint8(1) # strangely, the offset is zero when the bitmask is 1 + q = (ql.astype(np.int8) - (qh << np.uint8(2)).astype(np.int8)).astype( + np.float32 + ) + + return (dl * q).reshape((n_blocks, QK_K)) + + +class Q4_K(__Quant, qtype=GGMLQuantizationType.Q4_K): + K_SCALE_SIZE = 12 + + @staticmethod + def get_scale_min(scales: np.ndarray) -> tuple[np.ndarray, np.ndarray]: + n_blocks = scales.shape[0] + scales = scales.view(np.uint8) + ### Unpacking the following: ### + # 0 EEAAAAAA + # 1 FFBBBBBB + # 2 GGCCCCCC + # 3 HHDDDDDD + # 4 eeaaaaaa + # 5 ffbbbbbb + # 6 ggcccccc + # 7 hhdddddd + # 8 eeeeEEEE + # 9 ffffFFFF + # 10 ggggGGGG + # 11 hhhhHHHH + scales = scales.reshape((n_blocks, 3, 4)) + d, m, m_d = np.split(scales, 3, axis=-2) + sc = np.concatenate([d & 0x3F, (m_d & 0x0F) | ((d >> 2) & 0x30)], axis=-1) + min = np.concatenate([m & 0x3F, (m_d >> 4) | ((m >> 2) & 0x30)], axis=-1) -def __quantize_q8_0_shape_change(s: tuple[int, ...]) -> tuple[int, ...]: - return (*s[:-1], s[-1] // __q8_block_size * __q8_type_size) + return (sc.reshape((n_blocks, 8)), min.reshape((n_blocks, 8))) + @classmethod + def dequantize_blocks(cls, blocks: np.ndarray) -> np.ndarray: + n_blocks = blocks.shape[0] -# Implementation of Q8_0 with bit-exact same results as reference implementation in ggml-quants.c -def __quantize_q8_0_rows(n: np.ndarray) -> np.ndarray: - shape = n.shape - assert shape[-1] % __q8_block_size == 0 + d, rest = np.hsplit(blocks, [2]) + dmin, rest = np.hsplit(rest, [2]) + scales, qs = np.hsplit(rest, [cls.K_SCALE_SIZE]) - n_blocks = n.size // __q8_block_size + d = d.view(np.float16).astype(np.float32) + dmin = dmin.view(np.float16).astype(np.float32) - blocks = n.reshape((n_blocks, __q8_block_size)).astype(np.float32, copy=False) + sc, m = Q4_K.get_scale_min(scales) - d = abs(blocks).max(axis=1, keepdims=True) / 127 - with np.errstate(divide="ignore"): - id = np.where(d == 0, 0, 1 / d) - qs = np_roundf(blocks * id) + d = (d * sc.astype(np.float32)).reshape((n_blocks, -1, 1)) + dm = (dmin * m.astype(np.float32)).reshape((n_blocks, -1, 1)) - # (n_blocks, 2) - d = d.astype(np.float16).view(np.uint8) - # (n_blocks, block_size) - qs = qs.astype(np.int8).view(np.uint8) + qs = qs.reshape((n_blocks, -1, 1, 32)) >> np.array( + [0, 4], dtype=np.uint8 + ).reshape((1, 1, 2, 1)) + qs = (qs & np.uint8(0x0F)).reshape((n_blocks, -1, 32)).astype(np.float32) - assert d.shape[1] + qs.shape[1] == __q8_type_size + return (d * qs - dm).reshape((n_blocks, QK_K)) - return np.concatenate([d, qs], axis=1).reshape(__quantize_q8_0_shape_change(shape)) +class Q5_K(__Quant, qtype=GGMLQuantizationType.Q5_K): + @classmethod + def dequantize_blocks(cls, blocks: np.ndarray) -> np.ndarray: + n_blocks = blocks.shape[0] -def __quantize_q8_0_array(n: np.ndarray) -> np.ndarray: - return __apply_over_grouped_rows( - __quantize_q8_0_rows, - arr=n, - otype=np.uint8, - oshape=__quantize_q8_0_shape_change(n.shape), + d, rest = np.hsplit(blocks, [2]) + dmin, rest = np.hsplit(rest, [2]) + scales, rest = np.hsplit(rest, [Q4_K.K_SCALE_SIZE]) + qh, qs = np.hsplit(rest, [QK_K // 8]) + + d = d.view(np.float16).astype(np.float32) + dmin = dmin.view(np.float16).astype(np.float32) + + sc, m = Q4_K.get_scale_min(scales) + + d = (d * sc.astype(np.float32)).reshape((n_blocks, -1, 1)) + dm = (dmin * m.astype(np.float32)).reshape((n_blocks, -1, 1)) + + ql = qs.reshape((n_blocks, -1, 1, 32)) >> np.array( + [0, 4], dtype=np.uint8 + ).reshape((1, 1, 2, 1)) + qh = qh.reshape((n_blocks, -1, 1, 32)) >> np.array( + [i for i in range(8)], dtype=np.uint8 + ).reshape((1, 1, 8, 1)) + ql = (ql & np.uint8(0x0F)).reshape((n_blocks, -1, 32)) + qh = (qh & np.uint8(0x01)).reshape((n_blocks, -1, 32)) + q = (ql | (qh << np.uint8(4))).astype(np.float32) + + return (d * q - dm).reshape((n_blocks, QK_K)) + + +class Q6_K(__Quant, qtype=GGMLQuantizationType.Q6_K): + @classmethod + def dequantize_blocks(cls, blocks: np.ndarray) -> np.ndarray: + n_blocks = blocks.shape[0] + + ql, rest = np.hsplit(blocks, [QK_K // 2]) + qh, rest = np.hsplit(rest, [QK_K // 4]) + scales, d = np.hsplit(rest, [QK_K // 16]) + + scales = scales.view(np.int8).astype(np.float32) + d = d.view(np.float16).astype(np.float32) + d = (d * scales).reshape((n_blocks, QK_K // 16, 1)) + + ql = ql.reshape((n_blocks, -1, 1, 64)) >> np.array( + [0, 4], dtype=np.uint8 + ).reshape((1, 1, 2, 1)) + ql = (ql & np.uint8(0x0F)).reshape((n_blocks, -1, 32)) + qh = qh.reshape((n_blocks, -1, 1, 32)) >> np.array( + [0, 2, 4, 6], dtype=np.uint8 + ).reshape((1, 1, 4, 1)) + qh = (qh & np.uint8(0x03)).reshape((n_blocks, -1, 32)) + q = (ql | (qh << np.uint8(4))).astype(np.int8) - np.int8(32) + q = q.reshape((n_blocks, QK_K // 16, -1)).astype(np.float32) + + return (d * q).reshape((n_blocks, QK_K)) + + +class TQ1_0(__Quant, qtype=GGMLQuantizationType.TQ1_0): + @classmethod + def quantize_blocks(cls, blocks: np.ndarray) -> np.ndarray: + n_blocks = blocks.shape[0] + + d = abs(blocks).max(axis=-1, keepdims=True) + with np.errstate(divide="ignore"): + id = np.where(d == 0, 0, 1 / d) + qs = np_roundf(blocks * id) + qs = (qs.astype(np.int8) + np.int8(1)).astype(np.uint8) + + qs0, qs1, qh = ( + qs[..., : (32 * 5)], + qs[..., (32 * 5) : (48 * 5)], + qs[..., (48 * 5) :], + ) + qs0 = qs0.reshape((n_blocks, -1, 5, 32)) * np.array( + [81, 27, 9, 3, 1], dtype=np.uint8 + ).reshape((1, 1, 5, 1)) + qs0 = np.sum(qs0, axis=-2).reshape((n_blocks, -1)) + qs1 = qs1.reshape((n_blocks, -1, 5, 16)) * np.array( + [81, 27, 9, 3, 1], dtype=np.uint8 + ).reshape((1, 1, 5, 1)) + qs1 = np.sum(qs1, axis=-2).reshape((n_blocks, -1)) + qh = qh.reshape((n_blocks, -1, 4, 4)) * np.array( + [81, 27, 9, 3], dtype=np.uint8 + ).reshape((1, 1, 4, 1)) + qh = np.sum(qh, axis=-2).reshape((n_blocks, -1)) + qs = np.concatenate([qs0, qs1, qh], axis=-1) + qs = (qs.astype(np.uint16) * 256 + (243 - 1)) // 243 + + qs = qs.astype(np.uint8) + d = d.astype(np.float16).view(np.uint8) + + return np.concatenate([qs, d], axis=-1) + + @classmethod + def dequantize_blocks(cls, blocks: np.ndarray) -> np.ndarray: + n_blocks = blocks.shape[0] + + qs, rest = np.hsplit(blocks, [(QK_K - 4 * QK_K // 64) // 5]) + qh, d = np.hsplit(rest, [QK_K // 64]) + + d = d.view(np.float16).astype(np.float32) + + qs0, qs1 = qs[..., :32], qs[..., 32:] + qs0 = qs0.reshape((n_blocks, -1, 1, 32)) * np.array( + [1, 3, 9, 27, 81], dtype=np.uint8 + ).reshape((1, 1, 5, 1)) + qs0 = qs0.reshape((n_blocks, -1)) + qs1 = qs1.reshape((n_blocks, -1, 1, 16)) * np.array( + [1, 3, 9, 27, 81], dtype=np.uint8 + ).reshape((1, 1, 5, 1)) + qs1 = qs1.reshape((n_blocks, -1)) + qh = qh.reshape((n_blocks, -1, 1, 4)) * np.array( + [1, 3, 9, 27], dtype=np.uint8 + ).reshape((1, 1, 4, 1)) + qh = qh.reshape((n_blocks, -1)) + qs = np.concatenate([qs0, qs1, qh], axis=-1) + qs = ((qs.astype(np.uint16) * 3) >> 8).astype(np.int8) - np.int8(1) + + return d * qs.astype(np.float32) + + +class TQ2_0(__Quant, qtype=GGMLQuantizationType.TQ2_0): + @classmethod + def quantize_blocks(cls, blocks: np.ndarray) -> np.ndarray: + n_blocks = blocks.shape[0] + + d = abs(blocks).max(axis=-1, keepdims=True) + with np.errstate(divide="ignore"): + id = np.where(d == 0, 0, 1 / d) + qs = np_roundf(blocks * id) + qs = (qs.astype(np.int8) + np.int8(1)).astype(np.uint8) + + qs = qs.reshape((n_blocks, -1, 4, 32)) << np.array( + [0, 2, 4, 6], dtype=np.uint8 + ).reshape((1, 1, 4, 1)) + qs = qs[..., 0, :] | qs[..., 1, :] | qs[..., 2, :] | qs[..., 3, :] + qs = qs.reshape((n_blocks, -1)) + + d = d.astype(np.float16).view(np.uint8) + + return np.concatenate([qs, d], axis=-1) + + @classmethod + def dequantize_blocks(cls, blocks: np.ndarray) -> np.ndarray: + n_blocks = blocks.shape[0] + + qs, d = np.hsplit(blocks, [QK_K // 4]) + + d = d.view(np.float16).astype(np.float32) + + qs = qs.reshape((n_blocks, -1, 1, 32)) >> np.array( + [0, 2, 4, 6], dtype=np.uint8 + ).reshape((1, 1, 4, 1)) + qs = (qs & 0x03).reshape((n_blocks, -1)).astype(np.int8) - np.int8(1) + + return d * qs.astype(np.float32) + + +class IQ2_XXS(__Quant, qtype=GGMLQuantizationType.IQ2_XXS): + ksigns: bytes = ( + b"\x00\x81\x82\x03\x84\x05\x06\x87\x88\x09\x0a\x8b\x0c\x8d\x8e\x0f" + b"\x90\x11\x12\x93\x14\x95\x96\x17\x18\x99\x9a\x1b\x9c\x1d\x1e\x9f" + b"\xa0\x21\x22\xa3\x24\xa5\xa6\x27\x28\xa9\xaa\x2b\xac\x2d\x2e\xaf" + b"\x30\xb1\xb2\x33\xb4\x35\x36\xb7\xb8\x39\x3a\xbb\x3c\xbd\xbe\x3f" + b"\xc0\x41\x42\xc3\x44\xc5\xc6\x47\x48\xc9\xca\x4b\xcc\x4d\x4e\xcf" + b"\x50\xd1\xd2\x53\xd4\x55\x56\xd7\xd8\x59\x5a\xdb\x5c\xdd\xde\x5f" + b"\x60\xe1\xe2\x63\xe4\x65\x66\xe7\xe8\x69\x6a\xeb\x6c\xed\xee\x6f" + b"\xf0\x71\x72\xf3\x74\xf5\xf6\x77\x78\xf9\xfa\x7b\xfc\x7d\x7e\xff" ) + # iq2xxs_grid, but with each byte of the original packed in 2 bits, + # by mapping 0x08 to 0, 0x19 to 1, and 0x2b to 2. + grid_shape = (256, 8) + grid_map = (0x08, 0x19, 0x2B) + grid_hex = ( + b"00000200050008000a00110014002000220028002a0041004400500058006100" + b"6400800082008a00a20001010401100115014001840198010002020222028202" + b"010404041004210424044004420448046004810484049004a404000502050805" + b"200546056905800591050906100640068406a406000805080808140828084108" + b"440850085208880804094009020a140a01100410101021104010601084109010" + b"951000110811201150115a118011241245120014081420142514491480141815" + b"6215001616160118041810184018811800190519a019511a002002200a204420" + b"6120802082202921482100220222012404241024402456240025412564259026" + b"082820289428442a014004401040184021402440404048405640604081408440" + b"9040004120416141804185410142104248425642684200440844204480449944" + b"124524450046014804481048404845480049584961498249454a904a00500850" + b"1150195020508050885004514251a4519152905492540a550156545600581158" + b"195864584059085a046010604060686000615561186260620064056410651265" + b"84654268008002800a8041808280048118814081118201840484108415844084" + b"608400854685948509864086608602880489118a0490109024904090a1901691" + b"8091459200942294449451958198209902a050a085a009a100a218a450a804a9" + ) -__quantize_q8_0_lazy = LazyNumpyTensor._wrap_fn( - __quantize_q8_0_array, - meta_noop=(np.uint8, __quantize_q8_0_shape_change), -) + @classmethod + def dequantize_blocks(cls, blocks: np.ndarray) -> np.ndarray: + n_blocks = blocks.shape[0] + d, qs = np.hsplit(blocks, [2]) -def quantize_q8_0(data: np.ndarray): - if type(data) is LazyNumpyTensor: - return __quantize_q8_0_lazy(data) - else: - return __quantize_q8_0_array(data) + d = d.view(np.float16).astype(np.float32) + + qs = qs.view(np.uint32).reshape(n_blocks, -1, 2) + + db = ( + d + * (np.float32(0.5) + (qs[..., 1] >> 28).astype(np.float32)) + * np.float32(0.25) + ) + db = db.reshape((n_blocks, -1, 1, 1)) + + # get the sign indices and unpack the bits + signs = qs[..., 1].reshape((n_blocks, -1, 1)) >> np.array( + [0, 7, 14, 21], dtype=np.uint32 + ).reshape((1, 1, 4)) + ksigns = np.frombuffer(cls.ksigns, dtype=np.uint8).reshape((1, 1, 1, 128)) + signs = (signs & np.uint32(0x7F)).reshape((n_blocks, -1, 4, 1)) + signs = np.take_along_axis(ksigns, signs, axis=-1) + signs = signs.reshape((n_blocks, -1, 4, 1)) >> np.array( + [i for i in range(8)], dtype=np.uint8 + ).reshape((1, 1, 1, 8)) + signs = signs & np.uint8(0x01) + signs = np.where(signs == 0, np.float32(1), np.float32(-1)) + signs = signs.reshape((n_blocks, -1, 4, 8)) + + assert cls.grid is not None + grid = np.take_along_axis( + cls.grid, + qs[..., 0].copy().view(np.uint8).reshape((n_blocks, -1, 1, 1)), + axis=-2, + ) + grid = grid.reshape((n_blocks, -1, 4, 8)) + + return (db * grid * signs).reshape((n_blocks, -1)) + + +class IQ2_XS(__Quant, qtype=GGMLQuantizationType.IQ2_XS): + # iq2xs_grid, but with each byte of the original packed in 2 bits, + # by mapping 0x08 to 0, 0x19 to 1, and 0x2b to 2. + grid_shape = (512, 8) + grid_map = (0x08, 0x19, 0x2B) + grid_hex = ( + b"00000200050008000a0011001400160019002000220025002800410044004600" + b"49005000520055005800610064008000820085008800910094009900a0000101" + b"04010601090110011201150118011a0121012401400142014501480151015401" + b"6001680181018401900100020202050208021102140220024102440250025502" + b"80028a0201040404060409041004120415041804210424044004420445044804" + b"5104540456046004810484049004000502050505080511051405200541054405" + b"500561058005010604061006260640064206840600080208050808080a081108" + b"14082008250841084408500858088008a008aa08010904091009400981098909" + b"000a200a280a960aa00a01100410061009101010121015101810211024104010" + b"4210451048105110541060106a10811084109010001102110511081111111411" + b"2011411144115011801194119611011204120612101240126012001402140514" + b"0814111414142014411444144914501464148014011504151015401500161416" + b"49160118041810181218401854188618001905196619511aa91a002002200520" + b"08200a201120142020204120442050208020a020012104211021402148216521" + b"002222228022a82201240424102429244024002541255225992501261a26a626" + b"002808280a28202855288828a22868299029082a202a822a882a8a2a01400440" + b"0640094010401240154018402140244040404240454048404a40514054406040" + b"6540814084409040004102410541084111411441204141414441504180418541" + b"a241014204421042124229424042004402440544084411441444194420444144" + b"4444504480449444014504451045244540459a4500460a464446504601480448" + b"1048404845485448624800491149444950496949044a00500250055008501150" + b"145020502850415044505050805001510451105115514051425100524452aa52" + b"0154045410542154405460548154a154005508558055885521566856a1560058" + b"14584158505899581a5940594259855a0160046010604060546062608660a960" + b"006124624a62926200641664106540654565a46501686a682569066a546a626a" + b"00800280058008801180148020802a8041804480508080808280a880aa800181" + b"0481068110814081518159810082208280828282a082a8820184048410841284" + b"158440846084898400854485a58518866a860088088825885a8880888288a888" + b"0689228a808a888a968aa88a0190049010904090569084900091229164915692" + b"89920094059444945094589429959095929541965198a6984999159a609a00a0" + b"02a008a00aa020a02aa0a0a051a159a1a6a100a202a208a22aa280a2a0a240a4" + b"95a465a698a60aa820a822a828a8a0a8a8a804a984a986a928aa2aaa91aaaaaa" + ) + + @classmethod + def dequantize_blocks(cls, blocks: np.ndarray) -> np.ndarray: + n_blocks = blocks.shape[0] + + d, rest = np.hsplit(blocks, [2]) + qs, scales = np.hsplit(rest, [2 * QK_K // 8]) + + d = d.view(np.float16).astype(np.float32) + qs = qs.view(np.uint16) + + scales = scales.reshape((n_blocks, -1, 1)) >> np.array( + [0, 4], dtype=np.uint8 + ).reshape((1, 1, 2)) + scales = (scales & 0x0F).reshape((n_blocks, -1)) + db = d * (np.float32(0.5) + scales) * np.float32(0.25) + db = db.reshape((n_blocks, -1, 1, 1)) + + # get the sign indices and unpack the bits + signs = np.frombuffer(IQ2_XXS.ksigns, dtype=np.uint8).reshape(1, 1, 128) + signs = np.take_along_axis(signs, (qs >> 9).reshape((n_blocks, -1, 1)), axis=-1) + signs = signs.reshape((n_blocks, -1, 1)) >> np.array( + [i for i in range(8)], dtype=np.uint8 + ).reshape((1, 1, 8)) + signs = signs & np.uint8(0x01) + signs = np.where(signs == 0, np.float32(1), np.float32(-1)) + signs = signs.reshape((n_blocks, -1, 2, 8)) + + assert cls.grid is not None + grid = np.take_along_axis( + cls.grid, (qs & np.uint16(511)).reshape((n_blocks, -1, 1, 1)), axis=-2 + ) + grid = grid.reshape((n_blocks, -1, 2, 8)) + + return (db * grid * signs).reshape((n_blocks, -1)) + + +class IQ2_S(__Quant, qtype=GGMLQuantizationType.IQ2_S): + # iq2s_grid, but with each byte of the original packed in 2 bits, + # by mapping 0x08 to 0, 0x19 to 1, and 0x2b to 2. + grid_shape = (1024, 8) + grid_map = (0x08, 0x19, 0x2B) + grid_hex = ( + b"00000200050008000a0011001400160019002000220025002800410044004600" + b"490050005200550058006100640066006900800082008500880091009400a000" + b"a500aa0001010401060109011001120115011801210124014001420145014801" + b"510154015601590160016501680181018401900192019501a101a40100020202" + b"050208021102140220022a02410244024602490250025502800285028a029402" + b"a202010404040604090410041204150418042104240426042904400442044504" + b"48044a0451045404560459046004620465048104840486048904900495049804" + b"a104a40400050205050508050a05110514051605190520052505280541054405" + b"46054905500552055505580561056405800582058505880591059405a0050106" + b"0406060609061006150640064506480651065406600681068406900600080208" + b"050808081108140816081908200825082a084108440846084908500852085508" + b"580861086408800885089408aa08010904091009120915091809210940094509" + b"480951095409600981099009000a110a140a220a280a2a0a500a990a01100410" + b"0610091010101210151018102110241026104010421045104810511054105610" + b"59106010621065106810811084108610901095109810a110a410001102110511" + b"08110a1111111411161119112011221125112811411144114611491150115211" + b"5511581161116411801182118511881191119411011204120912101215122112" + b"2412401245125112541281128412901200140214051408141114141416141914" + b"2014251428144114441446144914501452145514581461146414801482148514" + b"881491149414a014011504150615091510151215151518152115241540154215" + b"4515481551155415601581158415901500160516081611161416201641164416" + b"50168016aa160118041806180918101815181818211840184218451848185118" + b"541860188118841800190219051908191119141920194119441950196919a219" + b"041a101a401a561a00200220052008201120142016201920202025202a204120" + b"4420502052205520642080208a209420aa200121042110211221152121214021" + b"4221452151215421602181218421902100220a22222228222a22442250228822" + b"8a22a82201240424062409241024152418242124242440244224452448245124" + b"5424602481248424902400250525082511251425202541254425502566258025" + b"0126042610264026592600280528112814284128442850288a28aa2801290429" + b"102995290a2a222a642a882a8a2a014004400640094010401240154018401a40" + b"21402440264040404240454048404a4051405440564059406040624065408140" + b"8440904095409840a140a4400041024105410841114114411641194120412241" + b"2541414144414641494150415241554158416141644180418241854188419141" + b"9441a04101420442104212421542184224424042454248425142544260428142" + b"844200440244054408440a441144144416441944204422442544284441444444" + b"46444944504452445544584461446444804482448544884491449444a0440145" + b"0445064509451045124515451845214524454045424545454845514554456045" + b"6a4581458445904500460246054608461146144620464146444650468046a546" + b"0148044809481048124815481848214824484048424845484848514854486048" + b"84489048004902490549084911491449204941494449504980499649014a044a" + b"104a404a00500250055008501150145016501950205022502550285041504450" + b"4650495050505250555058506150645080508250855088509150945001510451" + b"0651095110511251155118512151245140514251455148515151545160518151" + b"8451905100520552085211521452205241524452505269528052015404540654" + b"0954105412541554185421542454405442544554485451545454605481548454" + b"9054005502550555085511551455205541554455505580550156045610562656" + b"405600580258055808581158145820584158445850585a588058015904591059" + b"4059005a195a855aa85a01600460066010601260156018602160246040604560" + b"4860516054606060846090600061026105610861116114612061416144615061" + b"806199610462106240625662a162006405640864116414642064416444645064" + b"806401650465106540654a656865926500669466016804681068656898680069" + b"2a69426aa16a0080028005800880118014801980208025804180448050805280" + b"5580588061808080858091809480018104810981108112811581188121812481" + b"408142814581488151815481818184819081a981008205820a82118214824182" + b"4482508201840484068409841084128415841884218440844284458448845184" + b"5484608481848484908400850285058508851185148520854185448550858085" + b"8a85018604861086298640860088058811881488418844885088a28801890489" + b"40896589228a588a5a8a828aa28a019004900990109012901590189024904090" + b"4290459048905190549060908190849090900091059111911491419144915091" + b"5a910192049210924092a6920094029405940894119414942094419444945094" + b"8094969401950495109540959895a19500964696649601980498109826984098" + b"a998009949995299909a00a005a00aa014a022a02aa041a044a050a0a2a0aaa0" + b"40a165a102a20aa222a228a22aa282a288a28aa2a8a201a404a410a440a489a4" + b"a4a400a519a551a60aa828a8a2a854a986a908aa0aaa20aa22aa28aa88aaaaaa" + ) + + @classmethod + def dequantize_blocks(cls, blocks: np.ndarray) -> np.ndarray: + n_blocks = blocks.shape[0] + + d, rest = np.hsplit(blocks, [2]) + qs, rest = np.hsplit(rest, [QK_K // 8]) + signs, rest = np.hsplit(rest, [QK_K // 8]) + qh, scales = np.hsplit(rest, [QK_K // 32]) + + d = d.view(np.float16).astype(np.float32) + + scales = scales.reshape((n_blocks, -1, 1)) >> np.array( + [0, 4], dtype=np.uint8 + ).reshape((1, 1, 2)) + scales = (scales & 0x0F).reshape((n_blocks, -1)) + db = d * (np.float32(0.5) + scales) * np.float32(0.25) + db = db.reshape((n_blocks, -1, 1, 1)) + + # unpack the sign bits + signs = signs.reshape((n_blocks, -1, 1)) >> np.array( + [i for i in range(8)], dtype=np.uint8 + ).reshape((1, 1, 8)) + signs = signs & np.uint8(0x01) + signs = np.where(signs == 0, np.float32(1), np.float32(-1)) + signs = signs.reshape((n_blocks, -1, 2, 8)) + + qh = qh.reshape((n_blocks, -1, 1)) >> np.array( + [0, 2, 4, 6], dtype=np.uint8 + ).reshape((1, 1, 4)) + qs = qs.astype(np.uint16) | ((qh & 0x03).astype(np.uint16) << 8).reshape( + (n_blocks, -1) + ) + + assert cls.grid is not None + grid = np.take_along_axis(cls.grid, qs.reshape((n_blocks, -1, 1, 1)), axis=-2) + grid = grid.reshape((n_blocks, -1, 2, 8)) + + return (db * grid * signs).reshape((n_blocks, -1)) + + +class IQ3_XXS(__Quant, qtype=GGMLQuantizationType.IQ3_XXS): + grid_shape = (256, 4) + grid_map = (0x04, 0x0C, 0x14, 0x1C, 0x24, 0x2C, 0x34, 0x3E) + grid_hex = ( + b"0000020004001100130017002000220031004200730075000101030110011201" + b"2101250130013201410154017001000202020402110220022202310233023702" + b"5102570275020103070310031203250370031304370444045704730475040105" + b"0705320552053506640610071407160743076107011003101010121021102310" + b"3010321034104710501000110211111120112211011203121012121221123012" + b"7212001302132013311346136613011405145014201524154615711505162217" + b"4017002002201120132020202220262031204220012103210521102112212121" + b"3021632167217021002202221122172220222222372240225522012310231423" + b"7023742335245324032527254125742501270327162745270130103012302130" + b"2330503065307230003102312031313144314631013203321032253252327232" + b"1133333330344734723400350635223555351436363663363337603704401740" + b"3540374053405740744120423742404260426642074345430444514464442545" + b"4345704505471047124730471250415070500051065126515551145232527252" + b"0253535310542354275472540255315550562457425724604460466064602161" + b"6161176264623063366344640565526533660367216703700570077010703270" + b"5270267140711272457252720073157333736073217441740075027524753076" + ) + + @classmethod + def dequantize_blocks(cls, blocks: np.ndarray) -> np.ndarray: + n_blocks = blocks.shape[0] + + d, rest = np.hsplit(blocks, [2]) + qs, scales = np.hsplit(rest, [QK_K // 4]) + + d = d.view(np.float16).astype(np.float32) + scales = scales.view(np.uint32) + + db = d * (np.float32(0.5) + (scales >> 28).astype(np.float32)) * np.float32(0.5) + db = db.reshape((n_blocks, -1, 1, 1)) + + # get the sign indices and unpack the bits + signs = scales.reshape((n_blocks, -1, 1)) >> np.array( + [0, 7, 14, 21], dtype=np.uint32 + ).reshape((1, 1, 4)) + ksigns = np.frombuffer(IQ2_XXS.ksigns, dtype=np.uint8).reshape((1, 1, 1, 128)) + signs = (signs & np.uint32(0x7F)).reshape((n_blocks, -1, 4, 1)) + signs = np.take_along_axis(ksigns, signs, axis=-1) + signs = signs.reshape((n_blocks, -1, 4, 1)) >> np.array( + [i for i in range(8)], dtype=np.uint8 + ).reshape((1, 1, 1, 8)) + signs = signs & np.uint8(0x01) + signs = np.where(signs == 0, np.float32(1), np.float32(-1)) + signs = signs.reshape((n_blocks, -1, 4, 8)) + + assert cls.grid is not None + grid = np.take_along_axis(cls.grid, qs.reshape((n_blocks, -1, 1, 1)), axis=-2) + grid = grid.reshape((n_blocks, -1, 4, 8)) + + return (db * grid * signs).reshape((n_blocks, -1)) + + +class IQ3_S(__Quant, qtype=GGMLQuantizationType.IQ3_S): + grid_shape = (512, 4) + grid_map = (0x01, 0x03, 0x05, 0x07, 0x09, 0x0B, 0x0D, 0x0F) + grid_hex = ( + b"0000010002000500070010001100120014001600200021002500330040004200" + b"4500470051005300600062007100740077000001010102010401100111011501" + b"2001230127013101350144016101650172010002010205020702100213021602" + b"2102250230023402420245024702510253027002730203031103150320032203" + b"3103330336034403500352036703710375030004130417042104240432044004" + b"4304510470040205040520052205260533054105450547056605730506061106" + b"1306310652067106000702070407200722072607330750075407001001100210" + b"0410101011101310151017102010221031103410361054105610611072100011" + b"0111031106111011141121113011331141115011521170117611001212121512" + b"1712201224123212401243125512601272120113041307131013131321132713" + b"3013341341136213701303140514121414143114331442144614501454140115" + b"1015131521153015321551152016241627164416461601170317101712172117" + b"3517411762177017002001200320052007201020122014201620212023202720" + b"3020322041204320452050205220672070207320752000210221102113211721" + b"2221252131213421422151210122042207222122232230223722412253225722" + b"7122742200230223052311232223242331233323422350236623012407242024" + b"2324322435244124722475240425112522253725402553257025002602260726" + b"2126552661260527112726273027432750270230113013301530173022303130" + b"3330353042304430473051306330713001310331053114312131233140316031" + b"7231763100321232203232323432503201331033143321332333273330334133" + b"4333473355337333033411341634223431345234603464340135103512352535" + b"3235443556357335163641360137033720372237353700400440124020402440" + b"2740324041405040704002410741114113412241304135414341514155410142" + b"0342104215422142334240425742624270420443114313432043224331433543" + b"0044024424443744404471440545074521456245134634466046104715473047" + b"4347514702501050145022504050445047505250665074500151035105511251" + b"2151325172510052115223523052365253520253075310532753445351536553" + b"7353015404542054325446541255265551555355425602570457225711601360" + b"1560316033606060006120612761646112623462426255626262706200631463" + b"2163406325644364626400650365346560650566406611671367007004700770" + b"2070227036704070547062700271117124714371457101720472107216722172" + b"3072517202733273357353730174057413742074507422754275027631760077" + ) + + @classmethod + def dequantize_blocks(cls, blocks: np.ndarray) -> np.ndarray: + n_blocks = blocks.shape[0] + + d, rest = np.hsplit(blocks, [2]) + qs, rest = np.hsplit(rest, [QK_K // 4]) + qh, rest = np.hsplit(rest, [QK_K // 32]) + signs, scales = np.hsplit(rest, [QK_K // 8]) + + d = d.view(np.float16).astype(np.float32) + + scales = scales.reshape((n_blocks, -1, 1)) >> np.array( + [0, 4], dtype=np.uint8 + ).reshape((1, 1, 2)) + scales = (scales & 0x0F).reshape((n_blocks, -1)) + db = d * (1 + 2 * scales) + db = db.reshape((n_blocks, -1, 1, 1)) + + # unpack the sign bits + signs = signs.reshape((n_blocks, -1, 1)) >> np.array( + [i for i in range(8)], dtype=np.uint8 + ).reshape((1, 1, 8)) + signs = signs & np.uint8(0x01) + signs = np.where(signs == 0, np.float32(1), np.float32(-1)) + signs = signs.reshape((n_blocks, -1, 4, 8)) + + qh = qh.reshape((n_blocks, -1, 1)) >> np.array( + [i for i in range(8)], dtype=np.uint8 + ) + qh = (qh & 0x01).astype(np.uint16).reshape((n_blocks, -1)) + qs = qs.astype(np.uint16) | (qh << 8) + + assert cls.grid is not None + grid = np.take_along_axis(cls.grid, qs.reshape((n_blocks, -1, 1, 1)), axis=-2) + grid = grid.reshape((n_blocks, -1, 4, 8)) + + return (db * grid * signs).reshape((n_blocks, -1)) + + +class IQ1_S(__Quant, qtype=GGMLQuantizationType.IQ1_S): + # iq1s_grid, with each byte packed into 2 bits + # -1, 0, 1 <=> 0, 1, 2 + grid_shape = (2048, 8) + grid_map = (-1, 0, 1) + grid_hex = ( + b"00000200050008000a00110015002000220028002a0045005100540056006500" + b"8000820088008a009500a000a200a800aa000401050111011401160119011a01" + b"2501410146014901520155015a0161016401660168018501910194019601a501" + b"0002020208020a0215022002220228022a024502510259026402690280028202" + b"88028a02910295029902a002a202a802aa021104140416042504410449045504" + b"5a046404650491049904a5040105040505050605150518051a05290540054505" + b"4a0550055105540555055605590560056205650568056a058105910595059805" + b"9a05a105a405a505a605a9051406190641064406500652065506580660066106" + b"6606690685069106940699060008020808080a0815082008220828082a084508" + b"5108560865088008820888088a089508a008a208a808aa080509110914091909" + b"2409250941095009510955096109640969099109940996099909a509000a020a" + b"080a0a0a150a200a220a280a2a0a450a510a590a610a650a800a820a850a880a" + b"8a0a950aa00aa20aa80aaa0a1010111014101910241025104110441050105510" + b"58106110641065106910911094109610a110a510011104110611091110111211" + b"1511181121112411291145114a11501151115211541155115611591160116511" + b"841192119511a111a41111121412161225124012461249125212551258125a12" + b"641266128512911294129612a512011406140914141415141814191421142614" + b"41144514461448144a1451145414551456145914621465146814841489149014" + b"94149514981499149a14a114a414a514a914021505150a151115141515151615" + b"191520152215251528152a154115441545154615511552155415551556155915" + b"5a1561156415651566156915801582158415851588158a159015911594159515" + b"961599159a15a015a215a51501160416051606161516161618161a1621162616" + b"401642164416451648164a165116551656165816591661166416651668166916" + b"6a1686168a1692169516a416a916111816182518411844184618491850185518" + b"58185a1860186118641866186918851891189418a5181019121915191a192119" + b"25194219441945194819511954195519561959195a19601965196a1989199119" + b"921995199819a119a619a919091a161a241a261a441a461a491a501a521a551a" + b"581a611a661a691a851a911a961a9a1a0020022008200a201520202022202520" + b"28202a20452051205920612065208020822088208a209520a020a220a520a820" + b"aa2005211121142119212521422144214921552158215a216121642165216621" + b"8521902196219921a521012208220a22112215222022222228222a2245225122" + b"562259226522812288228a2291229522a022a222a822aa220524142416241924" + b"252444244524462449245224552458245a2466248524912494249924a124a524" + b"0925152521252925402545254825512554255525592562256525682589259025" + b"9425952598259a25a125a425a625a92505261026122619262526412649265526" + b"6026612669268426862690269a260028022808280a2815282028222828282a28" + b"45285128542865288028822888288a28a028a228a828aa280929112914291929" + b"2529462949295229552961296429662969298529902996299929a429a529002a" + b"022a082a0a2a202a222a282a2a2a452a512a562a592a652a802a822a882a8a2a" + b"952aa02aa22aa82aaa2a054011401640254049405240554058405a4061406440" + b"664094409940a140a6400041014104410641094112411541164118411a412141" + b"26412941454148414a41514154415541564159415a41654168416a4181418441" + b"8641904192419541a041a141a241054211421442164225424142524255425a42" + b"6442694289429442a5420144154419442944454448444a445144544455445644" + b"61446244654468446a44814486448944904492449544a044a144a94401450245" + b"05450a4511451445154516451945204525452a45414544454545464549455045" + b"5145544555455645584559456145644565456645694582458445854588459145" + b"94459545964599459a45a545a845aa450146054609461446154618461a462146" + b"2446294640464246454648465046514652465546564659466246654668468146" + b"85468a4694469546a146a446a6460548114815481a4825484248494850485548" + b"5848614864486648694885489148944896489948a5480149054906490a491049" + b"144915491849214924492649404945494a495149524954495549564959496049" + b"6249654966496a49864989499249954996499849a149a449a649a949164a444a" + b"464a494a554a584a5a4a644a694a944aa54a0150045005500650095012501550" + b"1a50215024502950405045504850515054505550565059506550685086508950" + b"95509850a050a150a650a9500551085109510a51115114511551165118511951" + b"20512551265128512a5141514451455146514951505151515251545155515651" + b"585159515a51615164516551665169518251855191519451955196519951a051" + b"a551aa5101520652125215521a5221522452425245524a525152545255525652" + b"595262526552855290529252955299529a52a452045405541154145415541654" + b"185419542154255428542a54415444544554465449544a545054515454545554" + b"5654585459545a54615462546454655466546954805488548a54915494549554" + b"96549954a154a454a554aa540155025504550555065509551055115512551455" + b"1555165519551a55215524552555265529554055415542554455455546554855" + b"4955505551555255545555555655585559555a55605561556455655566556855" + b"69556a5581558455855589558a559055915594559555965598559955a155a455" + b"a555a655a9550056015602560456065608560956115614561556185619562056" + b"2156225624562556265628562956415645564656485649564a56505651565256" + b"545655565656585659565a566156645665566956825685568656885689568a56" + b"915695569a56a256a556a656a856a95604580558065809581058155818582158" + b"2a58455848584a58515854585558565858585958605862586458655882588958" + b"9058925895589858a158a9580159025905590a59115914591559165919592559" + b"41594459455946594959505951595259545955595659585959595a5961596459" + b"655966596959815985598959915994599559965998599959a559045a085a155a" + b"1a5a205a255a265a295a455a485a495a515a555a565a585a595a625a655a685a" + b"6a5a815a8a5a925a955a965a985a9a5aa15a0560146016601960256044605060" + b"5560566058605a60616064606660696081609660a56001610461066109611261" + b"15612161226126612961456149615161556156615961656166616a6184618a61" + b"92619561a161a661a96111621662196240624162466255625662586260628562" + b"91629662a56211641264156416641a6421642664296440644264456448644a64" + b"516454645564566459645a646064626465648464856489649064926494649564" + b"966498649a64a164a464a964056508650a651165156516651965446545654665" + b"496550655165546555655665596561656465656566656965866589658a659165" + b"9565966599659a65a265a565a665a86502660966156620662666286629664066" + b"456648664a66516654665566566658665a666066656668668066826685668a66" + b"9466966698669966a066a466a666aa661668196825684168526855685a686168" + b"6968856891689868a66801690469106915692169246926692969406941694569" + b"4669486951695469556956695969606965696a69826984698a699569a169a469" + b"a569a969116a166a186a416a446a496a506a556a586a5a6a646a656a696a866a" + b"946a986a9a6aa66a0080028008800a802080228028802a804580508051805480" + b"5680598065808080828088808a809580a080a280a880aa800581118114811681" + b"1981258141814481498150815281558156815881598164816681698185818981" + b"948196819981a5810082028208820a8215822082228228822a82518254825982" + b"65828082828288828a829582a082a282a882aa82148419844184448451845584" + b"5a846184648469849484998401850985128515851a8526852985408541854585" + b"4885518554855585568559855a856585668568856a8581858485868589859085" + b"928595859885a68511861686198625864186448649864a865086558659865a86" + b"618666866a86858691869a86a4860088028808880a8815882088228828882a88" + b"41884588518854885988658869888088828888888a889588a088a288a888aa88" + b"05890689118914891689258941894489468949895089528955895a8961896489" + b"858996899989a589008a028a088a0a8a158a208a228a288a2a8a458a518a548a" + b"568a808a828a888a8a8a958aa08aa28aa88aaa8a059011901690189019902590" + b"419046904990559058905a9069906a9085909190949096909990a59001910491" + b"069109911091159118911a912191249126912991409145915091519154915591" + b"569159916291659184918691929195919891a191a491a691a991059211921492" + b"19922592449246924992509252925592589266926992859294929692a9920194" + b"04940694109415941894269440944a9451945494559456945894599460946194" + b"62946594849486949294949495949894a194a9940095059508950a9510951195" + b"14951595169519952195259529952a9541954495459546954995509551955295" + b"549555955695589559955a956195649565956695699581958595889591959295" + b"94959595969599959a95a095a295a595a895aa95019604961096159619962096" + b"2696299645964896499651965296559656965996659668968296849689968a96" + b"929694969596a496a696a9960598169819982598419846985098529855985698" + b"5a98649865988598919896989998a59804990699099910991299159918991a99" + b"209921992499269940994299459948994a995199549955995699599962996599" + b"66996a99819984999099929995999a99a199a699059a159a259a449a469a499a" + b"509a559a589a619a859a919a949a959a969a00a002a008a00aa015a020a022a0" + b"28a02aa045a051a054a056a059a080a082a088a08aa095a0a0a0a2a0a8a0aaa0" + b"05a109a111a114a116a119a11aa146a149a151a155a158a15aa161a164a185a1" + b"90a192a196a199a102a208a20aa210a219a222a228a22aa245a251a256a259a2" + b"65a280a282a288a28aa295a2a0a2a2a2a8a2aaa219a425a441a444a450a454a4" + b"55a458a45aa461a465a466a468a469a485a406a509a510a512a515a518a526a5" + b"29a542a545a551a554a555a556a559a565a56aa581a584a585a586a589a592a5" + b"95a598a505a611a616a61aa621a625a644a646a64aa652a655a656a658a660a6" + b"62a686a690a695a696a699a6a1a6a4a6a6a600a802a808a80aa820a822a828a8" + b"2aa851a854a856a859a880a882a888a88aa895a8a0a8a2a8a8a8aaa805a914a9" + b"19a921a925a941a950a955a95aa961a966a969a990a996a900aa02aa08aa0aaa" + b"20aa22aa28aa2aaa51aa54aa56aa80aa82aa88aa8aaa95aaa0aaa2aaa8aaaaaa" + ) + + delta = np.float32(0.125) + + @classmethod + def dequantize_blocks(cls, blocks: np.ndarray) -> np.ndarray: + n_blocks = blocks.shape[0] + + d, rest = np.hsplit(blocks, [2]) + qs, qh = np.hsplit(rest, [QK_K // 8]) + + d = d.view(np.float16).astype(np.float32) + qh = qh.view(np.uint16) + + dl = d * (2 * ((qh >> 12) & 7) + 1) + dl = dl.reshape((n_blocks, -1, 1, 1)) + delta = np.where((qh & np.uint16(0x8000)) == 0, cls.delta, -cls.delta) + delta = delta.reshape((n_blocks, -1, 1, 1)) + + qh = qh.reshape((n_blocks, -1, 1)) >> np.array( + [0, 3, 6, 9], dtype=np.uint16 + ).reshape((1, 1, 4)) + qs = qs.astype(np.uint16) | ((qh & 7) << 8).reshape((n_blocks, -1)) + + assert cls.grid is not None + grid = np.take_along_axis(cls.grid, qs.reshape((n_blocks, -1, 1, 1)), axis=-2) + grid = grid.reshape((n_blocks, -1, 4, 8)) + + return (dl * (grid + delta)).reshape((n_blocks, -1)) + + +class IQ1_M(__Quant, qtype=GGMLQuantizationType.IQ1_M): + grid_shape = IQ1_S.grid_shape + grid_map = IQ1_S.grid_map + grid_hex = IQ1_S.grid_hex + + delta = IQ1_S.delta + + # Okay *this* type is weird. It's the only one which stores the f16 scales in multiple parts. + @classmethod + def dequantize_blocks(cls, blocks: np.ndarray) -> np.ndarray: + n_blocks = blocks.shape[0] + + qs, rest = np.hsplit(blocks, [QK_K // 8]) + qh, scales = np.hsplit(rest, [QK_K // 16]) + + # The f16 scale is packed across multiple bytes + scales = scales.view(np.uint16) + d = (scales.reshape((n_blocks, 4)) & np.uint16(0xF000)) >> np.array( + [12, 8, 4, 0], dtype=np.uint16 + ).reshape((1, 4)) + d = d[..., 0] | d[..., 1] | d[..., 2] | d[..., 3] + d = d.view(np.float16).astype(np.float32).reshape((n_blocks, 1)) + + scales = scales.reshape(n_blocks, -1, 1) >> np.array( + [0, 3, 6, 9], dtype=np.uint16 + ).reshape((1, 1, 4)) + scales = (scales & 0x07).reshape((n_blocks, -1)) + dl = d * (2 * scales + 1) + dl = dl.reshape((n_blocks, -1, 2, 1, 1)) + + qh = qh.reshape((n_blocks, -1, 1)) >> np.array([0, 4], dtype=np.uint8).reshape( + (1, 1, 2) + ) + qs = qs.astype(np.uint16) | ((qh & 0x07).astype(np.uint16) << 8).reshape( + (n_blocks, -1) + ) + + delta = np.where(qh & 0x08 == 0, cls.delta, -cls.delta) + delta = delta.reshape((n_blocks, -1, 2, 2, 1)) + + assert cls.grid is not None + grid = np.take_along_axis(cls.grid, qs.reshape((n_blocks, -1, 1, 1)), axis=-2) + grid = grid.reshape((n_blocks, -1, 2, 2, 8)) + + return (dl * (grid + delta)).reshape((n_blocks, -1)) + + +class IQ4_NL(__Quant, qtype=GGMLQuantizationType.IQ4_NL): + kvalues = (-127, -104, -83, -65, -49, -35, -22, -10, 1, 13, 25, 38, 53, 69, 89, 113) + + @classmethod + def dequantize_blocks(cls, blocks: np.ndarray) -> np.ndarray: + n_blocks = blocks.shape[0] + + d, qs = np.hsplit(blocks, [2]) + + d = d.view(np.float16).astype(np.float32) + + qs = qs.reshape((n_blocks, -1, 1, cls.block_size // 2)) >> np.array( + [0, 4], dtype=np.uint8 + ).reshape((1, 1, 2, 1)) + + qs = (qs & np.uint8(0x0F)).reshape((n_blocks, -1, 1)) + + kvalues = np.array(cls.kvalues, dtype=np.int8).reshape(1, 1, 16) + qs = ( + np.take_along_axis(kvalues, qs, axis=-1) + .astype(np.float32) + .reshape((n_blocks, -1)) + ) + + return d * qs + + +class IQ4_XS(__Quant, qtype=GGMLQuantizationType.IQ4_XS): + @classmethod + def dequantize_blocks(cls, blocks: np.ndarray) -> np.ndarray: + n_blocks = blocks.shape[0] + + d, rest = np.hsplit(blocks, [2]) + scales_h, rest = np.hsplit(rest, [2]) + scales_l, qs = np.hsplit(rest, [QK_K // 64]) + + d = d.view(np.float16).astype(np.float32) + scales_h = scales_h.view(np.uint16) + + scales_l = scales_l.reshape((n_blocks, -1, 1)) >> np.array( + [0, 4], dtype=np.uint8 + ).reshape((1, 1, 2)) + scales_h = scales_h.reshape((n_blocks, 1, -1)) >> np.array( + [2 * i for i in range(QK_K // 32)], dtype=np.uint16 + ).reshape((1, -1, 1)) + scales_l = scales_l.reshape((n_blocks, -1)) & np.uint8(0x0F) + scales_h = scales_h.reshape((n_blocks, -1)).astype(np.uint8) & np.uint8(0x03) + + scales = (scales_l | (scales_h << np.uint8(4))).astype(np.int8) - np.int8(32) + dl = (d * scales.astype(np.float32)).reshape((n_blocks, -1, 1)) + + qs = qs.reshape((n_blocks, -1, 1, 16)) >> np.array( + [0, 4], dtype=np.uint8 + ).reshape((1, 1, 2, 1)) + qs = qs.reshape((n_blocks, -1, 32, 1)) & np.uint8(0x0F) + + kvalues = np.array(IQ4_NL.kvalues, dtype=np.int8).reshape((1, 1, 1, -1)) + qs = ( + np.take_along_axis(kvalues, qs, axis=-1) + .astype(np.float32) + .reshape((n_blocks, -1, 32)) + ) + + return (dl * qs).reshape((n_blocks, -1)) diff --git a/src/gguf/tensor_mapping.py b/src/gguf/tensor_mapping.py index 3161173..3de2a6f 100644 --- a/src/gguf/tensor_mapping.py +++ b/src/gguf/tensor_mapping.py @@ -7,463 +7,574 @@ class TensorNameMap: mappings_cfg: dict[MODEL_TENSOR, tuple[str, ...]] = { + # Token embeddings MODEL_TENSOR.TOKEN_EMBD: ( - "gpt_neox.embed_in", - "transformer.wte", - "transformer.word_embeddings", - "word_embeddings", - "model.embed_tokens", - "tok_embeddings", - "embeddings.word_embeddings", - "language_model.embedding.word_embeddings", - "wte", - "transformer.embd.wte", - "model.tok_embeddings", - "model.embedding", - "backbone.embedding", - "backbone.embeddings", - "transformer.in_out_embed", - "embedding.word_embeddings", - "transformer.token_embeddings", - "shared", - "rwkv.embeddings", - ), - MODEL_TENSOR.TOKEN_TYPES: ("embeddings.token_type_embeddings",), + "gpt_neox.embed_in", # gptneox + "transformer.wte", # gpt2 gpt-j mpt refact qwen dbrx jais exaone + "transformer.word_embeddings", # falcon + "word_embeddings", # bloom + "model.embed_tokens", # llama-hf nemotron olmoe olmo2 + "tok_embeddings", # llama-pth + "embeddings.word_embeddings", # bert nomic-bert + "language_model.embedding.word_embeddings", # persimmon + "wte", # gpt2 + "transformer.embd.wte", # phi2 + "model.tok_embeddings", # internlm2 + "model.embedding", # mamba-qbert + "backbone.embedding", # mamba + "backbone.embeddings", # mamba-hf + "transformer.in_out_embed", # Grok + "embedding.word_embeddings", # chatglm + "transformer.token_embeddings", # openelm + "shared", # t5 + "rwkv.embeddings", # rwkv + ), + # Token type embeddings + MODEL_TENSOR.TOKEN_TYPES: ( + "embeddings.token_type_embeddings", # bert nomic-bert + ), + # Normalization of token embeddings MODEL_TENSOR.TOKEN_EMBD_NORM: ( - "word_embeddings_layernorm", - "embeddings.LayerNorm", - "emb_ln", - "transformer.norm", - "rwkv.blocks.0.pre_ln", - ), + "word_embeddings_layernorm", # bloom + "embeddings.LayerNorm", # bert + "emb_ln", # nomic-bert + "transformer.norm", # openelm + "rwkv.blocks.0.pre_ln", # rwkv + "backbone.norm", # wavtokenizer + ), + # Position embeddings MODEL_TENSOR.POS_EMBD: ( - "transformer.wpe", - "embeddings.position_embeddings", - "wpe", + "transformer.wpe", # gpt2 + "embeddings.position_embeddings", # bert + "wpe", # gpt2 ), + # Output MODEL_TENSOR.OUTPUT: ( - "embed_out", - "lm_head", - "output", - "word_embeddings_for_head", - "lm_head.linear", - "output_layer", - "head", - ), + "embed_out", # gptneox + "lm_head", # gpt2 mpt falcon llama-hf baichuan qwen mamba dbrx jais nemotron exaone olmoe olmo2 + "output", # llama-pth bloom internlm2 + "word_embeddings_for_head", # persimmon + "lm_head.linear", # phi2 + "output_layer", # chatglm + "head", # rwkv + "head.out", # wavtokenizer + ), + # Output norm MODEL_TENSOR.OUTPUT_NORM: ( - "gpt_neox.final_layer_norm", - "transformer.ln_f", - "model.norm", - "norm", - "transformer.norm_f", - "ln_f", - "language_model.encoder.final_layernorm", - "model.final_layernorm", - "lm_head.ln", - "model.norm_f", - "backbone.norm_f", - "transformer.rms_norm", - "encoder.final_layernorm", - "transformer.norm", - "model.norm", - "rwkv.ln_out", - ), + "gpt_neox.final_layer_norm", # gptneox + "transformer.ln_f", # gpt2 gpt-j falcon jais exaone + "model.norm", # llama-hf baichuan internlm2 olmoe olmo2 + "norm", # llama-pth + "transformer.norm_f", # mpt dbrx + "ln_f", # refact bloom qwen gpt2 + "language_model.encoder.final_layernorm", # persimmon + "model.final_layernorm", # persimmon + "lm_head.ln", # phi2 + "model.norm_f", # mamba-qbert + "backbone.norm_f", # mamba + "transformer.rms_norm", # Grok + "encoder.final_layernorm", # chatglm + "transformer.norm", # openelm + "model.norm", # nemotron + "rwkv.ln_out", # rwkv + "backbone.final_layer_norm", # wavtokenizer + ), + # Rope frequencies MODEL_TENSOR.ROPE_FREQS: ( - "rope.freqs", - "rotary_pos_emb.inv_freq", + "rope.freqs", # llama-pth + "rotary_pos_emb.inv_freq", # chatglm ), + MODEL_TENSOR.ROPE_FACTORS_LONG: (), + MODEL_TENSOR.ROPE_FACTORS_SHORT: (), + MODEL_TENSOR.CONV1D: ("backbone.embed",), # roberta } block_mappings_cfg: dict[MODEL_TENSOR, tuple[str, ...]] = { + # Attention norm MODEL_TENSOR.ATTN_NORM: ( - "gpt_neox.layers.{bid}.input_layernorm", - "transformer.h.{bid}.ln_1", - "transformer.blocks.{bid}.norm_1", - "transformer.h.{bid}.input_layernorm", - "h.{bid}.input_layernorm", - "transformer.h.{bid}.ln_mlp", - "model.layers.{bid}.input_layernorm", - "layers.{bid}.attention_norm", - "language_model.encoder.layers.{bid}.input_layernorm", - "model.layers.{bid}.ln1", - "h.{bid}.ln_1", - "transformer.h.{bid}.ln", - "model.layers.layers.{bid}.norm", - "model.layers.{bid}.attention_norm", - "model.layers.{bid}.norm", - "backbone.layers.{bid}.norm", - "transformer.decoder_layer.{bid}.rms_norm", - "transformer.blocks.{bid}.norm_attn_norm.norm_1", - "encoder.layers.{bid}.input_layernorm", - "transformer.layers.{bid}.attn_norm", - "rwkv.blocks.{bid}.ln1", - ), + "gpt_neox.layers.{bid}.input_layernorm", # gptneox + "transformer.h.{bid}.ln_1", # gpt2 gpt-j refact qwen jais exaone + "transformer.blocks.{bid}.norm_1", # mpt + "transformer.h.{bid}.input_layernorm", # falcon7b + "h.{bid}.input_layernorm", # bloom + "transformer.h.{bid}.ln_mlp", # falcon40b + "model.layers.{bid}.input_layernorm", # llama-hf nemotron olmoe + "layers.{bid}.attention_norm", # llama-pth + "language_model.encoder.layers.{bid}.input_layernorm", # persimmon + "model.layers.{bid}.ln1", # yi + "h.{bid}.ln_1", # gpt2 + "transformer.h.{bid}.ln", # phi2 + "model.layers.layers.{bid}.norm", # plamo + "model.layers.{bid}.attention_norm", # internlm2 + "model.layers.{bid}.norm", # mamba-qbert + "backbone.layers.{bid}.norm", # mamba + "transformer.decoder_layer.{bid}.rms_norm", # Grok + "transformer.blocks.{bid}.norm_attn_norm.norm_1", # dbrx + "encoder.layers.{bid}.input_layernorm", # chatglm + "transformer.layers.{bid}.attn_norm", # openelm + "rwkv.blocks.{bid}.ln1", # rwkv + ), + # Attention norm 2 MODEL_TENSOR.ATTN_NORM_2: ( - "transformer.h.{bid}.ln_attn", - "encoder.layer.{bid}.layer_norm_1", - "rwkv.blocks.{bid}.ln2", + "transformer.h.{bid}.ln_attn", # falcon40b + "encoder.layer.{bid}.layer_norm_1", # jina-v2-code + "rwkv.blocks.{bid}.ln2", # rwkv ), + # Attention query-key-value MODEL_TENSOR.ATTN_QKV: ( - "gpt_neox.layers.{bid}.attention.query_key_value", - "transformer.h.{bid}.attn.c_attn", - "transformer.blocks.{bid}.attn.Wqkv", - "transformer.blocks.{bid}.norm_attn_norm.attn.Wqkv", - "transformer.h.{bid}.self_attention.query_key_value", - "h.{bid}.self_attention.query_key_value", - "language_model.encoder.layers.{bid}.self_attention.query_key_value", - "model.layers.{bid}.self_attn.query_key_value", - "h.{bid}.attn.c_attn", - "transformer.h.{bid}.mixer.Wqkv", - "encoder.layers.{bid}.attn.Wqkv", - "model.layers.{bid}.self_attn.qkv_proj", - "encoder.layers.{bid}.self_attention.query_key_value", - "transformer.layers.{bid}.attn.qkv_proj", - ), + "gpt_neox.layers.{bid}.attention.query_key_value", # gptneox + "transformer.h.{bid}.attn.c_attn", # gpt2 qwen jais + "transformer.blocks.{bid}.attn.Wqkv", # mpt + "transformer.blocks.{bid}.norm_attn_norm.attn.Wqkv", # dbrx + "transformer.h.{bid}.self_attention.query_key_value", # falcon + "h.{bid}.self_attention.query_key_value", # bloom + "language_model.encoder.layers.{bid}.self_attention.query_key_value", # persimmon + "model.layers.{bid}.self_attn.query_key_value", # persimmon + "h.{bid}.attn.c_attn", # gpt2 + "transformer.h.{bid}.mixer.Wqkv", # phi2 + "encoder.layers.{bid}.attn.Wqkv", # nomic-bert + "model.layers.{bid}.self_attn.qkv_proj", # phi3 + "encoder.layers.{bid}.self_attention.query_key_value", # chatglm + "transformer.layers.{bid}.attn.qkv_proj", # openelm + ), + # Attention query MODEL_TENSOR.ATTN_Q: ( - "model.layers.{bid}.self_attn.q_proj", - "layers.{bid}.attention.wq", - "encoder.layer.{bid}.attention.self.query", - "transformer.h.{bid}.attn.q_proj", - "model.layers.layers.{bid}.self_attn.q_proj", - "model.layers.{bid}.attention.wq", - "transformer.decoder_layer.{bid}.multi_head_attention.query", - "transformer.h.{bid}.attn.attention.q_proj", - ), + "model.layers.{bid}.self_attn.q_proj", # llama-hf nemotron olmoe olmo2 + "model.layers.{bid}.self_attn.q_proj_no_perm", # llama-custom + "layers.{bid}.attention.wq", # llama-pth + "encoder.layer.{bid}.attention.self.query", # bert + "transformer.h.{bid}.attn.q_proj", # gpt-j + "model.layers.layers.{bid}.self_attn.q_proj", # plamo + "model.layers.{bid}.attention.wq", # internlm2 + "transformer.decoder_layer.{bid}.multi_head_attention.query", # Grok + "transformer.h.{bid}.attn.attention.q_proj", # exaone + ), + # Attention key MODEL_TENSOR.ATTN_K: ( - "model.layers.{bid}.self_attn.k_proj", - "layers.{bid}.attention.wk", - "encoder.layer.{bid}.attention.self.key", - "transformer.h.{bid}.attn.k_proj", - "transformer.h.{bid}.attn.k", - "model.layers.layers.{bid}.self_attn.k_proj", - "model.layers.{bid}.attention.wk", - "transformer.decoder_layer.{bid}.multi_head_attention.key", - "transformer.h.{bid}.attn.attention.k_proj", - ), + "model.layers.{bid}.self_attn.k_proj", # llama-hf nemotron olmoe olmo2 + "model.layers.{bid}.self_attn.k_proj_no_perm", # llama-custom + "layers.{bid}.attention.wk", # llama-pth + "encoder.layer.{bid}.attention.self.key", # bert + "transformer.h.{bid}.attn.k_proj", # gpt-j + "transformer.h.{bid}.attn.k", # refact + "model.layers.layers.{bid}.self_attn.k_proj", # plamo + "model.layers.{bid}.attention.wk", # internlm2 + "transformer.decoder_layer.{bid}.multi_head_attention.key", # Grok + "transformer.h.{bid}.attn.attention.k_proj", # exaone + ), + # Attention value MODEL_TENSOR.ATTN_V: ( - "model.layers.{bid}.self_attn.v_proj", - "layers.{bid}.attention.wv", - "encoder.layer.{bid}.attention.self.value", - "transformer.h.{bid}.attn.v_proj", - "transformer.h.{bid}.attn.v", - "model.layers.layers.{bid}.self_attn.v_proj", - "model.layers.{bid}.attention.wv", - "transformer.decoder_layer.{bid}.multi_head_attention.value", - "transformer.h.{bid}.attn.attention.v_proj", - ), + "model.layers.{bid}.self_attn.v_proj", # llama-hf nemotron olmoe olmo2 + "layers.{bid}.attention.wv", # llama-pth + "encoder.layer.{bid}.attention.self.value", # bert + "transformer.h.{bid}.attn.v_proj", # gpt-j + "transformer.h.{bid}.attn.v", # refact + "model.layers.layers.{bid}.self_attn.v_proj", # plamo + "model.layers.{bid}.attention.wv", # internlm2 + "transformer.decoder_layer.{bid}.multi_head_attention.value", # Grok + "transformer.h.{bid}.attn.attention.v_proj", # exaone + ), + # Attention output MODEL_TENSOR.ATTN_OUT: ( - "gpt_neox.layers.{bid}.attention.dense", - "transformer.h.{bid}.attn.c_proj", - "transformer.blocks.{bid}.attn.out_proj", - "transformer.h.{bid}.self_attention.dense", - "h.{bid}.self_attention.dense", - "model.layers.{bid}.self_attn.o_proj", - "layers.{bid}.attention.wo", - "encoder.layer.{bid}.attention.output.dense", - "transformer.h.{bid}.attn.out_proj", - "language_model.encoder.layers.{bid}.self_attention.dense", - "model.layers.{bid}.self_attn.dense", - "h.{bid}.attn.c_proj", - "transformer.h.{bid}.mixer.out_proj", - "model.layers.layers.{bid}.self_attn.o_proj", - "model.layers.{bid}.attention.wo", - "encoder.layers.{bid}.attn.out_proj", - "transformer.decoder_layer.{bid}.multi_head_attention.linear", - "transformer.blocks.{bid}.norm_attn_norm.attn.out_proj", - "encoder.layers.{bid}.self_attention.dense", - "transformer.layers.{bid}.attn.out_proj", - "transformer.h.{bid}.attn.attention.out_proj", - ), + "gpt_neox.layers.{bid}.attention.dense", # gptneox + "transformer.h.{bid}.attn.c_proj", # gpt2 refact qwen jais + "transformer.blocks.{bid}.attn.out_proj", # mpt + "transformer.h.{bid}.self_attention.dense", # falcon + "h.{bid}.self_attention.dense", # bloom + "model.layers.{bid}.self_attn.o_proj", # llama-hf nemotron olmoe olmo2 + "model.layers.{bid}.self_attn.linear_attn", # deci + "layers.{bid}.attention.wo", # llama-pth + "encoder.layer.{bid}.attention.output.dense", # bert + "transformer.h.{bid}.attn.out_proj", # gpt-j + "language_model.encoder.layers.{bid}.self_attention.dense", # persimmon + "model.layers.{bid}.self_attn.dense", # persimmon + "h.{bid}.attn.c_proj", # gpt2 + "transformer.h.{bid}.mixer.out_proj", # phi2 + "model.layers.layers.{bid}.self_attn.o_proj", # plamo + "model.layers.{bid}.attention.wo", # internlm2 + "encoder.layers.{bid}.attn.out_proj", # nomic-bert + "transformer.decoder_layer.{bid}.multi_head_attention.linear", # Grok + "transformer.blocks.{bid}.norm_attn_norm.attn.out_proj", # dbrx + "encoder.layers.{bid}.self_attention.dense", # chatglm + "transformer.layers.{bid}.attn.out_proj", # openelm + "transformer.h.{bid}.attn.attention.out_proj", # exaone + ), + # Attention output norm MODEL_TENSOR.ATTN_OUT_NORM: ( - "encoder.layer.{bid}.attention.output.LayerNorm", - "encoder.layers.{bid}.norm1", - "transformer.decoder_layer.{bid}.rms_norm_1", - "transformer.blocks.{bid}.norm_attn_norm.norm_2", + "encoder.layer.{bid}.attention.output.LayerNorm", # bert + "encoder.layers.{bid}.norm1", # nomic-bert + "transformer.decoder_layer.{bid}.rms_norm_1", # Grok + "transformer.blocks.{bid}.norm_attn_norm.norm_2", # dbrx ), - MODEL_TENSOR.ATTN_POST_NORM: ("model.layers.{bid}.post_attention_layernorm",), + MODEL_TENSOR.ATTN_POST_NORM: ( + "model.layers.{bid}.post_attention_layernorm", # gemma2 olmo2 + ), + # Rotary embeddings MODEL_TENSOR.ATTN_ROT_EMBD: ( - "model.layers.{bid}.self_attn.rotary_emb.inv_freq", - "layers.{bid}.attention.inner_attention.rope.freqs", - "model.layers.layers.{bid}.self_attn.rotary_emb.inv_freq", - "transformer.h.{bid}.attn.rotary_emb.inv_freq", + "model.layers.{bid}.self_attn.rotary_emb.inv_freq", # llama-hf + "layers.{bid}.attention.inner_attention.rope.freqs", # llama-pth + "model.layers.layers.{bid}.self_attn.rotary_emb.inv_freq", # plamo + "transformer.h.{bid}.attn.rotary_emb.inv_freq", # codeshell ), + # Feed-forward norm MODEL_TENSOR.FFN_NORM: ( - "gpt_neox.layers.{bid}.post_attention_layernorm", - "transformer.h.{bid}.ln_2", - "h.{bid}.post_attention_layernorm", - "transformer.blocks.{bid}.norm_2", - "model.layers.{bid}.post_attention_layernorm", - "layers.{bid}.ffn_norm", - "language_model.encoder.layers.{bid}.post_attention_layernorm", - "model.layers.{bid}.ln2", - "h.{bid}.ln_2", - "model.layers.{bid}.ffn_norm", - "transformer.decoder_layer.{bid}.rms_norm_2", - "encoder.layers.{bid}.post_attention_layernorm", - "transformer.layers.{bid}.ffn_norm", - ), - MODEL_TENSOR.FFN_PRE_NORM: ("model.layers.{bid}.pre_feedforward_layernorm",), - MODEL_TENSOR.FFN_POST_NORM: ("model.layers.{bid}.post_feedforward_layernorm",), + "gpt_neox.layers.{bid}.post_attention_layernorm", # gptneox + "transformer.h.{bid}.ln_2", # gpt2 refact qwen jais exaone + "h.{bid}.post_attention_layernorm", # bloom + "transformer.blocks.{bid}.norm_2", # mpt + "model.layers.{bid}.post_attention_layernorm", # llama-hf nemotron olmoe + "layers.{bid}.ffn_norm", # llama-pth + "language_model.encoder.layers.{bid}.post_attention_layernorm", # persimmon + "model.layers.{bid}.ln2", # yi + "h.{bid}.ln_2", # gpt2 + "model.layers.{bid}.ffn_norm", # internlm2 + "transformer.decoder_layer.{bid}.rms_norm_2", # Grok + "encoder.layers.{bid}.post_attention_layernorm", # chatglm + "transformer.layers.{bid}.ffn_norm", # openelm + ), + # Post feed-forward norm + MODEL_TENSOR.FFN_PRE_NORM: ( + "model.layers.{bid}.pre_feedforward_layernorm", # gemma2 + ), + # Post feed-forward norm + MODEL_TENSOR.FFN_POST_NORM: ( + "model.layers.{bid}.post_feedforward_layernorm", # gemma2 olmo2 + ), MODEL_TENSOR.FFN_GATE_INP: ( - "layers.{bid}.feed_forward.gate", - "model.layers.{bid}.block_sparse_moe.gate", - "model.layers.{bid}.mlp.gate", - "transformer.decoder_layer.{bid}.router", - "transformer.blocks.{bid}.ffn.router.layer", + "layers.{bid}.feed_forward.gate", # mixtral + "model.layers.{bid}.block_sparse_moe.gate", # mixtral + "model.layers.{bid}.mlp.gate", # qwen2moe olmoe + "transformer.decoder_layer.{bid}.router", # Grok + "transformer.blocks.{bid}.ffn.router.layer", # dbrx + "model.layers.{bid}.block_sparse_moe.router.layer", # granitemoe ), - MODEL_TENSOR.FFN_GATE_INP_SHEXP: ("model.layers.{bid}.mlp.shared_expert_gate",), + MODEL_TENSOR.FFN_GATE_INP_SHEXP: ( + "model.layers.{bid}.mlp.shared_expert_gate", # qwen2moe + ), + MODEL_TENSOR.FFN_EXP_PROBS_B: ( + "model.layers.{bid}.mlp.gate.e_score_correction", # deepseek-v3 + ), + # Feed-forward up MODEL_TENSOR.FFN_UP: ( - "gpt_neox.layers.{bid}.mlp.dense_h_to_4h", - "transformer.h.{bid}.mlp.c_fc", - "transformer.blocks.{bid}.ffn.up_proj", - "transformer.h.{bid}.mlp.dense_h_to_4h", - "h.{bid}.mlp.dense_h_to_4h", - "model.layers.{bid}.mlp.up_proj", - "layers.{bid}.feed_forward.w3", - "encoder.layer.{bid}.intermediate.dense", - "transformer.h.{bid}.mlp.fc_in", - "transformer.h.{bid}.mlp.linear_3", - "language_model.encoder.layers.{bid}.mlp.dense_h_to_4h", - "model.layers.{bid}.mlp.dense_h_to_4h", - "transformer.h.{bid}.mlp.w1", - "h.{bid}.mlp.c_fc", - "transformer.h.{bid}.mlp.fc1", - "model.layers.{bid}.mlp.fc1", - "model.layers.{bid}.mlp.gate_up_proj", - "model.layers.layers.{bid}.mlp.up_proj", - "model.layers.{bid}.feed_forward.w3", - "encoder.layers.{bid}.mlp.fc11", - "model.layers.{bid}.mlp.c_fc", - "encoder.layer.{bid}.mlp.gated_layers_v", - "model.layers.{bid}.residual_mlp.w3", - "encoder.layers.{bid}.mlp.dense_h_to_4h", - "transformer.h.{bid}.mlp.c_fc_1", + "gpt_neox.layers.{bid}.mlp.dense_h_to_4h", # gptneox + "transformer.h.{bid}.mlp.c_fc", # gpt2 jais + "transformer.blocks.{bid}.ffn.up_proj", # mpt + "transformer.h.{bid}.mlp.dense_h_to_4h", # falcon + "h.{bid}.mlp.dense_h_to_4h", # bloom + "model.layers.{bid}.mlp.up_proj", # llama-hf refact nemotron olmo2 + "layers.{bid}.feed_forward.w3", # llama-pth + "encoder.layer.{bid}.intermediate.dense", # bert + "transformer.h.{bid}.mlp.fc_in", # gpt-j + "transformer.h.{bid}.mlp.linear_3", # refact + "language_model.encoder.layers.{bid}.mlp.dense_h_to_4h", # persimmon + "model.layers.{bid}.mlp.dense_h_to_4h", # persimmon + "transformer.h.{bid}.mlp.w1", # qwen + "h.{bid}.mlp.c_fc", # gpt2 + "transformer.h.{bid}.mlp.fc1", # phi2 + "model.layers.{bid}.mlp.fc1", # phi2 + "model.layers.{bid}.mlp.gate_up_proj", # phi3 + "model.layers.layers.{bid}.mlp.up_proj", # plamo + "model.layers.{bid}.feed_forward.w3", # internlm2 + "encoder.layers.{bid}.mlp.fc11", # nomic-bert + "model.layers.{bid}.mlp.c_fc", # starcoder2 + "encoder.layer.{bid}.mlp.gated_layers_v", # jina-bert-v2 + "model.layers.{bid}.residual_mlp.w3", # arctic + "encoder.layers.{bid}.mlp.dense_h_to_4h", # chatglm + "transformer.h.{bid}.mlp.c_fc_1", # exaone ), MODEL_TENSOR.FFN_UP_EXP: ( - "layers.{bid}.feed_forward.experts.w3", - "transformer.decoder_layer.{bid}.moe.linear_v", - "transformer.blocks.{bid}.ffn.experts.mlp.v1", - "model.layers.{bid}.mlp.experts.up_proj", + "layers.{bid}.feed_forward.experts.w3", # mixtral (merged) + "transformer.decoder_layer.{bid}.moe.linear_v", # Grok (merged) + "transformer.blocks.{bid}.ffn.experts.mlp.v1", # dbrx + "model.layers.{bid}.mlp.experts.up_proj", # qwen2moe olmoe (merged) ), MODEL_TENSOR.FFN_UP_SHEXP: ( - "model.layers.{bid}.mlp.shared_expert.up_proj", - "model.layers.{bid}.mlp.shared_experts.up_proj", + "model.layers.{bid}.mlp.shared_expert.up_proj", # qwen2moe + "model.layers.{bid}.mlp.shared_experts.up_proj", # deepseek deepseek2 ), - MODEL_TENSOR.FFN_ACT: ("transformer.blocks.{bid}.ffn.act",), + # AWQ-activation gate + MODEL_TENSOR.FFN_ACT: ("transformer.blocks.{bid}.ffn.act",), # mpt + # Feed-forward gate MODEL_TENSOR.FFN_GATE: ( - "model.layers.{bid}.mlp.gate_proj", - "layers.{bid}.feed_forward.w1", - "transformer.h.{bid}.mlp.w2", - "transformer.h.{bid}.mlp.c_fc2", - "model.layers.layers.{bid}.mlp.gate_proj", - "model.layers.{bid}.feed_forward.w1", - "encoder.layers.{bid}.mlp.fc12", - "encoder.layer.{bid}.mlp.gated_layers_w", - "transformer.h.{bid}.mlp.linear_1", - "model.layers.{bid}.residual_mlp.w1", - "transformer.h.{bid}.mlp.c_fc_0", + "model.layers.{bid}.mlp.gate_proj", # llama-hf refact olmo2 + "layers.{bid}.feed_forward.w1", # llama-pth + "transformer.h.{bid}.mlp.w2", # qwen + "transformer.h.{bid}.mlp.c_fc2", # jais + "model.layers.layers.{bid}.mlp.gate_proj", # plamo + "model.layers.{bid}.feed_forward.w1", # internlm2 + "encoder.layers.{bid}.mlp.fc12", # nomic-bert + "encoder.layer.{bid}.mlp.gated_layers_w", # jina-bert-v2 + "transformer.h.{bid}.mlp.linear_1", # refact + "model.layers.{bid}.residual_mlp.w1", # arctic + "transformer.h.{bid}.mlp.c_fc_0", # exaone ), MODEL_TENSOR.FFN_GATE_EXP: ( - "layers.{bid}.feed_forward.experts.w1", - "transformer.decoder_layer.{bid}.moe.linear", - "transformer.blocks.{bid}.ffn.experts.mlp.w1", - "model.layers.{bid}.mlp.experts.gate_proj", + "layers.{bid}.feed_forward.experts.w1", # mixtral (merged) + "transformer.decoder_layer.{bid}.moe.linear", # Grok (merged) + "transformer.blocks.{bid}.ffn.experts.mlp.w1", # dbrx + "model.layers.{bid}.mlp.experts.gate_proj", # qwen2moe olmoe (merged) ), MODEL_TENSOR.FFN_GATE_SHEXP: ( - "model.layers.{bid}.mlp.shared_expert.gate_proj", - "model.layers.{bid}.mlp.shared_experts.gate_proj", + "model.layers.{bid}.mlp.shared_expert.gate_proj", # qwen2moe + "model.layers.{bid}.mlp.shared_experts.gate_proj", # deepseek deepseek2 ), + # Feed-forward down MODEL_TENSOR.FFN_DOWN: ( - "gpt_neox.layers.{bid}.mlp.dense_4h_to_h", - "transformer.h.{bid}.mlp.c_proj", - "transformer.blocks.{bid}.ffn.down_proj", - "transformer.h.{bid}.mlp.dense_4h_to_h", - "h.{bid}.mlp.dense_4h_to_h", - "model.layers.{bid}.mlp.down_proj", - "layers.{bid}.feed_forward.w2", - "encoder.layer.{bid}.output.dense", - "transformer.h.{bid}.mlp.fc_out", - "language_model.encoder.layers.{bid}.mlp.dense_4h_to_h", - "model.layers.{bid}.mlp.dense_4h_to_h", - "h.{bid}.mlp.c_proj", - "transformer.h.{bid}.mlp.fc2", - "model.layers.{bid}.mlp.fc2", - "model.layers.layers.{bid}.mlp.down_proj", - "model.layers.{bid}.feed_forward.w2", - "encoder.layers.{bid}.mlp.fc2", - "model.layers.{bid}.mlp.c_proj", - "encoder.layer.{bid}.mlp.wo", - "transformer.layers.{bid}.ffn.proj_2", - "model.layers.{bid}.residual_mlp.w2", - "encoder.layer.{bid}.mlp.down_layer", - "encoder.layers.{bid}.mlp.dense_4h_to_h", - "model.layers.h.{bid}.mlp.c_proj", + "gpt_neox.layers.{bid}.mlp.dense_4h_to_h", # gptneox + "transformer.h.{bid}.mlp.c_proj", # gpt2 refact qwen jais + "transformer.blocks.{bid}.ffn.down_proj", # mpt + "transformer.h.{bid}.mlp.dense_4h_to_h", # falcon + "h.{bid}.mlp.dense_4h_to_h", # bloom + "model.layers.{bid}.mlp.down_proj", # llama-hf nemotron olmo2 + "layers.{bid}.feed_forward.w2", # llama-pth + "encoder.layer.{bid}.output.dense", # bert + "transformer.h.{bid}.mlp.fc_out", # gpt-j + "language_model.encoder.layers.{bid}.mlp.dense_4h_to_h", # persimmon + "model.layers.{bid}.mlp.dense_4h_to_h", # persimmon + "h.{bid}.mlp.c_proj", # gpt2 + "transformer.h.{bid}.mlp.fc2", # phi2 + "model.layers.{bid}.mlp.fc2", # phi2 + "model.layers.layers.{bid}.mlp.down_proj", # plamo + "model.layers.{bid}.feed_forward.w2", # internlm2 + "encoder.layers.{bid}.mlp.fc2", # nomic-bert + "model.layers.{bid}.mlp.c_proj", # starcoder2 + "encoder.layer.{bid}.mlp.wo", # jina-bert-v2 + "transformer.layers.{bid}.ffn.proj_2", # openelm + "model.layers.{bid}.residual_mlp.w2", # arctic + "encoder.layer.{bid}.mlp.down_layer", # jina-bert-v2 + "encoder.layers.{bid}.mlp.dense_4h_to_h", # chatglm + "model.layers.h.{bid}.mlp.c_proj", # exaone ), MODEL_TENSOR.FFN_DOWN_EXP: ( - "layers.{bid}.feed_forward.experts.w2", - "transformer.decoder_layer.{bid}.moe.linear_1", - "transformer.blocks.{bid}.ffn.experts.mlp.w2", - "model.layers.{bid}.mlp.experts.down_proj", + "layers.{bid}.feed_forward.experts.w2", # mixtral (merged) + "transformer.decoder_layer.{bid}.moe.linear_1", # Grok (merged) + "transformer.blocks.{bid}.ffn.experts.mlp.w2", # dbrx + "model.layers.{bid}.mlp.experts.down_proj", # qwen2moe olmoe (merged) + "model.layers.{bid}.block_sparse_moe.output_linear", # granitemoe ), MODEL_TENSOR.FFN_DOWN_SHEXP: ( - "model.layers.{bid}.mlp.shared_expert.down_proj", - "model.layers.{bid}.mlp.shared_experts.down_proj", + "model.layers.{bid}.mlp.shared_expert.down_proj", # qwen2moe + "model.layers.{bid}.mlp.shared_experts.down_proj", # deepseek deepseek2 ), MODEL_TENSOR.ATTN_Q_NORM: ( "language_model.encoder.layers.{bid}.self_attention.q_layernorm", - "model.layers.{bid}.self_attn.q_layernorm", - "model.layers.{bid}.self_attn.q_norm", - "transformer.blocks.{bid}.attn.q_ln", - "encoder.layer.{bid}.attention.self.layer_norm_q", - "transformer.layers.{bid}.attn.q_norm", + "model.layers.{bid}.self_attn.q_layernorm", # persimmon + "model.layers.{bid}.self_attn.q_norm", # cohere olmoe chameleon olmo2 + "transformer.blocks.{bid}.attn.q_ln", # sea-lion + "encoder.layer.{bid}.attention.self.layer_norm_q", # jina-bert-v2 + "transformer.layers.{bid}.attn.q_norm", # openelm ), MODEL_TENSOR.ATTN_K_NORM: ( "language_model.encoder.layers.{bid}.self_attention.k_layernorm", - "model.layers.{bid}.self_attn.k_layernorm", - "model.layers.{bid}.self_attn.k_norm", - "transformer.blocks.{bid}.attn.k_ln", - "encoder.layer.{bid}.attention.self.layer_norm_k", - "transformer.layers.{bid}.attn.k_norm", + "model.layers.{bid}.self_attn.k_layernorm", # persimmon + "model.layers.{bid}.self_attn.k_norm", # cohere olmoe chameleon olmo2 + "transformer.blocks.{bid}.attn.k_ln", # sea-lion + "encoder.layer.{bid}.attention.self.layer_norm_k", # jina-bert-v2 + "transformer.layers.{bid}.attn.k_norm", # openelm ), MODEL_TENSOR.ROPE_FREQS: ( - "language_model.encoder.layers.{bid}.self_attention.rotary_emb.inv_freq", + "language_model.encoder.layers.{bid}.self_attention.rotary_emb.inv_freq", # persimmon ), MODEL_TENSOR.LAYER_OUT_NORM: ( - "encoder.layer.{bid}.output.LayerNorm", - "encoder.layers.{bid}.norm2", - "transformer.decoder_layer.{bid}.rms_norm_3", - "encoder.layer.{bid}.mlp.layernorm", - "encoder.layer.{bid}.layer_norm_2", + "encoder.layer.{bid}.output.LayerNorm", # bert + "encoder.layers.{bid}.norm2", # nomic-bert + "transformer.decoder_layer.{bid}.rms_norm_3", # Grok + "encoder.layer.{bid}.mlp.layernorm", # jina-bert-v2 + "encoder.layer.{bid}.layer_norm_2", # jina-v2-code ), MODEL_TENSOR.SSM_IN: ( "model.layers.{bid}.in_proj", "backbone.layers.{bid}.mixer.in_proj", - "model.layers.{bid}.mamba.in_proj", ), MODEL_TENSOR.SSM_CONV1D: ( "model.layers.{bid}.conv1d", "backbone.layers.{bid}.mixer.conv1d", - "model.layers.{bid}.mamba.conv1d", ), MODEL_TENSOR.SSM_X: ( "model.layers.{bid}.x_proj", "backbone.layers.{bid}.mixer.x_proj", - "model.layers.{bid}.mamba.x_proj", ), MODEL_TENSOR.SSM_DT: ( "model.layers.{bid}.dt_proj", "backbone.layers.{bid}.mixer.dt_proj", - "model.layers.{bid}.mamba.dt_proj", ), - MODEL_TENSOR.SSM_DT_NORM: ("model.layers.{bid}.mamba.dt_layernorm",), MODEL_TENSOR.SSM_A: ( "model.layers.{bid}.A_log", "backbone.layers.{bid}.mixer.A_log", - "model.layers.{bid}.mamba.A_log", - ), - MODEL_TENSOR.SSM_B_NORM: ( - "model.layers.{bid}.mamba.b_layernorm", - "model.layers.{bid}.mamba.B_layernorm", - ), - MODEL_TENSOR.SSM_C_NORM: ( - "model.layers.{bid}.mamba.c_layernorm", - "model.layers.{bid}.mamba.C_layernorm", ), MODEL_TENSOR.SSM_D: ( "model.layers.{bid}.D", "backbone.layers.{bid}.mixer.D", - "model.layers.{bid}.mamba.D", ), MODEL_TENSOR.SSM_OUT: ( "model.layers.{bid}.out_proj", "backbone.layers.{bid}.mixer.out_proj", - "model.layers.{bid}.mamba.out_proj", - ), - MODEL_TENSOR.TIME_MIX_W1: ("rwkv.blocks.{bid}.attention.time_maa_w1",), - MODEL_TENSOR.TIME_MIX_W2: ("rwkv.blocks.{bid}.attention.time_maa_w2",), - MODEL_TENSOR.TIME_MIX_LERP_X: ("rwkv.blocks.{bid}.attention.time_maa_x",), - MODEL_TENSOR.TIME_MIX_LERP_K: ("rwkv.blocks.{bid}.attention.time_maa_k",), - MODEL_TENSOR.TIME_MIX_LERP_V: ("rwkv.blocks.{bid}.attention.time_maa_v",), - MODEL_TENSOR.TIME_MIX_LERP_R: ("rwkv.blocks.{bid}.attention.time_maa_r",), - MODEL_TENSOR.TIME_MIX_LERP_G: ("rwkv.blocks.{bid}.attention.time_maa_g",), - MODEL_TENSOR.TIME_MIX_LERP_W: ("rwkv.blocks.{bid}.attention.time_maa_w",), - MODEL_TENSOR.TIME_MIX_FIRST: ("rwkv.blocks.{bid}.attention.time_faaaa",), - MODEL_TENSOR.TIME_MIX_DECAY: ("rwkv.blocks.{bid}.attention.time_decay",), - MODEL_TENSOR.TIME_MIX_DECAY_W1: ("rwkv.blocks.{bid}.attention.time_decay_w1",), - MODEL_TENSOR.TIME_MIX_DECAY_W2: ("rwkv.blocks.{bid}.attention.time_decay_w2",), - MODEL_TENSOR.TIME_MIX_KEY: ("rwkv.blocks.{bid}.attention.key",), - MODEL_TENSOR.TIME_MIX_VALUE: ("rwkv.blocks.{bid}.attention.value",), - MODEL_TENSOR.TIME_MIX_RECEPTANCE: ("rwkv.blocks.{bid}.attention.receptance",), - MODEL_TENSOR.TIME_MIX_GATE: ("rwkv.blocks.{bid}.attention.gate",), - MODEL_TENSOR.TIME_MIX_LN: ("rwkv.blocks.{bid}.attention.ln_x",), - MODEL_TENSOR.TIME_MIX_OUTPUT: ("rwkv.blocks.{bid}.attention.output",), - MODEL_TENSOR.CHANNEL_MIX_LERP_K: ("rwkv.blocks.{bid}.feed_forward.time_maa_k",), - MODEL_TENSOR.CHANNEL_MIX_LERP_R: ("rwkv.blocks.{bid}.feed_forward.time_maa_r",), - MODEL_TENSOR.CHANNEL_MIX_KEY: ("rwkv.blocks.{bid}.feed_forward.key",), + ), + MODEL_TENSOR.TIME_MIX_W1: ( + "rwkv.blocks.{bid}.attention.time_maa_w1", # rwkv v6 + ), + MODEL_TENSOR.TIME_MIX_W2: ( + "rwkv.blocks.{bid}.attention.time_maa_w2", # rwkv v6 + ), + MODEL_TENSOR.TIME_MIX_LERP_X: ( + "rwkv.blocks.{bid}.attention.time_maa_x", # rwkv v6 + ), + MODEL_TENSOR.TIME_MIX_LERP_K: ( + "rwkv.blocks.{bid}.attention.time_maa_k", # rwkv v6 + ), + MODEL_TENSOR.TIME_MIX_LERP_V: ( + "rwkv.blocks.{bid}.attention.time_maa_v", # rwkv v6 + ), + MODEL_TENSOR.TIME_MIX_LERP_R: ( + "rwkv.blocks.{bid}.attention.time_maa_r", # rwkv v6 + ), + MODEL_TENSOR.TIME_MIX_LERP_G: ( + "rwkv.blocks.{bid}.attention.time_maa_g", # rwkv v6 + ), + MODEL_TENSOR.TIME_MIX_LERP_W: ( + "rwkv.blocks.{bid}.attention.time_maa_w", # rwkv v6 + ), + MODEL_TENSOR.TIME_MIX_FIRST: ( + "rwkv.blocks.{bid}.attention.time_faaaa", # rwkv v6 + ), + MODEL_TENSOR.TIME_MIX_DECAY: ( + "rwkv.blocks.{bid}.attention.time_decay", # rwkv v6 + ), + MODEL_TENSOR.TIME_MIX_DECAY_W1: ( + "rwkv.blocks.{bid}.attention.time_decay_w1", # rwkv v6 + ), + MODEL_TENSOR.TIME_MIX_DECAY_W2: ( + "rwkv.blocks.{bid}.attention.time_decay_w2", # rwkv v6 + ), + MODEL_TENSOR.TIME_MIX_KEY: ("rwkv.blocks.{bid}.attention.key",), # rwkv + MODEL_TENSOR.TIME_MIX_VALUE: ("rwkv.blocks.{bid}.attention.value",), # rwkv + MODEL_TENSOR.TIME_MIX_RECEPTANCE: ( + "rwkv.blocks.{bid}.attention.receptance", # rwkv + ), + MODEL_TENSOR.TIME_MIX_GATE: ("rwkv.blocks.{bid}.attention.gate",), # rwkv + MODEL_TENSOR.TIME_MIX_LN: ("rwkv.blocks.{bid}.attention.ln_x",), # rwkv + MODEL_TENSOR.TIME_MIX_OUTPUT: ("rwkv.blocks.{bid}.attention.output",), # rwkv + MODEL_TENSOR.CHANNEL_MIX_LERP_K: ( + "rwkv.blocks.{bid}.feed_forward.time_maa_k", # rwkv v6 + ), + MODEL_TENSOR.CHANNEL_MIX_LERP_R: ( + "rwkv.blocks.{bid}.feed_forward.time_maa_r", # rwkv v6 + ), + MODEL_TENSOR.CHANNEL_MIX_KEY: ("rwkv.blocks.{bid}.feed_forward.key",), # rwkv MODEL_TENSOR.CHANNEL_MIX_RECEPTANCE: ( - "rwkv.blocks.{bid}.feed_forward.receptance", + "rwkv.blocks.{bid}.feed_forward.receptance", # rwkv ), - MODEL_TENSOR.CHANNEL_MIX_VALUE: ("rwkv.blocks.{bid}.feed_forward.value",), - MODEL_TENSOR.ATTN_Q_A: ("model.layers.{bid}.self_attn.q_a_proj",), - MODEL_TENSOR.ATTN_Q_B: ("model.layers.{bid}.self_attn.q_b_proj",), + MODEL_TENSOR.CHANNEL_MIX_VALUE: ( + "rwkv.blocks.{bid}.feed_forward.value", # rwkv + ), + MODEL_TENSOR.ATTN_Q_A: ("model.layers.{bid}.self_attn.q_a_proj",), # deepseek2 + MODEL_TENSOR.ATTN_Q_B: ("model.layers.{bid}.self_attn.q_b_proj",), # deepseek2 MODEL_TENSOR.ATTN_KV_A_MQA: ( - "model.layers.{bid}.self_attn.kv_a_proj_with_mqa", - ), - MODEL_TENSOR.ATTN_KV_B: ("model.layers.{bid}.self_attn.kv_b_proj",), - MODEL_TENSOR.ATTN_Q_A_NORM: ("model.layers.{bid}.self_attn.q_a_layernorm",), - MODEL_TENSOR.ATTN_KV_A_NORM: ("model.layers.{bid}.self_attn.kv_a_layernorm",), - MODEL_TENSOR.ATTN_SUB_NORM: ("model.layers.{bid}.self_attn.inner_attn_ln",), - MODEL_TENSOR.FFN_SUB_NORM: ("model.layers.{bid}.mlp.ffn_layernorm",), - MODEL_TENSOR.DEC_ATTN_NORM: ("decoder.block.{bid}.layer.0.layer_norm",), - MODEL_TENSOR.DEC_ATTN_Q: ("decoder.block.{bid}.layer.0.SelfAttention.q",), - MODEL_TENSOR.DEC_ATTN_K: ("decoder.block.{bid}.layer.0.SelfAttention.k",), - MODEL_TENSOR.DEC_ATTN_V: ("decoder.block.{bid}.layer.0.SelfAttention.v",), - MODEL_TENSOR.DEC_ATTN_OUT: ("decoder.block.{bid}.layer.0.SelfAttention.o",), + "model.layers.{bid}.self_attn.kv_a_proj_with_mqa", # deepseek2 + ), + MODEL_TENSOR.ATTN_KV_B: ( + "model.layers.{bid}.self_attn.kv_b_proj", # deepseek2 + ), + MODEL_TENSOR.ATTN_Q_A_NORM: ( + "model.layers.{bid}.self_attn.q_a_layernorm", # deepseek2 + ), + MODEL_TENSOR.ATTN_KV_A_NORM: ( + "model.layers.{bid}.self_attn.kv_a_layernorm", # deepseek2 + ), + MODEL_TENSOR.ATTN_SUB_NORM: ( + "model.layers.{bid}.self_attn.inner_attn_ln", # bitnet + ), + MODEL_TENSOR.FFN_SUB_NORM: ("model.layers.{bid}.mlp.ffn_layernorm",), # bitnet + MODEL_TENSOR.DEC_ATTN_NORM: ("decoder.block.{bid}.layer.0.layer_norm",), # t5 + MODEL_TENSOR.DEC_ATTN_Q: ("decoder.block.{bid}.layer.0.SelfAttention.q",), # t5 + MODEL_TENSOR.DEC_ATTN_K: ("decoder.block.{bid}.layer.0.SelfAttention.k",), # t5 + MODEL_TENSOR.DEC_ATTN_V: ("decoder.block.{bid}.layer.0.SelfAttention.v",), # t5 + MODEL_TENSOR.DEC_ATTN_OUT: ( + "decoder.block.{bid}.layer.0.SelfAttention.o", # t5 + ), MODEL_TENSOR.DEC_ATTN_REL_B: ( - "decoder.block.{bid}.layer.0.SelfAttention.relative_attention_bias", + "decoder.block.{bid}.layer.0.SelfAttention.relative_attention_bias", # t5 + ), + MODEL_TENSOR.DEC_CROSS_ATTN_NORM: ( + "decoder.block.{bid}.layer.1.layer_norm", # t5 ), - MODEL_TENSOR.DEC_CROSS_ATTN_NORM: ("decoder.block.{bid}.layer.1.layer_norm",), MODEL_TENSOR.DEC_CROSS_ATTN_Q: ( - "decoder.block.{bid}.layer.1.EncDecAttention.q", + "decoder.block.{bid}.layer.1.EncDecAttention.q", # t5 ), MODEL_TENSOR.DEC_CROSS_ATTN_K: ( - "decoder.block.{bid}.layer.1.EncDecAttention.k", + "decoder.block.{bid}.layer.1.EncDecAttention.k", # t5 ), MODEL_TENSOR.DEC_CROSS_ATTN_V: ( - "decoder.block.{bid}.layer.1.EncDecAttention.v", + "decoder.block.{bid}.layer.1.EncDecAttention.v", # t5 ), MODEL_TENSOR.DEC_CROSS_ATTN_OUT: ( - "decoder.block.{bid}.layer.1.EncDecAttention.o", + "decoder.block.{bid}.layer.1.EncDecAttention.o", # t5 ), MODEL_TENSOR.DEC_CROSS_ATTN_REL_B: ( - "decoder.block.{bid}.layer.1.EncDecAttention.relative_attention_bias", + "decoder.block.{bid}.layer.1.EncDecAttention.relative_attention_bias", # t5 + ), + MODEL_TENSOR.DEC_FFN_NORM: ("decoder.block.{bid}.layer.2.layer_norm",), # t5 + MODEL_TENSOR.DEC_FFN_GATE: ( + "decoder.block.{bid}.layer.2.DenseReluDense.wi_0", # flan-t5 ), - MODEL_TENSOR.DEC_FFN_NORM: ("decoder.block.{bid}.layer.2.layer_norm",), - MODEL_TENSOR.DEC_FFN_GATE: ("decoder.block.{bid}.layer.2.DenseReluDense.wi_0",), MODEL_TENSOR.DEC_FFN_UP: ( - "decoder.block.{bid}.layer.2.DenseReluDense.wi", - "decoder.block.{bid}.layer.2.DenseReluDense.wi_1", - ), - MODEL_TENSOR.DEC_FFN_DOWN: ("decoder.block.{bid}.layer.2.DenseReluDense.wo",), - MODEL_TENSOR.DEC_OUTPUT_NORM: ("decoder.final_layer_norm",), - MODEL_TENSOR.ENC_ATTN_NORM: ("encoder.block.{bid}.layer.0.layer_norm",), - MODEL_TENSOR.ENC_ATTN_Q: ("encoder.block.{bid}.layer.0.SelfAttention.q",), - MODEL_TENSOR.ENC_ATTN_K: ("encoder.block.{bid}.layer.0.SelfAttention.k",), - MODEL_TENSOR.ENC_ATTN_V: ("encoder.block.{bid}.layer.0.SelfAttention.v",), - MODEL_TENSOR.ENC_ATTN_OUT: ("encoder.block.{bid}.layer.0.SelfAttention.o",), + "decoder.block.{bid}.layer.2.DenseReluDense.wi", # t5 + "decoder.block.{bid}.layer.2.DenseReluDense.wi_1", # flan-t5 + ), + MODEL_TENSOR.DEC_FFN_DOWN: ( + "decoder.block.{bid}.layer.2.DenseReluDense.wo", # t5 + ), + MODEL_TENSOR.DEC_OUTPUT_NORM: ("decoder.final_layer_norm",), # t5 + MODEL_TENSOR.ENC_ATTN_NORM: ("encoder.block.{bid}.layer.0.layer_norm",), # t5 + MODEL_TENSOR.ENC_ATTN_Q: ("encoder.block.{bid}.layer.0.SelfAttention.q",), # t5 + MODEL_TENSOR.ENC_ATTN_K: ("encoder.block.{bid}.layer.0.SelfAttention.k",), # t5 + MODEL_TENSOR.ENC_ATTN_V: ("encoder.block.{bid}.layer.0.SelfAttention.v",), # t5 + MODEL_TENSOR.ENC_ATTN_OUT: ( + "encoder.block.{bid}.layer.0.SelfAttention.o", # t5 + ), MODEL_TENSOR.ENC_ATTN_REL_B: ( - "encoder.block.{bid}.layer.0.SelfAttention.relative_attention_bias", + "encoder.block.{bid}.layer.0.SelfAttention.relative_attention_bias", # t5 + ), + MODEL_TENSOR.ENC_FFN_NORM: ("encoder.block.{bid}.layer.1.layer_norm",), # t5 + MODEL_TENSOR.ENC_FFN_GATE: ( + "encoder.block.{bid}.layer.1.DenseReluDense.wi_0", # flan-t5 ), - MODEL_TENSOR.ENC_FFN_NORM: ("encoder.block.{bid}.layer.1.layer_norm",), - MODEL_TENSOR.ENC_FFN_GATE: ("encoder.block.{bid}.layer.1.DenseReluDense.wi_0",), MODEL_TENSOR.ENC_FFN_UP: ( - "encoder.block.{bid}.layer.1.DenseReluDense.wi", - "encoder.block.{bid}.layer.1.DenseReluDense.wi_1", + "encoder.block.{bid}.layer.1.DenseReluDense.wi", # t5 + "encoder.block.{bid}.layer.1.DenseReluDense.wi_1", # flan-t5 + ), + MODEL_TENSOR.ENC_FFN_DOWN: ( + "encoder.block.{bid}.layer.1.DenseReluDense.wo", # t5 + ), + ############################################################################ + # TODO: these do not belong to block_mappings_cfg - move them to mappings_cfg + MODEL_TENSOR.ENC_OUTPUT_NORM: ("encoder.final_layer_norm",), # t5 + MODEL_TENSOR.CLS: ( + "classifier", # jina + "classifier.dense", # roberta + ), + MODEL_TENSOR.CLS_OUT: ("classifier.out_proj",), # roberta + ############################################################################# + MODEL_TENSOR.CONVNEXT_DW: ("backbone.convnext.{bid}.dwconv",), # wavtokenizer + MODEL_TENSOR.CONVNEXT_NORM: ("backbone.convnext.{bid}.norm",), # wavtokenizer + MODEL_TENSOR.CONVNEXT_PW1: ("backbone.convnext.{bid}.pwconv1",), # wavtokenizer + MODEL_TENSOR.CONVNEXT_PW2: ("backbone.convnext.{bid}.pwconv2",), # wavtokenizer + MODEL_TENSOR.CONVNEXT_GAMMA: ("backbone.convnext.{bid}.gamma",), # wavtokenizer + MODEL_TENSOR.POSNET_CONV1: ("backbone.posnet.{bid}.conv1",), # wavtokenizer + MODEL_TENSOR.POSNET_CONV2: ("backbone.posnet.{bid}.conv2",), # wavtokenizer + MODEL_TENSOR.POSNET_NORM: ("backbone.posnet.{bid}.norm",), # wavtokenizer + MODEL_TENSOR.POSNET_NORM1: ("backbone.posnet.{bid}.norm1",), # wavtokenizer + MODEL_TENSOR.POSNET_NORM2: ("backbone.posnet.{bid}.norm2",), # wavtokenizer + MODEL_TENSOR.POSNET_ATTN_NORM: ("backbone.posnet.{bid}.norm",), # wavtokenizer + MODEL_TENSOR.POSNET_ATTN_Q: ("backbone.posnet.{bid}.q",), # wavtokenizer + MODEL_TENSOR.POSNET_ATTN_K: ("backbone.posnet.{bid}.k",), # wavtokenizer + MODEL_TENSOR.POSNET_ATTN_V: ("backbone.posnet.{bid}.v",), # wavtokenizer + MODEL_TENSOR.POSNET_ATTN_OUT: ( + "backbone.posnet.{bid}.proj_out", # wavtokenizer ), - MODEL_TENSOR.ENC_FFN_DOWN: ("encoder.block.{bid}.layer.1.DenseReluDense.wo",), - MODEL_TENSOR.ENC_OUTPUT_NORM: ("encoder.final_layer_norm",), } + # architecture-specific block mappings arch_block_mappings_cfg: dict[MODEL_ARCH, dict[MODEL_TENSOR, tuple[str, ...]]] = { MODEL_ARCH.ARCTIC: { MODEL_TENSOR.FFN_NORM: ("model.layers.{bid}.residual_layernorm",), diff --git a/src/gguf/vocab.py b/src/gguf/vocab.py index ea92c4e..3aa13ea 100644 --- a/src/gguf/vocab.py +++ b/src/gguf/vocab.py @@ -157,8 +157,36 @@ def _try_load_from_tokenizer_json(self, path: Path) -> bool: tokenizer = json.load(f) if self.load_merges: merges = tokenizer.get("model", {}).get("merges") - if isinstance(merges, list) and merges and isinstance(merges[0], str): - self.merges = merges + if isinstance(merges, list) and merges: + if isinstance(merges[0], str): + self.merges = merges + elif ( + isinstance(merges[0], list) + and len(merges[0]) == 2 + and isinstance(merges[0][0], str) + ): + # New format since transformers 4.45 to support spaces in merges + # ref: https://github.com/ggerganov/llama.cpp/issues/9692 + # TODO: internally store as the new format instead of converting to old + if any(" " in s for pair in merges for s in pair): + logger.warning( + f'Spaces in merges detected, encoding as {chr(ord(" ") + 256)!r}' + ) + self.merges = [ + " ".join( + [ + # ensure the spaces are properly encoded + "".join( + chr(ord(c) + 256) if c == " " else c + for c in part + ) + for part in pair + ] + ) + for pair in merges + ] + else: + raise ValueError("Unknown tokenizer merges format") added_tokens = tokenizer.get("added_tokens", {}) else: added_tokens = {} @@ -225,7 +253,6 @@ class Vocab(BaseVocab, Protocol): fname_tokenizer: Path def __init__(self, base_path: Path): ... - def all_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]: ... diff --git a/src/globals.py b/src/globals.py index 68f0d6e..97af270 100644 --- a/src/globals.py +++ b/src/globals.py @@ -80,11 +80,15 @@ def load_dotenv(self=Any) -> None: def show_about(self) -> None: - about_text = ( - "AutoGGUF\n\n" - f"Version: {AUTOGGUF_VERSION}\n\n" - "A tool for managing and converting GGUF models." - ) + about_text = f"""AutoGGUF + +Version: {AUTOGGUF_VERSION} + +A tool for managing and converting GGUF models. +This application is licensed under the Apache License 2.0. +Copyright (c) 2025 leafspark. +It also utilizes llama.cpp, licensed under the MIT License. +Copyright (c) 2023-2024 The ggml authors.""" QMessageBox.about(self, "About AutoGGUF", about_text)