From 4d9634b0df58cf8c264417808b69e089d85ee08d Mon Sep 17 00:00:00 2001
From: Max Dawkins <max.dawkins@gmail.com>
Date: Fri, 16 Feb 2024 12:30:22 -0500
Subject: [PATCH] Q4_1 quantization compiling to vmfb megacommit

---
 python/turbine_llamacpp/compile.py          | 201 ++++++++++++++++----
 python/turbine_llamacpp/ggml_structs.py     | 116 ++++++++++-
 python/turbine_llamacpp/llamacpp_runner.py  | 126 ++++++++++++
 python/turbine_llamacpp/model.py            |  48 ++---
 python/turbine_llamacpp/model_downloader.py |  14 +-
 python/turbine_llamacpp/params.py           |  83 +++++++-
 python/turbine_llamacpp/ref/openllama.py    |  14 +-
 repack_gguf_params.py                       |  66 +++++++
 8 files changed, 587 insertions(+), 81 deletions(-)
 create mode 100644 python/turbine_llamacpp/llamacpp_runner.py
 create mode 100644 repack_gguf_params.py

diff --git a/python/turbine_llamacpp/compile.py b/python/turbine_llamacpp/compile.py
index 6a2ca90..a98fdb9 100644
--- a/python/turbine_llamacpp/compile.py
+++ b/python/turbine_llamacpp/compile.py
@@ -9,6 +9,7 @@
 
 
 import argparse
+
 parser = argparse.ArgumentParser()
 parser.add_argument(
     "--gguf_path",
@@ -16,13 +17,85 @@
     default="ggml-model-q8_0.gguf",
     help="path to gguf",
 )
+parser.add_argument(
+    "--irpa_path",
+    type=str,
+    default=None,
+    help="path to a .irpa file to generate new repacked parameters.",
+)
+parser.add_argument(
+    "--compile_to", default="torch", type=str, help="torch, linalg, vmfb"
+)
+parser.add_argument(
+    "--vmfb_path", type=str, default=None, help="Path/name to store compiled vmfb."
+)
+parser.add_argument("--device", type=str, default="llvm-cpu", help="llvm-cpu")
+parser.add_argument(
+    "--quantization",
+    type=str,
+    default="",
+    help="Comma separated list of quantization types. Supported types are [Q4_1].",
+)
+
 
-def create_direct_predict_internal_kv_module(model: LlamaCPP) -> CompiledModule:
+def create_direct_predict_internal_kv_module(
+    hp: HParams,
+    compile_to=None,
+    device=None,
+    vmfb_path=None,
+    quantization=None,
+    irpa_path=None,
+):
     """This compilation performs direct, non-sampled prediction.
 
-    It manages its kv cache and step states internally.
+    It manages its kv kv_cache and step states internally.
     """
 
+    quant_types = quantization.split(",")
+    if irpa_path:
+        import iree.runtime as rt
+
+        dequantize_types = [
+            type
+            for type in [
+                "F32",
+                "F16",
+                "Q4_0",
+                "Q4_1",
+                "Q5_0",
+                "Q5_1",
+                "Q8_0",
+                "Q8_1",
+                "Q2_K",
+                "Q3_K",
+                "Q4_K",
+                "Q5_K",
+                "Q6_K",
+                "Q8_K",
+            ]
+            if type not in quant_types
+        ]
+        # We can't match on this param yet for the quantization rewrite.
+        dequantize_params = [
+            "token_embd.weight",
+        ]
+        repacked_params = hp.repack_tensor_params(
+            dequantize_types=dequantize_types,
+            dequantize_params=dequantize_params,
+            dtype=torch.float32,
+        )
+        rt.save_archive_file(repacked_params, irpa_path)
+        print(f"saved repacked parameters to {irpa_path}")
+
+    # Replace tensor params for tracing with dequantized types for any type not
+    # listed in args.quantization
+    replaceable_types = [type for type in hp.supported_types if type not in quant_types]
+    # Replace Q4_1 tensors because of a rewrite trick for Q4_1 parameters
+    if "Q4_1" in quant_types:
+        replaceable_types.append("Q4_1")
+    hp.replace_quantized_tensors(replaceable_types=replaceable_types)
+    model = LlamaCPP(hp)
+
     class LlamaDpisModule(CompiledModule):
         params = export_parameters(
             model.theta.params,
@@ -30,69 +103,56 @@ class LlamaDpisModule(CompiledModule):
             name_mapper=lambda n: n.removeprefix("params."),
         )
         current_seq_index = export_global(AbstractIndex, mutable=True)
-        cache_k = export_global(
-            model.cache_k, name="cache_k", uninitialized=True, mutable=True
-        )
-        cache_v = export_global(
-            model.cache_v, name="cache_v", uninitialized=True, mutable=True
-        )
+        kv_cache = export_global_tree(model.kv_cache, uninitialized=True, mutable=True)
 
         def run_initialize(
-            self, input_ids=AbstractTensor(model.hp.bs, None, dtype=torch.int32)
+            self, input_ids=AbstractTensor(model.hp.bs, None, dtype=torch.int64)
         ):
-            output_token, cache_k, cache_v = self._initialize(
+            output_token, *kv_cache = self._initialize(
                 input_ids,
-                cache_k=self.cache_k,
-                cache_v=self.cache_v,
+                *self.kv_cache,
                 constraints=[
                     input_ids.dynamic_dim(1) <= model.max_seqlen,
                 ],
             )
             self.current_seq_index = IREE.tensor_dim(input_ids, 1)
-            self.cache_k = cache_k
-            self.cache_v = cache_v
+            self.kv_cache = kv_cache
             return output_token
 
-        def run_forward(self, token0=AbstractTensor(1, 1, dtype=torch.int32)):
+        def run_forward(self, token0=AbstractTensor(1, 1, dtype=torch.int64)):
             seq_index_0 = self.current_seq_index
             # TODO: Torch currently has poor support for passing symints across
             # the tracing boundary, so we box it in a tensor and unbox it on the
             # inside. Once this restriction is relaxes, just pass it straight through.
-            seq_index_0_tensor = IREE.tensor_splat(value=seq_index_0, dtype=torch.int32)
-            output_token, cache_k, cache_v = self._decode_step(
-                token0, seq_index_0_tensor, self.cache_k, self.cache_v
+            seq_index_0_tensor = IREE.tensor_splat(value=seq_index_0, dtype=torch.int64)
+            output_token, *kv_cache = self._decode_step(
+                token0, seq_index_0_tensor, *self.kv_cache
             )
             # TODO: Emit an assertion of some kind of overflowing max_seqlen.
             self.current_seq_index = seq_index_0 + 1
-            self.cache_k = cache_k
-            self.cache_v = cache_v
+            self.kv_cache = kv_cache
             return output_token
 
         @jittable
-        def _initialize(
-            input_ids: torch.Tensor, cache_k: torch.Tensor, cache_v: torch.Tensor
-        ):
+        def _initialize(input_ids: torch.Tensor, *kv_cache):
             return (
                 model.forward(
                     input_ids,
                     0,
-                    local_cache_k=cache_k,
-                    local_cache_v=cache_v,
+                    local_kv_cache=kv_cache,
                 ),
-                cache_k,
-                cache_v,
+                *kv_cache,
             )
 
         @jittable
         def _decode_step(
             token0: torch.Tensor,
             index0: torch.Tensor,
-            cache_k: torch.Tensor,
-            cache_v: torch.Tensor,
+            *kv_cache,
         ):
             bs, sl_input = token0.shape
-            _, _, sl_k, *_ = cache_k.shape
-            _, _, sl_v, *_ = cache_v.shape
+            _, sl_k, *_ = kv_cache[0].shape
+            _, sl_v, *_ = kv_cache[0].shape
             index0_scalar = index0.item()
             # Torch is very picky that on the auto-regressive steps it knows
             # that the index0_scalar value (which is used to slice the caches)
@@ -107,23 +167,86 @@ def _decode_step(
                 model.forward(
                     token0,
                     index0_scalar,
-                    local_cache_k=cache_k,
-                    local_cache_v=cache_v,
+                    local_kv_cache=kv_cache,
                 ),
-                cache_k,
-                cache_v,
+                *kv_cache,
             )
 
-    return LlamaDpisModule(import_to="import")
+    import_to = "INPUT" if compile_to == "linalg" else "IMPORT"
+    inst = LlamaDpisModule(import_to=import_to)
+
+    quantized_param_names = get_quantized_param_name_dict(hp, quant_types)
+    # Only supporting rewrite for Q4_1 params right now.
+    if "Q4_1" in quantized_param_names and not compile_to == "linalg":
+        from shark_turbine.transforms.quantization import mm_group_quant
+
+        mm_group_quant.MMGroupQuantRewriterPass(
+            CompiledModule.get_mlir_module(inst).operation,
+            group_size=32,
+            param_names=quantized_param_names["Q4_1"],
+        ).run()
+    module_str = str(CompiledModule.get_mlir_module(inst))
+    if compile_to != "vmfb":
+        return module_str
+    else:
+        flags = [
+            "--iree-input-type=torch",
+            "--mlir-print-debuginfo",
+            "--mlir-print-op-on-diagnostic=false",
+            "--iree-stream-resource-index-bits=64",
+            "--iree-vm-target-index-bits=64",
+        ]
+        if device == "cpu" or device == "llvm-cpu":
+            flags.extend(
+                [
+                    "--iree-llvmcpu-target-cpu-features=host",
+                    "--iree-llvmcpu-target-triple=x86_64-linux-gnu",
+                    "--iree-llvmcpu-enable-ukernels=all",
+                ]
+            )
+            device = "llvm-cpu"
+        else:
+            print("Unknown device kind: ", device)
+        import iree.compiler as ireec
+
+        flatbuffer_blob = ireec.compile_str(
+            module_str,
+            target_backends=[device],
+            extra_args=flags,
+        )
+        if vmfb_path is None:
+            vmfb_path = f"output.vmfb"
+        with open(vmfb_path, "wb+") as f:
+            f.write(flatbuffer_blob)
+        print("saved to output.vmfb")
+        return module_str
+
+
+def get_quantized_param_name_dict(hp: HParams, allowed_quant_types: list[str]):
+    quantized_param_names = {}
+    for tensor_name, quant_type in hp.replaced_quantized_tensors:
+        if quant_type in allowed_quant_types:
+            if quant_type in quantized_param_names:
+                quantized_param_names[quant_type].add(tensor_name)
+            else:
+                quantized_param_names[quant_type] = set([tensor_name])
+    return quantized_param_names
 
 
 def main():
     args = parser.parse_args()
     hp = HParams(args.gguf_path)
-    model = LlamaCPP(hp)
-    cm = create_direct_predict_internal_kv_module(model)
+    module_str = create_direct_predict_internal_kv_module(
+        hp,
+        args.compile_to,
+        args.device,
+        args.vmfb_path,
+        args.quantization,
+        args.irpa_path,
+    )
     with open(f"output.mlir", "w+") as f:
-        f.write(str(CompiledModule.get_mlir_module(cm)))
+        f.write(module_str)
+    print("saved to output.mlir")
 
 
 if __name__ == "__main__":
diff --git a/python/turbine_llamacpp/ggml_structs.py b/python/turbine_llamacpp/ggml_structs.py
index 6e2c515..3f5236b 100644
--- a/python/turbine_llamacpp/ggml_structs.py
+++ b/python/turbine_llamacpp/ggml_structs.py
@@ -1,11 +1,13 @@
 from typing import Generic, Optional, TypeVar
 from abc import ABC, abstractmethod
 from dataclasses import dataclass
+import warnings
 
 import torch
 
 __all__ = [
     "Q4_0",
+    "Q4_1",
     "Q8_0",
     "QuantizedTensor",
 ]
@@ -53,6 +55,12 @@ def dequant_blocked(self, dtype: Optional[torch.dtype] = None) -> torch.Tensor:
         scaled = d * qs.to(dtype)
         return scaled
 
+    def repack_for_turbine(self, dtype: Optional[torch.dtype] = None):
+        warnings.warn(
+            f"Repacking quantized type Q8_0 not supported. Returning in GGUF format."
+        )
+        return self.dequant(dtype), None, None
+
     def __repr__(self):
         return f"Q8_0(d[{self.d.shape}]={self.d}, qs[{self.qs.shape}]={self.qs})"
 
@@ -86,6 +94,7 @@ def unpack(self) -> Q8_0Struct:
         qs = blocks[..., 1:].view(torch.int8)
         return Q8_0Struct(self.shape, blocks, d, qs)
 
+
 @dataclass
 class Q4_0Struct(UnpackedStruct):
     shape: list[int]
@@ -103,16 +112,22 @@ def dequant_blocked(self, dtype: Optional[torch.dtype] = None) -> torch.Tensor:
             d = d.to(dtype)
         else:
             dtype = d.dtype
-        v1 = (qs & 0xF)
-        v2 = (qs >> 4)
+        v1 = qs & 0xF
+        v2 = qs >> 4
         # Set up shape for combined unpacked dequants.
         target_shape = list(v1.shape)
         target_shape[-1] = v1.shape[-1] + v2.shape[-1]
         # Combining unpacked quants.
-        v3 = torch.cat([v1,v2],dim=-1)
+        v3 = torch.cat([v1, v2], dim=-1)
         scaled = d * (v3.to(dtype) - 8.0)
         return scaled
 
+    def repack_for_turbine(self, dtype: Optional[torch.dtype] = None):
+        warnings.warn(
+            f"Repacking quantized type Q4_0 not supported. Returning in GGUF format."
+        )
+        return self.dequant(dtype), None, None
+
     def __repr__(self):
         return f"Q4_0(d[{self.d.shape}]={self.d}, qs[{self.qs.shape}]={self.qs})"
 
@@ -148,3 +163,98 @@ def unpack(self) -> Q4_0Struct:
         d = blocks[..., 0:1].view(torch.float16)
         qs = blocks[..., 1:].view(torch.uint8)
         return Q4_0Struct(self.shape, blocks, d, qs)
+
+
+@dataclass
+class Q4_1Struct(UnpackedStruct):
+    shape: list[int]
+    blocks: torch.Tensor
+    d: torch.Tensor
+    m: torch.Tensor
+    qs: torch.Tensor
+
+    def dequant(self, dtype: Optional[torch.dtype] = None) -> torch.Tensor:
+        return self.dequant_blocked(dtype).reshape(self.shape)
+
+    def dequant_blocked(self, dtype: Optional[torch.dtype] = None) -> torch.Tensor:
+        d = self.d
+        m = self.m
+        qs = self.qs
+        if dtype:
+            d = d.to(dtype)
+            m = m.to(dtype)
+        else:
+            dtype = d.dtype
+        v1 = qs & 0xF
+        v2 = qs >> 4
+        # Set up shape for combined unpacked dequants.
+        target_shape = list(v1.shape)
+        target_shape[-1] = v1.shape[-1] + v2.shape[-1]
+        # Combining unpacked quants.
+        v3 = torch.cat([v1, v2], dim=-1)
+        scaled = (d * v3.to(dtype)) + m
+        return scaled
+
+    # GGML packing of Q4 data is in the order:
+    # [0, 16, 1, 17, 2, 18, ...]
+    # We need to repack to the [0, 1, 2, ...] order.
+    def reorder_q4_data(self, q4_tensor: torch.Tensor):
+        v1 = q4_tensor & 0xF
+        v2 = q4_tensor >> 4
+        block_size = q4_tensor.size(-1)
+        even_idx = torch.tensor(range(0, block_size, 2))
+        odd_idx = torch.tensor(range(1, block_size, 2))
+        v1_even = v1.index_select(-1, even_idx)
+        v1_odd = v1.index_select(-1, odd_idx)
+        v2_even = v2.index_select(-1, even_idx)
+        v2_odd = v2.index_select(-1, odd_idx)
+        v1_packed = torch.bitwise_or(v1_even, v1_odd << 4)
+        v2_packed = torch.bitwise_or(v2_even, v2_odd << 4)
+        return torch.cat([v1_packed, v2_packed], dim=-1)
+
+    def repack_for_turbine(self, dtype: Optional[torch.dtype] = None):
+        if not dtype:
+            dtype = self.d.dtype
+        weights = self.reorder_q4_data(self.qs)
+        scales = self.d
+        # GGML uses a positive scaled zero point, and turbine uses a negative
+        # unscaled zero point so we adjust the zero points accordingly.
+        zps = self.m / -self.d
+        return weights, scales.to(dtype), zps.to(dtype)
+
+    def __repr__(self):
+        return f"Q4_1(d[{self.d.shape}]={self.d}, m[{self.m.shape}]={self.m}, qs[{self.qs.shape}]={self.qs})"
+
+
+class Q4_1(QuantizedTensor[Q4_1Struct]):
+    """
+    ```
+    #define QK4_1 32
+    typedef struct {
+        ggml_fp16_t d;          // delta
+        ggml_fp16_t m;          // min
+        uint8_t qs[QK4_1 / 2];  // nibbles / quants
+    } block_q4_1;
+    ```
+    Dequant:
+    https://github.com/ggerganov/llama.cpp/blob/f026f8120f97090d34a52b3dc023c82e0ede3f7d/ggml-opencl.cpp#L131-L142
+    """
+
+    def __init__(self, linear: torch.Tensor, shape: list[int]):
+        assert linear.dtype == torch.uint8
+        self.linear = linear
+        self.shape = shape
+
+    def unpack(self) -> Q4_1Struct:
+        # Blocks are 9 i16s, so start there.
+        # delta: 1 i16
+        # quants: 8 i16s. (32 i4s -> 16 i8s -> 8 i16s)
+        linear_blocks = self.linear.view(torch.int16).reshape(-1, 10)
+        # Reblock to the result shape excluding the final dimension, which
+        # is expanded.
+        block_shape = self.shape[0:-1] + [-1, 10]
+        blocks = linear_blocks.reshape(block_shape)
+        d = blocks[..., 0:1].view(torch.float16)
+        m = blocks[..., 1:2].view(torch.float16)
+        qs = blocks[..., 2:].view(torch.uint8)
+        return Q4_1Struct(self.shape, blocks, d, m, qs)
diff --git a/python/turbine_llamacpp/llamacpp_runner.py b/python/turbine_llamacpp/llamacpp_runner.py
new file mode 100644
index 0000000..0730b5d
--- /dev/null
+++ b/python/turbine_llamacpp/llamacpp_runner.py
@@ -0,0 +1,126 @@
+import argparse
+from turbine_models.model_runner import vmfbRunner
+from iree import runtime as ireert
+import torch
+import time
+from turbine_llamacpp.params import *
+from transformers import LlamaTokenizer
+
+parser = argparse.ArgumentParser()
+
+# TODO move common runner flags to generic flag file
+parser.add_argument(
+    "--vmfb_path",
+    type=str,
+    default="output.vmfb",
+    help="path to vmfb containing compiled module",
+)
+parser.add_argument(
+    "--external_weight_path",
+    type=str,
+    default="reformatted_parameters.irpa",
+    help="path to external weight parameters",
+)
+parser.add_argument(
+    "--gguf_path",
+    type=str,
+    default="",
+    help="path to gguf file used to generate parameters",
+)
+parser.add_argument(
+    "--hf_model_path",
+    type=str,
+    default="openlm-research/open_llama_3b",
+    help="path to the hf model. Needed for tokenizer right now",
+)
+parser.add_argument(
+    "--device",
+    type=str,
+    default="local-task",
+    help="local-sync, local-task",
+)
+parser.add_argument(
+    "--prompt",
+    type=str,
+    default="<s> Q: What is the largest animal?\nA:",
+    help="prompt for llm model",
+)
+
+
+class SharkLLM(object):
+    def __init__(self, device, vmfb_path, external_weight_path):
+        self.runner = vmfbRunner(
+            device=device,
+            vmfb_path=vmfb_path,
+            external_weight_path=external_weight_path,
+        )
+        self.model = self.runner.ctx.modules.llama_dpis
+        self.first_input = True
+        self.num_tokens = 0
+        self.last_prompt = None
+        self.prev_token_len = 0
+
+    def format_out(self, results):
+        return results.to_host()[0][0]
+
+    def generate(self, input_ids, tokenizer):
+        try:
+            turbine_results = []
+            # Only need not seen token for init cache
+            # Because we have stored the res in KV-cache.
+            token_len = input_ids.shape[-1]
+            inputs = [ireert.asdevicearray(self.runner.config.device, input_ids)]
+            s = time.time()
+            results = self.model["run_initialize"](*inputs)  # example_input_id
+            e = time.time()
+            print(
+                f"num_tokens: {token_len}, time_taken={e-s}, tok/second:{token_len/(e-s)}"
+            )
+            token_len += 1
+            self.first_input = False
+            s = time.time()
+            turbine_results.append(self.format_out(results))
+            while self.format_out(results) != 2:
+                results = self.model["run_forward"](results)
+                # uncomment to see tokens as they are emitted
+                # print(f"turbine: {tokenizer.decode(self.format_out(results))}")
+                turbine_results.append(self.format_out(results))
+            e = time.time()
+            decoded_tokens = len(turbine_results)
+            print(
+                f"Decode num_tokens: {decoded_tokens}, time_taken={e-s}, tok/second:{decoded_tokens/(e-s)}"
+            )
+            self.prev_token_len = token_len + decoded_tokens
+            return turbine_results
+        except KeyboardInterrupt:
+            return turbine_results
+
+
+def run_llm(
+    device,
+    prompt,
+    vmfb_path,
+    external_weight_path,
+    hf_model_path,
+):
+    tokenizer = LlamaTokenizer.from_pretrained(hf_model_path)
+    input_ids = tokenizer(prompt, return_tensors="pt").input_ids
+    llm = SharkLLM(
+        device=device,
+        vmfb_path=vmfb_path,
+        external_weight_path=external_weight_path,
+    )
+    print("generating turbine output: ")
+    return tokenizer.decode(llm.generate(input_ids, tokenizer=tokenizer))
+
+
+if __name__ == "__main__":
+    args = parser.parse_args()
+    turbine_output = run_llm(
+        args.device,
+        args.prompt,
+        args.vmfb_path,
+        args.external_weight_path,
+        args.hf_model_path,
+    )
+    print(turbine_output)
diff --git a/python/turbine_llamacpp/model.py b/python/turbine_llamacpp/model.py
index e58898e..6d94168 100644
--- a/python/turbine_llamacpp/model.py
+++ b/python/turbine_llamacpp/model.py
@@ -16,6 +16,7 @@
 ENABLE_DEBUG = False
 
 import argparse
+
 parser = argparse.ArgumentParser()
 parser.add_argument(
     "--gguf_path",
@@ -24,6 +25,7 @@
     help="path to gguf",
 )
 
+
 def debug(*args):
     if ENABLE_DEBUG:
         print(*args)
@@ -66,26 +68,18 @@ def __init__(self, hp: HParams):
             raise ValueError("Unsupported rotary embedding")
 
         # Initialize the KV cache.
-        self.cache_k = torch.empty(
-            (
-                self.transformer_block_count,
-                self.hp.bs,
-                self.max_seqlen,
-                self.attention_head_count,
-                self.attention_head_dim,
-            ),
-            dtype=self.hp.dtype,
-        )
-        self.cache_v = torch.empty(
-            (
-                self.transformer_block_count,
-                self.hp.bs,
-                self.max_seqlen,
-                self.attention_head_count,
-                self.attention_head_dim,
-            ),
-            dtype=self.hp.dtype,
-        )
+        self.kv_cache = [
+            torch.empty(
+                (
+                    self.hp.bs,
+                    self.max_seqlen,
+                    self.attention_head_count,
+                    self.attention_head_dim,
+                ),
+                dtype=self.hp.dtype,
+            )
+            for i in range(self.transformer_block_count * 2)
+        ]
 
     def forward(
         self,
@@ -93,8 +87,7 @@ def forward(
         start_index: int,
         *,
         return_logits: bool = False,
-        local_cache_k: Optional[torch.Tensor] = None,
-        local_cache_v: Optional[torch.Tensor] = None,
+        local_kv_cache: list[torch.Tensor] = None,
     ):
         bs, sl = tokens.shape
         assert bs == self.hp.bs, "Batch size mismatch vs params"
@@ -113,17 +106,15 @@ def forward(
             ).type_as(h)
 
         # Allow either the global cache or a local set passed in parameters.
-        if local_cache_k is None:
-            local_cache_k = self.cache_k
-        if local_cache_v is None:
-            local_cache_v = self.cache_v
+        if local_kv_cache is None:
+            local_kv_cache = self.kv_cache
 
         # Transformer blocks.
         for block_idx in range(self.transformer_block_count):
             transformer_theta = self.theta("blk", block_idx)
             # Attention.
-            block_cache_k = local_cache_k[block_idx, ...]
-            block_cache_v = local_cache_v[block_idx, ...]
+            block_cache_k = local_kv_cache[block_idx]
+            block_cache_v = local_kv_cache[self.transformer_block_count + block_idx]
             attention_output = self.attention(
                 transformer_theta,
                 h,
@@ -310,7 +301,6 @@ def create_rotary_embed_table(max_seqlen: int, dim: int, theta: float = 10000.0)
     args = parser.parse_args()
     torch.no_grad().__enter__()
     hp = HParams(args.gguf_path)
-    # print(hp)
     detokenizer = Detokenizer(hp)
     model = LlamaCPP(hp)
     start_index = 0
diff --git a/python/turbine_llamacpp/model_downloader.py b/python/turbine_llamacpp/model_downloader.py
index ee6a7cc..48288e5 100644
--- a/python/turbine_llamacpp/model_downloader.py
+++ b/python/turbine_llamacpp/model_downloader.py
@@ -15,13 +15,19 @@
     default="openlm-research/open_llama_3b",
 )
 
+
 def donwload_hf_model(hf_auth_token, hf_model_name):
     auth_token = hf_auth_token if len(hf_auth_token) != 0 else None
     model_name = hf_model_name.split("/")[-1]
-    snapshot_download(repo_id=hf_model_name, local_dir="downloaded_" + model_name,
-                    local_dir_use_symlinks=False, revision="main",
-                    token=auth_token)
+    snapshot_download(
+        repo_id=hf_model_name,
+        local_dir="downloaded_" + model_name,
+        local_dir_use_symlinks=False,
+        revision="main",
+        token=auth_token,
+    )
+
 
 if __name__ == "__main__":
     args = parser.parse_args()
-    donwload_hf_model(args.hf_auth_token, args.hf_model_name)
\ No newline at end of file
+    donwload_hf_model(args.hf_auth_token, args.hf_model_name)
diff --git a/python/turbine_llamacpp/params.py b/python/turbine_llamacpp/params.py
index 01c8ff9..4704892 100644
--- a/python/turbine_llamacpp/params.py
+++ b/python/turbine_llamacpp/params.py
@@ -33,6 +33,8 @@ def as_qtensor(self) -> QuantizedTensor:
         tn = self.type_name
         if tn == "Q4_0":
             return self.as_q4_0()
+        if tn == "Q4_1":
+            return self.as_q4_1()
         if tn == "Q8_0":
             return self.as_q8_0()
         raise ValueError(f"Quantized type {tn} not supported")
@@ -44,9 +46,11 @@ def as_tensor(self) -> torch.Tensor:
         raise ValueError(f"Tensor type {tn} not supported")
 
     def as_q4_0(self) -> Q4_0:
-        # import pdb; pdb.set_trace()
         return Q4_0(torch.tensor(self.data), self.shape)
 
+    def as_q4_1(self) -> Q4_1:
+        return Q4_1(torch.tensor(self.data), self.shape)
+
     def as_q8_0(self) -> Q8_0:
         return Q8_0(torch.tensor(self.data), self.shape)
 
@@ -72,6 +76,10 @@ def __init__(
         self.dtype = dtype
         self.rotary_emb_dtype = dtype
 
+        # Quantized tensor replacement
+        self.replaced_quantized_tensors = []
+        self.supported_types = ["Q4_0", "Q4_1", "Q8_0"]
+
     def _load_gguf(self, reader: GGUFReader):
         # Extract hyper-parameters. Adapted from gguf-dump.py
         for field in reader.fields.values():
@@ -86,7 +94,7 @@ def _load_gguf(self, reader: GGUFReader):
             else:
                 self.tables[field.name] = field.parts
                 # from IPython import embed
-                # embed()                
+                # embed()
 
         # Extract tensors.
         for tensor in reader.tensors:
@@ -111,6 +119,35 @@ def __contains__(self, k: str):
     def __iter__(self):
         return self.raw_params.__iter__()
 
+    def replace_quantized_tensors(self, replaceable_types: Optional[list[str]] = None):
+        if not replaceable_types:
+            replaceable_types = self.supported_types
+        else:
+            for type in replaceable_types:
+                if type not in self.supported_types:
+                    raise ValueError(f"Replacement of type {type} not supported")
+        if self.dtype == torch.float32:
+            replacement_type_name = "F32"
+        elif self.dtype == torch.float16:
+            replacement_type_name = "F16"
+        else:
+            raise ValueError(f"Replacement into tensors of {self.dtype} not supported")
+        for tensor_name, model_tensor in self.tensors.items():
+            if model_tensor.type_name in replaceable_types:
+                self.replaced_quantized_tensors.append(
+                    (tensor_name, model_tensor.type_name)
+                )
+                replacement_data = torch.zeros(
+                    size=model_tensor.shape, dtype=self.dtype
+                )
+                new_model_tensor = ModelTensor(
+                    name=model_tensor.name,
+                    shape=model_tensor.shape,
+                    type_name=replacement_type_name,
+                    data=replacement_data,
+                )
+                self.tensors[tensor_name] = new_model_tensor
+
     @property
     def tensor_params(
         self,
@@ -164,6 +201,48 @@ def add_to_dict(
                 add_to_dict(False, hp_tensor.name, hp_tensor.as_tensor())
         return params_dict, qparams_dict
 
+    def repack_tensor_params(
+        self,
+        dequantize_types: list[str] = [],
+        dequantize_params: list[str] = [],
+        dtype: Optional[torch.dtype] = None,
+        dequantize_all: bool = False,
+    ) -> dict[str, torch.Tensor]:
+        if dtype is None:
+            dtype = self.dtype
+        reformatted_tensors = {}
+        for tensor_name, tensor in self.tensors.items():
+            if not tensor.is_quantized or tensor.type_name not in self.supported_types:
+                reformatted_tensors[tensor_name] = np.ascontiguousarray(
+                    tensor.as_tensor().detach().numpy()
+                )
+                continue
+            if (
+                dequantize_all
+                or tensor.type_name in dequantize_types
+                or tensor_name in dequantize_params
+            ):
+                reformatted_tensor = tensor.as_qtensor().unpack().dequant(dtype)
+                reformatted_tensors[tensor_name] = np.ascontiguousarray(
+                    reformatted_tensor.detach().numpy()
+                )
+            else:
+                reformatted_tensor, scales, zps = (
+                    tensor.as_qtensor().unpack().repack_for_turbine(dtype)
+                )
+                reformatted_tensors[tensor_name] = np.ascontiguousarray(
+                    reformatted_tensor.detach().numpy()
+                )
+                if scales is not None:
+                    reformatted_tensors[f"{tensor_name}_scale"] = np.ascontiguousarray(
+                        scales.detach().numpy()
+                    )
+                if zps is not None:
+                    reformatted_tensors[f"{tensor_name}_zp"] = np.ascontiguousarray(
+                        zps.detach().numpy()
+                    )
+        return reformatted_tensors
+
     def __repr__(self):
         parts = ["HParams(", "  raw_params=["]
 
diff --git a/python/turbine_llamacpp/ref/openllama.py b/python/turbine_llamacpp/ref/openllama.py
index e59772f..0dca9fd 100644
--- a/python/turbine_llamacpp/ref/openllama.py
+++ b/python/turbine_llamacpp/ref/openllama.py
@@ -1,21 +1,25 @@
 import torch
 from transformers import LlamaTokenizer, LlamaForCausalLM
 
-model_path = '/home/stella/tmp/hf/open_llama_3b'
+model_path = "/home/stella/tmp/hf/open_llama_3b"
 # model_path = 'openlm-research/open_llama_7b'
 
 tokenizer = LlamaTokenizer.from_pretrained(model_path)
 model = LlamaForCausalLM.from_pretrained(
-    model_path, torch_dtype=torch.float32, device_map='auto',
+    model_path,
+    torch_dtype=torch.float32,
+    device_map="auto",
 )
 
-prompt = 'Q: What is the largest animal?\nA:'
+prompt = "Q: What is the largest animal?\nA:"
 input_ids = tokenizer(prompt, return_tensors="pt").input_ids
 print("INPUT IDS:", input_ids)
 
+
 def get_token_from_logits(logits):
     return int(torch.argmax(logits[:, -1, :], dim=1)[0])
 
+
 all_tokens = []
 outputs = model.forward(input_ids)
 token = get_token_from_logits(outputs.logits)
@@ -26,7 +30,9 @@ def get_token_from_logits(logits):
 while token != 2:
     print(f"*** STEP {step} ***")
     step += 1
-    outputs = model.forward(torch.tensor([[token]]), past_key_values=outputs.past_key_values)
+    outputs = model.forward(
+        torch.tensor([[token]]), past_key_values=outputs.past_key_values
+    )
     token = get_token_from_logits(outputs.logits)
     all_tokens.append(token)
     print("  :OUTPUT TOKEN:", token, tokenizer.decode(token))
diff --git a/repack_gguf_params.py b/repack_gguf_params.py
new file mode 100644
index 0000000..c2317cb
--- /dev/null
+++ b/repack_gguf_params.py
@@ -0,0 +1,66 @@
+from turbine_llamacpp.params import HParams
+import iree.runtime as rt
+import torch
+import numpy as np
+
+import argparse
+
+parser = argparse.ArgumentParser()
+parser.add_argument(
+    "--gguf_path",
+    type=str,
+    default="ggml-model-q4_1.gguf",
+    help="path to gguf.",
+)
+parser.add_argument(
+    "--irpa_path",
+    type=str,
+    default="reformatted_parameters.irpa",
+    help="path to irpa file to save reformatted parameters.",
+)
+parser.add_argument(
+    "--dequantize_params",
+    type=str,
+    default="token_embd.weight",
+    help="Comma separated list of parameter names to dequantize instead of repacking.",
+)
+parser.add_argument(
+    "--dequantize_all",
+    type=bool,
+    default=False,
+    help="dequantize all parameters instead of repacking them",
+)
+
+
+def main():
+    args = parser.parse_args()
+    dequantize_params = args.dequantize_params.split(",")
+    # Only Q4_1 has repacking support right now. Dequantize all other types.
+    dequantize_types = [
+        "F32",
+        "F16",
+        "Q4_0",
+        "Q5_0",
+        "Q5_1",
+        "Q8_0",
+        "Q8_1",
+        "Q2_K",
+        "Q3_K",
+        "Q4_K",
+        "Q5_K",
+        "Q6_K",
+        "Q8_K",
+    ]
+    hp = HParams(args.gguf_path)
+    formatted_params = hp.repack_tensor_params(
+        dequantize_types=dequantize_types,
+        dequantize_params=dequantize_params,
+        dtype=torch.float32,
+        dequantize_all=args.dequantize_all,
+    )
+    rt.save_archive_file(formatted_params, args.irpa_path)
+    print(f"saved to {args.irpa_path}")
+
+
+if __name__ == "__main__":
+    main()