pytorch · HDCharles · Jun 18, 2024 · Jun 15, 2024 · Jun 15, 2024 · Jun 18, 2024
diff --git a/.gitignore b/.gitignore
@@ -368,3 +368,6 @@ venv/
 # Log files
 *.log
 sweep/
+
+# Model checkpoints
+checkpoints/
diff --git a/scripts/convert_hf_checkpoint.py b/scripts/convert_hf_checkpoint.py
@@ -0,0 +1,149 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+#copied from https://github.com/pytorch-labs/gpt-fast/blob/main/scripts/convert_hf_checkpoint.py
+import json
+import re
+import shutil
+import sys
+from pathlib import Path
+from typing import Optional
+
+import torch
+
+# support running without installing as a package
+wd = Path(__file__).parent.parent.resolve()
+sys.path.append(str(wd))
+
+from model import ModelArgs
+
+
+@torch.inference_mode()
+def convert_hf_checkpoint(
+    *,
+    checkpoint_dir: Path = Path("checkpoints/meta-Transformer/Transformer-2-7b-chat-hf"),
+    model_name: Optional[str] = None,
+) -> None:
+    if model_name is None:
+        model_name = checkpoint_dir.name
+
+    # Llama 3 8B doesn't need conversion; instead, the original/consolidated.NN.pth files
+    # need to be copied into model.pth.
+    # Llama 3 70B can't be easily merged into one model.pth file, though, since names of the
+    # weights is state dict are the same in each consolidated.NN.pth file. Thus, it is not
+    # currently supported.
+    # Along this, we need to copy the original/tokenizer.model file to tokenizer.model.tiktoken
+    is_llama3 = "Llama-3" in model_name
+    if is_llama3:
+        # Check if we have multiple original/consolidated.NN.pth files and report error
+        # if we do for Llama 3.
+        original_dir = checkpoint_dir / "original"
+        pattern = re.compile(r"^consolidated\.\d{2}\.pth$")
+        bin_files = [bin for bin in original_dir.iterdir() if pattern.match(bin.name)]
+        if len(bin_files) > 1:
+            raise ValueError(
+                f"Multiple consolidated.NN.pth files found in {original_dir}. "
+                "Merging them into one model.pth file is not supported for Llama 3.")
+
+
+    config = ModelArgs.from_name(model_name)
+    print(f"Model config {config.__dict__}")
+
+    # Load the json file containing weight mapping
+    if not is_llama3:
+        model_map_json = checkpoint_dir / "pytorch_model.bin.index.json"
+
+        assert model_map_json.is_file()
+
+        with open(model_map_json) as json_map:
+            bin_index = json.load(json_map)
+
+        weight_map = {
+            "model.embed_tokens.weight": "tok_embeddings.weight",
+            "model.layers.{}.self_attn.q_proj.weight": "layers.{}.attention.wq.weight",
+            "model.layers.{}.self_attn.k_proj.weight": "layers.{}.attention.wk.weight",
+            "model.layers.{}.self_attn.v_proj.weight": "layers.{}.attention.wv.weight",
+            "model.layers.{}.self_attn.o_proj.weight": "layers.{}.attention.wo.weight",
+            'model.layers.{}.self_attn.rotary_emb.inv_freq': None,
+            'model.layers.{}.mlp.gate_proj.weight': 'layers.{}.feed_forward.w1.weight',
+            "model.layers.{}.mlp.up_proj.weight": "layers.{}.feed_forward.w3.weight",
+            "model.layers.{}.mlp.down_proj.weight": "layers.{}.feed_forward.w2.weight",
+            "model.layers.{}.input_layernorm.weight": "layers.{}.attention_norm.weight",
+            "model.layers.{}.post_attention_layernorm.weight": "layers.{}.ffn_norm.weight",
+            "model.norm.weight": "norm.weight",
+            "lm_head.weight": "output.weight",
+        }
+        bin_files = {checkpoint_dir / bin for bin in bin_index["weight_map"].values()}
+    else:
+        # There is no separate pytorch_model.bin.index.json file for llama3.
+        # Instead, we will just use all original/consolidated.NN.pth files.
+        # so, we use model.safetensors.index.json
+        weight_map = None
+        original_dir = checkpoint_dir / "original"
+        pattern = re.compile(r"^consolidated\.\d{2}\.pth$")
+        bin_files = {bin for bin in original_dir.iterdir() if pattern.match(bin.name)}
+
+
+    def permute(w, n_head):
+        dim = config.dim
+        return (
+            w.view(n_head, 2, config.head_dim // 2, dim)
+            .transpose(1, 2)
+            .reshape(config.head_dim * n_head, dim)
+        )
+
+    merged_result = {}
+    for file in sorted(bin_files):
+        state_dict = torch.load(str(file), map_location="cpu", mmap=True, weights_only=True)
+        merged_result.update(state_dict)
+    final_result = {}
+    if weight_map is not None:
+        for key, value in merged_result.items():
+            if "layers" in key:
+                abstract_key = re.sub(r'(\d+)', '{}', key)
+                layer_num = re.search(r'\d+', key).group(0)
+                new_key = weight_map[abstract_key]
+                if new_key is None:
+                    continue
+                new_key = new_key.format(layer_num)
+            else:
+                new_key = weight_map[key]
+
+            final_result[new_key] = value
+
+        for key in tuple(final_result.keys()):
+            if "wq" in key:
+                q = final_result[key]
+                k = final_result[key.replace("wq", "wk")]
+                v = final_result[key.replace("wq", "wv")]
+                q = permute(q, config.n_head)
+                k = permute(k, config.n_local_heads)
+                final_result[key.replace("wq", "wqkv")] = torch.cat([q, k, v])
+                del final_result[key]
+                del final_result[key.replace("wq", "wk")]
+                del final_result[key.replace("wq", "wv")]
+    else:
+        final_result = merged_result
+    print(f"Saving checkpoint to {checkpoint_dir / 'model.pth'}")
+    torch.save(final_result, checkpoint_dir / "model.pth")
+    if is_llama3:
+        original_dir = checkpoint_dir / "original"
+        tokenizer_model = original_dir / "tokenizer.model"
+        tokenizer_model_tiktoken = checkpoint_dir / "tokenizer.model"
+        print(f"Copying {tokenizer_model} to {tokenizer_model_tiktoken}")
+        shutil.copy(tokenizer_model, tokenizer_model_tiktoken)
+
+if __name__ == '__main__':
+    import argparse
+    parser = argparse.ArgumentParser(description='Convert HuggingFace checkpoint.')
+    parser.add_argument('--checkpoint_dir', type=Path, default=Path("checkpoints/meta-llama/llama-2-7b-chat-hf"))
+    parser.add_argument('--model_name', type=str, default=None)
+
+    args = parser.parse_args()
+    convert_hf_checkpoint(
+        checkpoint_dir=args.checkpoint_dir,
+        model_name=args.model_name,
+    )
diff --git a/scripts/download.py b/scripts/download.py
@@ -0,0 +1,32 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+# copied from https://github.com/pytorch-labs/gpt-fast/blob/main/scripts/download.py
+import os
+from typing import Optional
+
+from requests.exceptions import HTTPError
+
+
+def hf_download(repo_id: Optional[str] = None, hf_token: Optional[str] = None) -> None:
+    from huggingface_hub import snapshot_download
+    os.makedirs(f"checkpoints/{repo_id}", exist_ok=True)
+    try:
+        snapshot_download(repo_id, local_dir=f"checkpoints/{repo_id}", local_dir_use_symlinks=False, token=hf_token)
+    except HTTPError as e:
+        if e.response.status_code == 401:
+            print("You need to pass a valid `--hf_token=...` to download private checkpoints.")
+        else:
+            raise e
+
+if __name__ == '__main__':
+    import argparse
+    parser = argparse.ArgumentParser(description='Download data from HuggingFace Hub.')
+    parser.add_argument('--repo_id', type=str, default="checkpoints/meta-llama/llama-2-7b-chat-hf", help='Repository ID to download from.')
+    parser.add_argument('--hf_token', type=str, default=None, help='HuggingFace API token.')
+
+    args = parser.parse_args()
+    hf_download(args.repo_id, args.hf_token)
diff --git a/scripts/prepare.sh b/scripts/prepare.sh
@@ -0,0 +1,4 @@
+python scripts/download.py --repo_id meta-llama/Llama-2-7b-chat-hf
+python scripts/download.py --repo_id meta-llama/Meta-Llama-3-8B
+python scripts/convert_hf_checkpoint.py --checkpoint_dir checkpoints/meta-llama/Llama-2-7b-chat-hf
+python scripts/convert_hf_checkpoint.py --checkpoint_dir checkpoints/meta-llama/Meta-Llama-3-8B
diff --git a/test/quantization/test_quant_api.py b/test/quantization/test_quant_api.py
@@ -242,7 +242,7 @@ def test_8da4w_gptq_quantizer(self):
         # should be similar to TorchCompileDynamicQuantizer
         precision = torch.bfloat16
         device = "cpu"
-        checkpoint_path = Path("../gpt-fast/checkpoints/meta-llama/Llama-2-7b-chat-hf/model.pth")
+        checkpoint_path = Path("../checkpoints/meta-llama/Llama-2-7b-chat-hf/model.pth")
         model = Transformer.from_name(checkpoint_path.parent.name)
         checkpoint = torch.load(str(checkpoint_path), mmap=True, weights_only=True)
         model.load_state_dict(checkpoint, assign=True)
@@ -305,7 +305,7 @@ def test_8da4w_quantizer_eval(self):
 
         precision = torch.bfloat16
         device = "cpu"
-        checkpoint_path = Path("../gpt-fast/checkpoints/meta-llama/Llama-2-7b-chat-hf/model.pth")
+        checkpoint_path = Path("../checkpoints/meta-llama/Llama-2-7b-chat-hf/model.pth")
         model = Transformer.from_name(checkpoint_path.parent.name)
         checkpoint = torch.load(str(checkpoint_path), mmap=True, weights_only=True)
         model.load_state_dict(checkpoint, assign=True)
@@ -341,7 +341,7 @@ def test_gptq_quantizer_int4wo(self):
         torchao._models.llama.model.use_index_put_for_kv_cache = True
         precision = torch.bfloat16
         device = "cuda"
-        checkpoint_path = Path("../gpt-fast/checkpoints/meta-llama/Llama-2-7b-chat-hf/model.pth")
+        checkpoint_path = Path("../checkpoints/meta-llama/Llama-2-7b-chat-hf/model.pth")
         model = Transformer.from_name(checkpoint_path.parent.name)
         checkpoint = torch.load(str(checkpoint_path), mmap=True, weights_only=True)
         model.load_state_dict(checkpoint, assign=True)
@@ -402,7 +402,7 @@ def test_quantizer_int4wo(self):
         from torchao._models._eval import TransformerEvalWrapper
         precision = torch.bfloat16
         device = "cuda"
-        checkpoint_path = Path("../gpt-fast/checkpoints/meta-llama/Llama-2-7b-chat-hf/model.pth")
+        checkpoint_path = Path("../checkpoints/meta-llama/Llama-2-7b-chat-hf/model.pth")
         model = Transformer.from_name(checkpoint_path.parent.name)
         checkpoint = torch.load(str(checkpoint_path), mmap=True, weights_only=True)
         model.load_state_dict(checkpoint, assign=True)
@@ -438,7 +438,7 @@ def test_eval_wrapper(self):
         from torchao._models._eval import TransformerEvalWrapper
         precision = torch.bfloat16
         device = "cuda"
-        checkpoint_path = Path("../gpt-fast/checkpoints/meta-llama/Llama-2-7b-chat-hf/model.pth")
+        checkpoint_path = Path("../checkpoints/meta-llama/Llama-2-7b-chat-hf/model.pth")
         model = Transformer.from_name(checkpoint_path.parent.name)
         checkpoint = torch.load(str(checkpoint_path), mmap=True, weights_only=True)
         model.load_state_dict(checkpoint, assign=True)

diff --git a/torchao/_models/llama/README.md b/torchao/_models/llama/README.md
@@ -0,0 +1,12 @@
+# Llama Benchmarks
+
+The llama folder contains code/scripts for stable benchmarking llama models.
+
+To get model weights, go to https://huggingface.co/meta-llama/Llama-2-7b and/or https://huggingface.co/meta-llama/Meta-Llama-3-8B
+and follow the steps to gain access.
+
+Then from the torchao root directory use `huggingface-cli login` and follow the steps to login, then `sh ./scripts/prepare.sh` to
+download and convert the model weights
+
+once done you can execute benchmarks from the torchao/_models/llama dir with `sh benchmarks.sh`. You can perform and benchmarking
+directly using `generate.py`.
diff --git a/torchao/_models/llama/benchmarks.sh b/torchao/_models/llama/benchmarks.sh
@@ -1,4 +1,4 @@
-export CHECKPOINT_PATH=../../../../gpt-fast/checkpoints # path to checkpoints folder
+export CHECKPOINT_PATH=../../../checkpoints # path to checkpoints folder
 
 export MODEL_REPO=meta-llama/Meta-Llama-3-8B
 python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --precision torch.float32 --write_result benchmark_results.txt