AI-Hypercomputer · wang2yn84 · Jun 11, 2024 · May 31, 2024 · May 31, 2024 · May 31, 2024
diff --git a/README.md b/README.md
@@ -64,12 +64,21 @@ huggingface-cli download google/gemma-7b-pytorch --local-dir $input_ckpt_dir
 
 Need to manually modify the `config.json` in the checkpoint folder to make it a valid JSON file. (Replace `'` with `"`, remove the excessive `,` after the last item in the JSON object)
 
+## Mixtral
+### Get Mixtral Checkpoint from HuggingFace
+
+Please sign agreement on Huggingface website to access Mixtral checkpoints. Download Mixtral PyTorch checkpoint using huggingface-cli. Mixtral Tokenizer is included in the checkpoint.
+
+```bash
+huggingface-cli download mistralai/Mixtral-8x7B-v0.1 --local-dir $input_ckpt_dir
+```
+
 ## Run weight safetensor convert
 
 ```bash
 export input_ckpt_dir=Original llama weights directory
 export output_ckpt_dir=The output directory
-export model_name="llama-3" # or "llama-2", "gemma"
+export model_name="llama-3" # or "llama-2", "gemma", "mixtral"
 export quantize_weights=True # Whether to quantize weights
 export quantize_type="int8_per_channel" # "quantize_weights" needs to be turned on. Availabe quantize type: {"int8", "int4"} x {"per_channel", "blockwise"}, "int8_per_channel" is the default option if not specified.
 python -m convert_checkpoints --model_name=$model_name --input_checkpoint_dir=$input_ckpt_dir --output_checkpoint_dir=$output_ckpt_dir --quantize_type=$quantize_type
@@ -108,6 +117,11 @@ python run_interactive.py --size=70b --model_name=$model_name --batch_size=8 --m
 python run_interactive.py --model_name=$model_name --size=7b --batch_size=64 --max_cache_length=2048 --quantize_weights=$quantize --quantize_type=$quantize_type --quantize_kv_cache=$quantize --checkpoint_path=$output_ckpt_dir --tokenizer_path=$tokenizer_path --sharding_config=default_shardings/$model_name.yaml
 ```
 
+## Mixtral 8x7b
+```bash
+python run_interactive.py --model_name=$model_name --batch_size=128 --max_cache_length=2048 --quantize_weights=$quantize --quantize_type=$quantize_type --quantize_kv_cache=$quantize --checkpoint_path=$output_ckpt_dir --tokenizer_path=$tokenizer_path --sharding_config=default_shardings/$model_name.yaml
+```
+
 
 # Run the server
 Here is an example to run the server with llama2 7B config.

diff --git a/convert_checkpoints.py b/convert_checkpoints.py
@@ -26,6 +26,7 @@
 import hashlib
 import json
 import os
+import re
 import time
 
 import torch
@@ -37,6 +38,8 @@
 from jetstream_pt.config import FLAGS
 from jetstream_pt.third_party.gemma import model as gemma_model
 from jetstream_pt.third_party.llama import model_exportable as llama_model
+from jetstream_pt.third_party.mixtral import model as mixtral_model
+
 from safetensors import safe_open
 from safetensors.torch import save_file
 
@@ -123,6 +126,12 @@ def _quantize_state_dict(
         block_size = orig_block_size
         n_bit = orig_n_bit
   state_dict.update(updated_weights)
+  for k, v in state_dict.items():
+    if "layers" in k and "layers.0" not in k:
+      continue
+    print(
+        f"After quantization the converted key: {k} and value: {v.shape} {v.dtype}"
+    )
   return state_dict
 
 
@@ -462,6 +471,89 @@ def _get_gemma_state_dict(input_ckpt_dir):
   return state_dict, model_config
 
 
+def _get_mixtral_state_dict(input_ckpt_dir):
+  ckpt_files = list(input_ckpt_dir.glob("*.pt"))
+  assert len(ckpt_files) == 8, "only expect 8 ckpt file for Mistral model."
+
+  start = time.perf_counter()
+  state_dict = {}
+  for file in sorted(ckpt_files):
+    ckpt = torch.load(
+        str(file), map_location="cpu", mmap=True, weights_only=True
+    )
+    state_dict.update(ckpt)
+  end = time.perf_counter()
+  print(f"Loading checkpoints takes {end - start} seconds")
+
+  for k, v in state_dict.items():
+    if "layers" in k and "layers.0" not in k:
+      continue
+    print(f"The loaded key: {k} and value: {v.shape} {v.dtype}")
+
+  config = json.loads((input_ckpt_dir / "config.json").read_text())
+  print(f"Loaded config: {config}")
+  weight_map = {
+      "layers.{}.block_sparse_moe.w1": "layers.{}.block_sparse_moe.cond_ffn.w1",
+      "layers.{}.block_sparse_moe.w2": "layers.{}.block_sparse_moe.cond_ffn.w2",
+      "layers.{}.block_sparse_moe.w3": "layers.{}.block_sparse_moe.cond_ffn.w3",
+  }
+  for key in list(state_dict.keys()):
+    if state_dict[key].dtype.is_complex and _OUTPUT_SAFETENSORS.value:
+      assert (
+          key == "freqs_cis"
+      ), "Only expect key 'freqs_cis' in the state_dict has complex dtype."
+      # Remove "freqs_cis" since it has complex dtype, and safetensor doesn't support it.
+      # The "freqs_cis" will be reconstructed when it's loaded by inference engine.
+      state_dict.pop(key)
+      continue
+    prefix_to_remove = "model."
+    new_key = key
+    if key.startswith(prefix_to_remove):
+      new_key = new_key.removeprefix(prefix_to_remove)
+
+    if "layers" in key:
+      abstract_key = re.sub(r".(\d+).", ".{}.", key)
+      layer_num = re.search(r"\d+", key).group(0)
+      new_key = weight_map.get(abstract_key)
+      if new_key is None:
+        continue
+      new_key = new_key.format(layer_num)
+
+    if new_key == key:
+      continue
+
+    if "w1" in key or "w3" in key:
+      state_dict[new_key] = (
+          state_dict.pop(key)
+          .reshape(
+              config["num_local_experts"],
+              config["intermediate_size"],
+              config["hidden_size"],
+          )
+          .contiguous()
+      )
+    elif "w2" in key:
+      state_dict[new_key] = (
+          state_dict.pop(key)
+          .reshape(
+              config["num_local_experts"],
+              config["intermediate_size"],
+              config["hidden_size"],
+          )
+          .permute(0, 2, 1)
+          .contiguous()
+      )
+    elif "gate" in key:
+      state_dict[new_key] = state_dict.pop(key).contiguous()
+    else:
+      state_dict[new_key] = state_dict.pop(key)
+  for k, v in state_dict.items():
+    if "layers" in k and "layers.0" not in k:
+      continue
+    print(f"The converted key: {k} and value: {v.shape} {v.dtype}")
+  return state_dict, config
+
+
 def main(argv) -> None:
   """merge weights"""
 
@@ -473,6 +565,14 @@ def main(argv) -> None:
     quantize_embedding_weight_map = (
         gemma_model.GemmaModel.get_quantized_embedding_weight_to_scaler_map()
     )
+  elif FLAGS.model_name == "mixtral":
+    state_dict, params = _get_mixtral_state_dict(_INPUT_CHECKPOINT_DIR.value)
+    quantize_linear_weight_map = (
+        mixtral_model.Transformer.get_quantized_linear_weight_to_scaler_map()
+    )
+    quantize_embedding_weight_map = (
+        mixtral_model.Transformer.get_quantized_embedding_weight_to_scaler_map()
+    )
   else:
     state_dict, params = _get_llama_state_dict(_INPUT_CHECKPOINT_DIR.value)
     quantize_linear_weight_map = (

diff --git a/default_shardings/mixtral.yaml b/default_shardings/mixtral.yaml
@@ -0,0 +1,32 @@
+
+# Sharding config for mixtral
+# Sharding should either be an int between 0 and rank - 1
+# signifying the axis to shard or -1 / null signifying replicated
+
+
+freqs_cis : -1 #  torch.complex64 (2048, 64)
+tok_embeddings.weight : 1 #  torch.float32 (vocab_size, 4096)
+tok_embeddings.weight_scaler : 0 #  torch.bfloat16 (4096,)
+layers.*.attention.wo.weight : 1 #  torch.int8 (4096, 4096)
+layers.*.attention.wo.weight_scaler : -1 #  torch.bfloat16 (4096,)
+layers.*.attention.wq.weight : 0 #  torch.int8 (4096, 4096)
+layers.*.attention.wq.weight_scaler : 0 #  torch.bfloat16 (4096,)
+layers.*.attention.wk.weight : 0 #  torch.int8 (4096, 4096)
+layers.*.attention.wk.weight_scaler : 0 #  torch.bfloat16 (4096,)
+layers.*.attention.wv.weight : 0 #  torch.int8 (4096, 4096)
+layers.*.attention.wv.weight_scaler : 0 #  torch.bfloat16 (4096,)
+layers.*.attention.wqkv.weight : 0 #  torch.int8 (4096, 4096)
+layers.*.attention.wqkv.weight_scaler : 0 #  torch.bfloat16 (4096,)
+layers.*.block_sparse_moe.gate.weight: -1
+layers.*.block_sparse_moe.gate.weight_scaler: -1
+layers.*.block_sparse_moe.cond_ffn.w1: 1
+layers.*.block_sparse_moe.cond_ffn.w1_scaler: 1
+layers.*.block_sparse_moe.cond_ffn.w2: 2
+layers.*.block_sparse_moe.cond_ffn.w2_scaler: -1
+layers.*.block_sparse_moe.cond_ffn.w3: 1
+layers.*.block_sparse_moe.cond_ffn.w3_scaler: 1
+layers.*.ffn_norm.weight : -1 #  torch.float32 (4096,)
+layers.*.attention_norm.weight : -1 #  torch.float32 (4096,)
+norm.weight : -1 #  torch.float32 (4096,)
+output.weight : 0 #  torch.float32 (vocab_size, 4096)
+output.weight_scaler : 0 #  torch.float32 (4096,)
diff --git a/jetstream_pt/config.py b/jetstream_pt/config.py
@@ -108,7 +108,13 @@ def create_engine_from_config_flags():
   sharding_file_name = FLAGS.sharding_config
   if not sharding_file_name:
     sharding_file_name = (
-        "llama" if FLAGS.model_name.startswith("llama") else "gemma"
+        "llama"
+        if FLAGS.model_name.startswith("llama")
+        else "gemma"
+        if FLAGS.model_name.startswith("gemma")
+        else "mixtral"
+        if FLAGS.model_name.startswith("mixtral")
+        else None
     )
     if (
         quant_config.enable_weight_quantization

diff --git a/jetstream_pt/engine.py b/jetstream_pt/engine.py
@@ -37,6 +37,7 @@
 from jetstream_pt.environment import JetEngineEnvironment, JetEngineEnvironmentData, QuantizationConfig
 from jetstream_pt.third_party.llama import model_exportable as llama_model, model_args
 from jetstream_pt.third_party.gemma import config as gemma_config, model as gemma_model
+from jetstream_pt.third_party.mixtral import config as mixtral_config, model as mixtral_model
 
 
 Mesh = jax.sharding.Mesh
@@ -359,7 +360,6 @@ def _insert_wrap(
 
     start_insert = decode_state.current_position - prefix.seq_len
     tokens = decode_state.tokens.at[slot].set(prefix.token)
-
     start_insert = start_insert % self.env.cache_sequence_length
     # pos < 0
     update_indexes = (
@@ -641,12 +641,17 @@ def _load_from_safetensors(self, path):
   def _load_from_state_dict(self, path):
     state_dict = torch.load(path, map_location=torch.device("cpu"))
     weights = {}
+    print(f"Loaded keys are : {state_dict.keys()}")
     for key, model_weights in self.pt_model.state_dict().items():
+      if key == "freqs_cis":
+        continue
       assert key in state_dict, f"key: {key} not found"
-      weights[key] = torchjax.from_torch(state_dict[key])
+      weights[key] = torch_xla2.tensor.t2j(state_dict[key])
       assert tuple(model_weights.shape) == tuple(
           weights[key].shape
       ), f"key: {key} error: {model_weights.shape} != {weights[key].shape}"
+
+    weights["freqs_cis"] = torch_xla2.tensor.t2j(self.pt_model.freqs_cis)
     return weights
 
   # pylint: disable-next=all
@@ -760,7 +765,7 @@ def create_pytorch_engine(
 ) -> PyTorchEngine:
   """Returns: The pytorch engine."""
 
-  supported_models = ["llama-2", "llama-3", "gemma"]
+  supported_models = ["llama-2", "llama-3", "gemma", "mixtral"]
   if model_name not in supported_models:
     raise NotImplementedError(
         f"Model name should be one of{','.join(supported_models)}"
@@ -772,7 +777,6 @@ def create_pytorch_engine(
   jax.config.update("jax_traceback_filtering", "off")
   torch_dtype = torch.bfloat16 if bf16_enable else torch.float32
   torch.set_default_dtype(torch_dtype)
-
   checkpoint_format = ""
   checkpoint_path = ""
 
@@ -797,8 +801,14 @@ def create_pytorch_engine(
 
   pt_model = None
 
+  sharding_file_name = ""
   if not sharding_config:
-    sharding_file_name = "llama" if model_name.startswith("llama") else "gemma"
+    if model_name.startswith("llama"):
+      sharding_file_name = "llama"
+    elif model_name.startswith("gemma"):
+      sharding_file_name = "gemma"
+    elif model_name.startswith("mixtral"):
+      sharding_file_name = "mixtral"
     sharding_config = os.path.join(
         "default_shardings", sharding_file_name + ".yaml"
     )
@@ -851,6 +861,18 @@ def create_pytorch_engine(
     env = JetEngineEnvironment(env_data)
     print(f"Enviroment variables: {vars(env)}")
     pt_model = gemma_model.GemmaModel(args, env)
+  elif model_name == "mixtral":
+    args = mixtral_config.ModelArgs.from_name("Mixtral-8x7B-v0.1")
+    args.device = "meta"
+    env_data.cache_shape = (
+        batch_size,
+        args.n_local_heads,
+        max_cache_length,
+        args.dim // args.n_head,
+    )
+    env_data.num_layers = args.n_layer
+    env = JetEngineEnvironment(env_data)
+    pt_model = mixtral_model.Transformer(args, env)
   else:
     raise RuntimeError(f"Model with name {model_name} not found")
 

diff --git a/jetstream_pt/third_party/llama/model_exportable.py b/jetstream_pt/third_party/llama/model_exportable.py
@@ -200,10 +200,10 @@ def forward(
   ):
     """
     tokens: the input token for decoding
+    input_pos: the decoding position relative to the start, which is the length of the decoding results
     caches: kv caches
     mask: causal mask to filter the attention results
     start: the starting position for each slot
-    input_pos: the decoding position relative to the start, which is the length of the decoding results
     ragged_batch_index: precomputed batch index for ragged attention
     ragged_block_index: precomputed block index for ragged attention
     """

diff --git a/jetstream_pt/third_party/mixtral/__init__.py b/jetstream_pt/third_party/mixtral/__init__.py
diff --git a/jetstream_pt/third_party/mixtral/config.py b/jetstream_pt/third_party/mixtral/config.py
@@ -0,0 +1,78 @@
+# pylint: disable-all
+# # Copyright 2024 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Mixtral model config
+import dataclasses
+from dataclasses import dataclass
+
+
+def find_multiple(n: int, k: int) -> int:
+  if n % k == 0:
+    return n
+  return n + k - (n % k)
+
+
+@dataclass
+class ModelArgs:
+  block_size: int = 2048
+  vocab_size: int = 32000
+  n_layer: int = 32
+  n_head: int = 32
+  dim: int = 4096
+  intermediate_size: int = None
+  n_local_heads: int = -1
+  head_dim: int = 64
+  rope_base: float = 10000
+  norm_eps: float = 1e-5
+  num_experts: int = 8
+  num_activated_experts: int = 2
+  device: str = "meta"
+
+  def __post_init__(self):
+    if self.n_local_heads == -1:
+      self.n_local_heads = self.n_head
+    if self.intermediate_size is None:
+      hidden_dim = 4 * self.dim
+      n_hidden = int(2 * hidden_dim / 3)
+      self.intermediate_size = find_multiple(n_hidden, 256)
+    self.head_dim = self.dim // self.n_head
+
+  @classmethod
+  def from_name(cls, name: str):
+    if name in transformer_configs:
+      return cls(**transformer_configs[name])
+    # fuzzy search
+    config = [
+        config
+        for config in transformer_configs
+        if config in str(name).upper() or config in str(name)
+    ]
+    assert len(config) == 1, name
+    return cls(**transformer_configs[config[0]])
+
+
+transformer_configs = {
+    "Mixtral-8x7B-v0.1": dict(
+        block_size=32768,
+        n_layer=32,
+        n_head=32,
+        n_local_heads=8,
+        dim=4096,
+        intermediate_size=14336,
+        rope_base=1000000.0,
+        num_experts=8,
+        num_activated_experts=2,
+    ),
+}