From dea4157ee2639b892a2a996fbdef091df3314385 Mon Sep 17 00:00:00 2001 From: Heiner Date: Fri, 3 May 2024 22:38:05 +0200 Subject: [PATCH] Don't split MoE weights. As per https://github.com/ggerganov/llama.cpp/pull/7058#issuecomment-2092967508. This helps avoid a memcopy when running. --- convert_grok.py | 35 +++++++++++++---------------------- 1 file changed, 13 insertions(+), 22 deletions(-) diff --git a/convert_grok.py b/convert_grok.py index 8096931f5f123..d196f7a7b6c2a 100644 --- a/convert_grok.py +++ b/convert_grok.py @@ -185,14 +185,14 @@ def maybe_quantize_tensor(tensor, ggml_type): def get_dtype_and_ggml_type(tensor, ggml_type): - if tensor.ndim == 2: + if tensor.ndim in (2, 3): if tensor.shape[1] % GGML_QK8_0 == 0: return np.int8, ggml_type else: return np.float16, gguf.GGMLQuantizationType.F16 else: # 1d weight: convert it to float32 - assert tensor.ndim == 1 + assert tensor.ndim == 1, tensor return np.float32, gguf.GGMLQuantizationType.F32 @@ -236,15 +236,15 @@ def dump_state_dict(f, weight_names, model_files, ggml_type, config): cache.update(state_dict) tensor = cache.pop(key) _, tensor_ggml_type = get_dtype_and_ggml_type(tensor, ggml_type) - tensor = maybe_quantize_tensor(tensor, tensor_ggml_type) + array = maybe_quantize_tensor(tensor, tensor_ggml_type).numpy() - array = tensor.numpy() print( - f"dumping {key}: {tensor_ggml_type.name}/{array.dtype}, {array.shape}, {array.nbytes} bytes" + f"dumping {key}:", + f"{tensor_ggml_type.name}/{array.dtype}, {list(tensor.shape)}, {array.nbytes} bytes", ) f.write_tensor_data(array) - tensor_info.append((key, tensor.shape, tensor_ggml_type.name)) + tensor_info.append((key, list(tensor.shape), tensor_ggml_type.name)) try: print(tabulate(tensor_info, headers=["name", "shape", "dtype"], tablefmt="psql")) @@ -282,15 +282,10 @@ def convert_weight(tensor_name, weight, scales, experts, dtype=torch.float32, de if len(weight.shape) >= 2 and "token_embd" not in tensor_name: weight = weight.transpose(-1, -2) - if tensor_name.endswith("ffn_gate_inp.weight"): + if tensor_name.endswith("ffn_gate_inp.weight") or tensor_name.endswith("_exps.weight"): result[tensor_name] = weight[experts] # gather. elif "experts" not in tensor_name: result[tensor_name] = weight - else: - # split moe - for i, expert in enumerate(experts): - key = tensor_name.replace("experts", str(i)) - result[key] = weight[expert] return result @@ -328,14 +323,10 @@ def extract_vocabulary_from_model(vocab): def get_weight_names(config): weight_names = ["token_embd.weight"] for i in range(config.num_hidden_layers): - for j in range(config.num_experts): - weight_names += [ - f"blk.{i}.ffn_gate.{j}.weight", - f"blk.{i}.ffn_down.{j}.weight", - f"blk.{i}.ffn_up.{j}.weight", - ] - weight_names += [ + f"blk.{i}.ffn_gate_exps.weight", + f"blk.{i}.ffn_down_exps.weight", + f"blk.{i}.ffn_up_exps.weight", f"blk.{i}.attn_k.weight", f"blk.{i}.attn_output.weight", f"blk.{i}.attn_q.weight", @@ -399,9 +390,9 @@ def ffn_size(emb_size, widening_factor): ] for i in range(config.num_hidden_layers): tensor_names += [ - f"blk.{i}.ffn_gate.experts.weight", - f"blk.{i}.ffn_down.experts.weight", - f"blk.{i}.ffn_up.experts.weight", + f"blk.{i}.ffn_gate_exps.weight", + f"blk.{i}.ffn_down_exps.weight", + f"blk.{i}.ffn_up_exps.weight", f"blk.{i}.attn_k.weight", f"blk.{i}.attn_output.weight", f"blk.{i}.attn_q.weight",