ggerganov · sw · Apr 1, 2023 · Apr 1, 2023 · j-f1 · Apr 1, 2023
diff --git a/convert-ggml-to-pth.py b/convert-ggml-to-pth.py
@@ -7,7 +7,7 @@
 import torch
 from numba import njit
 from tqdm.auto import tqdm
-
+from ggml import *
 
 def read_header(fin):
     values = struct.unpack("i" * 9, fin.read(4 * 9))
@@ -37,9 +37,8 @@ def read_tokens(fin, vocab_size):
 
 @njit
 def dequantize_weights_numba(fin_data, n_rows, n_cols):
-    qk = 32
+    qk = GGML_BLCK_SIZE[GGML_TYPE.Q4_0]
     nb = n_cols // qk
-    bs = 4 + (qk // 2)
 
     weights = np.zeros((n_rows, n_cols), dtype=np.float32)
     data_pos = 0
@@ -63,9 +62,7 @@ def dequantize_weights_numba(fin_data, n_rows, n_cols):
 
 
 def dequantize_weights(fin, n_rows, n_cols):
-    qk = 32
-    nb = n_cols // qk
-    data_size = n_rows * n_cols // 2 + n_rows * nb * 4
+    data_size = n_rows * n_cols // GGML_BLCK_SIZE[GGML_TYPE.Q4_0] * GGML_TYPE_SIZE[GGML_TYPE.Q4_0]
     fin_data = fin.read(data_size)
     return dequantize_weights_numba(fin_data, n_rows, n_cols)
 
@@ -89,16 +86,16 @@ def read_variables(fin):
         tensor_data_offset = (tensor_data_offset + 31) & -32
         fin.seek(tensor_data_offset)
 
-        if ftype_cur == 2:
+        if ftype_cur == GGML_FILE.Q4_0:
             # 4-bit quantized weights
             dtype = np.uint8
             data = dequantize_weights(fin, shape[0], shape[1])
             data = data.reshape(shape)
-        elif ftype_cur == 0:
+        elif ftype_cur == GGML_FILE.F32:
             dtype = np.float32
             data_size = np.prod(shape)
             data = np.fromfile(fin, dtype=dtype, count=data_size).reshape(shape)
-        elif ftype_cur == 1:
+        elif ftype_cur == GGML_FILE.F16:
             dtype = np.float16
             data_size = np.prod(shape)
             data = np.fromfile(fin, dtype=dtype, count=data_size).reshape(shape)
@@ -269,6 +266,7 @@ def main():
 
     fin = open(ggml_files[0], "rb")
     hparams, ftype = read_header(fin)
+    GGML_FILE(ftype)    # raise ValueError on invalid file type
     tokens = read_tokens(fin, hparams["vocab_size"])
     model = read_variables(fin)
 

diff --git a/convert-gpt4all-to-ggml.py b/convert-gpt4all-to-ggml.py
@@ -12,6 +12,7 @@
 import struct
 import sys
 from sentencepiece import SentencePieceProcessor
+from ggml import *
 
 HPARAMS = keys = ["vocab_size", "dim", "multiple_of", "n_heads", "n_layers"]
 
@@ -32,6 +33,7 @@ def write_header(f_out, header):
 
     if magic != 0x67676d6c:
         raise Exception('Invalid file magic. Must be an old style ggml file.')
+    GGML_FILE(ftype)    # raise ValueError on invalid file type
 
     values = [
         0x67676d66, # magic: ggml in hex

diff --git a/convert-gptq-to-ggml.py b/convert-gptq-to-ggml.py
@@ -9,6 +9,7 @@
 import numpy as np
 import torch
 from sentencepiece import SentencePieceProcessor
+from ggml import *
 
 if len(sys.argv) != 4:
     print("Usage: convert-gptq-to-ggml.py llamaXXb-4bit.pt tokenizer.model out.bin\n")
@@ -143,7 +144,7 @@ def convert_q4(src_name, dst_name, permute=False):
                     .reshape(blob.shape))
 
     # header
-    write_header(shape, dst_name, 3) # ftype = Q4_1
+    write_header(shape, dst_name, GGML_FILE.Q4_1)
 
     # data
     blob.tofile(fout)

diff --git a/convert-pth-to-ggml.py b/convert-pth-to-ggml.py
@@ -23,43 +23,7 @@
 import torch
 
 from sentencepiece import SentencePieceProcessor
-
-QK = 32
-
-GGML_TYPE_Q4_0  = 0
-GGML_TYPE_Q4_1  = 1
-GGML_TYPE_I8    = 2
-GGML_TYPE_I16   = 3
-GGML_TYPE_I32   = 4
-GGML_TYPE_F16   = 5
-GGML_TYPE_F32   = 6
-
-WTYPES = {
-    0: GGML_TYPE_F32,
-    1: GGML_TYPE_F16,
-    2: GGML_TYPE_Q4_0,
-    3: GGML_TYPE_Q4_1,
-}
-
-GGML_BLCK_SIZE = {
-    GGML_TYPE_Q4_0:  QK,
-    GGML_TYPE_Q4_1:  QK,
-    GGML_TYPE_I8:    1,
-    GGML_TYPE_I16:   1,
-    GGML_TYPE_I32:   1,
-    GGML_TYPE_F16:   1,
-    GGML_TYPE_F32:   1,
-}
-
-GGML_TYPE_SIZE = {
-    GGML_TYPE_Q4_0: 4   + QK//2,
-    GGML_TYPE_Q4_1: 4*2 + QK//2,
-    GGML_TYPE_I8:   1,
-    GGML_TYPE_I16:  2,
-    GGML_TYPE_I32:  4,
-    GGML_TYPE_F16:  2,
-    GGML_TYPE_F32:  4,
-}
+from ggml import *
 
 def ggml_nelements(shape):
     r = 1
@@ -69,7 +33,7 @@ def ggml_nelements(shape):
 
 def ggml_nbytes(shape, ftype):
     x = ggml_nelements(shape)
-    t = WTYPES[ftype]
+    t = ggml_type_from_ftype[ftype]
     x *= GGML_TYPE_SIZE[t]
     x //= GGML_BLCK_SIZE[t]
     return x
@@ -155,8 +119,8 @@ def process_and_write_variables(fout, model, ftype, part_id, n_parts):
             print("  Converting to float32")
             data = data.astype(np.float32)
             ftype_cur = 0
-        blck_size = GGML_BLCK_SIZE[WTYPES[ftype_cur]]
-        type_size = GGML_TYPE_SIZE[WTYPES[ftype_cur]]
+        blck_size = GGML_BLCK_SIZE[ggml_type_from_ftype[ftype_cur]]
+        type_size = GGML_TYPE_SIZE[ggml_type_from_ftype[ftype_cur]]
 
         # determine dimension along which multipart tensor is sharded
         #
@@ -199,7 +163,7 @@ def process_and_write_variables(fout, model, ftype, part_id, n_parts):
 
         # ensure tensor data is aligned
         tensor_data_offset = fout.tell()
-        while tensor_data_offset % QK != 0:
+        while tensor_data_offset % 32 != 0:
             fout.write(struct.pack("B", 0))
             tensor_data_offset += 1
 
@@ -234,8 +198,7 @@ def process_and_write_variables(fout, model, ftype, part_id, n_parts):
 def main():
     args = parse_args()
     dir_model = args.dir_model
-    ftype = args.ftype
-    ftype_str = ["f32", "f16"]
+    ftype = GGML_FILE(args.ftype)
     hparams, tokenizer = load_hparams_and_tokenizer(dir_model)
 
     print(args)
@@ -252,7 +215,7 @@ def main():
         return
 
     n_parts = get_n_parts(hparams["dim"])
-    fname_out = f"{dir_model}/ggml-model-{ftype_str[ftype]}.bin"
+    fname_out = f"{dir_model}/ggml-model-{ftype.name.lower()}.bin"
 
     # we output a single file for ggml
     with open(fname_out, "wb") as fout: