Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Clean up QK and file and tensor types #678

Closed
wants to merge 2 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 7 additions & 9 deletions convert-ggml-to-pth.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
import torch
from numba import njit
from tqdm.auto import tqdm

from ggml import *

def read_header(fin):
values = struct.unpack("i" * 9, fin.read(4 * 9))
Expand Down Expand Up @@ -37,9 +37,8 @@ def read_tokens(fin, vocab_size):

@njit
def dequantize_weights_numba(fin_data, n_rows, n_cols):
qk = 32
qk = GGML_BLCK_SIZE[GGML_TYPE.Q4_0]
nb = n_cols // qk
bs = 4 + (qk // 2)

weights = np.zeros((n_rows, n_cols), dtype=np.float32)
data_pos = 0
Expand All @@ -63,9 +62,7 @@ def dequantize_weights_numba(fin_data, n_rows, n_cols):


def dequantize_weights(fin, n_rows, n_cols):
qk = 32
nb = n_cols // qk
data_size = n_rows * n_cols // 2 + n_rows * nb * 4
data_size = n_rows * n_cols // GGML_BLCK_SIZE[GGML_TYPE.Q4_0] * GGML_TYPE_SIZE[GGML_TYPE.Q4_0]
fin_data = fin.read(data_size)
return dequantize_weights_numba(fin_data, n_rows, n_cols)

Expand All @@ -89,16 +86,16 @@ def read_variables(fin):
tensor_data_offset = (tensor_data_offset + 31) & -32
fin.seek(tensor_data_offset)

if ftype_cur == 2:
if ftype_cur == GGML_FILE.Q4_0:
# 4-bit quantized weights
dtype = np.uint8
data = dequantize_weights(fin, shape[0], shape[1])
data = data.reshape(shape)
elif ftype_cur == 0:
elif ftype_cur == GGML_FILE.F32:
dtype = np.float32
data_size = np.prod(shape)
data = np.fromfile(fin, dtype=dtype, count=data_size).reshape(shape)
elif ftype_cur == 1:
elif ftype_cur == GGML_FILE.F16:
dtype = np.float16
data_size = np.prod(shape)
data = np.fromfile(fin, dtype=dtype, count=data_size).reshape(shape)
Expand Down Expand Up @@ -269,6 +266,7 @@ def main():

fin = open(ggml_files[0], "rb")
hparams, ftype = read_header(fin)
GGML_FILE(ftype) # raise ValueError on invalid file type
tokens = read_tokens(fin, hparams["vocab_size"])
model = read_variables(fin)

Expand Down
2 changes: 2 additions & 0 deletions convert-gpt4all-to-ggml.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
import struct
import sys
from sentencepiece import SentencePieceProcessor
from ggml import *

HPARAMS = keys = ["vocab_size", "dim", "multiple_of", "n_heads", "n_layers"]

Expand All @@ -32,6 +33,7 @@ def write_header(f_out, header):

if magic != 0x67676d6c:
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you make the magic a constant too?

Copy link
Collaborator Author

@sw sw Apr 1, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good idea, though that's the old one before mmap. Someone ought to migrate the *-to-ggml.py scripts. (edit: #704 #545)

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We need old magic constants anyway to detect older models.

raise Exception('Invalid file magic. Must be an old style ggml file.')
GGML_FILE(ftype) # raise ValueError on invalid file type

values = [
0x67676d66, # magic: ggml in hex
Expand Down
3 changes: 2 additions & 1 deletion convert-gptq-to-ggml.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
import numpy as np
import torch
from sentencepiece import SentencePieceProcessor
from ggml import *

if len(sys.argv) != 4:
print("Usage: convert-gptq-to-ggml.py llamaXXb-4bit.pt tokenizer.model out.bin\n")
Expand Down Expand Up @@ -143,7 +144,7 @@ def convert_q4(src_name, dst_name, permute=False):
.reshape(blob.shape))

# header
write_header(shape, dst_name, 3) # ftype = Q4_1
write_header(shape, dst_name, GGML_FILE.Q4_1)

# data
blob.tofile(fout)
Expand Down
51 changes: 7 additions & 44 deletions convert-pth-to-ggml.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,43 +23,7 @@
import torch

from sentencepiece import SentencePieceProcessor

QK = 32

GGML_TYPE_Q4_0 = 0
GGML_TYPE_Q4_1 = 1
GGML_TYPE_I8 = 2
GGML_TYPE_I16 = 3
GGML_TYPE_I32 = 4
GGML_TYPE_F16 = 5
GGML_TYPE_F32 = 6

WTYPES = {
0: GGML_TYPE_F32,
1: GGML_TYPE_F16,
2: GGML_TYPE_Q4_0,
3: GGML_TYPE_Q4_1,
}

GGML_BLCK_SIZE = {
GGML_TYPE_Q4_0: QK,
GGML_TYPE_Q4_1: QK,
GGML_TYPE_I8: 1,
GGML_TYPE_I16: 1,
GGML_TYPE_I32: 1,
GGML_TYPE_F16: 1,
GGML_TYPE_F32: 1,
}

GGML_TYPE_SIZE = {
GGML_TYPE_Q4_0: 4 + QK//2,
GGML_TYPE_Q4_1: 4*2 + QK//2,
GGML_TYPE_I8: 1,
GGML_TYPE_I16: 2,
GGML_TYPE_I32: 4,
GGML_TYPE_F16: 2,
GGML_TYPE_F32: 4,
}
from ggml import *

def ggml_nelements(shape):
r = 1
Expand All @@ -69,7 +33,7 @@ def ggml_nelements(shape):

def ggml_nbytes(shape, ftype):
x = ggml_nelements(shape)
t = WTYPES[ftype]
t = ggml_type_from_ftype[ftype]
x *= GGML_TYPE_SIZE[t]
x //= GGML_BLCK_SIZE[t]
return x
Expand Down Expand Up @@ -155,8 +119,8 @@ def process_and_write_variables(fout, model, ftype, part_id, n_parts):
print(" Converting to float32")
data = data.astype(np.float32)
ftype_cur = 0
blck_size = GGML_BLCK_SIZE[WTYPES[ftype_cur]]
type_size = GGML_TYPE_SIZE[WTYPES[ftype_cur]]
blck_size = GGML_BLCK_SIZE[ggml_type_from_ftype[ftype_cur]]
type_size = GGML_TYPE_SIZE[ggml_type_from_ftype[ftype_cur]]

# determine dimension along which multipart tensor is sharded
#
Expand Down Expand Up @@ -199,7 +163,7 @@ def process_and_write_variables(fout, model, ftype, part_id, n_parts):

# ensure tensor data is aligned
tensor_data_offset = fout.tell()
while tensor_data_offset % QK != 0:
while tensor_data_offset % 32 != 0:
fout.write(struct.pack("B", 0))
tensor_data_offset += 1

Expand Down Expand Up @@ -234,8 +198,7 @@ def process_and_write_variables(fout, model, ftype, part_id, n_parts):
def main():
args = parse_args()
dir_model = args.dir_model
ftype = args.ftype
ftype_str = ["f32", "f16"]
ftype = GGML_FILE(args.ftype)
hparams, tokenizer = load_hparams_and_tokenizer(dir_model)

print(args)
Expand All @@ -252,7 +215,7 @@ def main():
return

n_parts = get_n_parts(hparams["dim"])
fname_out = f"{dir_model}/ggml-model-{ftype_str[ftype]}.bin"
fname_out = f"{dir_model}/ggml-model-{ftype.name.lower()}.bin"

# we output a single file for ggml
with open(fname_out, "wb") as fout:
Expand Down
Loading