diff --git a/convert-ggml-to-pth.py b/convert-ggml-to-pth.py index 7ddfe3a1b20a6..b2b2005c4d0a8 100644 --- a/convert-ggml-to-pth.py +++ b/convert-ggml-to-pth.py @@ -7,7 +7,7 @@ import torch from numba import njit from tqdm.auto import tqdm - +from ggml import * def read_header(fin): values = struct.unpack("i" * 9, fin.read(4 * 9)) @@ -37,9 +37,8 @@ def read_tokens(fin, vocab_size): @njit def dequantize_weights_numba(fin_data, n_rows, n_cols): - qk = 32 + qk = GGML_BLCK_SIZE[GGML_TYPE.Q4_0] nb = n_cols // qk - bs = 4 + (qk // 2) weights = np.zeros((n_rows, n_cols), dtype=np.float32) data_pos = 0 @@ -63,9 +62,7 @@ def dequantize_weights_numba(fin_data, n_rows, n_cols): def dequantize_weights(fin, n_rows, n_cols): - qk = 32 - nb = n_cols // qk - data_size = n_rows * n_cols // 2 + n_rows * nb * 4 + data_size = n_rows * n_cols // GGML_BLCK_SIZE[GGML_TYPE.Q4_0] * GGML_TYPE_SIZE[GGML_TYPE.Q4_0] fin_data = fin.read(data_size) return dequantize_weights_numba(fin_data, n_rows, n_cols) @@ -89,16 +86,16 @@ def read_variables(fin): tensor_data_offset = (tensor_data_offset + 31) & -32 fin.seek(tensor_data_offset) - if ftype_cur == 2: + if ftype_cur == GGML_FILE.Q4_0: # 4-bit quantized weights dtype = np.uint8 data = dequantize_weights(fin, shape[0], shape[1]) data = data.reshape(shape) - elif ftype_cur == 0: + elif ftype_cur == GGML_FILE.F32: dtype = np.float32 data_size = np.prod(shape) data = np.fromfile(fin, dtype=dtype, count=data_size).reshape(shape) - elif ftype_cur == 1: + elif ftype_cur == GGML_FILE.F16: dtype = np.float16 data_size = np.prod(shape) data = np.fromfile(fin, dtype=dtype, count=data_size).reshape(shape) @@ -269,6 +266,7 @@ def main(): fin = open(ggml_files[0], "rb") hparams, ftype = read_header(fin) + GGML_FILE(ftype) # raise ValueError on invalid file type tokens = read_tokens(fin, hparams["vocab_size"]) model = read_variables(fin) diff --git a/convert-gpt4all-to-ggml.py b/convert-gpt4all-to-ggml.py index b1a5e0560e083..ee54d37498735 100644 --- a/convert-gpt4all-to-ggml.py +++ b/convert-gpt4all-to-ggml.py @@ -12,6 +12,7 @@ import struct import sys from sentencepiece import SentencePieceProcessor +from ggml import * HPARAMS = keys = ["vocab_size", "dim", "multiple_of", "n_heads", "n_layers"] @@ -32,6 +33,7 @@ def write_header(f_out, header): if magic != 0x67676d6c: raise Exception('Invalid file magic. Must be an old style ggml file.') + GGML_FILE(ftype) # raise ValueError on invalid file type values = [ 0x67676d66, # magic: ggml in hex diff --git a/convert-gptq-to-ggml.py b/convert-gptq-to-ggml.py index 42e99c2ff8d3b..8eda9206ac9e8 100644 --- a/convert-gptq-to-ggml.py +++ b/convert-gptq-to-ggml.py @@ -9,6 +9,7 @@ import numpy as np import torch from sentencepiece import SentencePieceProcessor +from ggml import * if len(sys.argv) != 4: print("Usage: convert-gptq-to-ggml.py llamaXXb-4bit.pt tokenizer.model out.bin\n") @@ -143,7 +144,7 @@ def convert_q4(src_name, dst_name, permute=False): .reshape(blob.shape)) # header - write_header(shape, dst_name, 3) # ftype = Q4_1 + write_header(shape, dst_name, GGML_FILE.Q4_1) # data blob.tofile(fout) diff --git a/convert-pth-to-ggml.py b/convert-pth-to-ggml.py index dcef2f6a32213..0df7b761fca57 100644 --- a/convert-pth-to-ggml.py +++ b/convert-pth-to-ggml.py @@ -23,43 +23,7 @@ import torch from sentencepiece import SentencePieceProcessor - -QK = 32 - -GGML_TYPE_Q4_0 = 0 -GGML_TYPE_Q4_1 = 1 -GGML_TYPE_I8 = 2 -GGML_TYPE_I16 = 3 -GGML_TYPE_I32 = 4 -GGML_TYPE_F16 = 5 -GGML_TYPE_F32 = 6 - -WTYPES = { - 0: GGML_TYPE_F32, - 1: GGML_TYPE_F16, - 2: GGML_TYPE_Q4_0, - 3: GGML_TYPE_Q4_1, -} - -GGML_BLCK_SIZE = { - GGML_TYPE_Q4_0: QK, - GGML_TYPE_Q4_1: QK, - GGML_TYPE_I8: 1, - GGML_TYPE_I16: 1, - GGML_TYPE_I32: 1, - GGML_TYPE_F16: 1, - GGML_TYPE_F32: 1, -} - -GGML_TYPE_SIZE = { - GGML_TYPE_Q4_0: 4 + QK//2, - GGML_TYPE_Q4_1: 4*2 + QK//2, - GGML_TYPE_I8: 1, - GGML_TYPE_I16: 2, - GGML_TYPE_I32: 4, - GGML_TYPE_F16: 2, - GGML_TYPE_F32: 4, -} +from ggml import * def ggml_nelements(shape): r = 1 @@ -69,7 +33,7 @@ def ggml_nelements(shape): def ggml_nbytes(shape, ftype): x = ggml_nelements(shape) - t = WTYPES[ftype] + t = ggml_type_from_ftype[ftype] x *= GGML_TYPE_SIZE[t] x //= GGML_BLCK_SIZE[t] return x @@ -155,8 +119,8 @@ def process_and_write_variables(fout, model, ftype, part_id, n_parts): print(" Converting to float32") data = data.astype(np.float32) ftype_cur = 0 - blck_size = GGML_BLCK_SIZE[WTYPES[ftype_cur]] - type_size = GGML_TYPE_SIZE[WTYPES[ftype_cur]] + blck_size = GGML_BLCK_SIZE[ggml_type_from_ftype[ftype_cur]] + type_size = GGML_TYPE_SIZE[ggml_type_from_ftype[ftype_cur]] # determine dimension along which multipart tensor is sharded # @@ -199,7 +163,7 @@ def process_and_write_variables(fout, model, ftype, part_id, n_parts): # ensure tensor data is aligned tensor_data_offset = fout.tell() - while tensor_data_offset % QK != 0: + while tensor_data_offset % 32 != 0: fout.write(struct.pack("B", 0)) tensor_data_offset += 1 @@ -234,8 +198,7 @@ def process_and_write_variables(fout, model, ftype, part_id, n_parts): def main(): args = parse_args() dir_model = args.dir_model - ftype = args.ftype - ftype_str = ["f32", "f16"] + ftype = GGML_FILE(args.ftype) hparams, tokenizer = load_hparams_and_tokenizer(dir_model) print(args) @@ -252,7 +215,7 @@ def main(): return n_parts = get_n_parts(hparams["dim"]) - fname_out = f"{dir_model}/ggml-model-{ftype_str[ftype]}.bin" + fname_out = f"{dir_model}/ggml-model-{ftype.name.lower()}.bin" # we output a single file for ggml with open(fname_out, "wb") as fout: diff --git a/ggml.c b/ggml.c index b6dd3f3cf74e3..6dd93b6302198 100644 --- a/ggml.c +++ b/ggml.c @@ -423,8 +423,6 @@ static const size_t CACHE_LINE_SIZE_F32 = CACHE_LINE_SIZE/sizeof(float); // quantization // -#define QK 32 - // AVX routines provided by GH user Const-me // ref: https://github.com/ggerganov/ggml/pull/27#issuecomment-1464934600 #if __AVX2__ || __AVX512F__ @@ -499,34 +497,36 @@ static inline __m128i packNibbles( __m128i bytes1, __m128i bytes2 ) // method 5 // blocks of QK elements // represented with a single float (delta) and QK/2 8-bit ints (i.e QK 4-bit signed integer factors) +#define QK_4_0 32 typedef struct { float d; // delta - uint8_t qs[QK / 2]; // nibbles / quants + uint8_t qs[QK_4_0 / 2]; // nibbles / quants } block_q4_0; -static_assert(sizeof(block_q4_0) == sizeof(float) + QK / 2, "wrong q4_0 block size/padding"); +static_assert(sizeof(block_q4_0) == sizeof(float) + QK_4_0 / 2, "wrong q4_0 block size/padding"); // method 4 // blocks of QK elements // represented with 2 floats (delta + min) and QK/2 8-bit ints (i.e QK 4-bit unsigned integer factors) +#define QK_4_1 32 typedef struct { float d; float m; - uint8_t qs[QK / 2]; // nibbles / quants + uint8_t qs[QK_4_1 / 2]; // nibbles / quants } block_q4_1; -static_assert(sizeof(block_q4_1) == sizeof(float) * 2 + QK / 2, "wrong q4_1 block size/padding"); +static_assert(sizeof(block_q4_1) == sizeof(float) * 2 + QK_4_1 / 2, "wrong q4_1 block size/padding"); // reference implementation for deterministic creation of model files static void quantize_row_q4_0_reference(const float * restrict x, block_q4_0 * restrict y, int k) { - assert(k % QK == 0); - const int nb = k / QK; + assert(k % QK_4_0 == 0); + const int nb = k / QK_4_0; - uint8_t pp[QK/2]; + uint8_t pp[QK_4_0/2]; for (int i = 0; i < nb; i++) { float amax = 0.0f; // absolute max - for (int l = 0; l < QK; l++) { - const float v = x[i*QK + l]; + for (int l = 0; l < QK_4_0; l++) { + const float v = x[i*QK_4_0 + l]; amax = MAX(amax, fabsf(v)); } @@ -535,9 +535,9 @@ static void quantize_row_q4_0_reference(const float * restrict x, block_q4_0 * r y[i].d = d; - for (int l = 0; l < QK; l += 2) { - const float v0 = x[i*QK + l + 0]*id; - const float v1 = x[i*QK + l + 1]*id; + for (int l = 0; l < QK_4_0; l += 2) { + const float v0 = x[i*QK_4_0 + l + 0]*id; + const float v1 = x[i*QK_4_0 + l + 1]*id; const uint8_t vi0 = (int8_t)roundf(v0) + 8; const uint8_t vi1 = (int8_t)roundf(v1) + 8; @@ -553,8 +553,8 @@ static void quantize_row_q4_0_reference(const float * restrict x, block_q4_0 * r } static void quantize_row_q4_0(const float * restrict x, void * restrict vy, int k) { - assert(k % QK == 0); - const int nb = k / QK; + assert(k % QK_4_0 == 0); + const int nb = k / QK_4_0; block_q4_0 * restrict y = vy; @@ -807,19 +807,19 @@ static void quantize_row_q4_0(const float * restrict x, void * restrict vy, int } static void quantize_row_q4_1_reference(const float * restrict x, void * restrict vy, int k) { - assert(k % QK == 0); - const int nb = k / QK; + assert(k % QK_4_1 == 0); + const int nb = k / QK_4_1; block_q4_1 * restrict y = vy; - uint8_t pp[QK/2]; + uint8_t pp[QK_4_1/2]; for (int i = 0; i < nb; i++) { float min = FLT_MAX; float max = -FLT_MAX; - for (int l = 0; l < QK; l++) { - const float v = x[i*QK + l]; + for (int l = 0; l < QK_4_1; l++) { + const float v = x[i*QK_4_1 + l]; if (v < min) min = v; if (v > max) max = v; } @@ -830,9 +830,9 @@ static void quantize_row_q4_1_reference(const float * restrict x, void * restric y[i].d = d; y[i].m = min; - for (int l = 0; l < QK; l += 2) { - const float v0 = (x[i*QK + l + 0] - min)*id; - const float v1 = (x[i*QK + l + 1] - min)*id; + for (int l = 0; l < QK_4_1; l += 2) { + const float v0 = (x[i*QK_4_1 + l + 0] - min)*id; + const float v1 = (x[i*QK_4_1 + l + 1] - min)*id; const uint8_t vi0 = roundf(v0); const uint8_t vi1 = roundf(v1); @@ -848,9 +848,9 @@ static void quantize_row_q4_1_reference(const float * restrict x, void * restric } static void quantize_row_q4_1(const float * restrict x, void * restrict vy, int k) { - assert(k % QK == 0); + assert(k % QK_4_1 == 0); - const int nb = k / QK; + const int nb = k / QK_4_1; block_q4_1 * restrict y = vy; @@ -970,8 +970,8 @@ static void quantize_row_q4_1(const float * restrict x, void * restrict vy, int } static void dequantize_row_q4_0(const void * restrict vx, float * restrict y, int k) { - assert(k % QK == 0); - const int nb = k / QK; + assert(k % QK_4_0 == 0); + const int nb = k / QK_4_0; const block_q4_0 * restrict x = vx; @@ -982,7 +982,7 @@ static void dequantize_row_q4_0(const void * restrict vx, float * restrict y, in const uint8_t * restrict pp = x[i].qs; - for (int l = 0; l < QK; l += 32) { + for (int l = 0; l < QK_4_0; l += 32) { // loop is done once, keep for easy experimenting with QK // Load 32x4-bit integers into 32x8-bit integers __m256i vx8 = bytesFromNibbles(pp+l/2); @@ -1004,7 +1004,7 @@ static void dequantize_row_q4_0(const void * restrict vx, float * restrict y, in // Scale and store for (int j = 0; j < 4; j++) { const __m256 result = _mm256_mul_ps(vf[j], d_v); - _mm256_storeu_ps(y + i * QK + l + j*8, result); + _mm256_storeu_ps(y + i * QK_4_0 + l + j*8, result); } } } @@ -1014,7 +1014,7 @@ static void dequantize_row_q4_0(const void * restrict vx, float * restrict y, in const uint8_t * restrict pp = x[i].qs; - for (int l = 0; l < QK; l += 16) { + for (int l = 0; l < QK_4_0; l += 16) { // Load 16x4-bit integers into 8x8-bit integers const uint8x8_t v8 = vld1_u8(pp + l/2); @@ -1053,10 +1053,10 @@ static void dequantize_row_q4_0(const void * restrict vx, float * restrict y, in const float32x4_t r3 = vmulq_f32(vf_3, vd); // Store - vst1q_f32(y + i*QK + l + 0, r0); - vst1q_f32(y + i*QK + l + 4, r1); - vst1q_f32(y + i*QK + l + 8, r2); - vst1q_f32(y + i*QK + l + 12, r3); + vst1q_f32(y + i*QK_4_0 + l + 0, r0); + vst1q_f32(y + i*QK_4_0 + l + 4, r1); + vst1q_f32(y + i*QK_4_0 + l + 8, r2); + vst1q_f32(y + i*QK_4_0 + l + 12, r3); } } #else @@ -1066,7 +1066,7 @@ static void dequantize_row_q4_0(const void * restrict vx, float * restrict y, in const uint8_t * restrict pp = x[i].qs; - for (int l = 0; l < QK; l += 2) { + for (int l = 0; l < QK_4_0; l += 2) { const uint8_t vi = pp[l/2]; const int8_t vi0 = vi & 0xf; @@ -1077,19 +1077,19 @@ static void dequantize_row_q4_0(const void * restrict vx, float * restrict y, in //printf("d = %f, vi = %d, vi0 = %d, vi1 = %d, v0 = %f, v1 = %f\n", d, vi, vi0, vi1, v0, v1); - y[i*QK + l + 0] = v0; - y[i*QK + l + 1] = v1; + y[i*QK_4_0 + l + 0] = v0; + y[i*QK_4_0 + l + 1] = v1; - assert(!isnan(y[i*QK + l + 0])); - assert(!isnan(y[i*QK + l + 1])); + assert(!isnan(y[i*QK_4_0 + l + 0])); + assert(!isnan(y[i*QK_4_0 + l + 1])); } } #endif } static void dequantize_row_q4_1(const void * restrict vx, float * restrict y, int k) { - assert(k % QK == 0); - const int nb = k / QK; + assert(k % QK_4_1 == 0); + const int nb = k / QK_4_1; const block_q4_1 * restrict x = vx; @@ -1100,7 +1100,7 @@ static void dequantize_row_q4_1(const void * restrict vx, float * restrict y, in const uint8_t * restrict pp = x[i].qs; - for (int l = 0; l < QK; l += 32) { + for (int l = 0; l < QK_4_1; l += 32) { // loop is done once, keep for easy experimenting with QK // Load 32x4-bit integers into 32x8-bit integers __m256i vx8 = bytesFromNibbles(pp+l/2); @@ -1119,7 +1119,7 @@ static void dequantize_row_q4_1(const void * restrict vx, float * restrict y, in // Scale, add m and store for (int j = 0; j < 4; j++) { const __m256 result = _mm256_add_ps(_mm256_mul_ps(vf[j], d_v), d_m); - _mm256_storeu_ps(y + i * QK + l + j*8, result); + _mm256_storeu_ps(y + i * QK_4_1 + l + j*8, result); } } } @@ -1130,7 +1130,7 @@ static void dequantize_row_q4_1(const void * restrict vx, float * restrict y, in const uint8_t * restrict pp = x[i].qs; - for (int l = 0; l < QK; l += 16) { + for (int l = 0; l < QK_4_1; l += 16) { // Load 16x4-bit integers into 8x8-bit integers const uint8x8_t v8 = vld1_u8(pp + l/2); @@ -1161,10 +1161,10 @@ static void dequantize_row_q4_1(const void * restrict vx, float * restrict y, in const float32x4_t r3 = vmlaq_f32(vm, vf_3, vd); // Store - vst1q_f32(y + i*QK + l + 0, r0); - vst1q_f32(y + i*QK + l + 4, r1); - vst1q_f32(y + i*QK + l + 8, r2); - vst1q_f32(y + i*QK + l + 12, r3); + vst1q_f32(y + i*QK_4_1 + l + 0, r0); + vst1q_f32(y + i*QK_4_1 + l + 4, r1); + vst1q_f32(y + i*QK_4_1 + l + 8, r2); + vst1q_f32(y + i*QK_4_1 + l + 12, r3); } } #else @@ -1174,7 +1174,7 @@ static void dequantize_row_q4_1(const void * restrict vx, float * restrict y, in const uint8_t * restrict pp = x[i].qs; - for (int l = 0; l < QK; l += 2) { + for (int l = 0; l < QK_4_1; l += 2) { const uint8_t vi = pp[l/2]; const int8_t vi0 = vi & 0xf; @@ -1183,11 +1183,11 @@ static void dequantize_row_q4_1(const void * restrict vx, float * restrict y, in const float v0 = vi0*d + m; const float v1 = vi1*d + m; - y[i*QK + l + 0] = v0; - y[i*QK + l + 1] = v1; + y[i*QK_4_1 + l + 0] = v0; + y[i*QK_4_1 + l + 1] = v1; - assert(!isnan(y[i*QK + l + 0])); - assert(!isnan(y[i*QK + l + 1])); + assert(!isnan(y[i*QK_4_1 + l + 0])); + assert(!isnan(y[i*QK_4_1 + l + 1])); } } #endif @@ -1757,7 +1757,7 @@ inline static void ggml_vec_dot_f32(const int n, float * restrict s, const float *s = sumf; } -#if __AVX512F__ && QK == 32 +#if __AVX512F__ && QK_4_0 == 32 static inline __m512 dot_q4_0_oneblock_avx512( __m512 acc, const block_q4_0 * restrict x, @@ -1825,9 +1825,9 @@ inline static void ggml_vec_dot_f16(const int n, float * restrict s, ggml_fp16_t } static void ggml_vec_dot_q4_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) { - const int nb = n / QK; + const int nb = n / QK_4_0; - assert(n % QK == 0); + assert(n % QK_4_0 == 0); assert(nb % 2 == 0); const block_q4_0 * restrict x = vx; @@ -2140,7 +2140,7 @@ static void ggml_vec_dot_q4_0(const int n, float * restrict s, const void * rest const uint8_t * restrict p0 = x[i].qs; const uint8_t * restrict p1 = y[i].qs; - for (int j = 0; j < QK/2; j++) { + for (int j = 0; j < QK_4_0/2; j++) { const uint8_t v0 = p0[j]; const uint8_t v1 = p1[j]; @@ -2159,7 +2159,7 @@ static void ggml_vec_dot_q4_0(const int n, float * restrict s, const void * rest } static void ggml_vec_dot_q4_1(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) { - const int nb = n / QK; + const int nb = n / QK_4_1; const block_q4_1 * restrict x = vx; const block_q4_1 * restrict y = vy; @@ -2236,7 +2236,7 @@ static void ggml_vec_dot_q4_1(const int n, float * restrict s, const void * rest res = _mm_add_ps( res, _mm_movehl_ps( res, res ) ); res = _mm_add_ss( res, _mm_movehdup_ps( res ) ); - sumf = _mm_cvtss_f32( res ) + acc_offset * QK; + sumf = _mm_cvtss_f32( res ) + acc_offset * QK_4_1; #elif defined(__ARM_NEON) float sum00 = 0.0f; float sum01 = 0.0f; @@ -2275,7 +2275,7 @@ static void ggml_vec_dot_q4_1(const int n, float * restrict s, const void * rest sum11 += x0->d*y0->d*vaddvq_u16(vaddq_u16(pl0, ph0)); } - sumf = QK*sum00 + sum01 + sum10 + sum11; + sumf = QK_4_1*sum00 + sum01 + sum10 + sum11; #else // scalar for (int i = 0; i < nb; i++) { @@ -2288,7 +2288,7 @@ static void ggml_vec_dot_q4_1(const int n, float * restrict s, const void * rest const uint8_t * restrict p0 = x[i].qs; const uint8_t * restrict p1 = y[i].qs; - for (int j = 0; j < QK/2; j++) { + for (int j = 0; j < QK_4_1/2; j++) { const uint8_t v0 = p0[j]; const uint8_t v1 = p1[j]; @@ -2547,118 +2547,113 @@ inline static void ggml_vec_norm_inv_f32(const int n, float * s, const float * x // data types // -static const int GGML_BLCK_SIZE[GGML_TYPE_COUNT] = { - QK, - QK, - 1, - 1, - 1, - 1, - 1, +static const int GGML_BLCK_SIZE[] = { + [GGML_TYPE_Q4_0] = QK_4_0, + [GGML_TYPE_Q4_1] = QK_4_1, + [GGML_TYPE_I8] = 1, + [GGML_TYPE_I16] = 1, + [GGML_TYPE_I32] = 1, + [GGML_TYPE_F16] = 1, + [GGML_TYPE_F32] = 1, }; - -static_assert(GGML_TYPE_COUNT == 7, "GGML_TYPE_COUNT != 5"); - -static const size_t GGML_TYPE_SIZE[GGML_TYPE_COUNT] = { - sizeof(block_q4_0), - sizeof(block_q4_1), - sizeof(int8_t ), - sizeof(int16_t), - sizeof(int32_t), - sizeof(ggml_fp16_t), - sizeof(float ), +static_assert(sizeof(GGML_BLCK_SIZE)/sizeof(*GGML_BLCK_SIZE) == GGML_TYPE_COUNT, "GGML_BLCK_SIZE incomplete"); + +static const size_t GGML_TYPE_SIZE[] = { + [GGML_TYPE_Q4_0] = sizeof(block_q4_0), + [GGML_TYPE_Q4_1] = sizeof(block_q4_1), + [GGML_TYPE_I8] = sizeof(int8_t), + [GGML_TYPE_I16] = sizeof(int16_t), + [GGML_TYPE_I32] = sizeof(int32_t), + [GGML_TYPE_F16] = sizeof(ggml_fp16_t), + [GGML_TYPE_F32] = sizeof(float), }; - -// don't forget to update the array above when adding new types -static_assert(GGML_TYPE_COUNT == 7, "GGML_TYPE_COUNT != 5"); - -static const char * GGML_OP_LABEL[GGML_OP_COUNT] = { - "NONE", - - "DUP", - "ADD", - "SUB", - "MUL", - "DIV", - "SQR", - "SQRT", - "SUM", - "MEAN", - "REPEAT", - "ABS", - "SGN", - "NEG", - "STEP", - "RELU", - "GELU", - "SILU", - "NORM", - "RMS_NORM", - - "MUL_MAT", - - "SCALE", - "CPY", - "RESHAPE", - "VIEW", - "PERMUTE", - "TRANSPOSE", - "GET_ROWS", - "DIAG_MASK_INF", - "SOFT_MAX", - "ROPE", - "CONV_1D_1S", - "CONV_1D_2S", - - "FLASH_ATTN", - "FLASH_FF", +static_assert(sizeof(GGML_TYPE_SIZE)/sizeof(*GGML_TYPE_SIZE) == GGML_TYPE_COUNT, "GGML_TYPE_SIZE incomplete"); + +static const char * GGML_OP_LABEL[] = { + [GGML_OP_NONE] = "NONE", + + [GGML_OP_DUP] = "DUP", + [GGML_OP_ADD] = "ADD", + [GGML_OP_SUB] = "SUB", + [GGML_OP_MUL] = "MUL", + [GGML_OP_DIV] = "DIV", + [GGML_OP_SQR] = "SQR", + [GGML_OP_SQRT] = "SQRT", + [GGML_OP_SUM] = "SUM", + [GGML_OP_MEAN] = "MEAN", + [GGML_OP_REPEAT] = "REPEAT", + [GGML_OP_ABS] = "ABS", + [GGML_OP_SGN] = "SGN", + [GGML_OP_NEG] = "NEG", + [GGML_OP_STEP] = "STEP", + [GGML_OP_RELU] = "RELU", + [GGML_OP_GELU] = "GELU", + [GGML_OP_SILU] = "SILU", + [GGML_OP_NORM] = "NORM", + [GGML_OP_RMS_NORM] = "RMS_NORM", + + [GGML_OP_MUL_MAT] = "MUL_MAT", + + [GGML_OP_SCALE] = "SCALE", + [GGML_OP_CPY] = "CPY", + [GGML_OP_RESHAPE] = "RESHAPE", + [GGML_OP_VIEW] = "VIEW", + [GGML_OP_PERMUTE] = "PERMUTE", + [GGML_OP_TRANSPOSE] = "TRANSPOSE", + [GGML_OP_GET_ROWS] = "GET_ROWS", + [GGML_OP_DIAG_MASK_INF] = "DIAG_MASK_INF", + [GGML_OP_SOFT_MAX] = "SOFT_MAX", + [GGML_OP_ROPE] = "ROPE", + [GGML_OP_CONV_1D_1S] = "CONV_1D_1S", + [GGML_OP_CONV_1D_2S] = "CONV_1D_2S", + + [GGML_OP_FLASH_ATTN] = "FLASH_ATTN", + [GGML_OP_FLASH_FF] = "FLASH_FF", }; - -static_assert(GGML_OP_COUNT == 35, "GGML_OP_COUNT != 35"); - -static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = { - "none", - - "x", - "x+y", - "x-y", - "x*y", - "x/y", - "x^2", - "√x", - "Σx", - "Σx/n", - "repeat(x)", - "abs(x)", - "sgn(x)", - "-x", - "step(x)", - "relu(x)", - "gelu(x)", - "silu(x)", - "norm(x)", - "rms_norm(x)", - - "X*Y", - - "x*v", - "x-\\>y", - "reshape(x)", - "view(x)", - "permute(x)", - "transpose(x)", - "get_rows(x)", - "diag_mask_inf(x)", - "soft_max(x)", - "rope(x)", - "conv_1d_1s(x)", - "conv_1d_2s(x)", - - "flash_attn(x)", - "flash_ff(x)", +static_assert(sizeof(GGML_OP_LABEL)/sizeof(*GGML_OP_LABEL) == GGML_OP_COUNT, "GGML_OP_LABEL incomplete"); + +static const char * GGML_OP_SYMBOL[] = { + [GGML_OP_NONE] = "none", + + [GGML_OP_DUP] = "x", + [GGML_OP_ADD] = "x+y", + [GGML_OP_SUB] = "x-y", + [GGML_OP_MUL] = "x*y", + [GGML_OP_DIV] = "x/y", + [GGML_OP_SQR] = "x^2", + [GGML_OP_SQRT] = "√x", + [GGML_OP_SUM] = "Σx", + [GGML_OP_MEAN] = "Σx/n", + [GGML_OP_REPEAT] = "repeat(x)", + [GGML_OP_ABS] = "abs(x)", + [GGML_OP_SGN] = "sgn(x)", + [GGML_OP_NEG] = "-x", + [GGML_OP_STEP] = "step(x)", + [GGML_OP_RELU] = "relu(x)", + [GGML_OP_GELU] = "gelu(x)", + [GGML_OP_SILU] = "silu(x)", + [GGML_OP_NORM] = "norm(x)", + [GGML_OP_RMS_NORM] = "rms_norm(x)", + + [GGML_OP_MUL_MAT] = "X*Y", + + [GGML_OP_SCALE] = "x*v", + [GGML_OP_CPY] = "x-\\>y", + [GGML_OP_RESHAPE] = "reshape(x)", + [GGML_OP_VIEW] = "view(x)", + [GGML_OP_PERMUTE] = "permute(x)", + [GGML_OP_TRANSPOSE] = "transpose(x)", + [GGML_OP_GET_ROWS] = "get_rows(x)", + [GGML_OP_DIAG_MASK_INF] = "diag_mask_inf(x)", + [GGML_OP_SOFT_MAX] = "soft_max(x)", + [GGML_OP_ROPE] = "rope(x)", + [GGML_OP_CONV_1D_1S] = "conv_1d_1s(x)", + [GGML_OP_CONV_1D_2S] = "conv_1d_2s(x)", + + [GGML_OP_FLASH_ATTN] = "flash_attn(x)", + [GGML_OP_FLASH_FF] = "flash_ff(x)", }; - -static_assert(GGML_OP_COUNT == 35, "GGML_OP_COUNT != 35"); +static_assert(sizeof(GGML_OP_SYMBOL)/sizeof(*GGML_OP_SYMBOL) == GGML_OP_COUNT, "GGML_OP_SYMBOL incomplete"); // // ggml object @@ -6686,7 +6681,7 @@ static void ggml_compute_forward_mul_mat_q_f32( float * dst_col = (float *) ((char *) dst->data + (i0*nb0 + 0*nb1 + i2*nb2 + i3*nb3)); - assert(ne00 % 32 == 0); + assert(ne00 % GGML_BLCK_SIZE[type] == 0); for (int ic = 0; ic < ne11; ++ic) { vec_dot_q(ne00, &dst_col[ic*ne0], src0_row, (void *) (src1_col + ic*row_size)); @@ -10496,16 +10491,16 @@ enum ggml_opt_result ggml_opt( //////////////////////////////////////////////////////////////////////////////// size_t ggml_quantize_q4_0(const float * src, void * dst, int n, int k, int64_t * hist) { - assert(k % QK == 0); - const int nb = k / QK; + assert(k % QK_4_0 == 0); + const int nb = k / QK_4_0; for (int j = 0; j < n; j += k) { - block_q4_0 * restrict y = (block_q4_0 *)dst + j/QK; + block_q4_0 * restrict y = (block_q4_0 *)dst + j/QK_4_0; quantize_row_q4_0_reference(src + j, y, k); for (int i = 0; i < nb; i++) { - for (int l = 0; l < QK; l += 2) { + for (int l = 0; l < QK_4_0; l += 2) { const uint8_t vi0 = y[i].qs[l/2] & 0xF; const uint8_t vi1 = y[i].qs[l/2] >> 4; @@ -10515,20 +10510,20 @@ size_t ggml_quantize_q4_0(const float * src, void * dst, int n, int k, int64_t * } } - return (n/QK*sizeof(block_q4_0)); + return (n/QK_4_0*sizeof(block_q4_0)); } size_t ggml_quantize_q4_1(const float * src, void * dst, int n, int k, int64_t * hist) { - assert(k % QK == 0); - const int nb = k / QK; + assert(k % QK_4_1 == 0); + const int nb = k / QK_4_1; for (int j = 0; j < n; j += k) { - block_q4_1 * restrict y = (block_q4_1 *)dst + j/QK; + block_q4_1 * restrict y = (block_q4_1 *)dst + j/QK_4_1; quantize_row_q4_1_reference(src + j, y, k); for (int i = 0; i < nb; i++) { - for (int l = 0; l < QK; l += 2) { + for (int l = 0; l < QK_4_1; l += 2) { const uint8_t vi0 = y[i].qs[l/2] & 0xF; const uint8_t vi1 = y[i].qs[l/2] >> 4; @@ -10538,7 +10533,7 @@ size_t ggml_quantize_q4_1(const float * src, void * dst, int n, int k, int64_t * } } - return (n/QK*sizeof(block_q4_1)); + return (n/QK_4_1*sizeof(block_q4_1)); } //////////////////////////////////////////////////////////////////////////////// diff --git a/ggml.py b/ggml.py new file mode 100644 index 0000000000000..ad9fd70edb433 --- /dev/null +++ b/ggml.py @@ -0,0 +1,50 @@ +from enum import IntEnum + + +class GGML_TYPE(IntEnum): + """Tensor types, corresponding to enum ggml_type in ggml.h""" + + Q4_0 = 0 + Q4_1 = 1 + I8 = 2 + I16 = 3 + I32 = 4 + F16 = 5 + F32 = 6 + + +class GGML_FILE(IntEnum): + """File types, corresponding to enum e_ftype in llama.cpp""" + + F32 = 0 + F16 = 1 + Q4_0 = 2 + Q4_1 = 3 + + +ggml_type_from_ftype = { + GGML_FILE.F32: GGML_TYPE.F32, + GGML_FILE.F16: GGML_TYPE.F16, + GGML_FILE.Q4_0: GGML_TYPE.Q4_0, + GGML_FILE.Q4_1: GGML_TYPE.Q4_1, +} + +GGML_BLCK_SIZE = { + GGML_TYPE.Q4_0: 32, + GGML_TYPE.Q4_1: 32, + GGML_TYPE.I8: 1, + GGML_TYPE.I16: 1, + GGML_TYPE.I32: 1, + GGML_TYPE.F16: 1, + GGML_TYPE.F32: 1, +} + +GGML_TYPE_SIZE = { + GGML_TYPE.Q4_0: 4 + GGML_BLCK_SIZE[GGML_TYPE.Q4_0] // 2, + GGML_TYPE.Q4_1: 4 * 2 + GGML_BLCK_SIZE[GGML_TYPE.Q4_1] // 2, + GGML_TYPE.I8: 1, + GGML_TYPE.I16: 2, + GGML_TYPE.I32: 4, + GGML_TYPE.F16: 2, + GGML_TYPE.F32: 4, +} diff --git a/llama.cpp b/llama.cpp index bed24207db776..d41656a90ac67 100644 --- a/llama.cpp +++ b/llama.cpp @@ -54,6 +54,15 @@ enum e_model { MODEL_65B, }; +// model file types +enum e_ftype { + FTYPE_F32 = 0, + FTYPE_F16 = 1, + FTYPE_Q4_0 = 2, + FTYPE_Q4_1 = 3, +}; +static const char * ftype_str[] = { "f32", "f16", "q4_0", "q4_1" }; + static const size_t MB = 1024*1024; // computed for n_ctx == 2048 @@ -100,7 +109,7 @@ struct llama_hparams { int32_t n_head = 32; int32_t n_layer = 32; int32_t n_rot = 64; - int32_t f16 = 1; + int32_t f16 = FTYPE_F16; }; struct llama_layer { @@ -508,10 +517,10 @@ static bool llama_model_load( // wtype is for per-layer weights, while vtype is for other weights ggml_type wtype, vtype; switch (model.hparams.f16) { - case 0: wtype = vtype = GGML_TYPE_F32; break; - case 1: wtype = vtype = GGML_TYPE_F16; break; - case 2: wtype = vtype = GGML_TYPE_Q4_0; break; - case 3: wtype = vtype = GGML_TYPE_Q4_1; break; + case FTYPE_F32: wtype = vtype = GGML_TYPE_F32; break; + case FTYPE_F16: wtype = vtype = GGML_TYPE_F16; break; + case FTYPE_Q4_0: wtype = vtype = GGML_TYPE_Q4_0; break; + case FTYPE_Q4_1: wtype = vtype = GGML_TYPE_Q4_1; break; case 4: wtype = GGML_TYPE_Q4_1; vtype = GGML_TYPE_F16; break; default: { @@ -684,16 +693,15 @@ static bool llama_model_load( return false; } if (0) { - static const char * ftype_str[] = { "f32", "f16", "q4_0", "q4_1", }; fprintf(stderr, "%24s - [%5d, %5d], type = %6s\n", name.data(), ne[0], ne[1], ftype_str[ftype]); } switch (ftype) { - case 0: // f32 - case 1: // f16 + case FTYPE_F32: + case FTYPE_F16: break; - case 2: // q4_0 - case 3: // q4_1 + case FTYPE_Q4_0: + case FTYPE_Q4_1: assert(ne[0] % 64 == 0); break; default: @@ -1273,20 +1281,15 @@ static llama_vocab::id llama_sample_top_p_top_k( // // TODO: reuse code from the llama_model_load() somehow -static bool llama_model_quantize_internal(const std::string & fname_inp, const std::string & fname_out, int itype) { - ggml_type type = GGML_TYPE_Q4_1; +static bool llama_model_quantize_internal(const std::string & fname_inp, const std::string & fname_out, enum e_ftype itype) { + ggml_type type; switch (itype) { - case 2: type = GGML_TYPE_Q4_0; break; - case 3: type = GGML_TYPE_Q4_1; break; - default: fprintf(stderr, "%s: invalid quantization type %d\n", __func__, itype); return 1; + case FTYPE_Q4_0: type = GGML_TYPE_Q4_0; break; + case FTYPE_Q4_1: type = GGML_TYPE_Q4_1; break; + default: fprintf(stderr, "%s: invalid quantization type %d\n", __func__, itype); return false; }; - if (type != GGML_TYPE_Q4_0 && type != GGML_TYPE_Q4_1) { - fprintf(stderr, "%s: invalid quantization type %d\n", __func__, type); - return false; - } - llama_vocab vocab; printf("%s: loading model from '%s'\n", __func__, fname_inp.c_str()); @@ -1438,7 +1441,6 @@ static bool llama_model_quantize_internal(const std::string & fname_inp, const s } { - static const char * ftype_str[] = { "f32", "f16", "q4_0", "q4_1", }; printf("%48s - [%5d, %5d], type = %6s ", name.data(), ne[0], ne[1], ftype_str[ftype]); } @@ -1459,12 +1461,12 @@ static bool llama_model_quantize_internal(const std::string & fname_inp, const s quantize &= (n_dims == 2); if (quantize) { - if (ftype != 0 && ftype != 1) { + if (ftype != FTYPE_F32 && ftype != FTYPE_F16) { fprintf(stderr, "%s: unsupported ftype %d for integer quantization\n", __func__, ftype); return false; } - if (ftype == 1) { + if (ftype == FTYPE_F16) { data_f16.resize(nelements); finp.read(reinterpret_cast(data_f16.data()), nelements * sizeof(ggml_fp16_t)); data_f32.resize(nelements); @@ -1478,7 +1480,7 @@ static bool llama_model_quantize_internal(const std::string & fname_inp, const s ftype = itype; } else { - const int bpe = (ftype == 0) ? sizeof(float) : sizeof(uint16_t); + const int bpe = (ftype == FTYPE_F32) ? sizeof(float) : sizeof(uint16_t); data_u8.resize(nelements*bpe); finp.read(reinterpret_cast(data_u8.data()), nelements * bpe); @@ -1660,7 +1662,7 @@ int llama_model_quantize( const char * fname_inp, const char * fname_out, int itype) { - if (!llama_model_quantize_internal(fname_inp, fname_out, itype)) { + if (!llama_model_quantize_internal(fname_inp, fname_out, (enum e_ftype)itype)) { fprintf(stderr, "%s: failed to quantize\n", __func__); return 1; } diff --git a/migrate-ggml-2023-03-30-pr613.py b/migrate-ggml-2023-03-30-pr613.py index b6ef2476e052d..ddc8664b4d90f 100644 --- a/migrate-ggml-2023-03-30-pr613.py +++ b/migrate-ggml-2023-03-30-pr613.py @@ -54,50 +54,7 @@ import json import struct import numpy as np - -QK = 32 - -GGML_TYPE_Q4_0 = 0 -GGML_TYPE_Q4_1 = 1 -GGML_TYPE_I8 = 2 -GGML_TYPE_I16 = 3 -GGML_TYPE_I32 = 4 -GGML_TYPE_F16 = 5 -GGML_TYPE_F32 = 6 - -WTYPE_NAMES = { - 0: "F32", - 1: "F16", - 2: "Q4_0", - 3: "Q4_1", -} - -WTYPES = { - 0: GGML_TYPE_F32, - 1: GGML_TYPE_F16, - 2: GGML_TYPE_Q4_0, - 3: GGML_TYPE_Q4_1, -} - -GGML_BLCK_SIZE = { - GGML_TYPE_Q4_0: QK, - GGML_TYPE_Q4_1: QK, - GGML_TYPE_I8: 1, - GGML_TYPE_I16: 1, - GGML_TYPE_I32: 1, - GGML_TYPE_F16: 1, - GGML_TYPE_F32: 1, -} - -GGML_TYPE_SIZE = { - GGML_TYPE_Q4_0: 4 + QK//2, - GGML_TYPE_Q4_1: 4*2 + QK//2, - GGML_TYPE_I8: 1, - GGML_TYPE_I16: 2, - GGML_TYPE_I32: 4, - GGML_TYPE_F16: 2, - GGML_TYPE_F32: 4, -} +from ggml import * HPARAMS = [ 'magic', # int32 @@ -150,7 +107,7 @@ def ggml_nelements(shape): def ggml_nbytes(shape, ftype): x = ggml_nelements(shape) - t = WTYPES[ftype] + t = ggml_type_from_ftype[ftype] x *= GGML_TYPE_SIZE[t] x //= GGML_BLCK_SIZE[t] return x @@ -177,10 +134,10 @@ def copy_tensors(fin, fout, part_id, n_parts): name = fin.read(length) data = fin.read(ggml_nbytes(partshape, ftype)) - blck_size = GGML_BLCK_SIZE[WTYPES[ftype]] - type_size = GGML_TYPE_SIZE[WTYPES[ftype]] + blck_size = GGML_BLCK_SIZE[ggml_type_from_ftype[ftype]] + type_size = GGML_TYPE_SIZE[ggml_type_from_ftype[ftype]] - print(f"Processing tensor {name} with shape: {partshape} and type: {WTYPE_NAMES[ftype]}") + print(f"Processing tensor {name} with shape: {partshape} and type: {GGML_FILE(ftype).name}") # determine dimension along which multipart tensor is sharded # @@ -222,7 +179,7 @@ def copy_tensors(fin, fout, part_id, n_parts): # ensure tensor data is aligned tensor_data_offset = fout.tell() - while tensor_data_offset % QK != 0: + while tensor_data_offset % 32 != 0: fout.write(struct.pack("B", 0)) tensor_data_offset += 1 diff --git a/tests/test-quantize.c b/tests/test-quantize.c index 993e9dcc3cef3..d44fa39f4095f 100644 --- a/tests/test-quantize.c +++ b/tests/test-quantize.c @@ -5,6 +5,8 @@ int main(void) { #define QK 32 + assert(ggml_blck_size(GGML_TYPE_Q4_0) == QK); + assert(ggml_blck_size(GGML_TYPE_Q4_1) == QK); float src[QK]; uint8_t dst[24]; int64_t hist[16];