From 126b9844822d0207fbf4db4a57c10ef45489e6fd Mon Sep 17 00:00:00 2001 From: Iwan Kawrakow Date: Tue, 11 Apr 2023 12:41:14 +0200 Subject: [PATCH 01/12] Use better conversion to ints in quantize_row_q4_0_reference and quantize_row_q4_1_reference. This reduces the difference to the vectorized versions to ~10% for quantize_row_q4_0 and <15% for quantize_row_q4_1 on the two CPU's I have tried (Ryzen 7950X and M2 Max). --- ggml.c | 31 +++++++++++++++++++++++++------ 1 file changed, 25 insertions(+), 6 deletions(-) diff --git a/ggml.c b/ggml.c index 897b67d930614..a99f6a3b7343d 100644 --- a/ggml.c +++ b/ggml.c @@ -2,6 +2,7 @@ #define _GNU_SOURCE #include "ggml.h" +#include "ggml_extra.h" #if defined(_MSC_VER) || defined(__MINGW32__) #include // using malloc.h with MSC/MINGW @@ -502,6 +503,13 @@ typedef struct { } block_q4_1; static_assert(sizeof(block_q4_1) == sizeof(float) * 2 + QK / 2, "wrong q4_1 block size/padding"); +inline int nearestInt(float fval) { + assert(fval <= 4194303.f); + float val = fval + 12582912.f; + int i; memcpy(&i, &val, sizeof(int)); + return (i & 0x007fffff) - 0x00400000; +} + // reference implementation for deterministic creation of model files static void quantize_row_q4_0_reference(const float * restrict x, block_q4_0 * restrict y, int k) { assert(k % QK == 0); @@ -526,8 +534,15 @@ static void quantize_row_q4_0_reference(const float * restrict x, block_q4_0 * r const float v0 = x[i*QK + l + 0]*id; const float v1 = x[i*QK + l + 1]*id; - const uint8_t vi0 = (int8_t)roundf(v0) + 8; - const uint8_t vi1 = (int8_t)roundf(v1) + 8; + // On x86_64 and x86, round is amazingly slow. + // Here it is best to just use this: + const uint8_t vi0 = (uint8_t)(v0 + 8.5f); + const uint8_t vi1 = (uint8_t)(v1 + 8.5f); + //const uint8_t vi0 = (int8_t)roundf(v0) + 8; + //const uint8_t vi1 = (int8_t)roundf(v1) + 8; + // This is marginally slower (but still much faster than round()) + //const uint8_t vi0 = nearestInt(v0) + 8; + //const uint8_t vi1 = nearestInt(v1) + 8; assert(vi0 < 16); assert(vi1 < 16); @@ -818,8 +833,10 @@ static void quantize_row_q4_1_reference(const float * restrict x, void * restric const float v0 = (x[i*QK + l + 0] - min)*id; const float v1 = (x[i*QK + l + 1] - min)*id; - const uint8_t vi0 = roundf(v0); - const uint8_t vi1 = roundf(v1); + // For some reason round() is amazingly slow on X86_64 and x86 + // Using this instead reduces the difference between AVX2 and scalar to less than ~15% + const uint8_t vi0 = nearestInt(v0); //roundf(v0); + const uint8_t vi1 = nearestInt(v1); //roundf(v1); assert(vi0 < 16); assert(vi1 < 16); @@ -2560,6 +2577,8 @@ inline static void ggml_vec_norm_inv_f32(const int n, float * s, const float * x // static const int GGML_BLCK_SIZE[GGML_TYPE_COUNT] = { + QK, + QK, QK, QK, 1, @@ -2569,7 +2588,7 @@ static const int GGML_BLCK_SIZE[GGML_TYPE_COUNT] = { 1, }; -static_assert(GGML_TYPE_COUNT == 7, "GGML_TYPE_COUNT != 5"); +static_assert(GGML_TYPE_COUNT == 7, "GGML_TYPE_COUNT != 7"); static const size_t GGML_TYPE_SIZE[GGML_TYPE_COUNT] = { sizeof(block_q4_0), @@ -2582,7 +2601,7 @@ static const size_t GGML_TYPE_SIZE[GGML_TYPE_COUNT] = { }; // don't forget to update the array above when adding new types -static_assert(GGML_TYPE_COUNT == 7, "GGML_TYPE_COUNT != 5"); +static_assert(GGML_TYPE_COUNT == 7, "GGML_TYPE_COUNT != 7"); static const char * GGML_OP_LABEL[GGML_OP_COUNT] = { "NONE", From 0c9a967a20b47c08b8cf17f1be35386f211aae1b Mon Sep 17 00:00:00 2001 From: Iwan Kawrakow Date: Tue, 11 Apr 2023 12:44:47 +0200 Subject: [PATCH 02/12] Adding new functions for Q4_0 and Q4_1 quantization --- CMakeLists.txt | 4 +- ggml_extra.cpp | 204 +++++++++++++++++++++++++++++++++++++++++++++++++ ggml_extra.h | 20 +++++ 3 files changed, 227 insertions(+), 1 deletion(-) create mode 100644 ggml_extra.cpp create mode 100644 ggml_extra.h diff --git a/CMakeLists.txt b/CMakeLists.txt index 6bec1f97befd9..41958c93a1bc8 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -235,7 +235,9 @@ endif() add_library(ggml OBJECT ggml.c - ggml.h) + ggml.h + ggml_extra.h + ggml_extra.cpp) target_include_directories(ggml PUBLIC .) target_compile_features(ggml PUBLIC c_std_11) # don't bump diff --git a/ggml_extra.cpp b/ggml_extra.cpp new file mode 100644 index 0000000000000..cabbefae78874 --- /dev/null +++ b/ggml_extra.cpp @@ -0,0 +1,204 @@ +#include "ggml_extra.h" + +#include +#include +#include +#include +#include +#include +#include + +namespace { + +inline int toNearestInt(float fval) { + assert(fval <= 4194303.f); + constexpr float kSnapper=3<<22; + auto val = fval + kSnapper; + int i; std::memcpy(&i, &val, sizeof(int)); + return (i & 0x007fffff) - 0x00400000; +} + +float kQuantize0(int n, const float* X, int8_t* L, std::vector>& work, int nmin, int nmax) { + work.clear(); + work.reserve(n*(nmax+2)); + float max = 0; int imax = -1; + for (int i=0; i max) { max = x; imax = i; } + } + if (imax < 0) { // all X are zero + for (int i=0; i 0) { + kmin = nmax-2; kmax = nmax + 1; + } else { + kmin = nmax/2; kmax = nmax+1; + } + } + for (int k=kmin; k<=kmax; ++k) work.push_back({(k + 0.501f)*maxi, imax}); + float minScale = work.front().first; + float maxScale = work.back().first; + for (int i=0; i maxScale) break; + if (s > minScale) work.push_back({s,i}); + } + } + std::sort(work.begin(), work.end()); + float sumlx = 0; int suml2 = 0; + float s = work.front().first; + for (int i=0; i L[i]) { + sumlx += X[i]; + suml2 += 1 + 2*L[i]; + } + else { + sumlx -= X[i]; + suml2 += 1 - 2*L[i]; + } + L[i] = l; + float sumlx2 = sumlx*sumlx; + if ((s != lasts || k == int(work.size())-1) && suml2 > 0 && sumlx2*bestSuml2 > bestSumlx2*suml2) { + bestSumlx = sumlx; bestSumlx2 = sumlx2; bestSuml2 = suml2; bests = s; + } + lasts = s; + } + for (int i=0; i kQuantize1(int n, const float* X, int8_t* L, std::vector& tmpX, + std::vector>& work, int nmax) { + float min = X[0], max = X[1]; + for (int i=1; i>& work, std::vector& tmpX) { + if (type == 0) { + float scale = kQuantize0(QK, X, L, work, -7, 7); + std::memcpy(y, &scale, sizeof(scale)); y += sizeof(scale); + uint8_t* q = (uint8_t*)y; + for (int k=0; k L(QK); + std::vector> work; + std::vector tmpX; + int nb = k / QK; + for (int i=0; i counter(0); + auto compute = [&counter, x, y, k, bucketSize, &processOne] () { + std::vector L(QK); + std::vector> work; + std::vector tmpX; + while (true) { + int first = counter.fetch_add(kChunkSize); + if (first >= k) break; + int last = first + kChunkSize; + if (last > k) last = k; + auto xi = x + first; + auto yi = y + (first/QK)*bucketSize; + int n = (last - first)/QK; + for (int i=0; i workers(nthread-1); + for (auto& w : workers) w = std::thread(compute); + compute(); + for (auto& w : workers) w.join(); +} + +} + +extern "C" { + +void kQuantizeQ4_0(const float* x, void* buffer, int k) { + kQuantizeQ4(x, buffer, k, 0); +} + +void kQuantizeQ4_1(const float* x, void* buffer, int k) { + kQuantizeQ4(x, buffer, k, 1); +} + +} diff --git a/ggml_extra.h b/ggml_extra.h new file mode 100644 index 0000000000000..99041bed08c89 --- /dev/null +++ b/ggml_extra.h @@ -0,0 +1,20 @@ +#pragma once + +#ifdef __cplusplus +extern "C" { +#endif + +#ifdef __cplusplus +// restrict not standard in C++ +#define GGML_RESTRICT +#else +#define GGML_RESTRICT restrict +#endif + +void kQuantizeQ4_0(const float* GGML_RESTRICT x, void* GGML_RESTRICT y, int k); + +void kQuantizeQ4_1(const float* GGML_RESTRICT x, void* GGML_RESTRICT y, int k); + +#ifdef __cplusplus +} +#endif From 8b3d1f977bad98a715c519c798441903b54efcfe Mon Sep 17 00:00:00 2001 From: Iwan Kawrakow Date: Tue, 11 Apr 2023 13:01:22 +0200 Subject: [PATCH 03/12] Remove forgotten remnant from a discarded change to ggml.c --- ggml.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/ggml.c b/ggml.c index a99f6a3b7343d..61c80a6f34ef5 100644 --- a/ggml.c +++ b/ggml.c @@ -2577,8 +2577,6 @@ inline static void ggml_vec_norm_inv_f32(const int n, float * s, const float * x // static const int GGML_BLCK_SIZE[GGML_TYPE_COUNT] = { - QK, - QK, QK, QK, 1, From 92408cd9830e26a842810f4b1e5405af410550c9 Mon Sep 17 00:00:00 2001 From: Iwan Kawrakow Date: Tue, 11 Apr 2023 13:03:51 +0200 Subject: [PATCH 04/12] Add ability to use new quantization in quantize-stats --- examples/quantize-stats/quantize-stats.cpp | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/examples/quantize-stats/quantize-stats.cpp b/examples/quantize-stats/quantize-stats.cpp index 203bfe8cc1057..cfee120c35ddb 100644 --- a/examples/quantize-stats/quantize-stats.cpp +++ b/examples/quantize-stats/quantize-stats.cpp @@ -1,6 +1,7 @@ #include "ggml.h" #include "llama.h" #include "llama_internal.h" +#include "ggml_extra.h" #include #include @@ -29,7 +30,7 @@ struct quantize_stats_params { std::vector include_types; }; -const int64_t SCRATCH_ELEMENTS = 32*32; +const int64_t SCRATCH_ELEMENTS = 32*32*256; // So we use multi-threading in a meaningful way in the new quantization const size_t HISTOGRAM_BUCKETS = 150; const double HISTOGRAM_RANGE = 0.03; @@ -184,6 +185,7 @@ int main(int argc, char ** argv) { // read command line bool invalid_param = false; + bool checkNewQuantization = false; std::string arg; for (int i = 1; i < argc; i++) { arg = argv[i]; @@ -232,6 +234,8 @@ int main(int argc, char ** argv) { fprintf(stderr, "error: %s not in list of types\n", argv[i]); invalid_param = true; } + } else if (arg == "-nq" || arg == "--new-quantization") { + checkNewQuantization = true; } else { fprintf(stderr, "error: unknown argument: %s\n", arg.c_str()); quantize_stats_print_usage(argc, argv); @@ -307,6 +311,9 @@ int main(int argc, char ** argv) { continue; } quantize_fns_t qfns = ggml_internal_get_quantize_fn(i); + if (i < 2 && checkNewQuantization) { + qfns.quantize_row_q = i == 0 ? kQuantizeQ4_0 : kQuantizeQ4_1; + } if (qfns.quantize_row_q && qfns.dequantize_row_q) { if (params.verbose) { printf("testing %s ...\n", type_strs[i]); From 709d23543af47d80856a6ad224fda77e7c65fbb2 Mon Sep 17 00:00:00 2001 From: Iwan Kawrakow Date: Tue, 11 Apr 2023 15:32:41 +0200 Subject: [PATCH 05/12] Add new quantization to quantize --- Makefile | 27 ++++++++-------- examples/quantize/quantize.cpp | 2 ++ ggml_extra.cpp | 56 +++++++++++++++++++++++++--------- ggml_extra.h | 7 +++++ llama.cpp | 12 ++++++-- 5 files changed, 76 insertions(+), 28 deletions(-) diff --git a/Makefile b/Makefile index 3e58a28a751ab..17624656bd158 100644 --- a/Makefile +++ b/Makefile @@ -145,32 +145,35 @@ ggml.o: ggml.c ggml.h llama.o: llama.cpp llama.h llama_util.h llama_internal.h $(CXX) $(CXXFLAGS) -c llama.cpp -o llama.o +ggml_extra.o: ggml_extra.cpp ggml_extra.h + $(CXX) $(CXXFLAGS) -c $< -o $@ + common.o: examples/common.cpp examples/common.h $(CXX) $(CXXFLAGS) -c examples/common.cpp -o common.o clean: rm -vf *.o main quantize quantize-stats perplexity embedding -main: examples/main/main.cpp ggml.o llama.o common.o - $(CXX) $(CXXFLAGS) examples/main/main.cpp ggml.o llama.o common.o -o main $(LDFLAGS) +main: examples/main/main.cpp ggml.o llama.o common.o ggml_extra.o + $(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS) @echo @echo '==== Run ./main -h for help. ====' @echo -quantize: examples/quantize/quantize.cpp ggml.o llama.o - $(CXX) $(CXXFLAGS) examples/quantize/quantize.cpp ggml.o llama.o -o quantize $(LDFLAGS) +quantize: examples/quantize/quantize.cpp ggml.o llama.o ggml_extra.o + $(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS) -quantize-stats: examples/quantize-stats/quantize-stats.cpp ggml.o llama.o - $(CXX) $(CXXFLAGS) examples/quantize-stats/quantize-stats.cpp ggml.o llama.o -o quantize-stats $(LDFLAGS) +quantize-stats: examples/quantize-stats/quantize-stats.cpp ggml.o llama.o ggml_extra.o + $(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS) -perplexity: examples/perplexity/perplexity.cpp ggml.o llama.o common.o - $(CXX) $(CXXFLAGS) examples/perplexity/perplexity.cpp ggml.o llama.o common.o -o perplexity $(LDFLAGS) +perplexity: examples/perplexity/perplexity.cpp ggml.o llama.o common.o ggml_extra.o + $(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS) -embedding: examples/embedding/embedding.cpp ggml.o llama.o common.o - $(CXX) $(CXXFLAGS) examples/embedding/embedding.cpp ggml.o llama.o common.o -o embedding $(LDFLAGS) +embedding: examples/embedding/embedding.cpp ggml.o llama.o common.o ggml_extra.o + $(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS) -libllama.so: llama.o ggml.o - $(CXX) $(CXXFLAGS) -shared -fPIC -o libllama.so llama.o ggml.o $(LDFLAGS) +libllama.so: llama.o ggml.o ggml_extra.o + $(CXX) $(CXXFLAGS) -shared -fPIC -o $@ $^ $(LDFLAGS) # # Tests # diff --git a/examples/quantize/quantize.cpp b/examples/quantize/quantize.cpp index 680757c6bf356..313b7534f36c5 100644 --- a/examples/quantize/quantize.cpp +++ b/examples/quantize/quantize.cpp @@ -14,6 +14,8 @@ int main(int argc, char ** argv) { fprintf(stderr, "usage: %s model-f32.bin model-quant.bin type\n", argv[0]); fprintf(stderr, " type = 2 - q4_0\n"); fprintf(stderr, " type = 3 - q4_1\n"); + fprintf(stderr, " type = 4 - new q4_0\n"); + fprintf(stderr, " type = 5 - new q4_1\n"); return 1; } diff --git a/ggml_extra.cpp b/ggml_extra.cpp index cabbefae78874..e2ae005df08a3 100644 --- a/ggml_extra.cpp +++ b/ggml_extra.cpp @@ -10,6 +10,11 @@ namespace { +constexpr int kChunkSize = 32*32*8; +constexpr int QK = 32; +constexpr int kBucketSize0 = QK/2 + sizeof(float); +constexpr int kBucketSize1 = QK/2 + 2*sizeof(float); + inline int toNearestInt(float fval) { assert(fval <= 4194303.f); constexpr float kSnapper=3<<22; @@ -126,24 +131,19 @@ std::pair kQuantize1(int n, const float* X, int8_t* L, std::vector return {a, b}; } -void kQuantizeQ4(const float* GGML_RESTRICT x, void* GGML_RESTRICT buffer, int k, int type) { - constexpr int kChunkSize = 32*32*8; - constexpr int QK = 32; - constexpr int kBucketSize0 = QK/2 + sizeof(float); - constexpr int kBucketSize1 = QK/2 + 2*sizeof(float); +void kQuantizeQ4(const float* X, void* buffer, int k, int type) { assert(k % QK == 0); auto processOne = [type] (const float* X, int8_t* L, char* y, std::vector>& work, std::vector& tmpX) { + auto q = (uint8_t*)y; if (type == 0) { float scale = kQuantize0(QK, X, L, work, -7, 7); - std::memcpy(y, &scale, sizeof(scale)); y += sizeof(scale); - uint8_t* q = (uint8_t*)y; + std::memcpy(q, &scale, sizeof(scale)); q += sizeof(scale); for (int k=0; k> work; std::vector tmpX; int nb = k / QK; + auto x = X; for (int i=0; i counter(0); - auto compute = [&counter, x, y, k, bucketSize, &processOne] () { + auto compute = [&counter, X, y, k, bucketSize, &processOne] () { std::vector L(QK); std::vector> work; std::vector tmpX; while (true) { - int first = counter.fetch_add(kChunkSize); + int first = counter.fetch_add(kChunkSize, std::memory_order_relaxed); if (first >= k) break; int last = first + kChunkSize; if (last > k) last = k; - auto xi = x + first; + auto xi = X + first; auto yi = y + (first/QK)*bucketSize; int n = (last - first)/QK; for (int i=0; i> 4]; + } + y += 16; + } +} + } extern "C" { @@ -201,4 +217,16 @@ void kQuantizeQ4_1(const float* x, void* buffer, int k) { kQuantizeQ4(x, buffer, k, 1); } +size_t kQuantizeQ4_0H(const float* x, void* buffer, int k, int64_t* hist) { + kQuantizeQ4(x, buffer, k, 0); + collectHisto(k, buffer, hist, 0); + return (k / QK) * kBucketSize0; +} + +size_t kQuantizeQ4_1H(const float* x, void* buffer, int k, int64_t* hist) { + kQuantizeQ4(x, buffer, k, 1); + collectHisto(k, buffer, hist, 1); + return (k / QK) * kBucketSize1; +} + } diff --git a/ggml_extra.h b/ggml_extra.h index 99041bed08c89..788fcd0ea3015 100644 --- a/ggml_extra.h +++ b/ggml_extra.h @@ -1,7 +1,12 @@ #pragma once #ifdef __cplusplus +#include +#include extern "C" { +#else +#include +#include #endif #ifdef __cplusplus @@ -12,8 +17,10 @@ extern "C" { #endif void kQuantizeQ4_0(const float* GGML_RESTRICT x, void* GGML_RESTRICT y, int k); +size_t kQuantizeQ4_0H(const float* GGML_RESTRICT x, void* GGML_RESTRICT y, int k, int64_t* hist); void kQuantizeQ4_1(const float* GGML_RESTRICT x, void* GGML_RESTRICT y, int k); +size_t kQuantizeQ4_1H(const float* GGML_RESTRICT x, void* GGML_RESTRICT y, int k, int64_t* hist); #ifdef __cplusplus } diff --git a/llama.cpp b/llama.cpp index 54ba01eefbade..04ba10672cbcc 100644 --- a/llama.cpp +++ b/llama.cpp @@ -8,6 +8,7 @@ #include "llama_internal.h" #include "ggml.h" +#include "ggml_extra.h" #include #include @@ -1546,9 +1547,12 @@ static llama_vocab::id llama_sample_top_p_top_k( static void llama_model_quantize_internal(const std::string & fname_inp, const std::string & fname_out, int itype) { ggml_type quantized_type; + bool useNewQuantization = false; switch (itype) { case 2: quantized_type = GGML_TYPE_Q4_0; break; case 3: quantized_type = GGML_TYPE_Q4_1; break; + case 4: quantized_type = GGML_TYPE_Q4_0; useNewQuantization = true; break; + case 5: quantized_type = GGML_TYPE_Q4_1; useNewQuantization = true; break; default: throw format("invalid quantization type %d\n", itype); }; @@ -1616,11 +1620,15 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s switch (new_type) { case GGML_TYPE_Q4_0: { - new_size = ggml_quantize_q4_0(f32_data, new_data, nelements, (int) tensor.ne.at(0), hist_cur.data()); + new_size = useNewQuantization ? + kQuantizeQ4_0H(f32_data, new_data, nelements, hist_cur.data()) : + ggml_quantize_q4_0(f32_data, new_data, nelements, (int) tensor.ne.at(0), hist_cur.data()); } break; case GGML_TYPE_Q4_1: { - new_size = ggml_quantize_q4_1(f32_data, new_data, nelements, (int) tensor.ne.at(0), hist_cur.data()); + new_size = useNewQuantization ? + kQuantizeQ4_1H(f32_data, new_data, nelements, hist_cur.data()) : + ggml_quantize_q4_1(f32_data, new_data, nelements, (int) tensor.ne.at(0), hist_cur.data()); } break; default: LLAMA_ASSERT(false); From b6df974577233a3e41328c3f78ade0b5efed6f62 Mon Sep 17 00:00:00 2001 From: Iwan Kawrakow Date: Tue, 11 Apr 2023 20:38:14 +0200 Subject: [PATCH 06/12] Reverting round() change so we can pass tests But we should eventually switch back to nearestInt() and adapt the test. --- ggml.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/ggml.c b/ggml.c index 61c80a6f34ef5..04a3a40f767d5 100644 --- a/ggml.c +++ b/ggml.c @@ -536,10 +536,10 @@ static void quantize_row_q4_0_reference(const float * restrict x, block_q4_0 * r // On x86_64 and x86, round is amazingly slow. // Here it is best to just use this: - const uint8_t vi0 = (uint8_t)(v0 + 8.5f); - const uint8_t vi1 = (uint8_t)(v1 + 8.5f); - //const uint8_t vi0 = (int8_t)roundf(v0) + 8; - //const uint8_t vi1 = (int8_t)roundf(v1) + 8; + //const uint8_t vi0 = (uint8_t)(v0 + 8.5f); + //const uint8_t vi1 = (uint8_t)(v1 + 8.5f); + const uint8_t vi0 = (int8_t)roundf(v0) + 8; + const uint8_t vi1 = (int8_t)roundf(v1) + 8; // This is marginally slower (but still much faster than round()) //const uint8_t vi0 = nearestInt(v0) + 8; //const uint8_t vi1 = nearestInt(v1) + 8; @@ -835,8 +835,10 @@ static void quantize_row_q4_1_reference(const float * restrict x, void * restric // For some reason round() is amazingly slow on X86_64 and x86 // Using this instead reduces the difference between AVX2 and scalar to less than ~15% - const uint8_t vi0 = nearestInt(v0); //roundf(v0); - const uint8_t vi1 = nearestInt(v1); //roundf(v1); + //const uint8_t vi0 = nearestInt(v0); + //const uint8_t vi1 = nearestInt(v1); + const uint8_t vi0 = roundf(v0); + const uint8_t vi1 = roundf(v1); assert(vi0 < 16); assert(vi1 < 16); From 931ae360500ed8dd9d7421fe2257654db7f96c0d Mon Sep 17 00:00:00 2001 From: Iwan Kawrakow Date: Tue, 11 Apr 2023 22:08:47 +0200 Subject: [PATCH 07/12] Improve Q4_0 MSE Somehow I had it hard-wired in my brain that quants need to be in -7...7 to be comparable to the original Q4_0. But this is clearly not the case, and if we relax this requirement this simple change brings the rmse down to 0.001966 at the expense of a somewhat longer computation (~67 seconds vs 49 seconds for the 7B model on M2 Max). Perplexity test is still running but it looks like the improvement compared to the previous version will be quite modest ~0.03) despite the significant improvement in MSE. The change does not affect Q4_1 as there we already use the full range of 16 possible int values. --- ggml_extra.cpp | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) diff --git a/ggml_extra.cpp b/ggml_extra.cpp index e2ae005df08a3..9eb9bbe8319c6 100644 --- a/ggml_extra.cpp +++ b/ggml_extra.cpp @@ -23,7 +23,7 @@ inline int toNearestInt(float fval) { return (i & 0x007fffff) - 0x00400000; } -float kQuantize0(int n, const float* X, int8_t* L, std::vector>& work, int nmin, int nmax) { +std::pair kQuantize0(int n, const float* X, int8_t* L, std::vector>& work, int nmin, int nmax) { work.clear(); work.reserve(n*(nmax+2)); float max = 0; int imax = -1; @@ -33,7 +33,7 @@ float kQuantize0(int n, const float* X, int8_t* L, std::vector 0) { - kmin = nmax-2; kmax = nmax + 1; + kmin = nmax-2; kmax = nmax+1; } else { kmin = nmax/2; kmax = nmax+1; } @@ -97,7 +97,7 @@ float kQuantize0(int n, const float* X, int8_t* L, std::vector kQuantize1(int n, const float* X, int8_t* L, std::vector& tmpX, @@ -137,7 +137,17 @@ void kQuantizeQ4(const float* X, void* buffer, int k, int type) { auto processOne = [type] (const float* X, int8_t* L, char* y, std::vector>& work, std::vector& tmpX) { auto q = (uint8_t*)y; if (type == 0) { - float scale = kQuantize0(QK, X, L, work, -7, 7); + if (int(tmpX.size()) < QK) tmpX.resize(QK); + auto r1 = kQuantize0(QK, X, L, work, -8, 7); + for (int i=0; i r1.first) { + scale = -r2.first; + std::memcpy(L, L2, QK); + } + //float scale = kQuantize0(QK, X, L, work, -7, 7); std::memcpy(q, &scale, sizeof(scale)); q += sizeof(scale); for (int k=0; k Date: Wed, 12 Apr 2023 07:38:42 +0200 Subject: [PATCH 08/12] Further improve Q4_0 MSE The RMSE of the 7B model becomes 0.00185228. It looks like the perplexity will end up being around 6.27-6.28. --- ggml_extra.cpp | 113 ++++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 102 insertions(+), 11 deletions(-) diff --git a/ggml_extra.cpp b/ggml_extra.cpp index 9eb9bbe8319c6..3a996d56d2c86 100644 --- a/ggml_extra.cpp +++ b/ggml_extra.cpp @@ -1,5 +1,6 @@ #include "ggml_extra.h" +#include #include #include #include @@ -23,6 +24,94 @@ inline int toNearestInt(float fval) { return (i & 0x007fffff) - 0x00400000; } +// Adapted from PR #835, function quantize_row_q4_0_rmse() +// +// I absolutely cannot reproduce the rmse = 0.00185915 reported in #835. +// Instead, I get rmse = 0.00197 with the original and rmse = 0.00192 +// with the modification that determines the scale actually minimizing +// the rmse. +// +// Do I have a bug? iI don't see it. +// The only difference is that I'm using toNearestInt() +// instead of round(), but what are the odds for getting scaled weights at +// exactly 2.5, 4.5, and 6.5, where toNearestInt() and round() differ. +// (with toNearestInt() behaving as expected and rounding towards the even integer, +// while round() always rounding up. +float quanizeRmse(int n, const float* X, int8_t* L) { +#define Q4_0_SCALE_CANDIDATE_COUNT 8 + static const float candidates[Q4_0_SCALE_CANDIDATE_COUNT] = { -8.7f, -8.5f, -8.3f, -8.1f, -7.9f, -7.7f, -7.2f, +7.0f }; + float max = 0, amax = 0; + for (int i=0; i amax) { amax = ax; max = X[i]; } + } + if (!amax) { // all zero + for (int i=0; i::max(), bestScale = 0; + for (int si=0; si best*suml2) { + best = sumlx*sumlx/suml2; bestScale = iscale; + } + } + float sumlx = 0; int suml2 = 0; + for (int i=0; i kQuantize0(int n, const float* X, int8_t* L, std::vector>& work, int nmin, int nmax) { work.clear(); work.reserve(n*(nmax+2)); @@ -137,17 +226,19 @@ void kQuantizeQ4(const float* X, void* buffer, int k, int type) { auto processOne = [type] (const float* X, int8_t* L, char* y, std::vector>& work, std::vector& tmpX) { auto q = (uint8_t*)y; if (type == 0) { - if (int(tmpX.size()) < QK) tmpX.resize(QK); - auto r1 = kQuantize0(QK, X, L, work, -8, 7); - for (int i=0; i r1.first) { - scale = -r2.first; - std::memcpy(L, L2, QK); - } - //float scale = kQuantize0(QK, X, L, work, -7, 7); + auto scale = quanizeRmseK(QK, X, L); + // The following is not quite as good as quanizeRmseK() and it is slower too. + //if (int(tmpX.size()) < QK) tmpX.resize(QK); + //auto r1 = kQuantize0(QK, X, L, work, -8, 7); + //for (int i=0; i r1.first) { + // scale = -r2.first; + // std::memcpy(L, L2, QK); + //} + ////float scale = kQuantize0(QK, X, L, work, -7, 7); std::memcpy(q, &scale, sizeof(scale)); q += sizeof(scale); for (int k=0; k Date: Wed, 12 Apr 2023 16:25:19 +0200 Subject: [PATCH 09/12] Various experiments, including 5-bit qunatization --- examples/quantize-stats/quantize-stats.cpp | 8 +- ggml_extra.cpp | 211 +++++++++++++++++++-- ggml_extra.h | 6 + 3 files changed, 211 insertions(+), 14 deletions(-) diff --git a/examples/quantize-stats/quantize-stats.cpp b/examples/quantize-stats/quantize-stats.cpp index cfee120c35ddb..8ab1d02b08ff2 100644 --- a/examples/quantize-stats/quantize-stats.cpp +++ b/examples/quantize-stats/quantize-stats.cpp @@ -306,13 +306,17 @@ int main(int argc, char ** argv) { std::vector output_scratch(SCRATCH_ELEMENTS); // loop throught quantization types - for (int i = 0; i < GGML_TYPE_COUNT; i++) { + //for (int i = 0; i < GGML_TYPE_COUNT; i++) { + for (int i = 1; i < 2; i++) { if (!params.include_types.empty() && std::find(params.include_types.begin(), params.include_types.end(), i) == params.include_types.end()) { continue; } quantize_fns_t qfns = ggml_internal_get_quantize_fn(i); if (i < 2 && checkNewQuantization) { - qfns.quantize_row_q = i == 0 ? kQuantizeQ4_0 : kQuantizeQ4_1; + //qfns.quantize_row_q = i == 0 ? kQuantizeQ4_0 : kQuantizeQ4_1; + //qfns.quantize_row_q = i == 0 ? kQuantizeQ4_0 : kQuantizeQ5_1; + qfns.quantize_row_q = i == 0 ? kQuantizeQ4_0 : kQuantizeQ5_1_Fast; + if (i == 1) qfns.dequantize_row_q = kDequantizeQ5_1; } if (qfns.quantize_row_q && qfns.dequantize_row_q) { if (params.verbose) { diff --git a/ggml_extra.cpp b/ggml_extra.cpp index 3a996d56d2c86..ed6330a3e85ce 100644 --- a/ggml_extra.cpp +++ b/ggml_extra.cpp @@ -1,4 +1,5 @@ #include "ggml_extra.h" +#include "ggml.h" #include #include @@ -27,8 +28,7 @@ inline int toNearestInt(float fval) { // Adapted from PR #835, function quantize_row_q4_0_rmse() // // I absolutely cannot reproduce the rmse = 0.00185915 reported in #835. -// Instead, I get rmse = 0.00197 with the original and rmse = 0.00192 -// with the modification that determines the scale actually minimizing +// Instead, I get rmse = 0.00197 with the original and rmse = 0.00192 // with the modification that determines the scale actually minimizing // the rmse. // // Do I have a bug? iI don't see it. @@ -79,12 +79,58 @@ float quanizeRmse(int n, const float* X, int8_t* L) { //return 1/bestScale; } +float quanizeRmseK(int n, const float* X, int8_t* L, + int nCandidates, const float* candidates, int nmin, int nmax) { + float max = 0; + for (int i=0; i best*suml2) { + best = sumlx*sumlx/suml2; bestScale = iscale; + } + } + float sumlx = 0; int suml2 = 0; + for (int i=0; i best*suml2) { - best = sumlx*sumlx/suml2; bestScale = iscale; + if (sumxlp*sumxlp*suml2m >= sumxlm*sumxlm*suml2p) { + if (sumxlp*sumxlp > best*suml2p) { + best = sumxlp*sumxlp/suml2p; bestScale = iscale; + } + } else { + if (sumxlm*sumxlm > best*suml2m) { + best = sumxlm*sumxlm/suml2m; bestScale = -iscale; + } } } float sumlx = 0; int suml2 = 0; @@ -112,6 +170,40 @@ float quanizeRmseK(int n, const float* X, int8_t* L) { return sumlx/suml2; } +float quanizeRmseOpt(int n, const float* X, int8_t* L, std::vector>& work) { + work.clear(); + work.reserve(n*17); + for (int l=-8; l<=8; ++l) { + float scale = l - 0.4999f; + for (int i=0; i 0 && sumlx*sumlx > best*suml2) { + best = sumlx*sumlx/suml2; bestScale = s; + } + } + } + sumlx = 0; suml2 = 0; + for (int i=0; i kQuantize0(int n, const float* X, int8_t* L, std::vector>& work, int nmin, int nmax) { work.clear(); work.reserve(n*(nmax+2)); @@ -200,9 +292,10 @@ std::pair kQuantize1(int n, const float* X, int8_t* L, std::vector return {min, 1.f}; } if (int(tmpX.size()) < n) tmpX.resize(n); - double a = min, b; - for (int itry=0; itry<3; ++itry) { + double a = min, b = 0; + for (int itry=0; itry<5; ++itry) { for (int i=0; i kQuantize1(int n, const float* X, int8_t* L, std::vector sumx += X[i]; } int64_t D = suml2*n - suml*suml; + auto aold = a, bold = b; a = (sumx*suml2 - sumlx*suml)/D; b = (sumlx*n - sumx*suml)/D; + if (itry > 0 && std::abs(a - aold) < 1e-6*std::abs(aold) && std::abs(b - bold) < 1e-6*std::abs(bold)) break; + } + return {a, b}; +} + +std::pair kQuantize1Fast(int n, const float* X, int8_t* L, int nmax) { + float min = X[0], max = X[1]; + for (int i=1; i>& work, std::vector& tmpX) { auto q = (uint8_t*)y; if (type == 0) { - auto scale = quanizeRmseK(QK, X, L); + auto scale = quanizeRmseK7(QK, X, L); + //auto scale = quanizeRmseFast(QK, X, L); + //auto scale = quanizeRmseOpt(QK, X, L, work); // The following is not quite as good as quanizeRmseK() and it is slower too. //if (int(tmpX.size()) < QK) tmpX.resize(QK); //auto r1 = kQuantize0(QK, X, L, work, -8, 7); @@ -241,11 +364,29 @@ void kQuantizeQ4(const float* X, void* buffer, int k, int type) { ////float scale = kQuantize0(QK, X, L, work, -7, 7); std::memcpy(q, &scale, sizeof(scale)); q += sizeof(scale); for (int k=0; k 15) { l1 -= 16; *u |= m; } + m <<= 1; + if (l2 > 15) { l2 -= 16; *u |= m; } + m <<= 1; + q[k] = l1 | (l2 << 4); + } } }; @@ -318,6 +459,14 @@ void kQuantizeQ4_1(const float* x, void* buffer, int k) { kQuantizeQ4(x, buffer, k, 1); } +void kQuantizeQ5_1(const float* x, void* buffer, int k) { + kQuantizeQ4(x, buffer, k, 2); +} + +void kQuantizeQ5_1_Fast(const float* x, void* buffer, int k) { + kQuantizeQ4(x, buffer, k, 3); +} + size_t kQuantizeQ4_0H(const float* x, void* buffer, int k, int64_t* hist) { kQuantizeQ4(x, buffer, k, 0); collectHisto(k, buffer, hist, 0); @@ -330,4 +479,42 @@ size_t kQuantizeQ4_1H(const float* x, void* buffer, int k, int64_t* hist) { return (k / QK) * kBucketSize1; } +size_t kQuantizeQ5_1H(const float* x, void* buffer, int k, int64_t* hist) { + kQuantizeQ4(x, buffer, k, 2); + collectHisto(k, buffer, hist, 1); + return (k / QK) * kBucketSize1; +} + +size_t kQuantizeQ5_1H_Fast(const float* x, void* buffer, int k, int64_t* hist) { + kQuantizeQ4(x, buffer, k, 3); + collectHisto(k, buffer, hist, 1); + return (k / QK) * kBucketSize1; +} + +void kDequantizeQ5_1(const void* x, float* y, int k) { + assert(k % QK == 0); + int n = k / QK; + auto data = (const uint8_t*)x; + for (int i=0; i> 4; + if (u & m) l1 += 16; + m <<= 1; + if (u & m) l2 += 16; + m <<= 1; + *y++ = a + b*l1; + *y++ = a + b*l2; + } + data += 16; + } +} + } diff --git a/ggml_extra.h b/ggml_extra.h index 788fcd0ea3015..7faa4380105f9 100644 --- a/ggml_extra.h +++ b/ggml_extra.h @@ -22,6 +22,12 @@ size_t kQuantizeQ4_0H(const float* GGML_RESTRICT x, void* GGML_RESTRICT y, int k void kQuantizeQ4_1(const float* GGML_RESTRICT x, void* GGML_RESTRICT y, int k); size_t kQuantizeQ4_1H(const float* GGML_RESTRICT x, void* GGML_RESTRICT y, int k, int64_t* hist); +void kQuantizeQ5_1(const float* GGML_RESTRICT x, void* GGML_RESTRICT y, int k); +size_t kQuantizeQ5_1H(const float* GGML_RESTRICT x, void* GGML_RESTRICT y, int k, int64_t* hist); +void kQuantizeQ5_1_Fast(const float* GGML_RESTRICT x, void* GGML_RESTRICT y, int k); +size_t kQuantizeQ5_1H_Fast(const float* GGML_RESTRICT x, void* GGML_RESTRICT y, int k, int64_t* hist); +void kDequantizeQ5_1(const void * GGML_RESTRICT x, float * GGML_RESTRICT y, int k); + #ifdef __cplusplus } #endif From 679e1cb6c01b16abe4f3ee3c849813b98970df93 Mon Sep 17 00:00:00 2001 From: Iwan Kawrakow Date: Wed, 12 Apr 2023 17:10:52 +0200 Subject: [PATCH 10/12] POC: Even lower rmse 4-bit Q4_0 quantization Basically, we use two Q4_0 quantizations, each having 16 weights, to a quantize a set of 32 weights. We get two separate scaling factors, which we store as fp16, ending up using the exact same 5 bits per weight as the current Q4_0. We end up witn an rmse of ~0.00159, so basically the same as the improved Q4_1. But this should run faster than `Q4_1` (unless fp16 -> fp32 conversion is somehow very slow). --- examples/quantize-stats/quantize-stats.cpp | 8 +++-- ggml_extra.cpp | 40 +++++++++++++++++++++- ggml_extra.h | 3 ++ 3 files changed, 47 insertions(+), 4 deletions(-) diff --git a/examples/quantize-stats/quantize-stats.cpp b/examples/quantize-stats/quantize-stats.cpp index 8ab1d02b08ff2..c924ff3d2cffb 100644 --- a/examples/quantize-stats/quantize-stats.cpp +++ b/examples/quantize-stats/quantize-stats.cpp @@ -307,7 +307,7 @@ int main(int argc, char ** argv) { // loop throught quantization types //for (int i = 0; i < GGML_TYPE_COUNT; i++) { - for (int i = 1; i < 2; i++) { + for (int i = 0; i < 1; i++) { if (!params.include_types.empty() && std::find(params.include_types.begin(), params.include_types.end(), i) == params.include_types.end()) { continue; } @@ -315,8 +315,10 @@ int main(int argc, char ** argv) { if (i < 2 && checkNewQuantization) { //qfns.quantize_row_q = i == 0 ? kQuantizeQ4_0 : kQuantizeQ4_1; //qfns.quantize_row_q = i == 0 ? kQuantizeQ4_0 : kQuantizeQ5_1; - qfns.quantize_row_q = i == 0 ? kQuantizeQ4_0 : kQuantizeQ5_1_Fast; - if (i == 1) qfns.dequantize_row_q = kDequantizeQ5_1; + //qfns.quantize_row_q = i == 0 ? kQuantizeQ4_0 : kQuantizeQ5_1_Fast; + //if (i == 1) qfns.dequantize_row_q = kDequantizeQ5_1; + qfns.quantize_row_q = i == 0 ? kQuantizeQ4_0K : kQuantizeQ5_1; + qfns.dequantize_row_q = i == 0 ? kDequantizeQ4_0K : kDequantizeQ5_1; } if (qfns.quantize_row_q && qfns.dequantize_row_q) { if (params.verbose) { diff --git a/ggml_extra.cpp b/ggml_extra.cpp index ed6330a3e85ce..927ab7e78b84e 100644 --- a/ggml_extra.cpp +++ b/ggml_extra.cpp @@ -369,6 +369,15 @@ void kQuantizeQ4(const float* X, void* buffer, int k, int type) { std::memcpy(q, &result.second, sizeof(result.second)); q += sizeof(result.second); std::memcpy(q, &result.first, sizeof(result.first)); q += sizeof(result.first); for (int k=0; k> 4; + l1 -= 8; l2 -= 8; + *y++ = a*l1; *y++ = a*l2; + } + data += 8; + for (int k=0; k<8; ++k) { + int8_t l1 = data[k] & 15, l2 = data[k] >> 4; + l1 -= 8; l2 -= 8; + *y++ = b*l1; *y++ = b*l2; + } + data += 8; + } +} + } diff --git a/ggml_extra.h b/ggml_extra.h index 7faa4380105f9..6ded616574541 100644 --- a/ggml_extra.h +++ b/ggml_extra.h @@ -28,6 +28,9 @@ void kQuantizeQ5_1_Fast(const float* GGML_RESTRICT x, void* GGML_RESTRICT y, int size_t kQuantizeQ5_1H_Fast(const float* GGML_RESTRICT x, void* GGML_RESTRICT y, int k, int64_t* hist); void kDequantizeQ5_1(const void * GGML_RESTRICT x, float * GGML_RESTRICT y, int k); +void kQuantizeQ4_0K(const float* GGML_RESTRICT x, void* GGML_RESTRICT y, int k); +void kDequantizeQ4_0K(const void * GGML_RESTRICT x, float * GGML_RESTRICT y, int k); + #ifdef __cplusplus } #endif From 6f34961559aaa5bdb323a3cb9cd83709939a8c2a Mon Sep 17 00:00:00 2001 From: Iwan Kawrakow Date: Thu, 13 Apr 2023 08:31:21 +0200 Subject: [PATCH 11/12] POC: Q4_1 for groups of 16 weight As last commit, but Q4_1 type, using the same memory as existing Q4_1 via fp16. We end up with rmse 0.00125125, maxerr 0.11657715, 95pct<0.0024, median<0.0010 after a quantize - dequantize roundtrip. This is quite a bit better than Q4_1 with groups of 32 weights, but by far not as good as 5-bit quantization that uses the same amount of memory where we had rmse 0.00076131, maxerr 0.05273438, 95pct<0.0016, median<0.0006 --- examples/quantize-stats/quantize-stats.cpp | 8 ++-- ggml_extra.cpp | 43 ++++++++++++++++++++++ ggml_extra.h | 3 ++ 3 files changed, 51 insertions(+), 3 deletions(-) diff --git a/examples/quantize-stats/quantize-stats.cpp b/examples/quantize-stats/quantize-stats.cpp index c924ff3d2cffb..ae807f493cafb 100644 --- a/examples/quantize-stats/quantize-stats.cpp +++ b/examples/quantize-stats/quantize-stats.cpp @@ -307,7 +307,7 @@ int main(int argc, char ** argv) { // loop throught quantization types //for (int i = 0; i < GGML_TYPE_COUNT; i++) { - for (int i = 0; i < 1; i++) { + for (int i = 1; i < 2; i++) { if (!params.include_types.empty() && std::find(params.include_types.begin(), params.include_types.end(), i) == params.include_types.end()) { continue; } @@ -317,8 +317,10 @@ int main(int argc, char ** argv) { //qfns.quantize_row_q = i == 0 ? kQuantizeQ4_0 : kQuantizeQ5_1; //qfns.quantize_row_q = i == 0 ? kQuantizeQ4_0 : kQuantizeQ5_1_Fast; //if (i == 1) qfns.dequantize_row_q = kDequantizeQ5_1; - qfns.quantize_row_q = i == 0 ? kQuantizeQ4_0K : kQuantizeQ5_1; - qfns.dequantize_row_q = i == 0 ? kDequantizeQ4_0K : kDequantizeQ5_1; + //qfns.quantize_row_q = i == 0 ? kQuantizeQ4_0K : kQuantizeQ5_1; + //qfns.dequantize_row_q = i == 0 ? kDequantizeQ4_0K : kDequantizeQ5_1; + qfns.quantize_row_q = i == 0 ? kQuantizeQ4_0K : kQuantizeQ4_1K; + qfns.dequantize_row_q = i == 0 ? kDequantizeQ4_0K : kDequantizeQ4_1K; } if (qfns.quantize_row_q && qfns.dequantize_row_q) { if (params.verbose) { diff --git a/ggml_extra.cpp b/ggml_extra.cpp index 927ab7e78b84e..787d62edb0ca6 100644 --- a/ggml_extra.cpp +++ b/ggml_extra.cpp @@ -378,6 +378,18 @@ void kQuantizeQ4(const float* X, void* buffer, int k, int type) { std::memcpy(q, &scale1fp16, sizeof(scale1fp16)); q += sizeof(scale1fp16); std::memcpy(q, &scale2fp16, sizeof(scale2fp16)); q += sizeof(scale2fp16); for (int k=0; k> 4; + *y++ = a1 + b1*l1; *y++ = a1 + b1*l2; + } + data += 8; + for (int k=0; k<8; ++k) { + int8_t l1 = data[k] & 15, l2 = data[k] >> 4; + *y++ = a2 + b2*l1; *y++ = a2 + b2*l2; + } + data += 8; + } +} + } diff --git a/ggml_extra.h b/ggml_extra.h index 6ded616574541..bddabc5c16a00 100644 --- a/ggml_extra.h +++ b/ggml_extra.h @@ -31,6 +31,9 @@ void kDequantizeQ5_1(const void * GGML_RESTRICT x, float * GGML_RESTRICT y, int void kQuantizeQ4_0K(const float* GGML_RESTRICT x, void* GGML_RESTRICT y, int k); void kDequantizeQ4_0K(const void * GGML_RESTRICT x, float * GGML_RESTRICT y, int k); +void kQuantizeQ4_1K(const float* GGML_RESTRICT x, void* GGML_RESTRICT y, int k); +void kDequantizeQ4_1K(const void * GGML_RESTRICT x, float * GGML_RESTRICT y, int k); + #ifdef __cplusplus } #endif From 97d7ac75657b90a60c35d081f4ef1a7bc5bbe772 Mon Sep 17 00:00:00 2001 From: Iwan Kawrakow Date: Thu, 13 Apr 2023 12:00:24 +0200 Subject: [PATCH 12/12] POC: Measure rmse of 8 bit quantization q8_0 : rmse 0.00010729, maxerr 0.01030385, 95pct<0.0002, median<0.0002 --- examples/quantize-stats/quantize-stats.cpp | 10 +-- ggml_extra.cpp | 78 +++++++++++++++++++++- ggml_extra.h | 3 + 3 files changed, 84 insertions(+), 7 deletions(-) diff --git a/examples/quantize-stats/quantize-stats.cpp b/examples/quantize-stats/quantize-stats.cpp index ae807f493cafb..5789bd9ea392c 100644 --- a/examples/quantize-stats/quantize-stats.cpp +++ b/examples/quantize-stats/quantize-stats.cpp @@ -307,7 +307,7 @@ int main(int argc, char ** argv) { // loop throught quantization types //for (int i = 0; i < GGML_TYPE_COUNT; i++) { - for (int i = 1; i < 2; i++) { + for (int i = 0; i < 1; i++) { if (!params.include_types.empty() && std::find(params.include_types.begin(), params.include_types.end(), i) == params.include_types.end()) { continue; } @@ -315,12 +315,14 @@ int main(int argc, char ** argv) { if (i < 2 && checkNewQuantization) { //qfns.quantize_row_q = i == 0 ? kQuantizeQ4_0 : kQuantizeQ4_1; //qfns.quantize_row_q = i == 0 ? kQuantizeQ4_0 : kQuantizeQ5_1; - //qfns.quantize_row_q = i == 0 ? kQuantizeQ4_0 : kQuantizeQ5_1_Fast; + ////qfns.quantize_row_q = i == 0 ? kQuantizeQ4_0 : kQuantizeQ5_1_Fast; //if (i == 1) qfns.dequantize_row_q = kDequantizeQ5_1; //qfns.quantize_row_q = i == 0 ? kQuantizeQ4_0K : kQuantizeQ5_1; //qfns.dequantize_row_q = i == 0 ? kDequantizeQ4_0K : kDequantizeQ5_1; - qfns.quantize_row_q = i == 0 ? kQuantizeQ4_0K : kQuantizeQ4_1K; - qfns.dequantize_row_q = i == 0 ? kDequantizeQ4_0K : kDequantizeQ4_1K; + //qfns.quantize_row_q = i == 0 ? kQuantizeQ4_0K : kQuantizeQ4_1K; + //qfns.dequantize_row_q = i == 0 ? kDequantizeQ4_0K : kDequantizeQ4_1K; + qfns.quantize_row_q = i == 0 ? kQuantizeQ8Simple: kQuantizeQ4_1K; + qfns.dequantize_row_q = i == 0 ? kDequantizeQ8: kDequantizeQ4_1K; } if (qfns.quantize_row_q && qfns.dequantize_row_q) { if (params.verbose) { diff --git a/ggml_extra.cpp b/ggml_extra.cpp index 787d62edb0ca6..fa2591e68255e 100644 --- a/ggml_extra.cpp +++ b/ggml_extra.cpp @@ -105,7 +105,30 @@ float quanizeRmseK(int n, const float* X, int8_t* L, sumlx += X[i]*l; suml2 += l*l; L[i] = l; } - return sumlx/suml2; + float scale = sumlx/suml2; + best = scale*sumlx; + for (int itry=0; itry<3; ++itry) { + bool haveChanges = false; + for (int i=0; i 0 && L[i] < nmax) { + auto s1 = sumlx + X[i]; + auto s2 = suml2 + 2*L[i] + 1; + if (s2 > 0 && s1*s1 > best*s2) { + scale = s1/s2; best = scale*s1; ++L[i]; sumlx = s1; suml2 = s2; haveChanges = true; + } + } + else if (g < 0 && L[i] > nmin) { + auto s1 = sumlx - X[i]; + auto s2 = suml2 - 2*L[i] + 1; + if (s2 > 0 && s1*s1 > best*s2) { + scale = s1/s2; best = scale*s1; --L[i]; sumlx = s1; suml2 = s2; haveChanges = true; + } + } + } + if (!haveChanges) break; + } + return scale; } // The following improves the above. // It gives RMSE = 0.00185228 for the 7B model. @@ -125,6 +148,19 @@ float quanizeRmseK15(int n, const float* X, int8_t* L) { return quanizeRmseK(n, X, L, kCandiateCount, candidates, 0, 15); } +float quanizeRmseK31(int n, const float* X, int8_t* L) { + constexpr int kCandiateCount = 24; + static const float candidates[kCandiateCount] = { + +35.25, +34.25f, +33.25f, +32.75f, +32.25f, +31.75f, +31.25f, +30.75f, +30.25f, +29.75f, +29.25f, +28.25f, +27.25f, +26.25f, + +25.25f, +24.25f, +23.25, +22.25f, +21.25f, +20.25f, +19.25f, +18.25f, +17.25f, +16.25f + }; + //static const float candidates[kCandiateCount] = { + // +33.25f, +32.25f, +31.75f, +31.25f, +30.75f, +30.25f, +30.25f, +29.25f, +28.75f, +27.25f, +26.25f, +25.25f, +24.25f, +23.25, +22.25f, + // +21.25f + //}; + return quanizeRmseK(n, X, L, kCandiateCount, candidates, 0, 31); +} + // Fast (as much faster than doing the optimization), but not very good. float quanizeRmseFast(int n, const float* X, int8_t* L) { //constexpr int kCandiateCount = 3; @@ -295,8 +331,9 @@ std::pair kQuantize1(int n, const float* X, int8_t* L, std::vector double a = min, b = 0; for (int itry=0; itry<5; ++itry) { for (int i=0; i 0) { + float iscale = 127.f/max; + float scale = max/127.f; + std::memcpy(data, &scale, sizeof(scale)); data += sizeof(scale); + for (int k=0; k<16; ++k) data[k] = toNearestInt(iscale * *x++); + data += 16; + } else { + float scale = 1; + std::memcpy(data, &scale, sizeof(scale)); data += sizeof(scale); + auto aux = (uint32_t*)data; + aux[0] = aux[1] = aux[2] = aux[3] = 0; + data += 16; + } + } +} + +void kDequantizeQ8(const void* x, float* y, int k) { + assert(k % QK == 0); + auto data = (const int8_t*)x; + int n = k / (QK/2); + for (int i=0; i