From 126b9844822d0207fbf4db4a57c10ef45489e6fd Mon Sep 17 00:00:00 2001
From: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Date: Tue, 11 Apr 2023 12:41:14 +0200
Subject: [PATCH 01/12] Use better conversion to ints

in quantize_row_q4_0_reference and quantize_row_q4_1_reference.
This reduces the difference to the vectorized versions to
~10% for quantize_row_q4_0 and <15% for quantize_row_q4_1 on
the two CPU's I have tried (Ryzen 7950X and M2 Max).
---
 ggml.c | 31 +++++++++++++++++++++++++------
 1 file changed, 25 insertions(+), 6 deletions(-)

diff --git a/ggml.c b/ggml.c
index 897b67d930614..a99f6a3b7343d 100644
--- a/ggml.c
+++ b/ggml.c
@@ -2,6 +2,7 @@
 #define _GNU_SOURCE
 
 #include "ggml.h"
+#include "ggml_extra.h"
 
 #if defined(_MSC_VER) || defined(__MINGW32__)
 #include <malloc.h> // using malloc.h with MSC/MINGW
@@ -502,6 +503,13 @@ typedef struct {
 } block_q4_1;
 static_assert(sizeof(block_q4_1) == sizeof(float) * 2 + QK / 2, "wrong q4_1 block size/padding");
 
+inline int nearestInt(float fval) {
+    assert(fval <= 4194303.f);
+    float val = fval + 12582912.f;
+    int i; memcpy(&i, &val, sizeof(int));
+    return (i & 0x007fffff) - 0x00400000;
+}
+
 // reference implementation for deterministic creation of model files
 static void quantize_row_q4_0_reference(const float * restrict x, block_q4_0 * restrict y, int k) {
     assert(k % QK == 0);
@@ -526,8 +534,15 @@ static void quantize_row_q4_0_reference(const float * restrict x, block_q4_0 * r
             const float v0 = x[i*QK + l + 0]*id;
             const float v1 = x[i*QK + l + 1]*id;
 
-            const uint8_t vi0 = (int8_t)roundf(v0) + 8;
-            const uint8_t vi1 = (int8_t)roundf(v1) + 8;
+            // On x86_64 and x86, round is amazingly slow.
+            // Here it is best to just use this:
+            const uint8_t vi0 = (uint8_t)(v0 + 8.5f);
+            const uint8_t vi1 = (uint8_t)(v1 + 8.5f);
+            //const uint8_t vi0 = (int8_t)roundf(v0) + 8;
+            //const uint8_t vi1 = (int8_t)roundf(v1) + 8;
+            // This is marginally slower (but still much faster than round())
+            //const uint8_t vi0 = nearestInt(v0) + 8;
+            //const uint8_t vi1 = nearestInt(v1) + 8;
 
             assert(vi0 < 16);
             assert(vi1 < 16);
@@ -818,8 +833,10 @@ static void quantize_row_q4_1_reference(const float * restrict x, void * restric
             const float v0 = (x[i*QK + l + 0] - min)*id;
             const float v1 = (x[i*QK + l + 1] - min)*id;
 
-            const uint8_t vi0 = roundf(v0);
-            const uint8_t vi1 = roundf(v1);
+            // For some reason round() is amazingly slow on X86_64 and x86
+            // Using this instead reduces the difference between AVX2 and scalar to less than ~15%
+            const uint8_t vi0 = nearestInt(v0); //roundf(v0);
+            const uint8_t vi1 = nearestInt(v1); //roundf(v1);
 
             assert(vi0 < 16);
             assert(vi1 < 16);
@@ -2560,6 +2577,8 @@ inline static void ggml_vec_norm_inv_f32(const int n, float * s, const float * x
 //
 
 static const int GGML_BLCK_SIZE[GGML_TYPE_COUNT] = {
+    QK,
+    QK,
     QK,
     QK,
     1,
@@ -2569,7 +2588,7 @@ static const int GGML_BLCK_SIZE[GGML_TYPE_COUNT] = {
     1,
 };
 
-static_assert(GGML_TYPE_COUNT == 7, "GGML_TYPE_COUNT != 5");
+static_assert(GGML_TYPE_COUNT == 7, "GGML_TYPE_COUNT != 7");
 
 static const size_t GGML_TYPE_SIZE[GGML_TYPE_COUNT] = {
     sizeof(block_q4_0),
@@ -2582,7 +2601,7 @@ static const size_t GGML_TYPE_SIZE[GGML_TYPE_COUNT] = {
 };
 
 // don't forget to update the array above when adding new types
-static_assert(GGML_TYPE_COUNT == 7, "GGML_TYPE_COUNT != 5");
+static_assert(GGML_TYPE_COUNT == 7, "GGML_TYPE_COUNT != 7");
 
 static const char * GGML_OP_LABEL[GGML_OP_COUNT] = {
     "NONE",

From 0c9a967a20b47c08b8cf17f1be35386f211aae1b Mon Sep 17 00:00:00 2001
From: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Date: Tue, 11 Apr 2023 12:44:47 +0200
Subject: [PATCH 02/12] Adding new functions for Q4_0 and Q4_1 quantization

---
 CMakeLists.txt |   4 +-
 ggml_extra.cpp | 204 +++++++++++++++++++++++++++++++++++++++++++++++++
 ggml_extra.h   |  20 +++++
 3 files changed, 227 insertions(+), 1 deletion(-)
 create mode 100644 ggml_extra.cpp
 create mode 100644 ggml_extra.h

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 6bec1f97befd9..41958c93a1bc8 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -235,7 +235,9 @@ endif()
 
 add_library(ggml OBJECT
             ggml.c
-            ggml.h)
+            ggml.h
+            ggml_extra.h
+            ggml_extra.cpp)
 
 target_include_directories(ggml PUBLIC .)
 target_compile_features(ggml PUBLIC c_std_11) # don't bump
diff --git a/ggml_extra.cpp b/ggml_extra.cpp
new file mode 100644
index 0000000000000..cabbefae78874
--- /dev/null
+++ b/ggml_extra.cpp
@@ -0,0 +1,204 @@
+#include "ggml_extra.h"
+
+#include <vector>
+#include <utility>
+#include <algorithm>
+#include <cassert>
+#include <thread>
+#include <atomic>
+#include <cstring>
+
+namespace {
+
+inline int toNearestInt(float fval) {
+    assert(fval <= 4194303.f);
+    constexpr float kSnapper=3<<22;
+    auto val = fval + kSnapper;
+    int i; std::memcpy(&i, &val, sizeof(int));
+    return (i & 0x007fffff) - 0x00400000;
+}
+
+float kQuantize0(int n, const float* X, int8_t* L, std::vector<std::pair<float,int>>& work, int nmin, int nmax) {
+    work.clear();
+    work.reserve(n*(nmax+2));
+    float max = 0; int imax = -1;
+    for (int i=0; i<n; ++i) {
+        float x = std::abs(X[i]);
+        if (x > max) { max = x; imax = i; }
+    }
+    if (imax < 0) {  // all X are zero
+        for (int i=0; i<n; ++i) L[i] = 0;
+        return 1.f;
+    }
+    float maxi = 1/max;
+    int kmin, kmax;
+    {
+        float scale0 = nmax*maxi;
+        double sumlx0 = 0; int suml20 = 0;
+        for (int i=0; i<n; ++i) {
+            int l = std::max(nmin, std::min(nmax, toNearestInt(scale0*X[i])));
+            sumlx0 += X[i]*l; suml20 += l*l;
+        }
+        auto df0 = suml20/scale0 - sumlx0;
+        if (df0 > 0) {
+            kmin = nmax-2; kmax = nmax + 1;
+        } else {
+            kmin = nmax/2; kmax = nmax+1;
+        }
+    }
+    for (int k=kmin; k<=kmax; ++k) work.push_back({(k + 0.501f)*maxi, imax});
+    float minScale = work.front().first;
+    float maxScale = work.back().first;
+    for (int i=0; i<n; ++i) {
+        L[i] = 0;
+        auto x = std::abs(X[i]);
+        if (i == imax || !x) continue;
+        int kkmin = std::max(0, int(minScale*x-0.501f));
+        int kkmax = std::min(kmax, int(maxScale*x));
+        auto xi = 1/x;
+        for (int k=kkmin; k<=kkmax; ++k) {
+            auto s = (k + 0.501f)*xi;
+            if (s > maxScale) break;
+            if (s > minScale) work.push_back({s,i});
+        }
+    }
+    std::sort(work.begin(), work.end());
+    float sumlx = 0; int suml2 = 0;
+    float s = work.front().first;
+    for (int i=0; i<n; ++i) {
+        int l = std::max(nmin, std::min(nmax, toNearestInt(s*X[i])));
+        sumlx += X[i]*l; suml2 += l*l;
+        L[i] = l;
+    }
+    float bestSumlx = sumlx, bestSumlx2 = sumlx*sumlx; int bestSuml2 = suml2; float bests = s;
+    float lasts = s;
+    for (int k=1; k<int(work.size()); ++k) {
+        s = work[k].first; int i = work[k].second;
+        int l = std::max(nmin, std::min(nmax, toNearestInt(s*X[i])));
+        if (l == L[i]) { lasts = s; continue; }
+        if (l > L[i]) {
+            sumlx += X[i];
+            suml2 += 1 + 2*L[i];
+        }
+        else {
+            sumlx -= X[i];
+            suml2 += 1 - 2*L[i];
+        }
+        L[i] = l;
+        float sumlx2 = sumlx*sumlx;
+        if ((s != lasts || k == int(work.size())-1) && suml2 > 0 && sumlx2*bestSuml2 > bestSumlx2*suml2) {
+            bestSumlx = sumlx; bestSumlx2 = sumlx2; bestSuml2 = suml2; bests = s;
+        }
+        lasts = s;
+    }
+    for (int i=0; i<n; ++i) L[i] = std::max(nmin, std::min(nmax, toNearestInt(bests*X[i])));
+    return bestSumlx/bestSuml2;
+}
+
+std::pair<float, float> kQuantize1(int n, const float* X, int8_t* L, std::vector<float>& tmpX,
+        std::vector<std::pair<float,int>>& work, int nmax) {
+    float min = X[0], max = X[1];
+    for (int i=1; i<n; ++i) {
+        min = std::min(min, X[i]); max = std::max(max, X[i]);
+    }
+    if (max == min) {
+        for (int i=0; i<n; ++i) L[i] = 0;
+        return {min, 1.f};
+    }
+    if (int(tmpX.size()) < n) tmpX.resize(n);
+    double a = min, b;
+    for (int itry=0; itry<3; ++itry) {
+        for (int i=0; i<n; ++i) tmpX[i] = X[i] - a;
+        kQuantize0(n, tmpX.data(), L, work, 0, 2*nmax+1);
+        double sumlx = 0, sumx = 0;
+        int suml2 = 0, suml = 0;
+        for (int i=0; i<n; ++i) {
+            auto l = L[i];
+            sumlx += X[i]*l;
+            suml2 += l*l;
+            suml  += l;
+            sumx  += X[i];
+        }
+        int64_t D = suml2*n - suml*suml;
+        a = (sumx*suml2 - sumlx*suml)/D;
+        b = (sumlx*n - sumx*suml)/D;
+    }
+    return {a, b};
+}
+
+void kQuantizeQ4(const float* GGML_RESTRICT x, void* GGML_RESTRICT buffer, int k, int type) {
+    constexpr int kChunkSize = 32*32*8;
+    constexpr int QK = 32;
+    constexpr int kBucketSize0 = QK/2 + sizeof(float);
+    constexpr int kBucketSize1 = QK/2 + 2*sizeof(float);
+    assert(k % QK == 0);
+
+    auto processOne = [type] (const float* X, int8_t* L, char* y, std::vector<std::pair<float, int>>& work, std::vector<float>& tmpX) {
+        if (type == 0) {
+            float scale = kQuantize0(QK, X, L, work, -7, 7);
+            std::memcpy(y, &scale, sizeof(scale)); y += sizeof(scale);
+            uint8_t* q = (uint8_t*)y;
+            for (int k=0; k<QK/2; ++k) q[k] = (L[2*k] + 8) | ((L[2*k+1] + 8) << 4);
+        } else {
+            auto result = kQuantize1(QK, X, L, tmpX, work, 7);
+            std::memcpy(y, &result.second, sizeof(result.second)); y += sizeof(result.second);
+            std::memcpy(y, &result.first,  sizeof(result.first));  y += sizeof(result.first);
+            uint8_t* q = (uint8_t*)y;
+            for (int k=0; k<QK/2; ++k) q[k] = L[2*k] | (L[2*k+1] << 4);
+        }
+    };
+
+    auto bucketSize = type == 0 ? kBucketSize0 : kBucketSize1;
+    auto y = (char*)buffer;
+    int nchunk = (k + kChunkSize-1)/kChunkSize;
+    if (nchunk < 2) {
+        std::vector<int8_t> L(QK);
+        std::vector<std::pair<float,int>> work;
+        std::vector<float> tmpX;
+        int nb = k / QK;
+        for (int i=0; i<nb; ++i) {
+            processOne(x + QK*i, L.data(), y, work, tmpX);
+            y += bucketSize; x += QK;
+        }
+        return;
+    }
+
+    std::atomic<int> counter(0);
+    auto compute = [&counter, x, y, k, bucketSize, &processOne] () {
+        std::vector<int8_t> L(QK);
+        std::vector<std::pair<float,int>> work;
+        std::vector<float> tmpX;
+        while (true) {
+            int first = counter.fetch_add(kChunkSize);
+            if (first >= k) break;
+            int last = first + kChunkSize;
+            if (last > k) last = k;
+            auto xi = x + first;
+            auto yi = y + (first/QK)*bucketSize;
+            int n = (last - first)/QK;
+            for (int i=0; i<n; ++i) {
+                processOne(xi, L.data(), yi, work, tmpX);
+                yi += bucketSize; xi += QK;
+            }
+        }
+    };
+    int nthread = std::min(nchunk, int(std::thread::hardware_concurrency()));
+    std::vector<std::thread> workers(nthread-1);
+    for (auto& w : workers) w = std::thread(compute);
+    compute();
+    for (auto& w : workers) w.join();
+}
+
+}
+
+extern "C" {
+
+void kQuantizeQ4_0(const float* x, void* buffer, int k) {
+    kQuantizeQ4(x, buffer, k, 0);
+}
+
+void kQuantizeQ4_1(const float* x, void* buffer, int k) {
+    kQuantizeQ4(x, buffer, k, 1);
+}
+
+}
diff --git a/ggml_extra.h b/ggml_extra.h
new file mode 100644
index 0000000000000..99041bed08c89
--- /dev/null
+++ b/ggml_extra.h
@@ -0,0 +1,20 @@
+#pragma once
+
+#ifdef  __cplusplus
+extern "C" {
+#endif
+
+#ifdef  __cplusplus
+// restrict not standard in C++
+#define GGML_RESTRICT
+#else
+#define GGML_RESTRICT restrict
+#endif
+
+void kQuantizeQ4_0(const float* GGML_RESTRICT x, void* GGML_RESTRICT y, int k);
+
+void kQuantizeQ4_1(const float* GGML_RESTRICT x, void* GGML_RESTRICT y, int k);
+
+#ifdef  __cplusplus
+}
+#endif

From 8b3d1f977bad98a715c519c798441903b54efcfe Mon Sep 17 00:00:00 2001
From: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Date: Tue, 11 Apr 2023 13:01:22 +0200
Subject: [PATCH 03/12] Remove forgotten remnant from a discarded change to
 ggml.c

---
 ggml.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/ggml.c b/ggml.c
index a99f6a3b7343d..61c80a6f34ef5 100644
--- a/ggml.c
+++ b/ggml.c
@@ -2577,8 +2577,6 @@ inline static void ggml_vec_norm_inv_f32(const int n, float * s, const float * x
 //
 
 static const int GGML_BLCK_SIZE[GGML_TYPE_COUNT] = {
-    QK,
-    QK,
     QK,
     QK,
     1,

From 92408cd9830e26a842810f4b1e5405af410550c9 Mon Sep 17 00:00:00 2001
From: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Date: Tue, 11 Apr 2023 13:03:51 +0200
Subject: [PATCH 04/12] Add ability to use new quantization in quantize-stats

---
 examples/quantize-stats/quantize-stats.cpp | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/examples/quantize-stats/quantize-stats.cpp b/examples/quantize-stats/quantize-stats.cpp
index 203bfe8cc1057..cfee120c35ddb 100644
--- a/examples/quantize-stats/quantize-stats.cpp
+++ b/examples/quantize-stats/quantize-stats.cpp
@@ -1,6 +1,7 @@
 #include "ggml.h"
 #include "llama.h"
 #include "llama_internal.h"
+#include "ggml_extra.h"
 
 #include <algorithm>
 #include <cassert>
@@ -29,7 +30,7 @@ struct quantize_stats_params {
     std::vector<enum ggml_type> include_types;
 };
 
-const int64_t SCRATCH_ELEMENTS = 32*32;
+const int64_t SCRATCH_ELEMENTS = 32*32*256; // So we use multi-threading in a meaningful way in the new quantization
 const size_t HISTOGRAM_BUCKETS = 150;
 const double HISTOGRAM_RANGE = 0.03;
 
@@ -184,6 +185,7 @@ int main(int argc, char ** argv) {
     // read command line
 
     bool invalid_param = false;
+    bool checkNewQuantization = false;
     std::string arg;
     for (int i = 1; i < argc; i++) {
         arg = argv[i];
@@ -232,6 +234,8 @@ int main(int argc, char ** argv) {
                 fprintf(stderr, "error: %s not in list of types\n", argv[i]);
                 invalid_param = true;
             }
+        } else if (arg == "-nq" || arg == "--new-quantization") {
+            checkNewQuantization = true;
         } else {
             fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
             quantize_stats_print_usage(argc, argv);
@@ -307,6 +311,9 @@ int main(int argc, char ** argv) {
             continue;
         }
         quantize_fns_t qfns = ggml_internal_get_quantize_fn(i);
+        if (i < 2 && checkNewQuantization) {
+            qfns.quantize_row_q = i == 0 ? kQuantizeQ4_0 : kQuantizeQ4_1;
+        }
         if (qfns.quantize_row_q && qfns.dequantize_row_q) {
             if (params.verbose) {
                 printf("testing %s ...\n",  type_strs[i]);

From 709d23543af47d80856a6ad224fda77e7c65fbb2 Mon Sep 17 00:00:00 2001
From: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Date: Tue, 11 Apr 2023 15:32:41 +0200
Subject: [PATCH 05/12] Add new quantization to quantize

---
 Makefile                       | 27 ++++++++--------
 examples/quantize/quantize.cpp |  2 ++
 ggml_extra.cpp                 | 56 +++++++++++++++++++++++++---------
 ggml_extra.h                   |  7 +++++
 llama.cpp                      | 12 ++++++--
 5 files changed, 76 insertions(+), 28 deletions(-)

diff --git a/Makefile b/Makefile
index 3e58a28a751ab..17624656bd158 100644
--- a/Makefile
+++ b/Makefile
@@ -145,32 +145,35 @@ ggml.o: ggml.c ggml.h
 llama.o: llama.cpp llama.h llama_util.h llama_internal.h
 	$(CXX) $(CXXFLAGS) -c llama.cpp -o llama.o
 
+ggml_extra.o: ggml_extra.cpp ggml_extra.h
+	$(CXX) $(CXXFLAGS) -c $< -o $@
+
 common.o: examples/common.cpp examples/common.h
 	$(CXX) $(CXXFLAGS) -c examples/common.cpp -o common.o
 
 clean:
 	rm -vf *.o main quantize quantize-stats perplexity embedding
 
-main: examples/main/main.cpp ggml.o llama.o common.o
-	$(CXX) $(CXXFLAGS) examples/main/main.cpp ggml.o llama.o common.o -o main $(LDFLAGS)
+main: examples/main/main.cpp ggml.o llama.o common.o ggml_extra.o
+	$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
 	@echo
 	@echo '====  Run ./main -h for help.  ===='
 	@echo
 
-quantize: examples/quantize/quantize.cpp ggml.o llama.o
-	$(CXX) $(CXXFLAGS) examples/quantize/quantize.cpp ggml.o llama.o -o quantize $(LDFLAGS)
+quantize: examples/quantize/quantize.cpp ggml.o llama.o ggml_extra.o
+	$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
 
-quantize-stats: examples/quantize-stats/quantize-stats.cpp ggml.o llama.o
-	$(CXX) $(CXXFLAGS) examples/quantize-stats/quantize-stats.cpp ggml.o llama.o -o quantize-stats $(LDFLAGS)
+quantize-stats: examples/quantize-stats/quantize-stats.cpp ggml.o llama.o ggml_extra.o
+	$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
 
-perplexity: examples/perplexity/perplexity.cpp ggml.o llama.o common.o
-	$(CXX) $(CXXFLAGS) examples/perplexity/perplexity.cpp ggml.o llama.o common.o -o perplexity $(LDFLAGS)
+perplexity: examples/perplexity/perplexity.cpp ggml.o llama.o common.o ggml_extra.o
+	$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
 
-embedding: examples/embedding/embedding.cpp ggml.o llama.o common.o
-	$(CXX) $(CXXFLAGS) examples/embedding/embedding.cpp ggml.o llama.o common.o -o embedding $(LDFLAGS)
+embedding: examples/embedding/embedding.cpp ggml.o llama.o common.o ggml_extra.o
+	$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
 
-libllama.so: llama.o ggml.o
-	$(CXX) $(CXXFLAGS) -shared -fPIC -o libllama.so llama.o ggml.o $(LDFLAGS)
+libllama.so: llama.o ggml.o ggml_extra.o
+	$(CXX) $(CXXFLAGS) -shared -fPIC -o $@ $^ $(LDFLAGS)
 #
 # Tests
 #
diff --git a/examples/quantize/quantize.cpp b/examples/quantize/quantize.cpp
index 680757c6bf356..313b7534f36c5 100644
--- a/examples/quantize/quantize.cpp
+++ b/examples/quantize/quantize.cpp
@@ -14,6 +14,8 @@ int main(int argc, char ** argv) {
         fprintf(stderr, "usage: %s model-f32.bin model-quant.bin type\n", argv[0]);
         fprintf(stderr, "  type = 2 - q4_0\n");
         fprintf(stderr, "  type = 3 - q4_1\n");
+        fprintf(stderr, "  type = 4 - new q4_0\n");
+        fprintf(stderr, "  type = 5 - new q4_1\n");
         return 1;
     }
 
diff --git a/ggml_extra.cpp b/ggml_extra.cpp
index cabbefae78874..e2ae005df08a3 100644
--- a/ggml_extra.cpp
+++ b/ggml_extra.cpp
@@ -10,6 +10,11 @@
 
 namespace {
 
+constexpr int kChunkSize = 32*32*8;
+constexpr int QK = 32;
+constexpr int kBucketSize0 = QK/2 + sizeof(float);
+constexpr int kBucketSize1 = QK/2 + 2*sizeof(float);
+
 inline int toNearestInt(float fval) {
     assert(fval <= 4194303.f);
     constexpr float kSnapper=3<<22;
@@ -126,24 +131,19 @@ std::pair<float, float> kQuantize1(int n, const float* X, int8_t* L, std::vector
     return {a, b};
 }
 
-void kQuantizeQ4(const float* GGML_RESTRICT x, void* GGML_RESTRICT buffer, int k, int type) {
-    constexpr int kChunkSize = 32*32*8;
-    constexpr int QK = 32;
-    constexpr int kBucketSize0 = QK/2 + sizeof(float);
-    constexpr int kBucketSize1 = QK/2 + 2*sizeof(float);
+void kQuantizeQ4(const float* X, void* buffer, int k, int type) {
     assert(k % QK == 0);
 
     auto processOne = [type] (const float* X, int8_t* L, char* y, std::vector<std::pair<float, int>>& work, std::vector<float>& tmpX) {
+        auto q = (uint8_t*)y;
         if (type == 0) {
             float scale = kQuantize0(QK, X, L, work, -7, 7);
-            std::memcpy(y, &scale, sizeof(scale)); y += sizeof(scale);
-            uint8_t* q = (uint8_t*)y;
+            std::memcpy(q, &scale, sizeof(scale)); q += sizeof(scale);
             for (int k=0; k<QK/2; ++k) q[k] = (L[2*k] + 8) | ((L[2*k+1] + 8) << 4);
         } else {
             auto result = kQuantize1(QK, X, L, tmpX, work, 7);
-            std::memcpy(y, &result.second, sizeof(result.second)); y += sizeof(result.second);
-            std::memcpy(y, &result.first,  sizeof(result.first));  y += sizeof(result.first);
-            uint8_t* q = (uint8_t*)y;
+            std::memcpy(q, &result.second, sizeof(result.second)); q += sizeof(result.second);
+            std::memcpy(q, &result.first,  sizeof(result.first));  q += sizeof(result.first);
             for (int k=0; k<QK/2; ++k) q[k] = L[2*k] | (L[2*k+1] << 4);
         }
     };
@@ -156,24 +156,25 @@ void kQuantizeQ4(const float* GGML_RESTRICT x, void* GGML_RESTRICT buffer, int k
         std::vector<std::pair<float,int>> work;
         std::vector<float> tmpX;
         int nb = k / QK;
+        auto x = X;
         for (int i=0; i<nb; ++i) {
-            processOne(x + QK*i, L.data(), y, work, tmpX);
+            processOne(x, L.data(), y, work, tmpX);
             y += bucketSize; x += QK;
         }
         return;
     }
 
     std::atomic<int> counter(0);
-    auto compute = [&counter, x, y, k, bucketSize, &processOne] () {
+    auto compute = [&counter, X, y, k, bucketSize, &processOne] () {
         std::vector<int8_t> L(QK);
         std::vector<std::pair<float,int>> work;
         std::vector<float> tmpX;
         while (true) {
-            int first = counter.fetch_add(kChunkSize);
+            int first = counter.fetch_add(kChunkSize, std::memory_order_relaxed);
             if (first >= k) break;
             int last = first + kChunkSize;
             if (last > k) last = k;
-            auto xi = x + first;
+            auto xi = X + first;
             auto yi = y + (first/QK)*bucketSize;
             int n = (last - first)/QK;
             for (int i=0; i<n; ++i) {
@@ -189,6 +190,21 @@ void kQuantizeQ4(const float* GGML_RESTRICT x, void* GGML_RESTRICT buffer, int k
     for (auto& w : workers) w.join();
 }
 
+void collectHisto(int k, const void* buffer, int64_t* hist, int type) {
+    if (!hist) return;
+    auto y = (const uint8_t*)buffer;
+    int m = type == 0 ? 4 : 8;
+    int n = k / 32;
+    for (int i=0; i<n; ++i) {
+        y += m;
+        for (int l=0; l<16; ++l) {
+            ++hist[y[l] & 15];
+            ++hist[y[l] >> 4];
+        }
+        y += 16;
+    }
+}
+
 }
 
 extern "C" {
@@ -201,4 +217,16 @@ void kQuantizeQ4_1(const float* x, void* buffer, int k) {
     kQuantizeQ4(x, buffer, k, 1);
 }
 
+size_t kQuantizeQ4_0H(const float* x, void* buffer, int k, int64_t* hist) {
+    kQuantizeQ4(x, buffer, k, 0);
+    collectHisto(k, buffer, hist, 0);
+    return (k / QK) * kBucketSize0;
+}
+
+size_t kQuantizeQ4_1H(const float* x, void* buffer, int k, int64_t* hist) {
+    kQuantizeQ4(x, buffer, k, 1);
+    collectHisto(k, buffer, hist, 1);
+    return (k / QK) * kBucketSize1;
+}
+
 }
diff --git a/ggml_extra.h b/ggml_extra.h
index 99041bed08c89..788fcd0ea3015 100644
--- a/ggml_extra.h
+++ b/ggml_extra.h
@@ -1,7 +1,12 @@
 #pragma once
 
 #ifdef  __cplusplus
+#include <cstdint>
+#include <cstddef>
 extern "C" {
+#else
+#include <stdint.h>
+#include <stddef.h>
 #endif
 
 #ifdef  __cplusplus
@@ -12,8 +17,10 @@ extern "C" {
 #endif
 
 void kQuantizeQ4_0(const float* GGML_RESTRICT x, void* GGML_RESTRICT y, int k);
+size_t kQuantizeQ4_0H(const float* GGML_RESTRICT x, void* GGML_RESTRICT y, int k, int64_t* hist);
 
 void kQuantizeQ4_1(const float* GGML_RESTRICT x, void* GGML_RESTRICT y, int k);
+size_t kQuantizeQ4_1H(const float* GGML_RESTRICT x, void* GGML_RESTRICT y, int k, int64_t* hist);
 
 #ifdef  __cplusplus
 }
diff --git a/llama.cpp b/llama.cpp
index 54ba01eefbade..04ba10672cbcc 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -8,6 +8,7 @@
 #include "llama_internal.h"
 
 #include "ggml.h"
+#include "ggml_extra.h"
 
 #include <array>
 #include <cinttypes>
@@ -1546,9 +1547,12 @@ static llama_vocab::id llama_sample_top_p_top_k(
 
 static void llama_model_quantize_internal(const std::string & fname_inp, const std::string & fname_out, int itype) {
     ggml_type quantized_type;
+    bool useNewQuantization = false;
     switch (itype) {
         case 2: quantized_type = GGML_TYPE_Q4_0; break;
         case 3: quantized_type = GGML_TYPE_Q4_1; break;
+        case 4: quantized_type = GGML_TYPE_Q4_0; useNewQuantization = true; break;
+        case 5: quantized_type = GGML_TYPE_Q4_1; useNewQuantization = true; break;
         default: throw format("invalid quantization type %d\n", itype);
     };
 
@@ -1616,11 +1620,15 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
             switch (new_type) {
                 case GGML_TYPE_Q4_0:
                     {
-                        new_size = ggml_quantize_q4_0(f32_data, new_data, nelements, (int) tensor.ne.at(0), hist_cur.data());
+                        new_size = useNewQuantization ?
+                            kQuantizeQ4_0H(f32_data, new_data, nelements, hist_cur.data()) :
+                            ggml_quantize_q4_0(f32_data, new_data, nelements, (int) tensor.ne.at(0), hist_cur.data());
                     } break;
                 case GGML_TYPE_Q4_1:
                     {
-                        new_size = ggml_quantize_q4_1(f32_data, new_data, nelements, (int) tensor.ne.at(0), hist_cur.data());
+                        new_size = useNewQuantization ?
+                            kQuantizeQ4_1H(f32_data, new_data, nelements, hist_cur.data()) :
+                            ggml_quantize_q4_1(f32_data, new_data, nelements, (int) tensor.ne.at(0), hist_cur.data());
                     } break;
                 default:
                     LLAMA_ASSERT(false);

From b6df974577233a3e41328c3f78ade0b5efed6f62 Mon Sep 17 00:00:00 2001
From: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Date: Tue, 11 Apr 2023 20:38:14 +0200
Subject: [PATCH 06/12] Reverting round() change so we can pass tests

But we should eventually switch back to nearestInt() and adapt the test.
---
 ggml.c | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/ggml.c b/ggml.c
index 61c80a6f34ef5..04a3a40f767d5 100644
--- a/ggml.c
+++ b/ggml.c
@@ -536,10 +536,10 @@ static void quantize_row_q4_0_reference(const float * restrict x, block_q4_0 * r
 
             // On x86_64 and x86, round is amazingly slow.
             // Here it is best to just use this:
-            const uint8_t vi0 = (uint8_t)(v0 + 8.5f);
-            const uint8_t vi1 = (uint8_t)(v1 + 8.5f);
-            //const uint8_t vi0 = (int8_t)roundf(v0) + 8;
-            //const uint8_t vi1 = (int8_t)roundf(v1) + 8;
+            //const uint8_t vi0 = (uint8_t)(v0 + 8.5f);
+            //const uint8_t vi1 = (uint8_t)(v1 + 8.5f);
+            const uint8_t vi0 = (int8_t)roundf(v0) + 8;
+            const uint8_t vi1 = (int8_t)roundf(v1) + 8;
             // This is marginally slower (but still much faster than round())
             //const uint8_t vi0 = nearestInt(v0) + 8;
             //const uint8_t vi1 = nearestInt(v1) + 8;
@@ -835,8 +835,10 @@ static void quantize_row_q4_1_reference(const float * restrict x, void * restric
 
             // For some reason round() is amazingly slow on X86_64 and x86
             // Using this instead reduces the difference between AVX2 and scalar to less than ~15%
-            const uint8_t vi0 = nearestInt(v0); //roundf(v0);
-            const uint8_t vi1 = nearestInt(v1); //roundf(v1);
+            //const uint8_t vi0 = nearestInt(v0);
+            //const uint8_t vi1 = nearestInt(v1);
+            const uint8_t vi0 = roundf(v0);
+            const uint8_t vi1 = roundf(v1);
 
             assert(vi0 < 16);
             assert(vi1 < 16);

From 931ae360500ed8dd9d7421fe2257654db7f96c0d Mon Sep 17 00:00:00 2001
From: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Date: Tue, 11 Apr 2023 22:08:47 +0200
Subject: [PATCH 07/12] Improve Q4_0 MSE

Somehow I had it hard-wired in my brain that quants need to be
in -7...7 to be comparable to the original Q4_0.

But this is clearly not the case, and if we relax this requirement
this simple change brings the rmse down to 0.001966 at the expense of
a somewhat longer computation (~67 seconds vs 49 seconds for the 7B
model on M2 Max).

Perplexity test is still running but it looks like the improvement
compared to the previous version will be quite modest ~0.03) despite
the significant improvement in MSE.

The change does not affect Q4_1 as there we already use the full
range of 16 possible int values.
---
 ggml_extra.cpp | 20 +++++++++++++++-----
 1 file changed, 15 insertions(+), 5 deletions(-)

diff --git a/ggml_extra.cpp b/ggml_extra.cpp
index e2ae005df08a3..9eb9bbe8319c6 100644
--- a/ggml_extra.cpp
+++ b/ggml_extra.cpp
@@ -23,7 +23,7 @@ inline int toNearestInt(float fval) {
     return (i & 0x007fffff) - 0x00400000;
 }
 
-float kQuantize0(int n, const float* X, int8_t* L, std::vector<std::pair<float,int>>& work, int nmin, int nmax) {
+std::pair<float, float> kQuantize0(int n, const float* X, int8_t* L, std::vector<std::pair<float,int>>& work, int nmin, int nmax) {
     work.clear();
     work.reserve(n*(nmax+2));
     float max = 0; int imax = -1;
@@ -33,7 +33,7 @@ float kQuantize0(int n, const float* X, int8_t* L, std::vector<std::pair<float,i
     }
     if (imax < 0) {  // all X are zero
         for (int i=0; i<n; ++i) L[i] = 0;
-        return 1.f;
+        return {1.f, 0.f};
     }
     float maxi = 1/max;
     int kmin, kmax;
@@ -46,7 +46,7 @@ float kQuantize0(int n, const float* X, int8_t* L, std::vector<std::pair<float,i
         }
         auto df0 = suml20/scale0 - sumlx0;
         if (df0 > 0) {
-            kmin = nmax-2; kmax = nmax + 1;
+            kmin = nmax-2; kmax = nmax+1;
         } else {
             kmin = nmax/2; kmax = nmax+1;
         }
@@ -97,7 +97,7 @@ float kQuantize0(int n, const float* X, int8_t* L, std::vector<std::pair<float,i
         lasts = s;
     }
     for (int i=0; i<n; ++i) L[i] = std::max(nmin, std::min(nmax, toNearestInt(bests*X[i])));
-    return bestSumlx/bestSuml2;
+    return {bestSumlx/bestSuml2, bestSumlx*bestSumlx/bestSuml2};
 }
 
 std::pair<float, float> kQuantize1(int n, const float* X, int8_t* L, std::vector<float>& tmpX,
@@ -137,7 +137,17 @@ void kQuantizeQ4(const float* X, void* buffer, int k, int type) {
     auto processOne = [type] (const float* X, int8_t* L, char* y, std::vector<std::pair<float, int>>& work, std::vector<float>& tmpX) {
         auto q = (uint8_t*)y;
         if (type == 0) {
-            float scale = kQuantize0(QK, X, L, work, -7, 7);
+            if (int(tmpX.size()) < QK) tmpX.resize(QK);
+            auto r1 = kQuantize0(QK, X, L, work, -8, 7);
+            for (int i=0; i<QK; ++i) tmpX[i] = -X[i];
+            int8_t L2[QK];
+            auto r2 = kQuantize0(QK, tmpX.data(), L2, work, -8, 7);
+            float scale = r1.first;
+            if (r2.second > r1.first) {
+                scale = -r2.first;
+                std::memcpy(L, L2, QK);
+            }
+            //float scale = kQuantize0(QK, X, L, work, -7, 7);
             std::memcpy(q, &scale, sizeof(scale)); q += sizeof(scale);
             for (int k=0; k<QK/2; ++k) q[k] = (L[2*k] + 8) | ((L[2*k+1] + 8) << 4);
         } else {

From 6bfb00a53b1a06e209f1b814356dd79ee96b89af Mon Sep 17 00:00:00 2001
From: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Date: Wed, 12 Apr 2023 07:38:42 +0200
Subject: [PATCH 08/12] Further improve Q4_0 MSE

The RMSE of the 7B model becomes 0.00185228.
It looks like the perplexity will end up being around 6.27-6.28.
---
 ggml_extra.cpp | 113 ++++++++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 102 insertions(+), 11 deletions(-)

diff --git a/ggml_extra.cpp b/ggml_extra.cpp
index 9eb9bbe8319c6..3a996d56d2c86 100644
--- a/ggml_extra.cpp
+++ b/ggml_extra.cpp
@@ -1,5 +1,6 @@
 #include "ggml_extra.h"
 
+#include <limits>
 #include <vector>
 #include <utility>
 #include <algorithm>
@@ -23,6 +24,94 @@ inline int toNearestInt(float fval) {
     return (i & 0x007fffff) - 0x00400000;
 }
 
+// Adapted from PR #835, function quantize_row_q4_0_rmse()
+//
+// I absolutely cannot reproduce the rmse = 0.00185915 reported in #835.
+// Instead, I get rmse = 0.00197 with the original and rmse = 0.00192
+// with the modification that determines the scale actually minimizing
+// the rmse.
+//
+// Do I have a bug? iI don't see it.
+// The only difference is that I'm using toNearestInt()
+// instead of round(), but what are the odds for getting scaled weights at
+// exactly 2.5, 4.5, and 6.5, where toNearestInt() and round() differ.
+// (with toNearestInt() behaving as expected and rounding towards the even integer,
+// while round() always rounding up.
+float quanizeRmse(int n, const float* X, int8_t* L) {
+#define Q4_0_SCALE_CANDIDATE_COUNT 8
+    static const float candidates[Q4_0_SCALE_CANDIDATE_COUNT] = { -8.7f, -8.5f, -8.3f, -8.1f, -7.9f, -7.7f, -7.2f, +7.0f };
+    float max = 0, amax = 0;
+    for (int i=0; i<n; ++i) {
+        float ax = std::abs(X[i]);
+        if (ax > amax) { amax = ax; max = X[i]; }
+    }
+    if (!amax) { // all zero
+        for (int i=0; i<n; ++i) L[i] = 0;
+        return 1.f;
+    }
+    float best = std::numeric_limits<float>::max(), bestScale = 0;
+    for (int si=0; si<Q4_0_SCALE_CANDIDATE_COUNT; ++si) {
+        float iscale = candidates[si]/max;
+        float err = 0;
+        for (int i=0; i<n; ++i) {
+            float sx = iscale*X[i];
+            int l = std::max(-8, std::min(7, toNearestInt(sx)));
+            sx -= l;
+            err += sx*sx;
+        }
+        if (err < best) {
+            best = err; bestScale = iscale;
+        }
+    }
+    // The follwoing is a departure from #835. Given the quants produces by bestScale,
+    // it determines the scale the actually minimizes the MSE (or RMSE).
+    // With this, I get rmse = 0.00192 for the 7B model.
+    float sumlx = 0; int suml2 = 0;
+    for (int i=0; i<n; ++i) {
+        int l = std::max(-8, std::min(7, toNearestInt(bestScale*X[i])));
+        sumlx += X[i]*l; suml2 += l*l;
+        L[i] = l;
+    }
+    return sumlx/suml2;
+    // The following is what is in quantize_row_q4_0_rmse() in PR #835
+    // With this version, I get rmse = 0.00197 for the 7B model.
+    //for (int i=0; i<n; ++i) L[i] = std::max(-8, std::min(7, toNearestInt(bestScale*X[i])));
+    //return 1/bestScale;
+}
+
+// The following improves the above.
+// It gives RMSE = 0.00185228 for the 7B model.
+float quanizeRmseK(int n, const float* X, int8_t* L) {
+    constexpr int kCandiateCount = 20;
+    static const float candidates[kCandiateCount] = { -8.7f, -8.5f, -8.3f, -8.1f, -7.9f, -7.7f, -7.2f, -7.0f, -6.3f, -5.7f,
+                                                      +8.7f, +8.5f, +8.3f, +8.1f, +7.9f, +7.7f, +7.2f, +7.0f, +6.3f, +5.7f};
+    float max = 0;
+    for (int i=0; i<n; ++i) max = std::max(max, std::abs(X[i]));
+    if (!max) { // all zero
+        for (int i=0; i<n; ++i) L[i] = 0;
+        return 1.f;
+    }
+    float best = 0, bestScale = 0;
+    for (int si=0; si<kCandiateCount; ++si) {
+        float iscale = candidates[si]/max;
+        float sumlx = 0; int suml2 = 0;
+        for (int i=0; i<n; ++i) {
+            int l = std::max(-8, std::min(7, toNearestInt(iscale*X[i])));
+            sumlx += X[i]*l; suml2 += l*l;
+        }
+        if (sumlx*sumlx > best*suml2) {
+            best = sumlx*sumlx/suml2; bestScale = iscale;
+        }
+    }
+    float sumlx = 0; int suml2 = 0;
+    for (int i=0; i<n; ++i) {
+        int l = std::max(-8, std::min(7, toNearestInt(bestScale*X[i])));
+        sumlx += X[i]*l; suml2 += l*l;
+        L[i] = l;
+    }
+    return sumlx/suml2;
+}
+
 std::pair<float, float> kQuantize0(int n, const float* X, int8_t* L, std::vector<std::pair<float,int>>& work, int nmin, int nmax) {
     work.clear();
     work.reserve(n*(nmax+2));
@@ -137,17 +226,19 @@ void kQuantizeQ4(const float* X, void* buffer, int k, int type) {
     auto processOne = [type] (const float* X, int8_t* L, char* y, std::vector<std::pair<float, int>>& work, std::vector<float>& tmpX) {
         auto q = (uint8_t*)y;
         if (type == 0) {
-            if (int(tmpX.size()) < QK) tmpX.resize(QK);
-            auto r1 = kQuantize0(QK, X, L, work, -8, 7);
-            for (int i=0; i<QK; ++i) tmpX[i] = -X[i];
-            int8_t L2[QK];
-            auto r2 = kQuantize0(QK, tmpX.data(), L2, work, -8, 7);
-            float scale = r1.first;
-            if (r2.second > r1.first) {
-                scale = -r2.first;
-                std::memcpy(L, L2, QK);
-            }
-            //float scale = kQuantize0(QK, X, L, work, -7, 7);
+            auto scale = quanizeRmseK(QK, X, L);
+            // The following is not quite as good as quanizeRmseK() and it is slower too.
+            //if (int(tmpX.size()) < QK) tmpX.resize(QK);
+            //auto r1 = kQuantize0(QK, X, L, work, -8, 7);
+            //for (int i=0; i<QK; ++i) tmpX[i] = -X[i];
+            //int8_t L2[QK];
+            //auto r2 = kQuantize0(QK, tmpX.data(), L2, work, -8, 7);
+            //float scale = r1.first;
+            //if (r2.second > r1.first) {
+            //    scale = -r2.first;
+            //    std::memcpy(L, L2, QK);
+            //}
+            ////float scale = kQuantize0(QK, X, L, work, -7, 7);
             std::memcpy(q, &scale, sizeof(scale)); q += sizeof(scale);
             for (int k=0; k<QK/2; ++k) q[k] = (L[2*k] + 8) | ((L[2*k+1] + 8) << 4);
         } else {

From 29b83e5fd665931226d152f29cc2ee21d7215523 Mon Sep 17 00:00:00 2001
From: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Date: Wed, 12 Apr 2023 16:25:19 +0200
Subject: [PATCH 09/12] Various experiments, including 5-bit qunatization

---
 examples/quantize-stats/quantize-stats.cpp |   8 +-
 ggml_extra.cpp                             | 211 +++++++++++++++++++--
 ggml_extra.h                               |   6 +
 3 files changed, 211 insertions(+), 14 deletions(-)

diff --git a/examples/quantize-stats/quantize-stats.cpp b/examples/quantize-stats/quantize-stats.cpp
index cfee120c35ddb..8ab1d02b08ff2 100644
--- a/examples/quantize-stats/quantize-stats.cpp
+++ b/examples/quantize-stats/quantize-stats.cpp
@@ -306,13 +306,17 @@ int main(int argc, char ** argv) {
     std::vector<float> output_scratch(SCRATCH_ELEMENTS);
 
     // loop throught quantization types
-    for (int i = 0; i < GGML_TYPE_COUNT; i++) {
+    //for (int i = 0; i < GGML_TYPE_COUNT; i++) {
+    for (int i = 1; i < 2; i++) {
         if (!params.include_types.empty() && std::find(params.include_types.begin(), params.include_types.end(), i) == params.include_types.end()) {
             continue;
         }
         quantize_fns_t qfns = ggml_internal_get_quantize_fn(i);
         if (i < 2 && checkNewQuantization) {
-            qfns.quantize_row_q = i == 0 ? kQuantizeQ4_0 : kQuantizeQ4_1;
+            //qfns.quantize_row_q = i == 0 ? kQuantizeQ4_0 : kQuantizeQ4_1;
+            //qfns.quantize_row_q = i == 0 ? kQuantizeQ4_0 : kQuantizeQ5_1;
+            qfns.quantize_row_q = i == 0 ? kQuantizeQ4_0 : kQuantizeQ5_1_Fast;
+            if (i == 1) qfns.dequantize_row_q = kDequantizeQ5_1;
         }
         if (qfns.quantize_row_q && qfns.dequantize_row_q) {
             if (params.verbose) {
diff --git a/ggml_extra.cpp b/ggml_extra.cpp
index 3a996d56d2c86..ed6330a3e85ce 100644
--- a/ggml_extra.cpp
+++ b/ggml_extra.cpp
@@ -1,4 +1,5 @@
 #include "ggml_extra.h"
+#include "ggml.h"
 
 #include <limits>
 #include <vector>
@@ -27,8 +28,7 @@ inline int toNearestInt(float fval) {
 // Adapted from PR #835, function quantize_row_q4_0_rmse()
 //
 // I absolutely cannot reproduce the rmse = 0.00185915 reported in #835.
-// Instead, I get rmse = 0.00197 with the original and rmse = 0.00192
-// with the modification that determines the scale actually minimizing
+// Instead, I get rmse = 0.00197 with the original and rmse = 0.00192 // with the modification that determines the scale actually minimizing
 // the rmse.
 //
 // Do I have a bug? iI don't see it.
@@ -79,12 +79,58 @@ float quanizeRmse(int n, const float* X, int8_t* L) {
     //return 1/bestScale;
 }
 
+float quanizeRmseK(int n, const float* X, int8_t* L,
+        int nCandidates, const float* candidates, int nmin, int nmax) {
+    float max = 0;
+    for (int i=0; i<n; ++i) max = std::max(max, std::abs(X[i]));
+    if (!max) { // all zero
+        for (int i=0; i<n; ++i) L[i] = 0;
+        return 1.f;
+    }
+    float best = 0, bestScale = 0;
+    for (int si=0; si<nCandidates; ++si) {
+        float iscale = candidates[si]/max;
+        float sumlx = 0; int suml2 = 0;
+        for (int i=0; i<n; ++i) {
+            int l = std::max(nmin, std::min(nmax, toNearestInt(iscale*X[i])));
+            sumlx += X[i]*l; suml2 += l*l;
+        }
+        if (sumlx*sumlx > best*suml2) {
+            best = sumlx*sumlx/suml2; bestScale = iscale;
+        }
+    }
+    float sumlx = 0; int suml2 = 0;
+    for (int i=0; i<n; ++i) {
+        int l = std::max(nmin, std::min(nmax, toNearestInt(bestScale*X[i])));
+        sumlx += X[i]*l; suml2 += l*l;
+        L[i] = l;
+    }
+    return sumlx/suml2;
+}
 // The following improves the above.
 // It gives RMSE = 0.00185228 for the 7B model.
-float quanizeRmseK(int n, const float* X, int8_t* L) {
+float quanizeRmseK7(int n, const float* X, int8_t* L) {
     constexpr int kCandiateCount = 20;
     static const float candidates[kCandiateCount] = { -8.7f, -8.5f, -8.3f, -8.1f, -7.9f, -7.7f, -7.2f, -7.0f, -6.3f, -5.7f,
                                                       +8.7f, +8.5f, +8.3f, +8.1f, +7.9f, +7.7f, +7.2f, +7.0f, +6.3f, +5.7f};
+    return quanizeRmseK(n, X, L, kCandiateCount, candidates, -8, 7);
+}
+
+float quanizeRmseK15(int n, const float* X, int8_t* L) {
+    constexpr int kCandiateCount = 16;
+    static const float candidates[kCandiateCount] = {
+        +17.75f, +17.25f, +16.75f, +16.25f, +15.75f, +15.25f, +14.75f, +14.25f, +13.75f, +13.25f, +12.75f, +12.25, +11.75f,
+        +11.25f, +10.75f, +10.25f
+    };
+    return quanizeRmseK(n, X, L, kCandiateCount, candidates, 0, 15);
+}
+
+// Fast (as much faster than doing the optimization), but not very good.
+float quanizeRmseFast(int n, const float* X, int8_t* L) {
+    //constexpr int kCandiateCount = 3;
+    //static const float candidates[kCandiateCount] = { +8.3f, +7.2f, +5.7f};
+    constexpr int kCandiateCount = 4;
+    static const float candidates[kCandiateCount] = { +8.7f, +7.9f, +7.2f, +5.7f};
     float max = 0;
     for (int i=0; i<n; ++i) max = std::max(max, std::abs(X[i]));
     if (!max) { // all zero
@@ -94,13 +140,25 @@ float quanizeRmseK(int n, const float* X, int8_t* L) {
     float best = 0, bestScale = 0;
     for (int si=0; si<kCandiateCount; ++si) {
         float iscale = candidates[si]/max;
-        float sumlx = 0; int suml2 = 0;
+        float sumxlp = 0, sumxlm = 0;
+        int   suml2p = 0, suml2m = 0;
         for (int i=0; i<n; ++i) {
-            int l = std::max(-8, std::min(7, toNearestInt(iscale*X[i])));
-            sumlx += X[i]*l; suml2 += l*l;
+            float x = X[i];
+            float sx = iscale*x;
+            int lx = toNearestInt(sx);
+            int lp = std::max(-8, std::min(7, +lx));
+            int lm = std::max(-8, std::min(7, -lx));
+            sumxlp += x*lp;  sumxlm += x*lm;
+            suml2p += lp*lp; suml2m += lm*lm;
         }
-        if (sumlx*sumlx > best*suml2) {
-            best = sumlx*sumlx/suml2; bestScale = iscale;
+        if (sumxlp*sumxlp*suml2m >= sumxlm*sumxlm*suml2p) {
+            if (sumxlp*sumxlp > best*suml2p) {
+                best = sumxlp*sumxlp/suml2p; bestScale = iscale;
+            }
+        } else {
+            if (sumxlm*sumxlm > best*suml2m) {
+                best = sumxlm*sumxlm/suml2m; bestScale = -iscale;
+            }
         }
     }
     float sumlx = 0; int suml2 = 0;
@@ -112,6 +170,40 @@ float quanizeRmseK(int n, const float* X, int8_t* L) {
     return sumlx/suml2;
 }
 
+float quanizeRmseOpt(int n, const float* X, int8_t* L, std::vector<std::pair<float,int>>& work) {
+    work.clear();
+    work.reserve(n*17);
+    for (int l=-8; l<=8; ++l) {
+        float scale = l - 0.4999f;
+        for (int i=0; i<n; ++i) {
+            if (X[i]) work.push_back({scale/std::abs(X[i]), i});
+        }
+    }
+    for (int i=0; i<n; ++i) L[i] = 0;
+    if (work.empty()) return 1.f; // all values are zero
+    std::sort(work.begin(), work.end());
+    float best = 0, bestScale = 0, lasts = work.front().first - 1;
+    double sumlx = 0; int suml2 = 0;
+    for (int k=0; k<int(work.size()); ++k) {
+        float s = work[k].first; int i = work[k].second;
+        int l = std::max(-8, std::min(7, toNearestInt(s*X[i])));
+        if (l != L[i]) {
+            sumlx += X[i]*(l-L[i]); suml2 += l*l - L[i]*L[i];
+            L[i] = l;
+            if ((s != lasts || k == int(work.size())-1) && suml2 > 0 && sumlx*sumlx > best*suml2) {
+                best = sumlx*sumlx/suml2; bestScale = s;
+            }
+        }
+    }
+    sumlx = 0; suml2 = 0;
+    for (int i=0; i<n; ++i) {
+        int l = std::max(-8, std::min(7, toNearestInt(bestScale*X[i])));
+        sumlx += X[i]*l; suml2 += l*l;
+        L[i] = l;
+    }
+    return sumlx/suml2;
+}
+
 std::pair<float, float> kQuantize0(int n, const float* X, int8_t* L, std::vector<std::pair<float,int>>& work, int nmin, int nmax) {
     work.clear();
     work.reserve(n*(nmax+2));
@@ -200,9 +292,10 @@ std::pair<float, float> kQuantize1(int n, const float* X, int8_t* L, std::vector
         return {min, 1.f};
     }
     if (int(tmpX.size()) < n) tmpX.resize(n);
-    double a = min, b;
-    for (int itry=0; itry<3; ++itry) {
+    double a = min, b = 0;
+    for (int itry=0; itry<5; ++itry) {
         for (int i=0; i<n; ++i) tmpX[i] = X[i] - a;
+        //quanizeRmseK15(n, tmpX.data(), L);
         kQuantize0(n, tmpX.data(), L, work, 0, 2*nmax+1);
         double sumlx = 0, sumx = 0;
         int suml2 = 0, suml = 0;
@@ -214,9 +307,37 @@ std::pair<float, float> kQuantize1(int n, const float* X, int8_t* L, std::vector
             sumx  += X[i];
         }
         int64_t D = suml2*n - suml*suml;
+        auto aold = a, bold = b;
         a = (sumx*suml2 - sumlx*suml)/D;
         b = (sumlx*n - sumx*suml)/D;
+        if (itry > 0 && std::abs(a - aold) < 1e-6*std::abs(aold) && std::abs(b - bold) < 1e-6*std::abs(bold)) break;
+    }
+    return {a, b};
+}
+
+std::pair<float, float> kQuantize1Fast(int n, const float* X, int8_t* L, int nmax) {
+    float min = X[0], max = X[1];
+    for (int i=1; i<n; ++i) {
+        min = std::min(min, X[i]); max = std::max(max, X[i]);
+    }
+    if (max == min) {
+        for (int i=0; i<n; ++i) L[i] = 0;
+        return {min, 1.f};
+    }
+    float scale = (nmax - 0.499f)/(max - min);
+    double sumlx = 0, sumx = 0;
+    int suml2 = 0, suml = 0;
+    for (int i=0; i<n; ++i) {
+        int l = toNearestInt(scale*(X[i] - min));
+        L[i] = l;
+        sumlx += X[i]*l;
+        suml2 += l*l;
+        suml  += l;
+        sumx  += X[i];
     }
+    int64_t D = suml2*n - suml*suml;
+    double a = (sumx*suml2 - sumlx*suml)/D;
+    double b = (sumlx*n - sumx*suml)/D;
     return {a, b};
 }
 
@@ -226,7 +347,9 @@ void kQuantizeQ4(const float* X, void* buffer, int k, int type) {
     auto processOne = [type] (const float* X, int8_t* L, char* y, std::vector<std::pair<float, int>>& work, std::vector<float>& tmpX) {
         auto q = (uint8_t*)y;
         if (type == 0) {
-            auto scale = quanizeRmseK(QK, X, L);
+            auto scale = quanizeRmseK7(QK, X, L);
+            //auto scale = quanizeRmseFast(QK, X, L);
+            //auto scale = quanizeRmseOpt(QK, X, L, work);
             // The following is not quite as good as quanizeRmseK() and it is slower too.
             //if (int(tmpX.size()) < QK) tmpX.resize(QK);
             //auto r1 = kQuantize0(QK, X, L, work, -8, 7);
@@ -241,11 +364,29 @@ void kQuantizeQ4(const float* X, void* buffer, int k, int type) {
             ////float scale = kQuantize0(QK, X, L, work, -7, 7);
             std::memcpy(q, &scale, sizeof(scale)); q += sizeof(scale);
             for (int k=0; k<QK/2; ++k) q[k] = (L[2*k] + 8) | ((L[2*k+1] + 8) << 4);
-        } else {
+        } else if (type == 1) {
             auto result = kQuantize1(QK, X, L, tmpX, work, 7);
             std::memcpy(q, &result.second, sizeof(result.second)); q += sizeof(result.second);
             std::memcpy(q, &result.first,  sizeof(result.first));  q += sizeof(result.first);
             for (int k=0; k<QK/2; ++k) q[k] = L[2*k] | (L[2*k+1] << 4);
+        } else {
+            auto result = type == 2 ? kQuantize1(QK, X, L, tmpX, work, 15) : kQuantize1Fast(QK, X, L, 31);
+            auto afp16 = ggml_fp32_to_fp16(result.first);
+            auto bfp16 = ggml_fp32_to_fp16(result.second);
+            std::memcpy(q, &afp16, sizeof(afp16)); q += sizeof(afp16);
+            std::memcpy(q, &bfp16, sizeof(bfp16)); q += sizeof(bfp16);
+            auto u = (uint32_t*)q;
+            *u = 0;
+            q += sizeof(uint32_t);
+            uint32_t m = 1u;
+            for (int k=0; k<QK/2; ++k) {
+                auto l1 = L[2*k], l2 = L[2*k+1];
+                if (l1 > 15) { l1 -= 16; *u |= m; }
+                m <<= 1;
+                if (l2 > 15) { l2 -= 16; *u |= m; }
+                m <<= 1;
+                q[k] = l1 | (l2 << 4);
+            }
         }
     };
 
@@ -318,6 +459,14 @@ void kQuantizeQ4_1(const float* x, void* buffer, int k) {
     kQuantizeQ4(x, buffer, k, 1);
 }
 
+void kQuantizeQ5_1(const float* x, void* buffer, int k) {
+    kQuantizeQ4(x, buffer, k, 2);
+}
+
+void kQuantizeQ5_1_Fast(const float* x, void* buffer, int k) {
+    kQuantizeQ4(x, buffer, k, 3);
+}
+
 size_t kQuantizeQ4_0H(const float* x, void* buffer, int k, int64_t* hist) {
     kQuantizeQ4(x, buffer, k, 0);
     collectHisto(k, buffer, hist, 0);
@@ -330,4 +479,42 @@ size_t kQuantizeQ4_1H(const float* x, void* buffer, int k, int64_t* hist) {
     return (k / QK) * kBucketSize1;
 }
 
+size_t kQuantizeQ5_1H(const float* x, void* buffer, int k, int64_t* hist) {
+    kQuantizeQ4(x, buffer, k, 2);
+    collectHisto(k, buffer, hist, 1);
+    return (k / QK) * kBucketSize1;
+}
+
+size_t kQuantizeQ5_1H_Fast(const float* x, void* buffer, int k, int64_t* hist) {
+    kQuantizeQ4(x, buffer, k, 3);
+    collectHisto(k, buffer, hist, 1);
+    return (k / QK) * kBucketSize1;
+}
+
+void kDequantizeQ5_1(const void* x, float* y, int k) {
+    assert(k % QK == 0);
+    int n = k / QK;
+    auto data = (const uint8_t*)x;
+    for (int i=0; i<n; ++i) {
+        ggml_fp16_t afp16, bfp16;
+        std::memcpy(&afp16, data, sizeof(afp16)); data += sizeof(afp16);
+        std::memcpy(&bfp16, data, sizeof(bfp16)); data += sizeof(bfp16);
+        auto a = ggml_fp16_to_fp32(afp16);
+        auto b = ggml_fp16_to_fp32(bfp16);
+        uint32_t u;
+        std::memcpy(&u, data, sizeof(u)); data += sizeof(u);
+        uint32_t m = 1u;
+        for (int k=0; k<16; ++k) {
+            auto l1 = data[k] & 15, l2 = data[k] >> 4;
+            if (u & m) l1 += 16;
+            m <<= 1;
+            if (u & m) l2 += 16;
+            m <<= 1;
+            *y++ = a + b*l1;
+            *y++ = a + b*l2;
+        }
+        data += 16;
+    }
+}
+
 }
diff --git a/ggml_extra.h b/ggml_extra.h
index 788fcd0ea3015..7faa4380105f9 100644
--- a/ggml_extra.h
+++ b/ggml_extra.h
@@ -22,6 +22,12 @@ size_t kQuantizeQ4_0H(const float* GGML_RESTRICT x, void* GGML_RESTRICT y, int k
 void kQuantizeQ4_1(const float* GGML_RESTRICT x, void* GGML_RESTRICT y, int k);
 size_t kQuantizeQ4_1H(const float* GGML_RESTRICT x, void* GGML_RESTRICT y, int k, int64_t* hist);
 
+void kQuantizeQ5_1(const float* GGML_RESTRICT x, void* GGML_RESTRICT y, int k);
+size_t kQuantizeQ5_1H(const float* GGML_RESTRICT x, void* GGML_RESTRICT y, int k, int64_t* hist);
+void kQuantizeQ5_1_Fast(const float* GGML_RESTRICT x, void* GGML_RESTRICT y, int k);
+size_t kQuantizeQ5_1H_Fast(const float* GGML_RESTRICT x, void* GGML_RESTRICT y, int k, int64_t* hist);
+void kDequantizeQ5_1(const void * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
+
 #ifdef  __cplusplus
 }
 #endif

From 679e1cb6c01b16abe4f3ee3c849813b98970df93 Mon Sep 17 00:00:00 2001
From: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Date: Wed, 12 Apr 2023 17:10:52 +0200
Subject: [PATCH 10/12] POC: Even lower rmse 4-bit Q4_0 quantization

Basically, we use two Q4_0 quantizations, each having 16 weights,
to a quantize a set of 32 weights. We get two separate scaling
factors, which we store as fp16, ending up using the exact same
5 bits per weight as the current Q4_0.

We end up witn an rmse of ~0.00159, so basically the same as
the improved Q4_1. But this should run faster than `Q4_1`
(unless fp16 -> fp32 conversion is somehow very slow).
---
 examples/quantize-stats/quantize-stats.cpp |  8 +++--
 ggml_extra.cpp                             | 40 +++++++++++++++++++++-
 ggml_extra.h                               |  3 ++
 3 files changed, 47 insertions(+), 4 deletions(-)

diff --git a/examples/quantize-stats/quantize-stats.cpp b/examples/quantize-stats/quantize-stats.cpp
index 8ab1d02b08ff2..c924ff3d2cffb 100644
--- a/examples/quantize-stats/quantize-stats.cpp
+++ b/examples/quantize-stats/quantize-stats.cpp
@@ -307,7 +307,7 @@ int main(int argc, char ** argv) {
 
     // loop throught quantization types
     //for (int i = 0; i < GGML_TYPE_COUNT; i++) {
-    for (int i = 1; i < 2; i++) {
+    for (int i = 0; i < 1; i++) {
         if (!params.include_types.empty() && std::find(params.include_types.begin(), params.include_types.end(), i) == params.include_types.end()) {
             continue;
         }
@@ -315,8 +315,10 @@ int main(int argc, char ** argv) {
         if (i < 2 && checkNewQuantization) {
             //qfns.quantize_row_q = i == 0 ? kQuantizeQ4_0 : kQuantizeQ4_1;
             //qfns.quantize_row_q = i == 0 ? kQuantizeQ4_0 : kQuantizeQ5_1;
-            qfns.quantize_row_q = i == 0 ? kQuantizeQ4_0 : kQuantizeQ5_1_Fast;
-            if (i == 1) qfns.dequantize_row_q = kDequantizeQ5_1;
+            //qfns.quantize_row_q = i == 0 ? kQuantizeQ4_0 : kQuantizeQ5_1_Fast;
+            //if (i == 1) qfns.dequantize_row_q = kDequantizeQ5_1;
+            qfns.quantize_row_q = i == 0 ? kQuantizeQ4_0K : kQuantizeQ5_1;
+            qfns.dequantize_row_q = i == 0 ? kDequantizeQ4_0K : kDequantizeQ5_1;
         }
         if (qfns.quantize_row_q && qfns.dequantize_row_q) {
             if (params.verbose) {
diff --git a/ggml_extra.cpp b/ggml_extra.cpp
index ed6330a3e85ce..927ab7e78b84e 100644
--- a/ggml_extra.cpp
+++ b/ggml_extra.cpp
@@ -369,6 +369,15 @@ void kQuantizeQ4(const float* X, void* buffer, int k, int type) {
             std::memcpy(q, &result.second, sizeof(result.second)); q += sizeof(result.second);
             std::memcpy(q, &result.first,  sizeof(result.first));  q += sizeof(result.first);
             for (int k=0; k<QK/2; ++k) q[k] = L[2*k] | (L[2*k+1] << 4);
+        } else if (type == 4) {
+            auto scale1 = quanizeRmseK7(QK/2, X, L);
+            auto scale2 = quanizeRmseK7(QK/2, X+QK/2, L+QK/2);
+            //printf("scale1 = %g, scale2 = %g\n",scale1,scale2);
+            auto scale1fp16 = ggml_fp32_to_fp16(scale1);
+            auto scale2fp16 = ggml_fp32_to_fp16(scale2);
+            std::memcpy(q, &scale1fp16, sizeof(scale1fp16)); q += sizeof(scale1fp16);
+            std::memcpy(q, &scale2fp16, sizeof(scale2fp16)); q += sizeof(scale2fp16);
+            for (int k=0; k<QK/2; ++k) q[k] = (L[2*k] + 8) | ((L[2*k+1] + 8) << 4);
         } else {
             auto result = type == 2 ? kQuantize1(QK, X, L, tmpX, work, 15) : kQuantize1Fast(QK, X, L, 31);
             auto afp16 = ggml_fp32_to_fp16(result.first);
@@ -390,7 +399,7 @@ void kQuantizeQ4(const float* X, void* buffer, int k, int type) {
         }
     };
 
-    auto bucketSize = type == 0 ? kBucketSize0 : kBucketSize1;
+    auto bucketSize = type == 0 || type == 4 ? kBucketSize0 : kBucketSize1;
     auto y = (char*)buffer;
     int nchunk = (k + kChunkSize-1)/kChunkSize;
     if (nchunk < 2) {
@@ -517,4 +526,33 @@ void kDequantizeQ5_1(const void* x, float* y, int k) {
     }
 }
 
+void kQuantizeQ4_0K(const float* x, void* buffer, int k) {
+    kQuantizeQ4(x, buffer, k, 4);
+}
+
+void kDequantizeQ4_0K(const void* x, float* y, int k) {
+    assert(k % QK == 0);
+    int n = k / QK;
+    auto data = (const uint8_t*)x;
+    for (int i=0; i<n; ++i) {
+        ggml_fp16_t afp16, bfp16;
+        std::memcpy(&afp16, data, sizeof(afp16)); data += sizeof(afp16);
+        std::memcpy(&bfp16, data, sizeof(bfp16)); data += sizeof(bfp16);
+        auto a = ggml_fp16_to_fp32(afp16);
+        auto b = ggml_fp16_to_fp32(bfp16);
+        for (int k=0; k<8; ++k) {
+            int8_t l1 = data[k] & 15, l2 = data[k] >> 4;
+            l1 -= 8; l2 -= 8;
+            *y++ = a*l1; *y++ = a*l2;
+        }
+        data += 8;
+        for (int k=0; k<8; ++k) {
+            int8_t l1 = data[k] & 15, l2 = data[k] >> 4;
+            l1 -= 8; l2 -= 8;
+            *y++ = b*l1; *y++ = b*l2;
+        }
+        data += 8;
+    }
+}
+
 }
diff --git a/ggml_extra.h b/ggml_extra.h
index 7faa4380105f9..6ded616574541 100644
--- a/ggml_extra.h
+++ b/ggml_extra.h
@@ -28,6 +28,9 @@ void kQuantizeQ5_1_Fast(const float* GGML_RESTRICT x, void* GGML_RESTRICT y, int
 size_t kQuantizeQ5_1H_Fast(const float* GGML_RESTRICT x, void* GGML_RESTRICT y, int k, int64_t* hist);
 void kDequantizeQ5_1(const void * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
 
+void kQuantizeQ4_0K(const float* GGML_RESTRICT x, void* GGML_RESTRICT y, int k);
+void kDequantizeQ4_0K(const void * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
+
 #ifdef  __cplusplus
 }
 #endif

From 6f34961559aaa5bdb323a3cb9cd83709939a8c2a Mon Sep 17 00:00:00 2001
From: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Date: Thu, 13 Apr 2023 08:31:21 +0200
Subject: [PATCH 11/12] POC: Q4_1 for groups of 16 weight

As last commit, but Q4_1 type, using the same memory as
existing Q4_1 via fp16.

We end up with
rmse 0.00125125, maxerr 0.11657715, 95pct<0.0024, median<0.0010
after a quantize - dequantize roundtrip.

This is quite a bit better than Q4_1 with groups of 32 weights,
but by far not as good as 5-bit quantization that uses the same
amount of memory where we had
rmse 0.00076131, maxerr 0.05273438, 95pct<0.0016, median<0.0006
---
 examples/quantize-stats/quantize-stats.cpp |  8 ++--
 ggml_extra.cpp                             | 43 ++++++++++++++++++++++
 ggml_extra.h                               |  3 ++
 3 files changed, 51 insertions(+), 3 deletions(-)

diff --git a/examples/quantize-stats/quantize-stats.cpp b/examples/quantize-stats/quantize-stats.cpp
index c924ff3d2cffb..ae807f493cafb 100644
--- a/examples/quantize-stats/quantize-stats.cpp
+++ b/examples/quantize-stats/quantize-stats.cpp
@@ -307,7 +307,7 @@ int main(int argc, char ** argv) {
 
     // loop throught quantization types
     //for (int i = 0; i < GGML_TYPE_COUNT; i++) {
-    for (int i = 0; i < 1; i++) {
+    for (int i = 1; i < 2; i++) {
         if (!params.include_types.empty() && std::find(params.include_types.begin(), params.include_types.end(), i) == params.include_types.end()) {
             continue;
         }
@@ -317,8 +317,10 @@ int main(int argc, char ** argv) {
             //qfns.quantize_row_q = i == 0 ? kQuantizeQ4_0 : kQuantizeQ5_1;
             //qfns.quantize_row_q = i == 0 ? kQuantizeQ4_0 : kQuantizeQ5_1_Fast;
             //if (i == 1) qfns.dequantize_row_q = kDequantizeQ5_1;
-            qfns.quantize_row_q = i == 0 ? kQuantizeQ4_0K : kQuantizeQ5_1;
-            qfns.dequantize_row_q = i == 0 ? kDequantizeQ4_0K : kDequantizeQ5_1;
+            //qfns.quantize_row_q = i == 0 ? kQuantizeQ4_0K : kQuantizeQ5_1;
+            //qfns.dequantize_row_q = i == 0 ? kDequantizeQ4_0K : kDequantizeQ5_1;
+            qfns.quantize_row_q = i == 0 ? kQuantizeQ4_0K : kQuantizeQ4_1K;
+            qfns.dequantize_row_q = i == 0 ? kDequantizeQ4_0K : kDequantizeQ4_1K;
         }
         if (qfns.quantize_row_q && qfns.dequantize_row_q) {
             if (params.verbose) {
diff --git a/ggml_extra.cpp b/ggml_extra.cpp
index 927ab7e78b84e..787d62edb0ca6 100644
--- a/ggml_extra.cpp
+++ b/ggml_extra.cpp
@@ -378,6 +378,18 @@ void kQuantizeQ4(const float* X, void* buffer, int k, int type) {
             std::memcpy(q, &scale1fp16, sizeof(scale1fp16)); q += sizeof(scale1fp16);
             std::memcpy(q, &scale2fp16, sizeof(scale2fp16)); q += sizeof(scale2fp16);
             for (int k=0; k<QK/2; ++k) q[k] = (L[2*k] + 8) | ((L[2*k+1] + 8) << 4);
+        } else if (type == 5) {
+            auto result1 = kQuantize1(QK/2, X, L, tmpX, work, 7);
+            auto result2 = kQuantize1(QK/2, X + QK/2, L + QK/2, tmpX, work, 7);
+            auto a1fp16 = ggml_fp32_to_fp16(result1.first);
+            auto b1fp16 = ggml_fp32_to_fp16(result1.second);
+            auto a2fp16 = ggml_fp32_to_fp16(result2.first);
+            auto b2fp16 = ggml_fp32_to_fp16(result2.second);
+            std::memcpy(q, &a1fp16, sizeof(a1fp16)); q += sizeof(a1fp16);
+            std::memcpy(q, &b1fp16, sizeof(b1fp16)); q += sizeof(b1fp16);
+            std::memcpy(q, &a2fp16, sizeof(a2fp16)); q += sizeof(a2fp16);
+            std::memcpy(q, &b2fp16, sizeof(b2fp16)); q += sizeof(b2fp16);
+            for (int k=0; k<QK/2; ++k) q[k] = L[2*k] | (L[2*k+1] << 4);
         } else {
             auto result = type == 2 ? kQuantize1(QK, X, L, tmpX, work, 15) : kQuantize1Fast(QK, X, L, 31);
             auto afp16 = ggml_fp32_to_fp16(result.first);
@@ -555,4 +567,35 @@ void kDequantizeQ4_0K(const void* x, float* y, int k) {
     }
 }
 
+void kQuantizeQ4_1K(const float* x, void* buffer, int k) {
+    kQuantizeQ4(x, buffer, k, 5);
+}
+
+void kDequantizeQ4_1K(const void* x, float* y, int k) {
+    assert(k % QK == 0);
+    int n = k / QK;
+    auto data = (const uint8_t*)x;
+    for (int i=0; i<n; ++i) {
+        ggml_fp16_t a1fp16, b1fp16, a2fp16, b2fp16;
+        std::memcpy(&a1fp16, data, sizeof(a1fp16)); data += sizeof(a1fp16);
+        std::memcpy(&b1fp16, data, sizeof(b1fp16)); data += sizeof(b1fp16);
+        std::memcpy(&a2fp16, data, sizeof(a2fp16)); data += sizeof(a2fp16);
+        std::memcpy(&b2fp16, data, sizeof(b2fp16)); data += sizeof(b2fp16);
+        auto a1 = ggml_fp16_to_fp32(a1fp16);
+        auto b1 = ggml_fp16_to_fp32(b1fp16);
+        auto a2 = ggml_fp16_to_fp32(a2fp16);
+        auto b2 = ggml_fp16_to_fp32(b2fp16);
+        for (int k=0; k<8; ++k) {
+            int8_t l1 = data[k] & 15, l2 = data[k] >> 4;
+            *y++ = a1 + b1*l1; *y++ = a1 + b1*l2;
+        }
+        data += 8;
+        for (int k=0; k<8; ++k) {
+            int8_t l1 = data[k] & 15, l2 = data[k] >> 4;
+            *y++ = a2 + b2*l1; *y++ = a2 + b2*l2;
+        }
+        data += 8;
+    }
+}
+
 }
diff --git a/ggml_extra.h b/ggml_extra.h
index 6ded616574541..bddabc5c16a00 100644
--- a/ggml_extra.h
+++ b/ggml_extra.h
@@ -31,6 +31,9 @@ void kDequantizeQ5_1(const void * GGML_RESTRICT x, float * GGML_RESTRICT y, int
 void kQuantizeQ4_0K(const float* GGML_RESTRICT x, void* GGML_RESTRICT y, int k);
 void kDequantizeQ4_0K(const void * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
 
+void kQuantizeQ4_1K(const float* GGML_RESTRICT x, void* GGML_RESTRICT y, int k);
+void kDequantizeQ4_1K(const void * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
+
 #ifdef  __cplusplus
 }
 #endif

From 97d7ac75657b90a60c35d081f4ef1a7bc5bbe772 Mon Sep 17 00:00:00 2001
From: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Date: Thu, 13 Apr 2023 12:00:24 +0200
Subject: [PATCH 12/12] POC: Measure rmse of 8 bit quantization

q8_0 : rmse 0.00010729, maxerr 0.01030385, 95pct<0.0002, median<0.0002
---
 examples/quantize-stats/quantize-stats.cpp | 10 +--
 ggml_extra.cpp                             | 78 +++++++++++++++++++++-
 ggml_extra.h                               |  3 +
 3 files changed, 84 insertions(+), 7 deletions(-)

diff --git a/examples/quantize-stats/quantize-stats.cpp b/examples/quantize-stats/quantize-stats.cpp
index ae807f493cafb..5789bd9ea392c 100644
--- a/examples/quantize-stats/quantize-stats.cpp
+++ b/examples/quantize-stats/quantize-stats.cpp
@@ -307,7 +307,7 @@ int main(int argc, char ** argv) {
 
     // loop throught quantization types
     //for (int i = 0; i < GGML_TYPE_COUNT; i++) {
-    for (int i = 1; i < 2; i++) {
+    for (int i = 0; i < 1; i++) {
         if (!params.include_types.empty() && std::find(params.include_types.begin(), params.include_types.end(), i) == params.include_types.end()) {
             continue;
         }
@@ -315,12 +315,14 @@ int main(int argc, char ** argv) {
         if (i < 2 && checkNewQuantization) {
             //qfns.quantize_row_q = i == 0 ? kQuantizeQ4_0 : kQuantizeQ4_1;
             //qfns.quantize_row_q = i == 0 ? kQuantizeQ4_0 : kQuantizeQ5_1;
-            //qfns.quantize_row_q = i == 0 ? kQuantizeQ4_0 : kQuantizeQ5_1_Fast;
+            ////qfns.quantize_row_q = i == 0 ? kQuantizeQ4_0 : kQuantizeQ5_1_Fast;
             //if (i == 1) qfns.dequantize_row_q = kDequantizeQ5_1;
             //qfns.quantize_row_q = i == 0 ? kQuantizeQ4_0K : kQuantizeQ5_1;
             //qfns.dequantize_row_q = i == 0 ? kDequantizeQ4_0K : kDequantizeQ5_1;
-            qfns.quantize_row_q = i == 0 ? kQuantizeQ4_0K : kQuantizeQ4_1K;
-            qfns.dequantize_row_q = i == 0 ? kDequantizeQ4_0K : kDequantizeQ4_1K;
+            //qfns.quantize_row_q = i == 0 ? kQuantizeQ4_0K : kQuantizeQ4_1K;
+            //qfns.dequantize_row_q = i == 0 ? kDequantizeQ4_0K : kDequantizeQ4_1K;
+            qfns.quantize_row_q = i == 0 ? kQuantizeQ8Simple: kQuantizeQ4_1K;
+            qfns.dequantize_row_q = i == 0 ? kDequantizeQ8: kDequantizeQ4_1K;
         }
         if (qfns.quantize_row_q && qfns.dequantize_row_q) {
             if (params.verbose) {
diff --git a/ggml_extra.cpp b/ggml_extra.cpp
index 787d62edb0ca6..fa2591e68255e 100644
--- a/ggml_extra.cpp
+++ b/ggml_extra.cpp
@@ -105,7 +105,30 @@ float quanizeRmseK(int n, const float* X, int8_t* L,
         sumlx += X[i]*l; suml2 += l*l;
         L[i] = l;
     }
-    return sumlx/suml2;
+    float scale = sumlx/suml2;
+    best = scale*sumlx;
+    for (int itry=0; itry<3; ++itry) {
+        bool haveChanges = false;
+        for (int i=0; i<n; ++i) {
+            auto g = X[i] - scale*L[i];
+            if (g > 0 && L[i] < nmax) {
+                auto s1 = sumlx + X[i];
+                auto s2 = suml2 + 2*L[i] + 1;
+                if (s2 > 0 && s1*s1 > best*s2) {
+                    scale = s1/s2; best = scale*s1; ++L[i]; sumlx = s1; suml2 = s2; haveChanges = true;
+                }
+            }
+            else if (g < 0 && L[i] > nmin) {
+                auto s1 = sumlx - X[i];
+                auto s2 = suml2 - 2*L[i] + 1;
+                if (s2 > 0 && s1*s1 > best*s2) {
+                    scale = s1/s2; best = scale*s1; --L[i]; sumlx = s1; suml2 = s2; haveChanges = true;
+                }
+            }
+        }
+        if (!haveChanges) break;
+    }
+    return scale;
 }
 // The following improves the above.
 // It gives RMSE = 0.00185228 for the 7B model.
@@ -125,6 +148,19 @@ float quanizeRmseK15(int n, const float* X, int8_t* L) {
     return quanizeRmseK(n, X, L, kCandiateCount, candidates, 0, 15);
 }
 
+float quanizeRmseK31(int n, const float* X, int8_t* L) {
+    constexpr int kCandiateCount = 24;
+    static const float candidates[kCandiateCount] = {
+        +35.25, +34.25f, +33.25f, +32.75f, +32.25f, +31.75f, +31.25f, +30.75f, +30.25f, +29.75f, +29.25f, +28.25f, +27.25f, +26.25f,
+        +25.25f, +24.25f, +23.25, +22.25f, +21.25f, +20.25f, +19.25f, +18.25f, +17.25f, +16.25f
+    };
+    //static const float candidates[kCandiateCount] = {
+    //    +33.25f, +32.25f, +31.75f, +31.25f, +30.75f, +30.25f, +30.25f, +29.25f, +28.75f, +27.25f, +26.25f, +25.25f, +24.25f, +23.25, +22.25f,
+    //    +21.25f
+    //};
+    return quanizeRmseK(n, X, L, kCandiateCount, candidates, 0, 31);
+}
+
 // Fast (as much faster than doing the optimization), but not very good.
 float quanizeRmseFast(int n, const float* X, int8_t* L) {
     //constexpr int kCandiateCount = 3;
@@ -295,8 +331,9 @@ std::pair<float, float> kQuantize1(int n, const float* X, int8_t* L, std::vector
     double a = min, b = 0;
     for (int itry=0; itry<5; ++itry) {
         for (int i=0; i<n; ++i) tmpX[i] = X[i] - a;
-        //quanizeRmseK15(n, tmpX.data(), L);
-        kQuantize0(n, tmpX.data(), L, work, 0, 2*nmax+1);
+        if (nmax == 7) quanizeRmseK15(n, tmpX.data(), L);
+        else if (nmax == 15) quanizeRmseK31(n, tmpX.data(), L);
+        else kQuantize0(n, tmpX.data(), L, work, 0, 2*nmax+1);
         double sumlx = 0, sumx = 0;
         int suml2 = 0, suml = 0;
         for (int i=0; i<n; ++i) {
@@ -598,4 +635,39 @@ void kDequantizeQ4_1K(const void* x, float* y, int k) {
     }
 }
 
+void kQuantizeQ8Simple(const float* x, void* y, int k) {
+    assert(k % QK == 0);
+    auto data = (int8_t*)y;
+    int n = k / (QK/2);
+    for (int i=0; i<n; ++i) {
+        float max = 0;
+        for (int k=0; k<16; ++k) max = std::max(max, std::abs(x[k]));
+        if (max > 0) {
+            float iscale = 127.f/max;
+            float scale = max/127.f;
+            std::memcpy(data, &scale, sizeof(scale)); data += sizeof(scale);
+            for (int k=0; k<16; ++k) data[k] = toNearestInt(iscale * *x++);
+            data += 16;
+        } else {
+            float scale = 1;
+            std::memcpy(data, &scale, sizeof(scale)); data += sizeof(scale);
+            auto aux = (uint32_t*)data;
+            aux[0] = aux[1] = aux[2] = aux[3] = 0;
+            data += 16;
+        }
+    }
+}
+
+void kDequantizeQ8(const void* x, float* y, int k) {
+    assert(k % QK == 0);
+    auto data = (const int8_t*)x;
+    int n = k / (QK/2);
+    for (int i=0; i<n; ++i) {
+        float scale;
+        std::memcpy(&scale, data, sizeof(scale)); data += sizeof(scale);
+        for (int k=0; k<16; ++k) *y++ = scale*data[k];
+        data += 16;
+    }
+}
+
 }
diff --git a/ggml_extra.h b/ggml_extra.h
index bddabc5c16a00..711fd7a2b19aa 100644
--- a/ggml_extra.h
+++ b/ggml_extra.h
@@ -34,6 +34,9 @@ void kDequantizeQ4_0K(const void * GGML_RESTRICT x, float * GGML_RESTRICT y, int
 void kQuantizeQ4_1K(const float* GGML_RESTRICT x, void* GGML_RESTRICT y, int k);
 void kDequantizeQ4_1K(const void * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
 
+void kQuantizeQ8Simple(const float* GGML_RESTRICT x, void* GGML_RESTRICT y, int k);
+void kDequantizeQ8(const void* GGML_RESTRICT x, float* GGML_RESTRICT y, int k);
+
 #ifdef  __cplusplus
 }
 #endif