Further improve Q4_0 MSE

The RMSE of the 7B model becomes 0.00185228. It looks like the perplexity will end up being around 6.27-6.28.
ggerganov · Apr 12, 2023 · 6bfb00a · 6bfb00a
1 parent 931ae36
commit 6bfb00a
Showing 1 changed file with 102 additions and 11 deletions.
diff --git a/ggml_extra.cpp b/ggml_extra.cpp
@@ -1,5 +1,6 @@
 #include "ggml_extra.h"
 
+#include <limits>
 #include <vector>
 #include <utility>
 #include <algorithm>
@@ -23,6 +24,94 @@ inline int toNearestInt(float fval) {
     return (i & 0x007fffff) - 0x00400000;
 }
 
+// Adapted from PR #835, function quantize_row_q4_0_rmse()
+//
+// I absolutely cannot reproduce the rmse = 0.00185915 reported in #835.
+// Instead, I get rmse = 0.00197 with the original and rmse = 0.00192
+// with the modification that determines the scale actually minimizing
+// the rmse.
+//
+// Do I have a bug? iI don't see it.
+// The only difference is that I'm using toNearestInt()
+// instead of round(), but what are the odds for getting scaled weights at
+// exactly 2.5, 4.5, and 6.5, where toNearestInt() and round() differ.
+// (with toNearestInt() behaving as expected and rounding towards the even integer,
+// while round() always rounding up.
+float quanizeRmse(int n, const float* X, int8_t* L) {
+#define Q4_0_SCALE_CANDIDATE_COUNT 8
+    static const float candidates[Q4_0_SCALE_CANDIDATE_COUNT] = { -8.7f, -8.5f, -8.3f, -8.1f, -7.9f, -7.7f, -7.2f, +7.0f };
+    float max = 0, amax = 0;
+    for (int i=0; i<n; ++i) {
+        float ax = std::abs(X[i]);
+        if (ax > amax) { amax = ax; max = X[i]; }
+    }
+    if (!amax) { // all zero
+        for (int i=0; i<n; ++i) L[i] = 0;
+        return 1.f;
+    }
+    float best = std::numeric_limits<float>::max(), bestScale = 0;
+    for (int si=0; si<Q4_0_SCALE_CANDIDATE_COUNT; ++si) {
+        float iscale = candidates[si]/max;
+        float err = 0;
+        for (int i=0; i<n; ++i) {
+            float sx = iscale*X[i];
+            int l = std::max(-8, std::min(7, toNearestInt(sx)));
+            sx -= l;
+            err += sx*sx;
+        }
+        if (err < best) {
+            best = err; bestScale = iscale;
+        }
+    }
+    // The follwoing is a departure from #835. Given the quants produces by bestScale,
+    // it determines the scale the actually minimizes the MSE (or RMSE).
+    // With this, I get rmse = 0.00192 for the 7B model.
+    float sumlx = 0; int suml2 = 0;
+    for (int i=0; i<n; ++i) {
+        int l = std::max(-8, std::min(7, toNearestInt(bestScale*X[i])));
+        sumlx += X[i]*l; suml2 += l*l;
+        L[i] = l;
+    }
+    return sumlx/suml2;
+    // The following is what is in quantize_row_q4_0_rmse() in PR #835
+    // With this version, I get rmse = 0.00197 for the 7B model.
+    //for (int i=0; i<n; ++i) L[i] = std::max(-8, std::min(7, toNearestInt(bestScale*X[i])));
+    //return 1/bestScale;
+}
+
+// The following improves the above.
+// It gives RMSE = 0.00185228 for the 7B model.
+float quanizeRmseK(int n, const float* X, int8_t* L) {
+    constexpr int kCandiateCount = 20;
+    static const float candidates[kCandiateCount] = { -8.7f, -8.5f, -8.3f, -8.1f, -7.9f, -7.7f, -7.2f, -7.0f, -6.3f, -5.7f,
+                                                      +8.7f, +8.5f, +8.3f, +8.1f, +7.9f, +7.7f, +7.2f, +7.0f, +6.3f, +5.7f};
+    float max = 0;
+    for (int i=0; i<n; ++i) max = std::max(max, std::abs(X[i]));
+    if (!max) { // all zero
+        for (int i=0; i<n; ++i) L[i] = 0;
+        return 1.f;
+    }
+    float best = 0, bestScale = 0;
+    for (int si=0; si<kCandiateCount; ++si) {
+        float iscale = candidates[si]/max;
+        float sumlx = 0; int suml2 = 0;
+        for (int i=0; i<n; ++i) {
+            int l = std::max(-8, std::min(7, toNearestInt(iscale*X[i])));
+            sumlx += X[i]*l; suml2 += l*l;
+        }
+        if (sumlx*sumlx > best*suml2) {
+            best = sumlx*sumlx/suml2; bestScale = iscale;
+        }
+    }
+    float sumlx = 0; int suml2 = 0;
+    for (int i=0; i<n; ++i) {
+        int l = std::max(-8, std::min(7, toNearestInt(bestScale*X[i])));
+        sumlx += X[i]*l; suml2 += l*l;
+        L[i] = l;
+    }
+    return sumlx/suml2;
+}
+
 std::pair<float, float> kQuantize0(int n, const float* X, int8_t* L, std::vector<std::pair<float,int>>& work, int nmin, int nmax) {
     work.clear();
     work.reserve(n*(nmax+2));
@@ -137,17 +226,19 @@ void kQuantizeQ4(const float* X, void* buffer, int k, int type) {
     auto processOne = [type] (const float* X, int8_t* L, char* y, std::vector<std::pair<float, int>>& work, std::vector<float>& tmpX) {
         auto q = (uint8_t*)y;
         if (type == 0) {
-            if (int(tmpX.size()) < QK) tmpX.resize(QK);
-            auto r1 = kQuantize0(QK, X, L, work, -8, 7);
-            for (int i=0; i<QK; ++i) tmpX[i] = -X[i];
-            int8_t L2[QK];
-            auto r2 = kQuantize0(QK, tmpX.data(), L2, work, -8, 7);
-            float scale = r1.first;
-            if (r2.second > r1.first) {
-                scale = -r2.first;
-                std::memcpy(L, L2, QK);
-            }
-            //float scale = kQuantize0(QK, X, L, work, -7, 7);
+            auto scale = quanizeRmseK(QK, X, L);
+            // The following is not quite as good as quanizeRmseK() and it is slower too.
+            //if (int(tmpX.size()) < QK) tmpX.resize(QK);
+            //auto r1 = kQuantize0(QK, X, L, work, -8, 7);
+            //for (int i=0; i<QK; ++i) tmpX[i] = -X[i];
+            //int8_t L2[QK];
+            //auto r2 = kQuantize0(QK, tmpX.data(), L2, work, -8, 7);
+            //float scale = r1.first;
+            //if (r2.second > r1.first) {
+            //    scale = -r2.first;
+            //    std::memcpy(L, L2, QK);
+            //}
+            ////float scale = kQuantize0(QK, X, L, work, -7, 7);
             std::memcpy(q, &scale, sizeof(scale)); q += sizeof(scale);
             for (int k=0; k<QK/2; ++k) q[k] = (L[2*k] + 8) | ((L[2*k+1] + 8) << 4);
         } else {