Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Q4_2 quantization with rmse-optimized scale and quants #1062

Merged
merged 4 commits into from
Apr 19, 2023
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
90 changes: 87 additions & 3 deletions ggml.c
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
#include <inttypes.h>
#include <stdio.h>
#include <float.h>
#include <limits.h>

// if C99 - static_assert is noop
// ref: https://stackoverflow.com/a/53923785/4039976
Expand Down Expand Up @@ -1123,12 +1124,94 @@ static void quantize_row_q4_2_reference(const float * restrict x, block_q4_2 * r
}
}

static inline int nearest_int(float fval) {
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this inline does not do anything here. the static is all you need.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

hm actually, after looking at cppref, i am not sure that C and C++ are the same here.

assert(fval <= 4194303.f);
float val = fval + 12582912.f;
int i; memcpy(&i, &val, sizeof(int));
return (i & 0x007fffff) - 0x00400000;
}

static float kquantize_q4_with_bounds(int n, int nmin, int nmax, const float * restrict X, int nCandidates,
const float * restrict candidates, int8_t * restrict L) {
assert (nmin >= INT8_MIN);
assert (nmax <= INT8_MAX);
float amax = 0;
for (int i=0; i<n; ++i) amax = MAX(amax, fabsf(X[i]));
if (!amax) { // all zero
for (int i=0; i<n; ++i) L[i] = 0;
return 1.f;
}
float best = 0, bestScale = 0;
for (int si=0; si<nCandidates; ++si) {
float iscale = candidates[si]/amax;
float sumlxP = 0; int suml2P = 0;
float sumlxM = 0; int suml2M = 0;
for (int i=0; i<n; ++i) {
int l = nearest_int(iscale*X[i]);
int lp = MAX(nmin, MIN(nmax, +l));
int lm = MAX(nmin, MIN(nmax, -l));
sumlxP += X[i]*lp; suml2P += lp*lp;
sumlxM += X[i]*lm; suml2M += lm*lm;
}
float sumlxP2 = sumlxP*sumlxP;
float sumlxM2 = sumlxM*sumlxM;
if (sumlxP2*suml2M > sumlxM2*suml2P) {
if (sumlxP2 > best*suml2P) {
best = sumlxP2/suml2P; bestScale = iscale;
}
} else {
if (sumlxM2 > best*suml2M) {
best = sumlxM2/suml2M; bestScale = -iscale;
}
}
}
float sumlx = 0; int suml2 = 0;
for (int i=0; i<n; ++i) {
int l = nearest_int(bestScale*X[i]);
l = MAX(nmin, MIN(nmax, l));
sumlx += X[i]*l; suml2 += l*l;
L[i] = l;
}
float scale = sumlx/suml2;
return scale;
}

static void quantize_row_q4_2_rmse(const float * restrict x, block_q4_2 * restrict y, int k) {
#define CANDIDATE_COUNT 8
static const float candidates[CANDIDATE_COUNT] = { +8.7f, +8.3f, +8.1f, +7.8f, +7.3f, +7.0f, +6.3f, +5.7f };
assert(k % QK4_2 == 0);

int8_t L[QK4_2];

const int nb = k / QK4_2;

for (int i = 0; i < nb; i++) {

float scale = kquantize_q4_with_bounds(QK4_2, -8, 7, x, CANDIDATE_COUNT, candidates, L);
y[i].d = GGML_FP32_TO_FP16(scale);

for (int l = 0; l < QK4_2; l += 2) {
const uint8_t vi0 = (uint8_t)(L[l+0] + 8);
const uint8_t vi1 = (uint8_t)(L[l+1] + 8);

assert(vi0 < 16);
assert(vi1 < 16);

y[i].qs[l/2] = vi0 | (vi1 << 4);
}

x += QK4_2;
}
}

static void quantize_row_q4_2(const float * restrict x, void * restrict vy, int k) {
assert(k % QK4_2 == 0);

block_q4_2 * restrict y = vy;

quantize_row_q4_2_reference(x, y, k);
//quantize_row_q4_2_reference(x, y, k);
// This produces the exact same format, just better match to the input floats ("better" as measured by RMSE)
quantize_row_q4_2_rmse(x, y, k);
}

// reference implementation for deterministic creation of model files
Expand Down Expand Up @@ -1558,7 +1641,7 @@ static const quantize_fns_t quantize_fns[GGML_TYPE_COUNT] = {
[GGML_TYPE_Q4_2] = {
.dequantize_row_q = dequantize_row_q4_2,
.quantize_row_q = quantize_row_q4_2,
.quantize_row_q_reference = (quantize_row_q_t) quantize_row_q4_2_reference,
.quantize_row_q_reference = (quantize_row_q_t) quantize_row_q4_2_rmse, //quantize_row_q4_2_reference,
.quantize_row_q_dot = quantize_row_q8_0,
.vec_dot_q = ggml_vec_dot_q4_2_q8_0,
},
Expand Down Expand Up @@ -12298,7 +12381,8 @@ size_t ggml_quantize_q4_2(const float * src, void * dst, int n, int k, int64_t *
for (int j = 0; j < n; j += k) {
block_q4_2 * restrict y = (block_q4_2 *)dst + j/QK4_2;

quantize_row_q4_2_reference(src + j, y, k);
//quantize_row_q4_2_reference(src + j, y, k);
quantize_row_q4_2_rmse(src + j, y, k);

for (int i = 0; i < nb; i++) {
for (int l = 0; l < QK4_2; l += 2) {
Expand Down