Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

More accurate Q4_0 and Q4_1 quantizations #896

Closed
wants to merge 12 commits into from
4 changes: 3 additions & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -235,7 +235,9 @@ endif()

add_library(ggml OBJECT
ggml.c
ggml.h)
ggml.h
ggml_extra.h
ggml_extra.cpp)

target_include_directories(ggml PUBLIC .)
target_compile_features(ggml PUBLIC c_std_11) # don't bump
Expand Down
27 changes: 15 additions & 12 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -145,32 +145,35 @@ ggml.o: ggml.c ggml.h
llama.o: llama.cpp llama.h llama_util.h llama_internal.h
$(CXX) $(CXXFLAGS) -c llama.cpp -o llama.o

ggml_extra.o: ggml_extra.cpp ggml_extra.h
$(CXX) $(CXXFLAGS) -c $< -o $@

common.o: examples/common.cpp examples/common.h
$(CXX) $(CXXFLAGS) -c examples/common.cpp -o common.o

clean:
rm -vf *.o main quantize quantize-stats perplexity embedding

main: examples/main/main.cpp ggml.o llama.o common.o
$(CXX) $(CXXFLAGS) examples/main/main.cpp ggml.o llama.o common.o -o main $(LDFLAGS)
main: examples/main/main.cpp ggml.o llama.o common.o ggml_extra.o
$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
@echo
@echo '==== Run ./main -h for help. ===='
@echo

quantize: examples/quantize/quantize.cpp ggml.o llama.o
$(CXX) $(CXXFLAGS) examples/quantize/quantize.cpp ggml.o llama.o -o quantize $(LDFLAGS)
quantize: examples/quantize/quantize.cpp ggml.o llama.o ggml_extra.o
$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)

quantize-stats: examples/quantize-stats/quantize-stats.cpp ggml.o llama.o
$(CXX) $(CXXFLAGS) examples/quantize-stats/quantize-stats.cpp ggml.o llama.o -o quantize-stats $(LDFLAGS)
quantize-stats: examples/quantize-stats/quantize-stats.cpp ggml.o llama.o ggml_extra.o
$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)

perplexity: examples/perplexity/perplexity.cpp ggml.o llama.o common.o
$(CXX) $(CXXFLAGS) examples/perplexity/perplexity.cpp ggml.o llama.o common.o -o perplexity $(LDFLAGS)
perplexity: examples/perplexity/perplexity.cpp ggml.o llama.o common.o ggml_extra.o
$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)

embedding: examples/embedding/embedding.cpp ggml.o llama.o common.o
$(CXX) $(CXXFLAGS) examples/embedding/embedding.cpp ggml.o llama.o common.o -o embedding $(LDFLAGS)
embedding: examples/embedding/embedding.cpp ggml.o llama.o common.o ggml_extra.o
$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)

libllama.so: llama.o ggml.o
$(CXX) $(CXXFLAGS) -shared -fPIC -o libllama.so llama.o ggml.o $(LDFLAGS)
libllama.so: llama.o ggml.o ggml_extra.o
$(CXX) $(CXXFLAGS) -shared -fPIC -o $@ $^ $(LDFLAGS)
#
# Tests
#
Expand Down
21 changes: 19 additions & 2 deletions examples/quantize-stats/quantize-stats.cpp
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
#include "ggml.h"
#include "llama.h"
#include "llama_internal.h"
#include "ggml_extra.h"

#include <algorithm>
#include <cassert>
Expand Down Expand Up @@ -29,7 +30,7 @@ struct quantize_stats_params {
std::vector<enum ggml_type> include_types;
};

const int64_t SCRATCH_ELEMENTS = 32*32;
const int64_t SCRATCH_ELEMENTS = 32*32*256; // So we use multi-threading in a meaningful way in the new quantization
const size_t HISTOGRAM_BUCKETS = 150;
const double HISTOGRAM_RANGE = 0.03;

Expand Down Expand Up @@ -184,6 +185,7 @@ int main(int argc, char ** argv) {
// read command line

bool invalid_param = false;
bool checkNewQuantization = false;
std::string arg;
for (int i = 1; i < argc; i++) {
arg = argv[i];
Expand Down Expand Up @@ -232,6 +234,8 @@ int main(int argc, char ** argv) {
fprintf(stderr, "error: %s not in list of types\n", argv[i]);
invalid_param = true;
}
} else if (arg == "-nq" || arg == "--new-quantization") {
checkNewQuantization = true;
} else {
fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
quantize_stats_print_usage(argc, argv);
Expand Down Expand Up @@ -302,11 +306,24 @@ int main(int argc, char ** argv) {
std::vector<float> output_scratch(SCRATCH_ELEMENTS);

// loop throught quantization types
for (int i = 0; i < GGML_TYPE_COUNT; i++) {
//for (int i = 0; i < GGML_TYPE_COUNT; i++) {
for (int i = 0; i < 1; i++) {
if (!params.include_types.empty() && std::find(params.include_types.begin(), params.include_types.end(), i) == params.include_types.end()) {
continue;
}
quantize_fns_t qfns = ggml_internal_get_quantize_fn(i);
if (i < 2 && checkNewQuantization) {
//qfns.quantize_row_q = i == 0 ? kQuantizeQ4_0 : kQuantizeQ4_1;
//qfns.quantize_row_q = i == 0 ? kQuantizeQ4_0 : kQuantizeQ5_1;
////qfns.quantize_row_q = i == 0 ? kQuantizeQ4_0 : kQuantizeQ5_1_Fast;
//if (i == 1) qfns.dequantize_row_q = kDequantizeQ5_1;
//qfns.quantize_row_q = i == 0 ? kQuantizeQ4_0K : kQuantizeQ5_1;
//qfns.dequantize_row_q = i == 0 ? kDequantizeQ4_0K : kDequantizeQ5_1;
//qfns.quantize_row_q = i == 0 ? kQuantizeQ4_0K : kQuantizeQ4_1K;
//qfns.dequantize_row_q = i == 0 ? kDequantizeQ4_0K : kDequantizeQ4_1K;
qfns.quantize_row_q = i == 0 ? kQuantizeQ8Simple: kQuantizeQ4_1K;
qfns.dequantize_row_q = i == 0 ? kDequantizeQ8: kDequantizeQ4_1K;
}
if (qfns.quantize_row_q && qfns.dequantize_row_q) {
if (params.verbose) {
printf("testing %s ...\n", type_strs[i]);
Expand Down
2 changes: 2 additions & 0 deletions examples/quantize/quantize.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@ int main(int argc, char ** argv) {
fprintf(stderr, "usage: %s model-f32.bin model-quant.bin type\n", argv[0]);
fprintf(stderr, " type = 2 - q4_0\n");
fprintf(stderr, " type = 3 - q4_1\n");
fprintf(stderr, " type = 4 - new q4_0\n");
fprintf(stderr, " type = 5 - new q4_1\n");
return 1;
}

Expand Down
23 changes: 21 additions & 2 deletions ggml.c
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
#define _GNU_SOURCE

#include "ggml.h"
#include "ggml_extra.h"

#if defined(_MSC_VER) || defined(__MINGW32__)
#include <malloc.h> // using malloc.h with MSC/MINGW
Expand Down Expand Up @@ -502,6 +503,13 @@ typedef struct {
} block_q4_1;
static_assert(sizeof(block_q4_1) == sizeof(float) * 2 + QK / 2, "wrong q4_1 block size/padding");

inline int nearestInt(float fval) {
assert(fval <= 4194303.f);
float val = fval + 12582912.f;
int i; memcpy(&i, &val, sizeof(int));
return (i & 0x007fffff) - 0x00400000;
}

// reference implementation for deterministic creation of model files
static void quantize_row_q4_0_reference(const float * restrict x, block_q4_0 * restrict y, int k) {
assert(k % QK == 0);
Expand All @@ -526,8 +534,15 @@ static void quantize_row_q4_0_reference(const float * restrict x, block_q4_0 * r
const float v0 = x[i*QK + l + 0]*id;
const float v1 = x[i*QK + l + 1]*id;

// On x86_64 and x86, round is amazingly slow.
// Here it is best to just use this:
//const uint8_t vi0 = (uint8_t)(v0 + 8.5f);
//const uint8_t vi1 = (uint8_t)(v1 + 8.5f);
const uint8_t vi0 = (int8_t)roundf(v0) + 8;
const uint8_t vi1 = (int8_t)roundf(v1) + 8;
// This is marginally slower (but still much faster than round())
//const uint8_t vi0 = nearestInt(v0) + 8;
//const uint8_t vi1 = nearestInt(v1) + 8;

assert(vi0 < 16);
assert(vi1 < 16);
Expand Down Expand Up @@ -818,6 +833,10 @@ static void quantize_row_q4_1_reference(const float * restrict x, void * restric
const float v0 = (x[i*QK + l + 0] - min)*id;
const float v1 = (x[i*QK + l + 1] - min)*id;

// For some reason round() is amazingly slow on X86_64 and x86
// Using this instead reduces the difference between AVX2 and scalar to less than ~15%
//const uint8_t vi0 = nearestInt(v0);
//const uint8_t vi1 = nearestInt(v1);
const uint8_t vi0 = roundf(v0);
const uint8_t vi1 = roundf(v1);

Expand Down Expand Up @@ -2569,7 +2588,7 @@ static const int GGML_BLCK_SIZE[GGML_TYPE_COUNT] = {
1,
};

static_assert(GGML_TYPE_COUNT == 7, "GGML_TYPE_COUNT != 5");
static_assert(GGML_TYPE_COUNT == 7, "GGML_TYPE_COUNT != 7");

static const size_t GGML_TYPE_SIZE[GGML_TYPE_COUNT] = {
sizeof(block_q4_0),
Expand All @@ -2582,7 +2601,7 @@ static const size_t GGML_TYPE_SIZE[GGML_TYPE_COUNT] = {
};

// don't forget to update the array above when adding new types
static_assert(GGML_TYPE_COUNT == 7, "GGML_TYPE_COUNT != 5");
static_assert(GGML_TYPE_COUNT == 7, "GGML_TYPE_COUNT != 7");

static const char * GGML_OP_LABEL[GGML_OP_COUNT] = {
"NONE",
Expand Down
Loading