Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Generalize quantize_fns for simpler FP16 handling #1237

Merged
merged 4 commits into from
Jul 5, 2023
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 7 additions & 7 deletions examples/quantize-stats/quantize-stats.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -147,7 +147,7 @@ void test_roundtrip_on_chunk(
const ggml_tensor * layer,
int64_t offset,
int64_t chunk_size,
const quantize_fns_t & qfns,
const ggml_type_traits_t & qfns,
bool use_reference,
float * input_scratch,
char * quantized_scratch,
Expand All @@ -163,11 +163,11 @@ void test_roundtrip_on_chunk(
}

if (use_reference) {
qfns.quantize_row_q_reference(input_scratch, quantized_scratch, chunk_size);
qfns.from_float_reference(input_scratch, quantized_scratch, chunk_size);
} else {
qfns.quantize_row_q(input_scratch, quantized_scratch, chunk_size);
qfns.from_float(input_scratch, quantized_scratch, chunk_size);
}
qfns.dequantize_row_q(quantized_scratch, output_scratch, chunk_size);
qfns.to_float(quantized_scratch, output_scratch, chunk_size);

update_error_stats(chunk_size, input_scratch, output_scratch, stats);
}
Expand All @@ -177,7 +177,7 @@ void test_roundtrip_on_chunk(
void test_roundtrip_on_layer(
std::string & name,
bool print_layer_stats,
const quantize_fns_t & qfns,
const ggml_type_traits_t & qfns,
bool use_reference,
const ggml_tensor * layer,
std::vector<float> & input_scratch,
Expand Down Expand Up @@ -388,8 +388,8 @@ int main(int argc, char ** argv) {
if (!params.include_types.empty() && std::find(params.include_types.begin(), params.include_types.end(), i) == params.include_types.end()) {
continue;
}
quantize_fns_t qfns = ggml_internal_get_quantize_fn(i);
if (qfns.quantize_row_q && qfns.dequantize_row_q) {
ggml_type_traits_t qfns = ggml_internal_get_type_traits(type);
if (qfns.from_float && qfns.to_float) {
if (params.verbose) {
printf("testing %s ...\n", ggml_type_name(type));
}
Expand Down
651 changes: 108 additions & 543 deletions ggml.c

Large diffs are not rendered by default.

33 changes: 13 additions & 20 deletions ggml.h
Original file line number Diff line number Diff line change
Expand Up @@ -224,8 +224,8 @@ extern "C" {
GGML_API float ggml_fp16_to_fp32(ggml_fp16_t x);
GGML_API ggml_fp16_t ggml_fp32_to_fp16(float x);

GGML_API void ggml_fp16_to_fp32_row(const ggml_fp16_t * x, float * y, size_t n);
GGML_API void ggml_fp32_to_fp16_row(const float * x, ggml_fp16_t * y, size_t n);
GGML_API void ggml_fp16_to_fp32_row(const ggml_fp16_t * x, float * y, int n);
GGML_API void ggml_fp32_to_fp16_row(const float * x, ggml_fp16_t * y, int n);

struct ggml_object;
struct ggml_context;
Expand Down Expand Up @@ -1490,26 +1490,19 @@ extern "C" {
// Internal types and functions exposed for tests and benchmarks
//

#ifdef __cplusplus
// restrict not standard in C++
#define GGML_RESTRICT
#else
#define GGML_RESTRICT restrict
#endif
typedef void (*dequantize_row_q_t)(const void * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
typedef void (*quantize_row_q_t) (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
typedef void (*vec_dot_q_t) (const int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT x, const void * GGML_RESTRICT y);
typedef void (*ggml_to_float_t)(const void * x, float * y, int k);
typedef void (*ggml_from_float_t)(const float * x, void * y, int k);
typedef void (*ggml_vec_dot_t)(const int n, float * s, const void * x, const void * y);

typedef struct {
dequantize_row_q_t dequantize_row_q;
quantize_row_q_t quantize_row_q;
quantize_row_q_t quantize_row_q_reference;
quantize_row_q_t quantize_row_q_dot;
vec_dot_q_t vec_dot_q;
enum ggml_type vec_dot_type;
} quantize_fns_t;

quantize_fns_t ggml_internal_get_quantize_fn(size_t i);
ggml_to_float_t to_float;
ggml_from_float_t from_float;
ggml_from_float_t from_float_reference;
ggml_vec_dot_t vec_dot;
enum ggml_type vec_dot_type;
} ggml_type_traits_t;

ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type i);

#ifdef __cplusplus
}
Expand Down
10 changes: 5 additions & 5 deletions llama.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2252,10 +2252,10 @@ static void llama_convert_tensor_internal(const llama_load_tensor & tensor, llam
}
float * f32_output = (float *) output.addr;

quantize_fns_t qtype;
ggml_type_traits_t qtype;
if (ggml_is_quantized(tensor.type)) {
qtype = ggml_internal_get_quantize_fn(tensor.type);
if (qtype.dequantize_row_q == NULL) {
qtype = ggml_internal_get_type_traits(tensor.type);
if (qtype.to_float == NULL) {
throw std::runtime_error(format("type %s unsupported for integer quantization: no dequantization available", ggml_type_name(tensor.type)));
}
} else if (tensor.type != GGML_TYPE_F16) {
Expand All @@ -2266,7 +2266,7 @@ static void llama_convert_tensor_internal(const llama_load_tensor & tensor, llam
if (tensor.type == GGML_TYPE_F16) {
ggml_fp16_to_fp32_row((ggml_fp16_t *)tensor.data, f32_output, nelements);
} else if (ggml_is_quantized(tensor.type)) {
qtype.dequantize_row_q(tensor.data, f32_output, nelements);
qtype.to_float(tensor.data, f32_output, nelements);
} else {
LLAMA_ASSERT(false); // unreachable
}
Expand All @@ -2291,7 +2291,7 @@ static void llama_convert_tensor_internal(const llama_load_tensor & tensor, llam
if (typ == GGML_TYPE_F16) {
ggml_fp16_to_fp32_row((ggml_fp16_t *)inbuf, outbuf, nels);
} else {
qtype.dequantize_row_q(inbuf, outbuf, nels);
qtype.to_float(inbuf, outbuf, nels);
}
};
workers.push_back(std::thread(compute, tensor.type, tensor.data + in_buff_offs, f32_output + out_buff_offs, thr_elems));
Expand Down
6 changes: 3 additions & 3 deletions pocs/vdot/q8dot.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -136,7 +136,7 @@ int main(int argc, char** argv) {

auto ggml_type = type == 0 ? GGML_TYPE_Q4_0 : GGML_TYPE_Q4_1;

auto funcs = ggml_internal_get_quantize_fn(ggml_type);
auto funcs = ggml_internal_get_type_traits(ggml_type);

Stat simple, ggml;

Expand All @@ -156,8 +156,8 @@ int main(int argc, char** argv) {

t1 = std::chrono::high_resolution_clock::now();
float fs;
if (type == 0) funcs.vec_dot_q(kVecSize * QK4_1, &fs, x40.data(), y.data());
else funcs.vec_dot_q(kVecSize * QK4_1, &fs, x41.data(), y.data());
if (type == 0) funcs.vec_dot(kVecSize * QK4_1, &fs, x40.data(), y.data());
else funcs.vec_dot(kVecSize * QK4_1, &fs, x41.data(), y.data());
t2 = std::chrono::high_resolution_clock::now();
t = 1e-3*std::chrono::duration_cast<std::chrono::nanoseconds>(t2-t1).count();
if (iloop > 3) ggml.addResult(fs, t);
Expand Down
13 changes: 7 additions & 6 deletions pocs/vdot/vdot.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -235,7 +235,7 @@ int main(int argc, char** argv) {
int n4 = useQ4_1 ? kVecSize / QK4_1 : kVecSize / QK4_0; n4 = 64*((n4 + 63)/64);
int n8 = kVecSize / QK8_0; n8 = 64*((n8 + 63)/64);

auto funcs = useQ4_1 ? ggml_internal_get_quantize_fn(GGML_TYPE_Q4_1) : ggml_internal_get_quantize_fn(GGML_TYPE_Q4_0);
auto funcs = useQ4_1 ? ggml_internal_get_type_traits(GGML_TYPE_Q4_1) : ggml_internal_get_type_traits(GGML_TYPE_Q4_0);

std::vector<block_q4_0> q40;
std::vector<block_q4_1> q41;
Expand All @@ -261,9 +261,9 @@ int main(int argc, char** argv) {
// Note, we do not include this in the timing as in practical application
// we already have the quantized model weights.
if (useQ4_1) {
funcs.quantize_row_q(x1.data(), q41.data(), kVecSize);
funcs.from_float(x1.data(), q41.data(), kVecSize);
} else {
funcs.quantize_row_q(x1.data(), q40.data(), kVecSize);
funcs.from_float(x1.data(), q40.data(), kVecSize);
}

// Now measure time the dot product needs using the "scalar" version above
Expand All @@ -282,9 +282,10 @@ int main(int argc, char** argv) {
dot_q4_q8(kVecSize, &result, q40.data(), q8.data());
}
else {
funcs.quantize_row_q_dot(y1.data(), q8.data(), kVecSize);
if (useQ4_1) funcs.vec_dot_q(kVecSize, &result, q41.data(), q8.data());
else funcs.vec_dot_q(kVecSize, &result, q40.data(), q8.data());
auto vdot = ggml_internal_get_type_traits(funcs.vec_dot_type);
vdot.from_float(y1.data(), q8.data(), kVecSize);
if (useQ4_1) funcs.vec_dot(kVecSize, &result, q41.data(), q8.data());
else funcs.vec_dot(kVecSize, &result, q40.data(), q8.data());
}
sumq += result;
t2 = std::chrono::high_resolution_clock::now();
Expand Down
30 changes: 16 additions & 14 deletions tests/test-quantize-fns.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -40,26 +40,26 @@ float array_rmse(const float * a1, const float * a2, size_t n) {
}

// Total quantization error on test data
float total_quantization_error(quantize_fns_t & qfns, size_t test_size, const float * test_data) {
float total_quantization_error(ggml_type_traits_t & qfns, size_t test_size, const float * test_data) {
std::vector<uint8_t> tmp_q(2*test_size);
std::vector<float> tmp_out(test_size);

qfns.quantize_row_q(test_data, tmp_q.data(), test_size);
qfns.dequantize_row_q(tmp_q.data(), tmp_out.data(), test_size);
qfns.from_float(test_data, tmp_q.data(), test_size);
qfns.to_float(tmp_q.data(), tmp_out.data(), test_size);
return array_rmse(test_data, tmp_out.data(), test_size);
}

// Total quantization error on test data
float reference_quantization_error(quantize_fns_t & qfns, size_t test_size, const float * test_data) {
float reference_quantization_error(ggml_type_traits_t & qfns, size_t test_size, const float * test_data) {
std::vector<uint8_t> tmp_q(2*test_size);
std::vector<float> tmp_out(test_size);
std::vector<float> tmp_out_ref(test_size);

qfns.quantize_row_q(test_data, tmp_q.data(), test_size);
qfns.dequantize_row_q(tmp_q.data(), tmp_out.data(), test_size);
qfns.from_float(test_data, tmp_q.data(), test_size);
qfns.to_float(tmp_q.data(), tmp_out.data(), test_size);

qfns.quantize_row_q_reference(test_data, tmp_q.data(), test_size);
qfns.dequantize_row_q(tmp_q.data(), tmp_out_ref.data(), test_size);
qfns.from_float_reference(test_data, tmp_q.data(), test_size);
qfns.to_float(tmp_q.data(), tmp_out_ref.data(), test_size);

return array_rmse(tmp_out.data(), tmp_out_ref.data(), test_size);
}
Expand All @@ -73,15 +73,17 @@ float dot_product(const float * a1, const float * a2, size_t test_size) {
}

// Total dot product error
float dot_product_error(quantize_fns_t & qfns, size_t test_size, const float * test_data1, const float *test_data2) {
float dot_product_error(ggml_type_traits_t & qfns, size_t test_size, const float * test_data1, const float *test_data2) {
std::vector<uint8_t> tmp_q1(2*test_size);
std::vector<uint8_t> tmp_q2(2*test_size);

qfns.quantize_row_q (test_data1, tmp_q1.data(), test_size);
qfns.quantize_row_q_dot(test_data2, tmp_q2.data(), test_size);
auto vdot = ggml_internal_get_type_traits(qfns.vec_dot_type);

qfns.from_float(test_data1, tmp_q1.data(), test_size);
vdot.from_float(test_data2, tmp_q2.data(), test_size);

float result = INFINITY;
qfns.vec_dot_q(test_size, &result, tmp_q1.data(), tmp_q2.data());
qfns.vec_dot(test_size, &result, tmp_q1.data(), tmp_q2.data());

const float dot_ref = dot_product(test_data1, test_data2, test_size);

Expand Down Expand Up @@ -123,9 +125,9 @@ int main(int argc, char * argv[]) {

for (int i = 0; i < GGML_TYPE_COUNT; i++) {
ggml_type type = (ggml_type) i;
quantize_fns_t qfns = ggml_internal_get_quantize_fn(i);
ggml_type_traits_t qfns = ggml_internal_get_type_traits(type);

if (qfns.quantize_row_q && qfns.dequantize_row_q) {
if (qfns.from_float && qfns.to_float) {
const float total_error = total_quantization_error(qfns, test_size, test_data.data());
const float max_quantization_error =
type == GGML_TYPE_Q2_K ? MAX_QUANTIZATION_TOTAL_ERROR_2BITS :
Expand Down
25 changes: 13 additions & 12 deletions tests/test-quantize-perf.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -123,9 +123,9 @@ void usage(char * argv[]) {
printf(" --type TYPE set test type as");
for (int i = 0; i < GGML_TYPE_COUNT; i++) {
ggml_type type = (ggml_type) i;
quantize_fns_t qfns = ggml_internal_get_quantize_fn(type);
ggml_type_traits_t qfns = ggml_internal_get_type_traits(type);
if (ggml_type_name(type) != NULL) {
if (qfns.quantize_row_q && qfns.dequantize_row_q) {
if (qfns.from_float && qfns.to_float) {
printf(" %s", ggml_type_name(type));
}
}
Expand Down Expand Up @@ -271,20 +271,20 @@ int main(int argc, char * argv[]) {

for (int i = 0; i < GGML_TYPE_COUNT; i++) {
ggml_type type = (ggml_type) i;
quantize_fns_t qfns = ggml_internal_get_quantize_fn(i);
ggml_type_traits_t qfns = ggml_internal_get_type_traits(type);
if (!params.include_types.empty() && ggml_type_name(type) && std::find(params.include_types.begin(), params.include_types.end(), ggml_type_name(type)) == params.include_types.end()) {
continue;
}

if (qfns.quantize_row_q && qfns.dequantize_row_q) {
if (qfns.from_float && qfns.to_float) {
printf("%s\n", ggml_type_name(type));

if (params.op_quantize_row_q_reference) {
printf(" quantize_row_q_reference\n");
for (size_t size : params.test_sizes) {
printf(" %zu values (%.2f MB)\n", size, 4*size/(float)(1024*1024));
auto quantize_fn = [&](void ) {
qfns.quantize_row_q_reference(test_data1, test_q1, size);
qfns.from_float_reference(test_data1, test_q1, size);
return test_q1[0];
};
size_t quantized_size = size / ggml_blck_size(type) * ggml_type_size(type);
Expand All @@ -298,7 +298,7 @@ int main(int argc, char * argv[]) {
for (size_t size : params.test_sizes) {
printf(" %zu values (%.2f MB)\n", size, 4*size/(float)(1024*1024));
auto quantize_fn = [&](void ) {
qfns.quantize_row_q(test_data1, test_q1, size);
qfns.from_float(test_data1, test_q1, size);
return test_q1[0];
};
size_t quantized_size = size / ggml_blck_size(type) * ggml_type_size(type);
Expand All @@ -309,11 +309,11 @@ int main(int argc, char * argv[]) {

if (params.op_dequantize_row_q) {
printf(" dequantize_row_q\n");
qfns.quantize_row_q(test_data1, test_q1, largest);
qfns.from_float(test_data1, test_q1, largest);
for (size_t size : params.test_sizes) {
printf(" %zu values (%.2f MB)\n", size, 4*size/(float)(1024*1024));
auto quantize_fn = [&](void ) {
qfns.dequantize_row_q(test_q1, test_out, size);
qfns.to_float(test_q1, test_out, size);
return test_out[0];
};
size_t quantized_size = size / ggml_blck_size(type) * ggml_type_size(type);
Expand All @@ -327,7 +327,8 @@ int main(int argc, char * argv[]) {
for (size_t size : params.test_sizes) {
printf(" %zu values (%.2f MB)\n", size, 4*size/(float)(1024*1024));
auto quantize_fn = [&](void ) {
qfns.quantize_row_q_dot(test_data1, test_q1, size);
auto vdot = ggml_internal_get_type_traits(qfns.vec_dot_type);
vdot.from_float(test_data1, test_q1, size);
return test_q1[0];
};
size_t quantized_size = size / ggml_blck_size(type) * ggml_type_size(type);
Expand All @@ -338,13 +339,13 @@ int main(int argc, char * argv[]) {

if (params.op_vec_dot_q) {
printf(" vec_dot_q\n");
qfns.quantize_row_q(test_data1, test_q1, largest);
qfns.quantize_row_q(test_data2, test_q2, largest);
qfns.from_float(test_data1, test_q1, largest);
qfns.from_float(test_data2, test_q2, largest);
for (size_t size : params.test_sizes) {
printf(" %zu values (%.2f MB)\n", size, 4*size/(float)(1024*1024));
auto quantize_fn = [&](void ) {
float result;
qfns.vec_dot_q(size, &result, test_q1, test_q2);
qfns.vec_dot(size, &result, test_q1, test_q2);
return result;
};
size_t quantized_size = size / ggml_blck_size(type) * ggml_type_size(type);
Expand Down
Loading