-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathbenchmark.c
624 lines (528 loc) · 20 KB
/
benchmark.c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
// Include necessary headers
#include <stdio.h>
#include <stdint.h>
#include <stdlib.h>
#include <string.h> // For memset
#include <time.h>
#include <math.h>
#include <assert.h>
// Constants
#define QK8_0 32
#define QK4_0 32
#define BLOCK_SIZE 4
#define NCOLS_INTERLEAVED 4
// Define ggml_fp16_t as uint16_t (half-precision floating-point)
typedef uint16_t ggml_fp16_t;
// Function to convert ggml_fp16_t to float
static inline float GGML_FP16_TO_FP32(ggml_fp16_t h) {
uint16_t h_exp = (h & 0x7C00) >> 10;
uint16_t h_sig = h & 0x03FF;
uint32_t f_sgn = (h & 0x8000) << 16;
uint32_t f_exp, f_sig;
if (h_exp == 0) {
// Zero / subnormal
if (h_sig == 0) {
// Zero
f_exp = 0;
f_sig = 0;
} else {
// Subnormal
h_exp = 1;
while ((h_sig & 0x0400) == 0) {
h_sig <<= 1;
h_exp -= 1;
}
h_sig &= 0x03FF;
f_exp = (h_exp + (127 - 15)) << 23;
f_sig = h_sig << 13;
}
} else if (h_exp == 0x1F) {
// Inf / NaN
f_exp = 0xFF << 23;
f_sig = h_sig << 13;
} else {
// Normalized
f_exp = (h_exp + (127 - 15)) << 23;
f_sig = h_sig << 13;
}
uint32_t f = f_sgn | f_exp | f_sig;
float result = *((float *)&f);
return result;
}
// Function to convert float to ggml_fp16_t
static inline ggml_fp16_t GGML_FP32_TO_FP16(float f) {
uint32_t x = *((uint32_t *)&f);
uint16_t h = 0;
uint32_t sign = (x >> 31) & 0x1;
uint32_t mantissa = x & 0x7FFFFF;
int32_t exp = (int32_t)((x >> 23) & 0xFF) - 127;
if (exp > 15) {
// Overflow, set to Inf
h = (sign << 15) | (0x1F << 10);
} else if (exp > -15) {
// Normalized number
exp += 15;
mantissa >>= 13;
h = (sign << 15) | (exp << 10) | (mantissa & 0x3FF);
} else {
// Underflow, set to zero
h = 0;
}
return h;
}
// Data structures
typedef struct {
ggml_fp16_t d; // scale (half-precision float)
uint8_t qs[QK4_0 / 2]; // quantized values (4-bit packed into bytes)
} block_q4_0;
typedef struct {
ggml_fp16_t d[4]; // scales (half-precision floats)
uint8_t qs[QK4_0 * 2]; // quantized values (4-bit packed into bytes)
} block_q4_0x4;
typedef struct {
ggml_fp16_t d[8]; // scales (half-precision floats)
uint8_t qs[QK4_0 * 4]; // quantized values (4-bit packed into bytes)
} block_q4_0x8;
typedef struct {
ggml_fp16_t d; // scale (half-precision float)
int8_t qs[QK8_0]; // quantized values (8-bit)
} block_q8_0;
// Utility macros
#define UNUSED(x) (void)(x)
#define MAX(a, b) (((a) > (b)) ? (a) : (b))
// Function prototypes
void gemv_q4_0_4x4_q8_0_scalar(
int n,
float * restrict s,
size_t bs,
const block_q4_0x4 * restrict vx,
const block_q8_0 * restrict vy,
int nr,
int nc
);
void gemv_unquantized(
int n,
float * restrict s,
const float * restrict vx,
const float * restrict vy,
int nr,
int nc
);
// Functions for quantization
// Function to quantize a single row to q4_0 format
void quantize_row_q4_0_ref(const float * restrict src, block_q4_0 * restrict dst, int k) {
float max_abs = 0.0f;
for (int i = 0; i < k; ++i) {
float v = src[i];
if (fabsf(v) > max_abs) {
max_abs = fabsf(v);
}
}
const float d = max_abs / 7.0f; // 4-bit quantization, so 2^3 - 1 = 7
const float id = (d != 0.0f) ? 1.0f / d : 0.0f;
dst->d = GGML_FP32_TO_FP16(d);
for (int i = 0; i < k; i += 2) {
uint8_t q = 0;
for (int j = 0; j < 2; ++j) {
float v = src[i + j] * id;
int vi = (int)(roundf(v));
vi = (vi < -8) ? -8 : (vi > 7) ? 7 : vi;
uint8_t qvi = (uint8_t)(vi + 8);
q |= (qvi & 0x0F) << (j * 4);
}
dst->qs[i / 2] = q;
}
}
// Function to make block_q4_0x4 from block_q4_0's
static block_q4_0x4 make_block_q4_0x4(block_q4_0 * in, unsigned int blck_size_interleave, unsigned int xor_mask) {
block_q4_0x4 out;
for (int i = 0; i < 4; i++) {
out.d[i] = in[i].d;
}
for (int i = 0; i < QK4_0 * 2; i++) {
int src_offset = (i / (4 * blck_size_interleave)) * blck_size_interleave;
int src_id = (i % (4 * blck_size_interleave)) / blck_size_interleave;
src_offset += (i % blck_size_interleave);
out.qs[i] = in[src_id].qs[src_offset] ^ xor_mask;
}
return out;
}
// Function to quantize a matrix to q4_0_4x4 format
static size_t quantize_q4_0_nr_bl(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, int nrows_interleaved, int blck_size_interleave) {
assert(n_per_row % QK4_0 == 0);
const int nb = n_per_row / QK4_0;
block_q4_0x4 * out_ptr = (block_q4_0x4 *) dst;
assert(nrows_interleaved <= 4);
block_q4_0 dst_tmp[4];
for (int b = 0; b < nrow; b += nrows_interleaved) {
for (int64_t x = 0; x < nb; x++) {
for (int i = 0; i < nrows_interleaved; i++ ) {
quantize_row_q4_0_ref(src + (b + i) * n_per_row + x * QK4_0, (block_q4_0 *) &dst_tmp[i], QK4_0);
}
*(block_q4_0x4 *) out_ptr = make_block_q4_0x4(dst_tmp, blck_size_interleave, 0x88);
out_ptr += 1;
}
}
return ((nrow * n_per_row) / QK4_0 * sizeof(block_q4_0));
}
size_t quantize_q4_0_4x4(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row) {
return quantize_q4_0_nr_bl(src, dst, nrow, n_per_row, 4, 4);
}
// Function to quantize q8_0 format
void quantize_q8_0_4x4(const float * restrict x, void * restrict vy, int64_t k) {
assert(QK8_0 == 32);
assert(k % QK8_0 == 0);
const int nb = k / QK8_0;
block_q8_0 * restrict y = (block_q8_0 *) vy;
for (int i = 0; i < nb; i++) {
float max_abs = 0.0f;
for (int j = 0; j < QK8_0; j++) {
float v = x[i * QK8_0 + j];
if (fabsf(v) > max_abs) {
max_abs = fabsf(v);
}
}
float d = max_abs / 127.0f;
float id = (d != 0.0f) ? 1.0f / d : 0.0f;
y[i].d = GGML_FP32_TO_FP16(d);
for (int j = 0; j < QK8_0; j++) {
float v = x[i * QK8_0 + j] * id;
int8_t q = (int8_t)(roundf(v));
y[i].qs[j] = q;
}
}
}
// Assembly function provided by the user
void ggml_gemv_q4_0_4x4_q8_0(int n, float * restrict s, size_t bs, const void * restrict vx, const void * restrict vy, int nr, int nc) {
const int qk = QK8_0;
const int nb = n / qk;
const int ncols_interleaved = 4;
const int blocklen = 4;
assert (n % qk == 0);
assert (nc % ncols_interleaved == 0);
UNUSED(s);
UNUSED(bs);
UNUSED(vx);
UNUSED(vy);
UNUSED(nr);
UNUSED(nc);
UNUSED(nb);
UNUSED(ncols_interleaved);
UNUSED(blocklen);
// Print metadata inputs and variables
// printf("Metadata Inputs and Variables:\n");
// printf("n: %d\n", n);
// printf("bs: %zu\n", bs);
// printf("nr: %d\n", nr);
// printf("nc: %d\n", nc);
#if defined(__aarch64__) && defined(__ARM_NEON)
{
const void * b_ptr = vx;
const void * a_ptr = vy;
float * res_ptr = s;
__asm__ __volatile__(
"movi v31.16b, #0x4\n"
"movi v30.16b, #0xf0\n"
"add %x[b_ptr], %x[b_ptr], #0x8\n"
"1:" // Column loop
"add x22, %x[a_ptr], #0x2\n"
"movi v29.16b, #0x0\n"
"mov x21, %x[nb]\n"
"2:" // Block loop
"ldr q28, [%x[b_ptr], #0x0]\n"
"ldr q27, [x22, #0x0]\n"
"movi v26.4s, #0x0\n"
"sub x20, x22, #0x2\n"
"ldr q25, [x22, #0x10]\n"
"ldr q24, [%x[b_ptr], #0x10]\n"
"sub x21, x21, #0x1\n"
"add x22, x22, #0x22\n"
"ldr q23, [%x[b_ptr], #0x20]\n"
"ldr q22, [%x[b_ptr], #0x30]\n"
"ld1r { v21.8h }, [x20]\n"
"ldr q20, [%x[b_ptr], #-0x8]\n"
"sshl v16.16b, v28.16b, v31.16b\n"
"and v28.16b, v28.16b, v30.16b\n"
"sshl v19.16b, v24.16b, v31.16b\n"
"and v24.16b, v24.16b, v30.16b\n"
"add %x[b_ptr], %x[b_ptr], #0x48\n"
"sshl v18.16b, v23.16b, v31.16b\n"
"and v23.16b, v23.16b, v30.16b\n"
".inst 0x4f9be21a // sdot v26.4s, v16.16b, v27.4b[0]\n"
"sshl v17.16b, v22.16b, v31.16b\n"
"and v22.16b, v22.16b, v30.16b\n"
"fcvtl v21.4s, v21.4h\n"
"fcvtl v16.4s, v20.4h\n"
".inst 0x4f99e39a // sdot v26.4s, v28.16b, v25.4b[0]\n"
"fmul v16.4s, v16.4s, v21.4s\n"
".inst 0x4fbbe27a // sdot v26.4s, v19.16b, v27.4b[1]\n"
".inst 0x4fb9e31a // sdot v26.4s, v24.16b, v25.4b[1]\n"
".inst 0x4f9bea5a // sdot v26.4s, v18.16b, v27.4b[2]\n"
".inst 0x4f99eafa // sdot v26.4s, v23.16b, v25.4b[2]\n"
".inst 0x4fbbea3a // sdot v26.4s, v17.16b, v27.4b[3]\n"
".inst 0x4fb9eada // sdot v26.4s, v22.16b, v25.4b[3]\n"
"scvtf v26.4s, v26.4s, #0x4\n"
"fmla v29.4s, v26.4s, v16.4s\n"
"cbnz x21, 2b\n"
"sub %x[nc], %x[nc], #0x4\n"
"str q29, [%x[res_ptr], #0x0]\n"
"add %x[res_ptr], %x[res_ptr], #0x10\n"
"cbnz %x[nc], 1b\n"
: [b_ptr] "+&r" (b_ptr), [res_ptr] "+&r" (res_ptr), [nc] "+&r" (nc)
: [a_ptr] "r" (a_ptr), [nb] "r" (nb)
: "memory", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x20", "x21", "x22"
);
return;
}
#endif // #if defined(__aarch64__) && defined(__ARM_NEON)
// Fallback scalar implementation
float sumf[4];
int sumi;
const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
for (int x = 0; x < nc / ncols_interleaved; x++) {
const block_q4_0x4 * b_ptr = (const block_q4_0x4 *) vx + (x * nb);
for (int j = 0; j < ncols_interleaved; j++) sumf[j] = 0.0;
for (int l = 0; l < nb; l++) {
for (int k = 0; k < (qk / (2 * blocklen)); k++) {
for (int j = 0; j < ncols_interleaved; j++) {
sumi = 0;
for (int i = 0; i < blocklen; ++i) {
int idx = k * ncols_interleaved * blocklen + j * blocklen + i;
uint8_t byte = b_ptr[l].qs[idx];
// Extract low and high nibbles
int8_t v0 = (byte & 0x0F);
int8_t v1 = (byte & 0xF0) >> 4;
// Sign-extend from 4-bit to 8-bit
v0 = (v0 ^ 0x8) - 0x8;
v1 = (v1 ^ 0x8) - 0x8;
int a_idx0 = k * blocklen + i;
int a_idx1 = k * blocklen + i + qk / 2;
int8_t a0 = a_ptr[l].qs[a_idx0];
int8_t a1 = a_ptr[l].qs[a_idx1];
// Multiply and accumulate
sumi += v0 * a0 + v1 * a1;
}
float b_scale = GGML_FP16_TO_FP32(b_ptr[l].d[j]);
float a_scale = GGML_FP16_TO_FP32(a_ptr[l].d);
// Adjust sumi by shifting right by 4
sumf[j] += ((float) (sumi >> 4)) * b_scale * a_scale;
}
}
}
for (int j = 0; j < ncols_interleaved; j++) s[x * ncols_interleaved + j] = sumf[j];
}
}
// Scalar GEMV function for quantized data
void gemv_q4_0_4x4_q8_0_scalar(
int n,
float * restrict s,
size_t bs,
const block_q4_0x4 * restrict vx,
const block_q8_0 * restrict vy,
int nr,
int nc
) {
const int qk = QK8_0;
const int nb = n / qk;
float sumf[NCOLS_INTERLEAVED];
assert(n % qk == 0);
assert(nc % NCOLS_INTERLEAVED == 0);
for (int x = 0; x < nc / NCOLS_INTERLEAVED; x++) {
// Clear accumulators
for (int j = 0; j < NCOLS_INTERLEAVED; j++) {
sumf[j] = 0.0f;
}
for (int l = 0; l < nb; l++) {
const block_q4_0x4 *b_block = &vx[x * nb + l];
const block_q8_0 *a_block = &vy[l];
for (int k = 0; k < (qk / (2 * BLOCK_SIZE)); k++) {
for (int j = 0; j < NCOLS_INTERLEAVED; j++) {
int sumi = 0;
for (int i = 0; i < BLOCK_SIZE; i++) {
int idx = k * NCOLS_INTERLEAVED * BLOCK_SIZE + j * BLOCK_SIZE + i;
uint8_t byte = b_block->qs[idx];
// Extract low and high nibbles and sign extend
int8_t v0 = (int8_t)(byte << 4); // Get lower nibble and shift left
int8_t v1 = (int8_t)(byte & 0xF0); // Get upper nibble
int a_idx0 = k * BLOCK_SIZE + i;
int a_idx1 = k * BLOCK_SIZE + i + qk / 2;
int8_t a0 = a_block->qs[a_idx0];
int8_t a1 = a_block->qs[a_idx1];
// Multiply and accumulate with right shift applied to each product
sumi += ((v0 * a0) >> 4) + ((v1 * a1) >> 4);
}
float b_scale = GGML_FP16_TO_FP32(b_block->d[j]);
float a_scale = GGML_FP16_TO_FP32(a_block->d);
sumf[j] += ((float) sumi) * b_scale * a_scale;
}
}
}
// Store results
for (int j = 0; j < NCOLS_INTERLEAVED; j++) {
s[x * NCOLS_INTERLEAVED + j] = sumf[j];
}
}
}
// Unquantized GEMV function
void gemv_unquantized(
int n,
float * restrict s,
const float * restrict vx,
const float * restrict vy,
int nr,
int nc
) {
// Assume 'vx' is a matrix of size [nc x n]
// 'vy' is a vector of size [n]
// 's' is the result vector of size [nc]
for (int i = 0; i < nc; ++i) {
float sum = 0.0f;
for (int k = 0; k < n; ++k) {
sum += vx[i * n + k] * vy[k];
}
s[i] = sum;
}
}
// Function to generate test data
void generate_test_data(int n, int nc, block_q4_0x4 **vx_out, block_q8_0 **vy_out, float **s_out, float **unquantized_vx, float **unquantized_vy) {
int nb = n / QK8_0;
int num_vx_blocks = nb * (nc / NCOLS_INTERLEAVED);
block_q4_0x4 *vx = (block_q4_0x4 *) malloc(num_vx_blocks * sizeof(block_q4_0x4));
block_q8_0 *vy = (block_q8_0 *) malloc(nb * sizeof(block_q8_0));
float *s = (float *) malloc(nc * sizeof(float));
// Generate unquantized data
float *ux = (float *) malloc(nc * n * sizeof(float));
float *uy = (float *) malloc(n * sizeof(float));
// Initialize random seed
srand(42);
// Generate random data for ux and uy
for (int i = 0; i < nc * n; i++) {
// ux[i] = ((float) rand() / RAND_MAX) * 2.0f - 1.0f; // Random float between -1 and 1
ux[i] = 1.0f;
}
for (int i = 0; i < n; i++) {
// uy[i] = ((float) rand() / RAND_MAX) * 2.0f - 1.0f; // Random float between -1 and 1
uy[i] = 1.0f;
}
// Quantize ux to vx
quantize_q4_0_4x4(ux, vx, nc, n);
// Quantize uy to vy
quantize_q8_0_4x4(uy, vy, n);
// Initialize s
for (int i = 0; i < nc; i++) {
s[i] = 0.0f;
}
*vx_out = vx;
*vy_out = vy;
*s_out = s;
*unquantized_vx = ux;
*unquantized_vy = uy;
}
// Benchmark function
void benchmark_implementation(
const char *name,
void (*func)(
int n,
float * restrict s,
size_t bs,
const void * restrict vx,
const void * restrict vy,
int nr,
int nc
),
int iterations,
int n,
int nc
) {
block_q4_0x4 *vx;
block_q8_0 *vy;
float *s;
float *ux;
float *uy;
generate_test_data(n, nc, &vx, &vy, &s, &ux, &uy);
float *s_quantized = (float *) malloc(nc * sizeof(float));
float *s_unquantized = (float *) malloc(nc * sizeof(float));
// Warmup
for (int i = 0; i < 5; i++) {
func(n, s_quantized, QK8_0, vx, vy, 1, nc);
}
// Benchmark quantized GEMV
struct timespec start_q, end_q;
clock_gettime(CLOCK_MONOTONIC, &start_q);
for (int i = 0; i < iterations; i++) {
func(n, s_quantized, QK8_0, vx, vy, 1, nc);
}
clock_gettime(CLOCK_MONOTONIC, &end_q);
// Benchmark unquantized GEMV
struct timespec start_uq, end_uq;
clock_gettime(CLOCK_MONOTONIC, &start_uq);
for (int i = 0; i < iterations; i++) {
gemv_unquantized(n, s_unquantized, ux, uy, 1, nc);
}
clock_gettime(CLOCK_MONOTONIC, &end_uq);
// Calculate duration in seconds
double duration_q = (end_q.tv_sec - start_q.tv_sec) + (end_q.tv_nsec - start_q.tv_nsec) / 1e9;
double duration_uq = (end_uq.tv_sec - start_uq.tv_sec) + (end_uq.tv_nsec - start_uq.tv_nsec) / 1e9;
printf("%s (Quantized): %.2f ms per iteration\n",
name,
(duration_q * 1000.0) / iterations);
printf("%s (Unquantized): %.2f ms per iteration\n",
"Unquantized GEMV",
(duration_uq * 1000.0) / iterations);
// Compare outputs
printf("Comparing outputs between quantized and unquantized GEMV:\n");
double max_diff = 0.0;
for (int i = 0; i < nc; i++) {
double diff = fabs(s_quantized[i] - s_unquantized[i]);
if (diff > max_diff) {
max_diff = diff;
}
// Optionally, print the differences
// if (diff == max_diff) {
// printf("Index %d: Quantized = %.6f, Unquantized = %.6f, Diff = %.6f\n",
// i, s_quantized[i], s_unquantized[i], diff);
// }
}
printf("Maximum difference: %.6f\n", max_diff);
// Print the first 10 elements of each output
printf("First 10 elements of quantized output:\n");
for (int i = 0; i < 10; i++) {
printf("%.6f ", s_quantized[i]);
}
printf("\n");
printf("First 10 elements of unquantized output:\n");
for (int i = 0; i < 10; i++) {
printf("%.6f ", s_unquantized[i]);
}
printf("\n");
free(vx);
free(vy);
free(s);
free(ux);
free(uy);
free(s_quantized);
free(s_unquantized);
}
int main() {
int n = 4096 * 2;
int nc = 4096 * 2;
int iterations = 1;
printf("Benchmarking with n=%d, nc=%d, iterations=%d\n", n, nc, iterations);
// Benchmark scalar implementation (Quantized GEMV)
benchmark_implementation(
"Quantized GEMV scalar",
(void (*)(int, float *, size_t, const void *, const void *, int, int)) gemv_q4_0_4x4_q8_0_scalar,
iterations,
n,
nc
);
// Benchmark scalar implementation (Quantized GEMV)
benchmark_implementation(
"Quantized GEMV",
(void (*)(int, float *, size_t, const void *, const void *, int, int)) ggml_gemv_q4_0_4x4_q8_0,
iterations,
n,
nc
);
return 0;
}