Skip to content

Commit

Permalink
mulmat-tune: fixed wrong result file name; decrease hist buf size;
Browse files Browse the repository at this point in the history
broken change: delete original profile ggerganov#1 from q_f32 profiles
  • Loading branch information
mqy committed May 31, 2023
1 parent bad2202 commit c0d321f
Show file tree
Hide file tree
Showing 4 changed files with 96 additions and 81 deletions.
77 changes: 37 additions & 40 deletions examples/mulmat-tune/mulmat-tune.c
Original file line number Diff line number Diff line change
Expand Up @@ -19,23 +19,28 @@ static void cmd_analyze(struct ggml_mulmat_tune *tune);
static void usage(char *prog) {
const char *usage_lines[] = {
"usage: %s [bench ...] | [analyze FILE] [-h | --help]\n",
"\n",
"bench [-m MODEL] [-t TYPE] [-f FILE] [-y]\n",
"--model MODEL 7B | 13B | 30B | 65B\n",
" default 7B\n",
"--type TYPE Q4_0 | Q4_1 | Q5_0 | Q5_1 | Q8_0 | F32 | F16\n",
" default Q4_0\n",
"--m_num M_NUM number of M, the max M = 2^(M_NUM-1)\n",
" requires: in range [8, 12]\n",
" default 10\n",
"--backend BACKEND backend: CUDA | CL | BLAS\n",
" default: auto detect\n",
"--n_pass number of passes to run\n",
" default 3\n",
" requires: in range [1, 5]\n",
"--file FILE data file to write\n",
" default stdout\n",
"-y always answer \"yes\" to all prompts\n",
"",
"bench [-m MODEL] [-t TYPE] [-f FILE] [-y]",
"--model MODEL 3B | 7B | 13B | 30B | 65B",
" default 7B",
"--type TYPE Q4_0 | Q4_1 | Q5_0 | Q5_1 | Q8_0 | F32 | F16",
" default Q4_0",
"--m_num M_NUM number of M, the max M = 2^(M_NUM-1)",
" requires within [8, 12]",
" default 10",
"--backend BACKEND backend: CUDA | CL | BLAS",
" default: auto detect",
"--n_pass number of passes to run",
" default 3",
" requires: within [1, 5]",
"--file FILE data file to write",
" default stdout",
"--hint enable hint",
" run less bench for constant or linear stages.",
" CAUTION: hint is experimental and the resulting",
" data may be unreliable, enable it only",
" if you know what you are doing",
"-y always answer \"yes\" to all prompts",
};

int len = (int)(sizeof(usage_lines) / sizeof(char *));
Expand All @@ -44,7 +49,7 @@ static void usage(char *prog) {
if (i == 0) {
fprintf(stderr, line, prog);
} else {
fprintf(stderr, "%s", line);
fprintf(stderr, "%s\n", line);
}
}

Expand Down Expand Up @@ -74,7 +79,7 @@ int main(int argc, char **argv) {

if (strcmp(cmd, "bench") == 0) {
struct ggml_mulmat_tune tune = {
.version = 1,
.version = GGML_MULMAT_TUNE_VERSION,
.n_shapes = 0,
};

Expand Down Expand Up @@ -124,6 +129,9 @@ int main(int argc, char **argv) {
arg_file = argv[i + 1];
++i;
}
} else if (strcmp(argv[i], "--hint") == 0) {
fprintf(stderr, "The `hint` feature is not implemented\n");
exit(1);
} else if (strcmp(argv[i], "-y") == 0) {
always_yes = true;
} else {
Expand Down Expand Up @@ -196,7 +204,7 @@ int main(int argc, char **argv) {
m_num = v;
}
if (m_num < 8 || m_num > 12) {
fprintf(stderr, "invalid m_num: %d, expect in range [8, 12]\n",
fprintf(stderr, "invalid m_num: %d, expect within [8, 12]\n",
m_num);
usage(argv[0]);
exit(1);
Expand All @@ -209,8 +217,8 @@ int main(int argc, char **argv) {
int v = atoi(arg_n_pass);
n_pass = v;
}
if (n_pass < 1 || n_pass > MAX_NUM_PASS) {
fprintf(stderr, "invalid n_pass: %d, expect in range [1, 5]\n",
if (n_pass < 1 || n_pass > GGML_MULMAT_MAX_PASS) {
fprintf(stderr, "invalid n_pass: %d, expect within [1, 5]\n",
n_pass);
usage(argv[0]);
exit(1);
Expand Down Expand Up @@ -350,7 +358,7 @@ int main(int argc, char **argv) {

void cmd_bench(struct ggml_mulmat_tune *tune, int n_pass, bool verbose) {
size_t wsize = 0;
void *q_buf = NULL;
char hist[64]; // TODO: make sure this size is safe.
void *wdata = NULL;

// alloc q_buf and wdata with max size.
Expand All @@ -364,16 +372,6 @@ void cmd_bench(struct ggml_mulmat_tune *tune, int n_pass, bool verbose) {
}
GGML_ASSERT(max_NxK > 0);

// NOTE: proximate.
size_t q_buf_size = max_NxK * sizeof(int64_t);

q_buf = malloc(q_buf_size);
if (!q_buf) {
fprintf(stderr,
"failed to allocate memory for q_buf, size: %zu MiB\n",
q_buf_size / 1024 / 1024);
exit(1);
}
wsize = max_NxK * sizeof(float);
wdata = malloc(wsize);
if (!wdata) {
Expand Down Expand Up @@ -449,23 +447,23 @@ void cmd_bench(struct ggml_mulmat_tune *tune, int n_pass, bool verbose) {
switch (shape->src0_type) {
case GGML_TYPE_Q4_0:
ggml_quantize_q4_0((const float *)src0_f32->data,
src0->data, N * K, K, (int64_t *)q_buf);
src0->data, N * K, K, (void *)hist);
break;
case GGML_TYPE_Q4_1:
ggml_quantize_q4_1((const float *)src0_f32->data,
src0->data, N * K, K, (int64_t *)q_buf);
src0->data, N * K, K, (void *)hist);
break;
case GGML_TYPE_Q5_0:
ggml_quantize_q5_0((const float *)src0_f32->data,
src0->data, N * K, K, (int64_t *)q_buf);
src0->data, N * K, K, (void *)hist);
break;
case GGML_TYPE_Q5_1:
ggml_quantize_q5_1((const float *)src0_f32->data,
src0->data, N * K, K, (int64_t *)q_buf);
src0->data, N * K, K, (void *)hist);
break;
case GGML_TYPE_Q8_0:
ggml_quantize_q8_0((const float *)src0_f32->data,
src0->data, N * K, K, (int64_t *)q_buf);
src0->data, N * K, K, (void *)hist);
break;
default:
GGML_ASSERT(false);
Expand All @@ -492,7 +490,7 @@ void cmd_bench(struct ggml_mulmat_tune *tune, int n_pass, bool verbose) {
// without memset, the first run may be significant slow.
memset(wdata, 0, wsize);

int stage_time[MAX_NUM_PASS];
int stage_time[GGML_MULMAT_MAX_PASS];
for (int i_bench = 0; i_bench < n_pass; i_bench++) {
int t0 = (int)ggml_time_us();

Expand Down Expand Up @@ -529,7 +527,6 @@ void cmd_bench(struct ggml_mulmat_tune *tune, int n_pass, bool verbose) {
}

free(wdata);
free(q_buf);
}

static void print_build_tips(void) {
Expand Down
67 changes: 42 additions & 25 deletions ggml-tune.c
Original file line number Diff line number Diff line change
Expand Up @@ -116,26 +116,39 @@ int ggml_mulmat_tune_validate(struct ggml_mulmat_tune *tune, const char *model,
const char *backend_vendor = ggml_get_backend_vendor();

int rc = 0;
char err_buf[1024];

if (strcmp(model, tune->model) != 0) {
if (tune->version != GGML_MULMAT_TUNE_VERSION) {
snprintf(
err_buf, sizeof(err_buf),
"version mismatch, please re-run bench. current: %d, incoming: %d",
GGML_MULMAT_TUNE_VERSION, tune->version);
rc = -1;
} else if (type != tune->type) {
} else if (strcmp(model, tune->model) != 0) {
snprintf(err_buf, sizeof(err_buf),
"model mismatch. current: %s, incoming: %s", model,
tune->model);
rc = -2;
} else if ((int)backend != tune->backend) {
} else if (type != tune->type) {
snprintf(err_buf, sizeof(err_buf),
"type mismatch. current: %d, incoming: %d\n", type,
tune->type);
rc = -3;
} else if ((int)backend != tune->backend) {
snprintf(err_buf, sizeof(err_buf),
"backend mismatch. current: %d, incoming: %d\n", backend,
tune->backend);
rc = -4;
} else if (backend_vendor == NULL ||
strcmp(backend_vendor, tune->backend_vendor) != 0) {
rc = -4;
} else {
// TODO
rc = -5;
snprintf(err_buf, sizeof(err_buf),
"backend vendor mismatch. current: %s, incoming: %s",
backend_vendor, tune->backend_vendor);
}

if (rc != 0) {
printf("model: %s, tune model: %s\n", model, tune->model);
printf("type: %d, tune type: %d\n", type, tune->type);
printf("backend: %d, tune backend: %d\n", backend, tune->backend);
printf("backend vendor: %s, tune backend vendor: %s\n", backend_vendor,
tune->backend_vendor);
fprintf(stderr, "mulmat-tune: error: %s\n", err_buf);
}

return rc;
Expand Down Expand Up @@ -572,21 +585,25 @@ void ggml_mulmat_init_task_profiles(enum ggml_backend backend) {

p[1].stages[1].backend = backend;
p[1].stages[1].wait = true;

} else if (backend == GGML_BACKEND_BLAS) {
ggml_mulmat_task_profiles_qxx_n = 3;

p[0].stages[0].backend = GGML_BACKEND_CPU;
p[0].stages[1].backend = GGML_BACKEND_CPU;
p[0].stages[1].parallel = true;

p[1].stages[1].backend = backend;
p[1].stages[1].wait = true;

p[2].stages[0].backend = GGML_BACKEND_CPU;
p[2].stages[0].parallel = true;
p[2].stages[1].backend = backend;
p[2].stages[1].wait = true;
int i = 0;
p[i].stages[0].backend = GGML_BACKEND_CPU;
p[i].stages[1].backend = GGML_BACKEND_CPU;
p[i].stages[1].parallel = true;
++i;

// p[i].stages[1].backend = backend;
// p[i].stages[1].wait = true;
// ++i;

p[i].stages[0].backend = GGML_BACKEND_CPU;
p[i].stages[0].parallel = true;
// p[i].stages[1].tune_hint = GGML_TUNE_HINT_CONSTANT;
p[i].stages[1].backend = backend;
p[i].stages[1].wait = true;
++i;

ggml_mulmat_task_profiles_qxx_n = i;
} else {
fprintf(stderr, "invalid backend: %d\n", backend);
GGML_ASSERT(false);
Expand Down
13 changes: 11 additions & 2 deletions ggml-tune.h
Original file line number Diff line number Diff line change
Expand Up @@ -10,15 +10,24 @@
extern "C" {
#endif

#define MAX_NUM_PASS 5

#define GGML_MULMAT_TUNE_VERSION 2
#define GGML_MULMAT_N_SHAPES 6

#define GGML_MULMAT_MAX_PASS 5
#define GGML_MULMAT_MAX_PROFILES 8

enum ggml_mulmat_tune_hint {
GGML_TUNE_HINT_UNKNOWN = 0,
GGML_TUNE_HINT_CONSTANT = 1,
};

struct ggml_task_stage {
enum ggml_backend backend;
bool parallel;
bool wait;

// TODO: experimental, may be moved to other place.
// enum ggml_mulmat_tune_hint tune_hint;
};

struct ggml_task_profile {
Expand Down
20 changes: 6 additions & 14 deletions llama.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2330,7 +2330,7 @@ struct llama_context * llama_init_from_file(

ctx->mm_tune = (struct ggml_mulmat_tune *)malloc(sizeof(struct ggml_mulmat_tune));
if (ctx->mm_tune == nullptr) {
fprintf(stderr, "\nERROR: failed to allocate memory for struct ggml_mulmat_tune\n");
fprintf(stderr, "ERROR: failed to allocate memory for struct ggml_mulmat_tune\n");
return nullptr;
}

Expand All @@ -2343,13 +2343,13 @@ struct llama_context * llama_init_from_file(
char buf[128];
GGML_ASSERT(strlen(env_dir) < sizeof(buf) - 10);
// TODO: take care the path separator for Windows.
snprintf(buf, sizeof(buf), "%s/%s.%s", env_dir, model_name, type_name);
snprintf(buf, sizeof(buf), "%s/%s.%s.txt", env_dir, model_name, type_name);
file = buf;
}

FILE *fp = fopen(file, "r");
if (!fp) {
fprintf(stderr, "\nWARN: mulmat-tune: failed to open file %s, ignore.\n", file);
fprintf(stderr, "mulmat-tune: failed to open file %s, ignore.\n", file);
free(ctx->mm_tune);
ctx->mm_tune = NULL;
} else {
Expand All @@ -2358,24 +2358,16 @@ struct llama_context * llama_init_from_file(

if (rc != 0) {
free(ctx->mm_tune);
fprintf(stderr, "\nERROR: mulmat-tune: failed to load file %s, error code: %d\n", file, rc);
fprintf(stderr, "mulmat-tune: failed to load file %s, error code: %d\n", file, rc);
return nullptr;
}

fprintf(stderr, "\nINFO: mulmat-tune: loaded file %s\n", file);
fprintf(stderr, "mulmat-tune: loaded file %s\n", file);

rc = ggml_mulmat_tune_validate(ctx->mm_tune, model_name, type);
if (rc != 0) {
free(ctx->mm_tune);
const char *err = "unknown";
switch (rc) {
case -1: err = "model mismatch"; break;
case -2: err = "type mismatch"; break;
case -3: err = "backend mismatch"; break;
case -4: err = "backend vendor mismatch"; break;
}
fprintf(stderr, "\nERROR: mulmat-tune: failed to validate file %s: %s\n", file, err);
return nullptr;
exit(1);
}
}
}
Expand Down

0 comments on commit c0d321f

Please sign in to comment.