diff --git a/examples/mulmat-tune/mulmat-tune.c b/examples/mulmat-tune/mulmat-tune.c index bade95587e02c1..20a7faff1c1324 100644 --- a/examples/mulmat-tune/mulmat-tune.c +++ b/examples/mulmat-tune/mulmat-tune.c @@ -19,23 +19,28 @@ static void cmd_analyze(struct ggml_mulmat_tune *tune); static void usage(char *prog) { const char *usage_lines[] = { "usage: %s [bench ...] | [analyze FILE] [-h | --help]\n", - "\n", - "bench [-m MODEL] [-t TYPE] [-f FILE] [-y]\n", - "--model MODEL 7B | 13B | 30B | 65B\n", - " default 7B\n", - "--type TYPE Q4_0 | Q4_1 | Q5_0 | Q5_1 | Q8_0 | F32 | F16\n", - " default Q4_0\n", - "--m_num M_NUM number of M, the max M = 2^(M_NUM-1)\n", - " requires: in range [8, 12]\n", - " default 10\n", - "--backend BACKEND backend: CUDA | CL | BLAS\n", - " default: auto detect\n", - "--n_pass number of passes to run\n", - " default 3\n", - " requires: in range [1, 5]\n", - "--file FILE data file to write\n", - " default stdout\n", - "-y always answer \"yes\" to all prompts\n", + "", + "bench [-m MODEL] [-t TYPE] [-f FILE] [-y]", + "--model MODEL 3B | 7B | 13B | 30B | 65B", + " default 7B", + "--type TYPE Q4_0 | Q4_1 | Q5_0 | Q5_1 | Q8_0 | F32 | F16", + " default Q4_0", + "--m_num M_NUM number of M, the max M = 2^(M_NUM-1)", + " requires within [8, 12]", + " default 10", + "--backend BACKEND backend: CUDA | CL | BLAS", + " default: auto detect", + "--n_pass number of passes to run", + " default 3", + " requires: within [1, 5]", + "--file FILE data file to write", + " default stdout", + "--hint enable hint", + " run less bench for constant or linear stages.", + " CAUTION: hint is experimental and the resulting", + " data may be unreliable, enable it only", + " if you know what you are doing", + "-y always answer \"yes\" to all prompts", }; int len = (int)(sizeof(usage_lines) / sizeof(char *)); @@ -44,7 +49,7 @@ static void usage(char *prog) { if (i == 0) { fprintf(stderr, line, prog); } else { - fprintf(stderr, "%s", line); + fprintf(stderr, "%s\n", line); } } @@ -74,7 +79,7 @@ int main(int argc, char **argv) { if (strcmp(cmd, "bench") == 0) { struct ggml_mulmat_tune tune = { - .version = 1, + .version = GGML_MULMAT_TUNE_VERSION, .n_shapes = 0, }; @@ -124,6 +129,9 @@ int main(int argc, char **argv) { arg_file = argv[i + 1]; ++i; } + } else if (strcmp(argv[i], "--hint") == 0) { + fprintf(stderr, "The `hint` feature is not implemented\n"); + exit(1); } else if (strcmp(argv[i], "-y") == 0) { always_yes = true; } else { @@ -196,7 +204,7 @@ int main(int argc, char **argv) { m_num = v; } if (m_num < 8 || m_num > 12) { - fprintf(stderr, "invalid m_num: %d, expect in range [8, 12]\n", + fprintf(stderr, "invalid m_num: %d, expect within [8, 12]\n", m_num); usage(argv[0]); exit(1); @@ -209,8 +217,8 @@ int main(int argc, char **argv) { int v = atoi(arg_n_pass); n_pass = v; } - if (n_pass < 1 || n_pass > MAX_NUM_PASS) { - fprintf(stderr, "invalid n_pass: %d, expect in range [1, 5]\n", + if (n_pass < 1 || n_pass > GGML_MULMAT_MAX_PASS) { + fprintf(stderr, "invalid n_pass: %d, expect within [1, 5]\n", n_pass); usage(argv[0]); exit(1); @@ -350,7 +358,7 @@ int main(int argc, char **argv) { void cmd_bench(struct ggml_mulmat_tune *tune, int n_pass, bool verbose) { size_t wsize = 0; - void *q_buf = NULL; + char hist[64]; // TODO: make sure this size is safe. void *wdata = NULL; // alloc q_buf and wdata with max size. @@ -364,16 +372,6 @@ void cmd_bench(struct ggml_mulmat_tune *tune, int n_pass, bool verbose) { } GGML_ASSERT(max_NxK > 0); - // NOTE: proximate. - size_t q_buf_size = max_NxK * sizeof(int64_t); - - q_buf = malloc(q_buf_size); - if (!q_buf) { - fprintf(stderr, - "failed to allocate memory for q_buf, size: %zu MiB\n", - q_buf_size / 1024 / 1024); - exit(1); - } wsize = max_NxK * sizeof(float); wdata = malloc(wsize); if (!wdata) { @@ -449,23 +447,23 @@ void cmd_bench(struct ggml_mulmat_tune *tune, int n_pass, bool verbose) { switch (shape->src0_type) { case GGML_TYPE_Q4_0: ggml_quantize_q4_0((const float *)src0_f32->data, - src0->data, N * K, K, (int64_t *)q_buf); + src0->data, N * K, K, (void *)hist); break; case GGML_TYPE_Q4_1: ggml_quantize_q4_1((const float *)src0_f32->data, - src0->data, N * K, K, (int64_t *)q_buf); + src0->data, N * K, K, (void *)hist); break; case GGML_TYPE_Q5_0: ggml_quantize_q5_0((const float *)src0_f32->data, - src0->data, N * K, K, (int64_t *)q_buf); + src0->data, N * K, K, (void *)hist); break; case GGML_TYPE_Q5_1: ggml_quantize_q5_1((const float *)src0_f32->data, - src0->data, N * K, K, (int64_t *)q_buf); + src0->data, N * K, K, (void *)hist); break; case GGML_TYPE_Q8_0: ggml_quantize_q8_0((const float *)src0_f32->data, - src0->data, N * K, K, (int64_t *)q_buf); + src0->data, N * K, K, (void *)hist); break; default: GGML_ASSERT(false); @@ -492,7 +490,7 @@ void cmd_bench(struct ggml_mulmat_tune *tune, int n_pass, bool verbose) { // without memset, the first run may be significant slow. memset(wdata, 0, wsize); - int stage_time[MAX_NUM_PASS]; + int stage_time[GGML_MULMAT_MAX_PASS]; for (int i_bench = 0; i_bench < n_pass; i_bench++) { int t0 = (int)ggml_time_us(); @@ -529,7 +527,6 @@ void cmd_bench(struct ggml_mulmat_tune *tune, int n_pass, bool verbose) { } free(wdata); - free(q_buf); } static void print_build_tips(void) { diff --git a/ggml-tune.c b/ggml-tune.c index 2f6c56a4dc3c2e..82ef3a5f9cfa6c 100644 --- a/ggml-tune.c +++ b/ggml-tune.c @@ -116,26 +116,39 @@ int ggml_mulmat_tune_validate(struct ggml_mulmat_tune *tune, const char *model, const char *backend_vendor = ggml_get_backend_vendor(); int rc = 0; + char err_buf[1024]; - if (strcmp(model, tune->model) != 0) { + if (tune->version != GGML_MULMAT_TUNE_VERSION) { + snprintf( + err_buf, sizeof(err_buf), + "version mismatch, please re-run bench. current: %d, incoming: %d", + GGML_MULMAT_TUNE_VERSION, tune->version); rc = -1; - } else if (type != tune->type) { + } else if (strcmp(model, tune->model) != 0) { + snprintf(err_buf, sizeof(err_buf), + "model mismatch. current: %s, incoming: %s", model, + tune->model); rc = -2; - } else if ((int)backend != tune->backend) { + } else if (type != tune->type) { + snprintf(err_buf, sizeof(err_buf), + "type mismatch. current: %d, incoming: %d\n", type, + tune->type); rc = -3; + } else if ((int)backend != tune->backend) { + snprintf(err_buf, sizeof(err_buf), + "backend mismatch. current: %d, incoming: %d\n", backend, + tune->backend); + rc = -4; } else if (backend_vendor == NULL || strcmp(backend_vendor, tune->backend_vendor) != 0) { - rc = -4; - } else { - // TODO + rc = -5; + snprintf(err_buf, sizeof(err_buf), + "backend vendor mismatch. current: %s, incoming: %s", + backend_vendor, tune->backend_vendor); } if (rc != 0) { - printf("model: %s, tune model: %s\n", model, tune->model); - printf("type: %d, tune type: %d\n", type, tune->type); - printf("backend: %d, tune backend: %d\n", backend, tune->backend); - printf("backend vendor: %s, tune backend vendor: %s\n", backend_vendor, - tune->backend_vendor); + fprintf(stderr, "mulmat-tune: error: %s\n", err_buf); } return rc; @@ -572,21 +585,25 @@ void ggml_mulmat_init_task_profiles(enum ggml_backend backend) { p[1].stages[1].backend = backend; p[1].stages[1].wait = true; - } else if (backend == GGML_BACKEND_BLAS) { - ggml_mulmat_task_profiles_qxx_n = 3; - - p[0].stages[0].backend = GGML_BACKEND_CPU; - p[0].stages[1].backend = GGML_BACKEND_CPU; - p[0].stages[1].parallel = true; - - p[1].stages[1].backend = backend; - p[1].stages[1].wait = true; - - p[2].stages[0].backend = GGML_BACKEND_CPU; - p[2].stages[0].parallel = true; - p[2].stages[1].backend = backend; - p[2].stages[1].wait = true; + int i = 0; + p[i].stages[0].backend = GGML_BACKEND_CPU; + p[i].stages[1].backend = GGML_BACKEND_CPU; + p[i].stages[1].parallel = true; + ++i; + + // p[i].stages[1].backend = backend; + // p[i].stages[1].wait = true; + // ++i; + + p[i].stages[0].backend = GGML_BACKEND_CPU; + p[i].stages[0].parallel = true; + // p[i].stages[1].tune_hint = GGML_TUNE_HINT_CONSTANT; + p[i].stages[1].backend = backend; + p[i].stages[1].wait = true; + ++i; + + ggml_mulmat_task_profiles_qxx_n = i; } else { fprintf(stderr, "invalid backend: %d\n", backend); GGML_ASSERT(false); diff --git a/ggml-tune.h b/ggml-tune.h index 76bfca2478a658..6ea3ca2cc6c246 100644 --- a/ggml-tune.h +++ b/ggml-tune.h @@ -10,15 +10,24 @@ extern "C" { #endif -#define MAX_NUM_PASS 5 - +#define GGML_MULMAT_TUNE_VERSION 2 #define GGML_MULMAT_N_SHAPES 6 + +#define GGML_MULMAT_MAX_PASS 5 #define GGML_MULMAT_MAX_PROFILES 8 +enum ggml_mulmat_tune_hint { + GGML_TUNE_HINT_UNKNOWN = 0, + GGML_TUNE_HINT_CONSTANT = 1, +}; + struct ggml_task_stage { enum ggml_backend backend; bool parallel; bool wait; + + // TODO: experimental, may be moved to other place. + // enum ggml_mulmat_tune_hint tune_hint; }; struct ggml_task_profile { diff --git a/llama.cpp b/llama.cpp index 8002b25adb96a7..8972591c6ad725 100644 --- a/llama.cpp +++ b/llama.cpp @@ -2330,7 +2330,7 @@ struct llama_context * llama_init_from_file( ctx->mm_tune = (struct ggml_mulmat_tune *)malloc(sizeof(struct ggml_mulmat_tune)); if (ctx->mm_tune == nullptr) { - fprintf(stderr, "\nERROR: failed to allocate memory for struct ggml_mulmat_tune\n"); + fprintf(stderr, "ERROR: failed to allocate memory for struct ggml_mulmat_tune\n"); return nullptr; } @@ -2343,13 +2343,13 @@ struct llama_context * llama_init_from_file( char buf[128]; GGML_ASSERT(strlen(env_dir) < sizeof(buf) - 10); // TODO: take care the path separator for Windows. - snprintf(buf, sizeof(buf), "%s/%s.%s", env_dir, model_name, type_name); + snprintf(buf, sizeof(buf), "%s/%s.%s.txt", env_dir, model_name, type_name); file = buf; } FILE *fp = fopen(file, "r"); if (!fp) { - fprintf(stderr, "\nWARN: mulmat-tune: failed to open file %s, ignore.\n", file); + fprintf(stderr, "mulmat-tune: failed to open file %s, ignore.\n", file); free(ctx->mm_tune); ctx->mm_tune = NULL; } else { @@ -2358,24 +2358,16 @@ struct llama_context * llama_init_from_file( if (rc != 0) { free(ctx->mm_tune); - fprintf(stderr, "\nERROR: mulmat-tune: failed to load file %s, error code: %d\n", file, rc); + fprintf(stderr, "mulmat-tune: failed to load file %s, error code: %d\n", file, rc); return nullptr; } - fprintf(stderr, "\nINFO: mulmat-tune: loaded file %s\n", file); + fprintf(stderr, "mulmat-tune: loaded file %s\n", file); rc = ggml_mulmat_tune_validate(ctx->mm_tune, model_name, type); if (rc != 0) { free(ctx->mm_tune); - const char *err = "unknown"; - switch (rc) { - case -1: err = "model mismatch"; break; - case -2: err = "type mismatch"; break; - case -3: err = "backend mismatch"; break; - case -4: err = "backend vendor mismatch"; break; - } - fprintf(stderr, "\nERROR: mulmat-tune: failed to validate file %s: %s\n", file, err); - return nullptr; + exit(1); } } }