diff --git a/examples/mulmat-tune/mulmat-tune.c b/examples/mulmat-tune/mulmat-tune.c
index bade95587e02c1..20a7faff1c1324 100644
--- a/examples/mulmat-tune/mulmat-tune.c
+++ b/examples/mulmat-tune/mulmat-tune.c
@@ -19,23 +19,28 @@ static void cmd_analyze(struct ggml_mulmat_tune *tune);
 static void usage(char *prog) {
     const char *usage_lines[] = {
         "usage: %s [bench ...] | [analyze FILE] [-h | --help]\n",
-        "\n",
-        "bench [-m MODEL] [-t TYPE] [-f FILE] [-y]\n",
-        "--model   MODEL    7B | 13B | 30B | 65B\n",
-        "                   default 7B\n",
-        "--type    TYPE     Q4_0 | Q4_1 | Q5_0 | Q5_1 | Q8_0 | F32 | F16\n",
-        "                   default Q4_0\n",
-        "--m_num   M_NUM    number of M, the max M = 2^(M_NUM-1)\n",
-        "                   requires: in range [8, 12]\n",
-        "                   default 10\n",
-        "--backend BACKEND  backend: CUDA | CL | BLAS\n",
-        "                   default: auto detect\n",
-        "--n_pass           number of passes to run\n",
-        "                   default 3\n",
-        "                   requires: in range [1, 5]\n",
-        "--file    FILE     data file to write\n",
-        "                   default stdout\n",
-        "-y                 always answer \"yes\" to all prompts\n",
+        "",
+        "bench [-m MODEL] [-t TYPE] [-f FILE] [-y]",
+        "--model   MODEL    3B | 7B | 13B | 30B | 65B",
+        "                   default 7B",
+        "--type    TYPE     Q4_0 | Q4_1 | Q5_0 | Q5_1 | Q8_0 | F32 | F16",
+        "                   default Q4_0",
+        "--m_num   M_NUM    number of M, the max M = 2^(M_NUM-1)",
+        "                   requires within [8, 12]",
+        "                   default 10",
+        "--backend BACKEND  backend: CUDA | CL | BLAS",
+        "                   default: auto detect",
+        "--n_pass           number of passes to run",
+        "                   default 3",
+        "                   requires: within [1, 5]",
+        "--file    FILE     data file to write",
+        "                   default stdout",
+        "--hint             enable hint",
+        "                   run less bench for constant or linear stages.",
+        "                   CAUTION: hint is experimental and the resulting",
+        "                            data may be unreliable, enable it only",
+        "                            if you know what you are doing",
+        "-y                 always answer \"yes\" to all prompts",
     };
 
     int len = (int)(sizeof(usage_lines) / sizeof(char *));
@@ -44,7 +49,7 @@ static void usage(char *prog) {
         if (i == 0) {
             fprintf(stderr, line, prog);
         } else {
-            fprintf(stderr, "%s", line);
+            fprintf(stderr, "%s\n", line);
         }
     }
 
@@ -74,7 +79,7 @@ int main(int argc, char **argv) {
 
     if (strcmp(cmd, "bench") == 0) {
         struct ggml_mulmat_tune tune = {
-            .version = 1,
+            .version = GGML_MULMAT_TUNE_VERSION,
             .n_shapes = 0,
         };
 
@@ -124,6 +129,9 @@ int main(int argc, char **argv) {
                     arg_file = argv[i + 1];
                     ++i;
                 }
+            } else if (strcmp(argv[i], "--hint") == 0) {
+                fprintf(stderr, "The `hint` feature is not implemented\n");
+                exit(1);
             } else if (strcmp(argv[i], "-y") == 0) {
                 always_yes = true;
             } else {
@@ -196,7 +204,7 @@ int main(int argc, char **argv) {
                 m_num = v;
             }
             if (m_num < 8 || m_num > 12) {
-                fprintf(stderr, "invalid m_num: %d, expect in range [8, 12]\n",
+                fprintf(stderr, "invalid m_num: %d, expect within [8, 12]\n",
                         m_num);
                 usage(argv[0]);
                 exit(1);
@@ -209,8 +217,8 @@ int main(int argc, char **argv) {
                 int v = atoi(arg_n_pass);
                 n_pass = v;
             }
-            if (n_pass < 1 || n_pass > MAX_NUM_PASS) {
-                fprintf(stderr, "invalid n_pass: %d, expect in range [1, 5]\n",
+            if (n_pass < 1 || n_pass > GGML_MULMAT_MAX_PASS) {
+                fprintf(stderr, "invalid n_pass: %d, expect within [1, 5]\n",
                         n_pass);
                 usage(argv[0]);
                 exit(1);
@@ -350,7 +358,7 @@ int main(int argc, char **argv) {
 
 void cmd_bench(struct ggml_mulmat_tune *tune, int n_pass, bool verbose) {
     size_t wsize = 0;
-    void *q_buf = NULL;
+    char hist[64]; // TODO: make sure this size is safe.
     void *wdata = NULL;
 
     // alloc q_buf and wdata with max size.
@@ -364,16 +372,6 @@ void cmd_bench(struct ggml_mulmat_tune *tune, int n_pass, bool verbose) {
         }
         GGML_ASSERT(max_NxK > 0);
 
-        // NOTE: proximate.
-        size_t q_buf_size = max_NxK * sizeof(int64_t);
-
-        q_buf = malloc(q_buf_size);
-        if (!q_buf) {
-            fprintf(stderr,
-                    "failed to allocate memory for q_buf, size: %zu MiB\n",
-                    q_buf_size / 1024 / 1024);
-            exit(1);
-        }
         wsize = max_NxK * sizeof(float);
         wdata = malloc(wsize);
         if (!wdata) {
@@ -449,23 +447,23 @@ void cmd_bench(struct ggml_mulmat_tune *tune, int n_pass, bool verbose) {
                 switch (shape->src0_type) {
                 case GGML_TYPE_Q4_0:
                     ggml_quantize_q4_0((const float *)src0_f32->data,
-                                       src0->data, N * K, K, (int64_t *)q_buf);
+                                       src0->data, N * K, K, (void *)hist);
                     break;
                 case GGML_TYPE_Q4_1:
                     ggml_quantize_q4_1((const float *)src0_f32->data,
-                                       src0->data, N * K, K, (int64_t *)q_buf);
+                                       src0->data, N * K, K, (void *)hist);
                     break;
                 case GGML_TYPE_Q5_0:
                     ggml_quantize_q5_0((const float *)src0_f32->data,
-                                       src0->data, N * K, K, (int64_t *)q_buf);
+                                       src0->data, N * K, K, (void *)hist);
                     break;
                 case GGML_TYPE_Q5_1:
                     ggml_quantize_q5_1((const float *)src0_f32->data,
-                                       src0->data, N * K, K, (int64_t *)q_buf);
+                                       src0->data, N * K, K, (void *)hist);
                     break;
                 case GGML_TYPE_Q8_0:
                     ggml_quantize_q8_0((const float *)src0_f32->data,
-                                       src0->data, N * K, K, (int64_t *)q_buf);
+                                       src0->data, N * K, K, (void *)hist);
                     break;
                 default:
                     GGML_ASSERT(false);
@@ -492,7 +490,7 @@ void cmd_bench(struct ggml_mulmat_tune *tune, int n_pass, bool verbose) {
                     // without memset, the first run may be significant slow.
                     memset(wdata, 0, wsize);
 
-                    int stage_time[MAX_NUM_PASS];
+                    int stage_time[GGML_MULMAT_MAX_PASS];
                     for (int i_bench = 0; i_bench < n_pass; i_bench++) {
                         int t0 = (int)ggml_time_us();
 
@@ -529,7 +527,6 @@ void cmd_bench(struct ggml_mulmat_tune *tune, int n_pass, bool verbose) {
     }
 
     free(wdata);
-    free(q_buf);
 }
 
 static void print_build_tips(void) {
diff --git a/ggml-tune.c b/ggml-tune.c
index 2f6c56a4dc3c2e..82ef3a5f9cfa6c 100644
--- a/ggml-tune.c
+++ b/ggml-tune.c
@@ -116,26 +116,39 @@ int ggml_mulmat_tune_validate(struct ggml_mulmat_tune *tune, const char *model,
     const char *backend_vendor = ggml_get_backend_vendor();
 
     int rc = 0;
+    char err_buf[1024];
 
-    if (strcmp(model, tune->model) != 0) {
+    if (tune->version != GGML_MULMAT_TUNE_VERSION) {
+        snprintf(
+            err_buf, sizeof(err_buf),
+            "version mismatch, please re-run bench. current: %d, incoming: %d",
+            GGML_MULMAT_TUNE_VERSION, tune->version);
         rc = -1;
-    } else if (type != tune->type) {
+    } else if (strcmp(model, tune->model) != 0) {
+        snprintf(err_buf, sizeof(err_buf),
+                 "model mismatch. current: %s, incoming: %s", model,
+                 tune->model);
         rc = -2;
-    } else if ((int)backend != tune->backend) {
+    } else if (type != tune->type) {
+        snprintf(err_buf, sizeof(err_buf),
+                 "type mismatch. current: %d, incoming: %d\n", type,
+                 tune->type);
         rc = -3;
+    } else if ((int)backend != tune->backend) {
+        snprintf(err_buf, sizeof(err_buf),
+                 "backend mismatch. current: %d, incoming: %d\n", backend,
+                 tune->backend);
+        rc = -4;
     } else if (backend_vendor == NULL ||
                strcmp(backend_vendor, tune->backend_vendor) != 0) {
-        rc = -4;
-    } else {
-        // TODO
+        rc = -5;
+        snprintf(err_buf, sizeof(err_buf),
+                 "backend vendor mismatch. current: %s, incoming: %s",
+                 backend_vendor, tune->backend_vendor);
     }
 
     if (rc != 0) {
-        printf("model: %s, tune model: %s\n", model, tune->model);
-        printf("type: %d, tune type: %d\n", type, tune->type);
-        printf("backend: %d, tune backend: %d\n", backend, tune->backend);
-        printf("backend vendor: %s, tune backend vendor: %s\n", backend_vendor,
-               tune->backend_vendor);
+        fprintf(stderr, "mulmat-tune: error: %s\n", err_buf);
     }
 
     return rc;
@@ -572,21 +585,25 @@ void ggml_mulmat_init_task_profiles(enum ggml_backend backend) {
 
         p[1].stages[1].backend = backend;
         p[1].stages[1].wait = true;
-
     } else if (backend == GGML_BACKEND_BLAS) {
-        ggml_mulmat_task_profiles_qxx_n = 3;
-
-        p[0].stages[0].backend = GGML_BACKEND_CPU;
-        p[0].stages[1].backend = GGML_BACKEND_CPU;
-        p[0].stages[1].parallel = true;
-
-        p[1].stages[1].backend = backend;
-        p[1].stages[1].wait = true;
-
-        p[2].stages[0].backend = GGML_BACKEND_CPU;
-        p[2].stages[0].parallel = true;
-        p[2].stages[1].backend = backend;
-        p[2].stages[1].wait = true;
+        int i = 0;
+        p[i].stages[0].backend = GGML_BACKEND_CPU;
+        p[i].stages[1].backend = GGML_BACKEND_CPU;
+        p[i].stages[1].parallel = true;
+        ++i;
+
+        // p[i].stages[1].backend = backend;
+        // p[i].stages[1].wait = true;
+        // ++i;
+
+        p[i].stages[0].backend = GGML_BACKEND_CPU;
+        p[i].stages[0].parallel = true;
+        // p[i].stages[1].tune_hint = GGML_TUNE_HINT_CONSTANT;
+        p[i].stages[1].backend = backend;
+        p[i].stages[1].wait = true;
+        ++i;
+
+        ggml_mulmat_task_profiles_qxx_n = i;
     } else {
         fprintf(stderr, "invalid backend: %d\n", backend);
         GGML_ASSERT(false);
diff --git a/ggml-tune.h b/ggml-tune.h
index 76bfca2478a658..6ea3ca2cc6c246 100644
--- a/ggml-tune.h
+++ b/ggml-tune.h
@@ -10,15 +10,24 @@
 extern "C" {
 #endif
 
-#define MAX_NUM_PASS 5
-
+#define GGML_MULMAT_TUNE_VERSION 2
 #define GGML_MULMAT_N_SHAPES 6
+
+#define GGML_MULMAT_MAX_PASS 5
 #define GGML_MULMAT_MAX_PROFILES 8
 
+enum ggml_mulmat_tune_hint {
+    GGML_TUNE_HINT_UNKNOWN = 0,
+    GGML_TUNE_HINT_CONSTANT = 1,
+};
+
 struct ggml_task_stage {
     enum ggml_backend backend;
     bool parallel;
     bool wait;
+
+    // TODO: experimental, may be moved to other place.
+    // enum ggml_mulmat_tune_hint tune_hint;
 };
 
 struct ggml_task_profile {
diff --git a/llama.cpp b/llama.cpp
index 8002b25adb96a7..8972591c6ad725 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -2330,7 +2330,7 @@ struct llama_context * llama_init_from_file(
 
             ctx->mm_tune = (struct ggml_mulmat_tune *)malloc(sizeof(struct ggml_mulmat_tune));
             if (ctx->mm_tune == nullptr) {
-                fprintf(stderr, "\nERROR: failed to allocate memory for struct ggml_mulmat_tune\n");
+                fprintf(stderr, "ERROR: failed to allocate memory for struct ggml_mulmat_tune\n");
                 return nullptr;
             }
 
@@ -2343,13 +2343,13 @@ struct llama_context * llama_init_from_file(
                 char buf[128];
                 GGML_ASSERT(strlen(env_dir) < sizeof(buf) - 10);
                 // TODO: take care the path separator for Windows.
-                snprintf(buf, sizeof(buf), "%s/%s.%s", env_dir, model_name, type_name);
+                snprintf(buf, sizeof(buf), "%s/%s.%s.txt", env_dir, model_name, type_name);
                 file = buf;
             }
 
             FILE *fp = fopen(file, "r");
             if (!fp) {
-                fprintf(stderr, "\nWARN: mulmat-tune: failed to open file %s, ignore.\n", file);
+                fprintf(stderr, "mulmat-tune: failed to open file %s, ignore.\n", file);
                 free(ctx->mm_tune);
                 ctx->mm_tune = NULL;
             } else {
@@ -2358,24 +2358,16 @@ struct llama_context * llama_init_from_file(
 
                 if (rc != 0) {
                     free(ctx->mm_tune);
-                    fprintf(stderr, "\nERROR: mulmat-tune: failed to load file %s, error code: %d\n", file, rc);
+                    fprintf(stderr, "mulmat-tune: failed to load file %s, error code: %d\n", file, rc);
                     return nullptr;
                 }
 
-                fprintf(stderr, "\nINFO: mulmat-tune: loaded file %s\n", file);
+                fprintf(stderr, "mulmat-tune: loaded file %s\n", file);
 
                 rc = ggml_mulmat_tune_validate(ctx->mm_tune, model_name, type);
                 if (rc != 0) {
                     free(ctx->mm_tune);
-                    const char *err = "unknown";
-                    switch (rc) {
-                        case -1: err = "model mismatch";       break;
-                        case -2: err = "type mismatch";        break;
-                        case -3: err = "backend mismatch";   break;
-                        case -4: err = "backend vendor mismatch"; break;
-                    }
-                    fprintf(stderr, "\nERROR: mulmat-tune: failed to validate file %s: %s\n", file, err);
-                    return nullptr;
+                    exit(1);
                 }
             }
         }