From 213f133701e947a94bdd070315cc0528429bd27d Mon Sep 17 00:00:00 2001
From: mqy <meng.qingyou@gmail.com>
Date: Wed, 14 Jun 2023 18:33:14 +0800
Subject: [PATCH 01/24] initial

---
 .gitignore                           |   1 +
 CMakeLists.txt                       |   9 +
 Makefile                             |  31 +-
 examples/CMakeLists.txt              |   1 +
 examples/common.cpp                  |  14 +
 examples/common.h                    |   2 +
 examples/main/main.cpp               |  10 +
 examples/mulmat-tune/CMakeLists.txt  |  14 +
 examples/mulmat-tune/README.md       | 272 ++++++++
 examples/mulmat-tune/mulmat-tune.cpp | 277 ++++++++
 examples/perplexity/perplexity.cpp   |  10 +
 ggml-cuda.cu                         |   2 +-
 ggml-opencl.cpp                      |   2 +-
 ggml-threading.c                     | 620 +++++++++++++++++
 ggml-threading.h                     |  68 ++
 ggml-tune.c                          | 897 ++++++++++++++++++++++++
 ggml-tune.h                          | 137 ++++
 ggml.c                               | 996 ++++++++++++++-------------
 ggml.h                               |  71 +-
 llama.cpp                            | 160 ++++-
 llama.h                              |   3 +
 tests/.gitignore                     |   2 +
 tests/CMakeLists.txt                 |   2 +
 tests/test-ggml-threading.c          | 345 ++++++++++
 tests/test-ggml-tune.c               | 200 ++++++
 25 files changed, 3646 insertions(+), 500 deletions(-)
 create mode 100644 examples/mulmat-tune/CMakeLists.txt
 create mode 100644 examples/mulmat-tune/README.md
 create mode 100644 examples/mulmat-tune/mulmat-tune.cpp
 create mode 100644 ggml-threading.c
 create mode 100644 ggml-threading.h
 create mode 100644 ggml-tune.c
 create mode 100644 ggml-tune.h
 create mode 100644 tests/.gitignore
 create mode 100644 tests/test-ggml-threading.c
 create mode 100644 tests/test-ggml-tune.c

diff --git a/.gitignore b/.gitignore
index e7bfd52e3d63c..c2e2a0ab0ca32 100644
--- a/.gitignore
+++ b/.gitignore
@@ -40,6 +40,7 @@ models/*
 /server
 /Pipfile
 /libllama.so
+/mulmat-tune
 
 build-info.h
 arm_neon.h
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 73677195404e3..832c1e986a6eb 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -78,6 +78,7 @@ option(LLAMA_K_QUANTS                        "llama: use k-quants"
 option(LLAMA_BUILD_TESTS                "llama: build tests"    ${LLAMA_STANDALONE})
 option(LLAMA_BUILD_EXAMPLES             "llama: build examples" ${LLAMA_STANDALONE})
 option(LLAMA_BUILD_SERVER               "llama: build server example"                           OFF)
+option(LLAMA_MULMAT_TUNE                "llama: mulmat tune"                                    OFF)
 
 #
 # Build info header
@@ -214,6 +215,7 @@ if (LLAMA_BLAS)
         message(STATUS "BLAS found, Includes: ${BLAS_INCLUDE_DIRS}")
         add_compile_options(${BLAS_LINKER_FLAGS})
         add_compile_definitions(GGML_USE_OPENBLAS)
+        add_compile_definitions(GGML_BLAS_VENDOR="${LLAMA_BLAS_VENDOR}")
         set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} ${BLAS_LIBRARIES})
         set(LLAMA_EXTRA_INCLUDES ${LLAMA_EXTRA_INCLUDES} ${BLAS_INCLUDE_DIRS})
 
@@ -276,6 +278,11 @@ if (LLAMA_METAL)
         )
 endif()
 
+if (LLAMA_MULMAT_TUNE)
+    add_compile_definitions(GGML_USE_MULMAT_TUNE)
+    add_compile_definitions(GGML_MULMAT_TUNE_NDEBUG)
+endif()
+
 if (LLAMA_K_QUANTS)
     set(GGML_SOURCES_EXTRA ${GGML_SOURCES_EXTRA} k_quants.c k_quants.h)
     add_compile_definitions(GGML_USE_K_QUANTS)
@@ -450,6 +457,8 @@ endif()
 
 add_library(ggml OBJECT
             ggml.c
+            ggml-threading.c
+            ggml-tune.c
             ggml.h
             ${GGML_SOURCES_CUDA}
             ${GGML_SOURCES_OPENCL}
diff --git a/Makefile b/Makefile
index afd06e0a60282..a8d1bdc0991ae 100644
--- a/Makefile
+++ b/Makefile
@@ -1,5 +1,5 @@
 # Define the default target now so that it is always the first target
-BUILD_TARGETS = main quantize quantize-stats perplexity embedding vdot train-text-from-scratch simple
+BUILD_TARGETS = main quantize quantize-stats perplexity embedding vdot train-text-from-scratch simple mulmat-tune
 
 ifdef LLAMA_BUILD_SERVER
 	BUILD_TARGETS += server
@@ -47,7 +47,8 @@ endif
 OPT = -O3
 CFLAGS   = -I.              $(OPT) -std=c11   -fPIC
 CXXFLAGS = -I. -I./examples $(OPT) -std=c++11 -fPIC
-LDFLAGS  =
+# -lm fixed error: ggml.o: undefined reference to symbol 'tanhf@@GLIBC_2.2.5' from ubuntu 22.04
+LDFLAGS  = -lm
 
 ifdef LLAMA_DEBUG
 	CFLAGS   += -O0 -g
@@ -134,8 +135,7 @@ ifndef LLAMA_NO_K_QUANTS
 endif
 
 ifndef LLAMA_NO_ACCELERATE
-	# Mac M1 - include Accelerate framework.
-	# `-framework Accelerate` works on Mac Intel as well, with negliable performance boost (as of the predict time).
+	# Mac Intel & M1 - include Accelerate framework.
 	ifeq ($(UNAME_S),Darwin)
 		CFLAGS  += -DGGML_USE_ACCELERATE
 		LDFLAGS += -framework Accelerate
@@ -145,10 +145,16 @@ endif # LLAMA_NO_ACCELERATE
 ifdef LLAMA_OPENBLAS
 	CFLAGS  += -DGGML_USE_OPENBLAS -I/usr/local/include/openblas -I/usr/include/openblas
 	LDFLAGS += -lopenblas
+	ifeq ($(UNAME_S),Darwin)
+		# openblas installed with Homebew on macOS.
+		CFLAGS  += -I/usr/local/opt/openblas/include
+		LDFLAGS += -L/usr/local/opt/openblas/lib
+	endif
 endif # LLAMA_OPENBLAS
 
 ifdef LLAMA_BLIS
 	CFLAGS  += -DGGML_USE_OPENBLAS -I/usr/local/include/blis -I/usr/include/blis
+	CFLAGS  += -DGGML_BLAS_VENDOR="\"BLIS\""
 	LDFLAGS += -lblis -L/usr/local/lib
 endif # LLAMA_BLIS
 
@@ -230,6 +236,11 @@ k_quants.o: k_quants.c k_quants.h
 	$(CC) $(CFLAGS) -c $< -o $@
 endif # LLAMA_NO_K_QUANTS
 
+ifdef LLAMA_MULMAT_TUNE
+	CFLAGS   += -DGGML_USE_MULMAT_TUNE -DGGML_MULMAT_TUNE_NDEBUG
+	CXXFLAGS += -DGGML_USE_MULMAT_TUNE
+endif
+
 #
 # Print build information
 #
@@ -245,6 +256,8 @@ $(info I CC:       $(CCV))
 $(info I CXX:      $(CXXV))
 $(info )
 
+OBJS += ggml-tune.o ggml-threading.o
+
 #
 # Build library
 #
@@ -253,7 +266,12 @@ ggml.o: ggml.c ggml.h ggml-cuda.h
 	$(CC)  $(CFLAGS)   -c $< -o $@
 
 llama.o: llama.cpp ggml.h ggml-cuda.h ggml-metal.h llama.h llama-util.h
-	$(CXX) $(CXXFLAGS) -c $< -o $@
+
+ggml-threading.o: ggml-threading.c ggml.h
+	$(CC)  $(CFLAGS) -c $< -o $@
+
+ggml-tune.o: ggml-tune.c ggml.h
+	$(CC)  $(CFLAGS) -c $< -o $@
 
 common.o: examples/common.cpp examples/common.h
 	$(CXX) $(CXXFLAGS) -c $< -o $@
@@ -298,6 +316,9 @@ server: examples/server/server.cpp examples/server/httplib.h examples/server/jso
 train-text-from-scratch: examples/train-text-from-scratch/train-text-from-scratch.cpp    build-info.h ggml.o llama.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
 
+mulmat-tune: examples/mulmat-tune/mulmat-tune.cpp build-info.h ggml.o $(OBJS)
+	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o mulmat-tune $(LDFLAGS)
+
 build-info.h: $(wildcard .git/index) scripts/build-info.sh
 	@sh scripts/build-info.sh > $@.tmp
 	@if ! cmp -s $@.tmp $@; then \
diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
index cf9c4a2231337..cf01b8a2adb90 100644
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@@ -39,6 +39,7 @@ else()
     add_subdirectory(baby-llama)
     add_subdirectory(train-text-from-scratch)
     add_subdirectory(simple)
+    add_subdirectory(mulmat-tune)
     if (LLAMA_METAL)
         add_subdirectory(metal)
     endif()
diff --git a/examples/common.cpp b/examples/common.cpp
index fed24e027d8a8..882e90c9c3649 100644
--- a/examples/common.cpp
+++ b/examples/common.cpp
@@ -345,6 +345,16 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
             params.mem_test = true;
         } else if (arg == "--export") {
             params.export_cgraph = true;
+#ifdef GGML_USE_MULMAT_TUNE
+        } else if (arg == "--tune") {
+            params.tune = true;
+        } else if (arg == "--tune-file") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params.tune_file = argv[i];
+#endif // GGML_USE_MULMAT_TUNE
         } else if (arg == "--verbose-prompt") {
             params.verbose_prompt = true;
         } else if (arg == "-r" || arg == "--reverse-prompt") {
@@ -498,6 +508,10 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
 #endif
     fprintf(stderr, "  --mtest               compute maximum memory usage\n");
     fprintf(stderr, "  --export              export the computation graph to 'llama.ggml'\n");
+#ifdef GGML_USE_MULMAT_TUNE
+    fprintf(stderr, "  --tune                mulmat tune enable. If tune-file is set then exit after bench\n");
+    fprintf(stderr, "  --tune-file FILE      mulmat tune data file. If tune is true, then write bench result to this file, else load the file and run\n");
+#endif
     fprintf(stderr, "  --verbose-prompt      print prompt before generation\n");
     fprintf(stderr, "  --lora FNAME          apply LoRA adapter (implies --no-mmap)\n");
     fprintf(stderr, "  --lora-base FNAME     optional model to use as a base for the layers modified by the LoRA adapter\n");
diff --git a/examples/common.h b/examples/common.h
index 6c2953cb2a7c6..5e394b2186c8d 100644
--- a/examples/common.h
+++ b/examples/common.h
@@ -77,6 +77,8 @@ struct gpt_params {
     bool mem_test          = false; // compute maximum memory usage
     bool export_cgraph     = false; // export the computation graph
     bool verbose_prompt    = false; // print prompt tokens before generation
+    bool tune              = false; // mulmat tune: enable
+    std::string tune_file  = "";    // mulmat tune: data file
 };
 
 bool gpt_params_parse(int argc, char ** argv, gpt_params & params);
diff --git a/examples/main/main.cpp b/examples/main/main.cpp
index 941312f9cc756..542e463bfe84e 100644
--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@@ -117,6 +117,16 @@ int main(int argc, char ** argv) {
         return 1;
     }
 
+#ifdef GGML_USE_MULMAT_TUNE
+    if (params.tune || !params.tune_file.empty()) {
+        bool ok = llama_mulmat_tune(ctx, params.n_threads, params.tune, params.tune_file.c_str());
+        if (!ok || (params.tune && !params.tune_file.empty())) {
+            llama_free(ctx);
+            return ok? 0: 1;
+        }
+    }
+#endif
+
     // print system information
     {
         fprintf(stderr, "\n");
diff --git a/examples/mulmat-tune/CMakeLists.txt b/examples/mulmat-tune/CMakeLists.txt
new file mode 100644
index 0000000000000..51e1053e87e07
--- /dev/null
+++ b/examples/mulmat-tune/CMakeLists.txt
@@ -0,0 +1,14 @@
+set(TARGET mulmat-tune)
+add_executable(${TARGET} mulmat-tune.cpp)
+
+if (XCODE OR MSVC)
+  set(MULMAT_TUNE_LIBS ggml)
+else()
+  set(MULMAT_TUNE_LIBS ggml m)
+endif()
+
+target_link_libraries(${TARGET} PRIVATE ${MULMAT_TUNE_LIBS} ${CMAKE_THREAD_LIBS_INIT})
+target_compile_features(${TARGET} PRIVATE cxx_std_11)
+if(TARGET BUILD_INFO)
+  add_dependencies(${TARGET} BUILD_INFO)
+endif()
diff --git a/examples/mulmat-tune/README.md b/examples/mulmat-tune/README.md
new file mode 100644
index 0000000000000..cff8a3d6467ea
--- /dev/null
+++ b/examples/mulmat-tune/README.md
@@ -0,0 +1,272 @@
+# Mulmat Benchmark and Tunning
+
+Apart from the standalone tool `mulmat-tune`, mulmat tune is also integrated into
+`main` and `perplexity`. To avoid too many new cli options, I just added two options.
+To make it run faster, the `m_num` is set as 8 thus max M is 128, and the `n_pass`
+is set as 1.
+
+With the newly added cli options, we can use `main` and `perplexity` with the
+following three ways:
+
+* bench and run:  --tune
+* bench and exit: --tune --tune-file <FILE>
+* load  and run:  --tune-file <FILE>
+
+The `load` mode reads existing data file. Although this is fine because we can
+run bench ahead of time (saving tens of seconds), but there are two shortcomings:
+- have to re-run when format changed, this is OK because we are acknowledged.
+- the most subtle problem is algorithm was changed silently but we are using the
+  outdated format. So I integrated mulmat tune into `main` and `perplexity` as
+  a complementary solution.
+
+## Build into main and perplexity
+
+Makefile:
+```
+make clean && LLAMA_MULMAT_TUNE=1 make
+```
+
+CMake (with BLAS):
+```
+cmake --build . --target clean
+cmake .. -DLLAMA_BLAS=ON -DLLAMA_MULMAT_TUNE=ON
+cmake --build . --config Release
+```
+
+Run examples:
+
+```
+# bench and run:
+
+./main -m ./models/3B/open-llama-3b-q4-0.bin -c 512 -b 1024 -n 256 --keep 48 --repeat_penalty 1.0 --color -i -r "User:" -f prompts/chat-with-bob.txt -t 4 --tune
+
+# bench then exit:
+./main -m ./models/3B/open-llama-3b-q4-0.bin --tune --tune-file <FILE>
+
+# load and run
+
+./main -m ./models/3B/open-llama-3b-q4-0.bin -c 512 -b 1024 -n 256 --keep 48 --repeat_penalty 1.0 --color -i -r "User:" -f prompts/chat-with-bob.txt -t 4 --tune-file <FILE>
+```
+
+# Build the standalone `mulmat-tune`
+
+Makefile:
+```
+make clean && LLAMA_MULMAT_TUNE=1 make
+```
+
+CMake (with BLAS)
+```
+cmake --build . --target clean
+cmake .. -DLLAMA_BLAS=ON -DLLAMA_MULMAT_TUNE=ON
+cmake --build . --config Release
+```
+
+Run examples:
+
+```
+./mulmat-tune -h
+
+# run with default params (7B, Q4_0, ...)
+./mulmat-tune
+
+# set model
+./mulmat-tune --model 13B
+
+# set ggml ftype, 2 for Q4_0, 3 for Q4_1, run `mulmat-tune -h` for help.
+./mulmat-tune --ftype 3
+
+# customized m_num
+./mulmat-tune --m_num 8
+
+# customized n_pass: run 1 pass only instead of the default 3.
+./mulmat-tune --n_pass 1
+
+# customized n_threads instead of the default 1.
+./mulmat-tune --n_threads 4
+
+# save to file
+./mulmat-tune --file <FILE>
+
+# save to file, always override if exists (CAUTION!)
+./mulmat-tune --file <FILE> -y
+
+```
+
+# End to End Test
+
+## Compare With Master
+
+You may want to run the following commands. Make sure the tune result file is
+setup properly.
+
+General steps:
+
+1. run `./mulmat-tune -h` to see how to build for misc vendors.
+   you can build with `GGML_MULMAT_TUNE_NDEBUG=` to enable the the debug, e.g:
+   ```
+   make clean; LLAMA_MULMAT_TUNE=1 LLAMA_MULMAT_TUNE_NDEBUG=1 LLAMA_NO_ACCELERATE=1 LLAMA_CLBLAST=1 make
+   ```
+   On `macOS`, `ACCELERATE` is enabled by default. When `ACCELERATE` is built along
+   with `CUDA` or `CL`, you may not see `CUDA` or `CL` from debug because `CPU`
+   or `CPU_BLAS` is more faster (as of the estimation from mulmat tune).
+2. create a small prompt file:
+   ```
+   head -n 5 ./models/wikitext-2-raw/wiki.valid.raw > ./models/wiki.valid-5.raw
+   ```
+3. run any of the following example commands.
+   ```
+   ./perplexity -m models/7B/ggml-model-q4_0.bin -f ./models/wiki.valid-5.raw -c 128 --mlock -t 1 -b 32
+   ./perplexity -m models/7B/ggml-model-q4_0.bin -f ./models/wiki.valid-5.raw -c 128 --mlock -t 4 -b 64
+   ```
+   * `--mlock` is recommended for `macOS`, you may not want to use it.
+   * don't change `-c 128`: too large `context size` causes 0 perplexity trunk.
+   * `-t` is the number of threads, recommend `1`, `2`, `4` or `6`.
+   * you can change the batch size (`-b`) between `1` and `128`.
+   * you may want to add other cli options.
+
+The following results are generated with Accelerate compiled.
+
+### 1 thread
+
+**Master (2d43387d)**
+
+```
+| M   | perplexity (seconds per pass) | prompt eval time (ms per token) |
+| --- | --------------- |
+|  8  |  43.53 | 339.95 |
+|  16 |  44.31 | 346.12 |
+|  24 |  43.14 | 336.90 |
+|  32 |  33.59 | 262.25 |
+|  40 |  27.64 | 215.77 |
+|  48 |  24.52 | 191.42 |
+```
+
+**This branch (tune)**
+
+```
+|  M  |  perplexity (seconds per pass) | prompt eval time (ms per token) |
+| --- | --------------- |
+|  8  |  43.78 | 341.96 |
+|  16 |  42.88 | 334.93 |
+|  24 |  42.06 | 328.42 |
+|  32 |  33.07 | 258.25 |
+|  40 |  28.69 | 223.98 |
+|  48 |  25.65 | 200.19 |
+```
+
+### 4 threads
+
+**Master (2d43387d)**
+
+```
+|  M  |  perplexity (seconds per pass) | prompt eval time (ms per token) |
+| --- | --------------- |
+|   8 |  12.43 |  96.99 |
+|  16 |  12.10 |  94.44 |
+|  24 |  12.81 |  99.95 |
+|  32 |  31.64 | 247.04 |
+|  48 |  24.55 | 191.63 |
+|  64 |  17.56 | 137.09 |
+|  96 |  17.59 | 137.25 |
+| 128 |  10.73 |  83.74 |
+```
+
+**This branch (no tune)**
+
+```
+|  M  |  perplexity (seconds per pass) | prompt eval time (ms per token) |
+| --- | --------------- |
+|   8 |  12.31 |  96.07 |
+|  16 |  12.00 |  93.63 |
+|  24 |  12.07 |  94.15 |
+|  32 |  20.34 | 158.76 |
+|  48 |  15.86 | 123.73 |
+|  64 |  10.98 |  85.69 |
+|  96 |  11.24 |  87.66 |
+| 128 |   7.53 |  58.77 |
+```
+
+**This branch (tune)**
+
+```
+|  M  |  perplexity (seconds per pass) | prompt eval time (ms per token) |
+| --- | --------------- |
+|   8 |  12.48 |  97.37 |
+|  16 |  12.26 |  95.70 |
+|  24 |  12.25 |  95.53 |
+|  32 |  11.98 |  93.58 |
+|  48 |  12.57 |  98.12 |
+|  64 |  11.28 |  88.05 |
+|  96 |   9.55 |  74.53 |
+| 128 |   7.51 |  58.61 |
+```
+
+# Bench Data Format
+
+**Example**
+
+```
+5 3B 2 6 1
+
+3200 3200  2 0 3 10
+16 0 0 0  16 1 0 1   0 0 0 0
+16 1 0 2  17 0 1 0   0 0 0 0
+ 0 0 0 0  34 0 1 0   0 0 0 0
+   1        1      793 0     9103     2102 0 0     6014 0
+   2        2     1591 0     8034     2305 0 0    30982 0
+   4        4     2236 0     6476     2484 0 0    31388 0
+   8        7     4161 0     6623     2389 0 0    29204 0
+  16       15     8339 0     6434     2752 0 0    34303 0
+  32       32    16919 0     6915     3651 0 0    42511 0
+  64      200    34270 0     6574     4528 0 0    68212 0
+ 128      188    69400 0     6325     6839 0 0    74437 0
+ 256      303   134597 0     6168    11544 0 0   110180 0
+ 512      687   279685 0     6337    29712 0 0   159728 0
+
+3200 8640  2 0 2 10
+
+ ...
+
+ ```
+
+**Informal Explanation**
+
+```
+head
+groups+
+
+head := version model ggml_ftype n_shapes n_threads
+shape+
+
+# head
+version: 1
+model: "3B" | "7B" | "13B" | "30B" | "65B"
+ggml_ftype: 0 - 4, 7 - 14
+n_shapes: number of shapes
+n_threads: number of threads
+
+shape := N K  m_num n_profiles
+task_conf_profile+
+bench_item+
+
+task_conf_profile: stage_conf(init) stage_conf(compute) stage_conf(finalize)
+stage_conf: backend parallel wait
+backend: 0 (NONE) | 16 (CPU) | 17 (CPU_BLAS) | 32 (GPU) | 33 (GPU_CUDA) | 34 (GPU_CL)
+parallel: 0 (false) | 1 (true)
+wait: 0 (false) | 1 (true)
+
+bench_item: M profile_time+
+profile_time := stage_time[3]
+stage_time[3]: init_time, compute_time, finalize_time
+```
+
+A task stage is invalid if it's backend equals to `GGML_TASK_BACKEND_NONE`.
+Time unit is `us`. A column is all zeros when that stage does not exist.
+
+# NOTE
+
+1. "3B" is [open-llama 3B](https://github.com/ggerganov/llama.cpp/pull/1588).
+2. Model names are subject to change: we may support something like X-3B, Y-4B, ...
+3. As of Jun 1, this tool is still in early stage, will be changed frequently in
+   recent couple of days (or weeks).
diff --git a/examples/mulmat-tune/mulmat-tune.cpp b/examples/mulmat-tune/mulmat-tune.cpp
new file mode 100644
index 0000000000000..62f1da27764b9
--- /dev/null
+++ b/examples/mulmat-tune/mulmat-tune.cpp
@@ -0,0 +1,277 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+
+#include "build-info.h"
+#include "ggml-tune.h"
+#include "ggml.h"
+#include "llama.h"
+
+#define UNUSED(x) (void)(x)
+
+static void print_build_tips(void) {
+    const char *a = "LLAMA_NO_ACCELERATE";
+    fprintf(stderr, "Tips on how to build with various backend vendors:\n\n");
+    fprintf(stderr, "CUDA:       make clean; LLAMA_CUBLAS=1 make\n");
+    fprintf(stderr, "CL:         make clean; LLAMA_CLBLAST=1 make\n");
+    fprintf(stderr, "Accelerate: make clean; %s=  make\n", a);
+    fprintf(stderr, "OpenBLAS:   make clean; %s=1 LLAMA_OPENBLAS=1 make\n", a);
+    fprintf(stderr, "BLIS:       make clean; %s=1 LLAMA_BLIS=1 make\n", a);
+    fprintf(stderr, "\n");
+    fprintf(stderr, "NOTE: for CUDA/CL, use %s=1 to disable ACCELERATE\n", a);
+}
+
+static bool prompt_yes_no(const char *prompt) {
+    char buf[2];
+    while (true) {
+        fprintf(stderr, "%s (Y|n)\n", prompt);
+        buf[0] = 0;
+        buf[1] = 0;
+        int i = 0;
+        int c = 0;
+
+        while (c != '\n') {
+            c = fgetc(stdin);
+            buf[i % 2] = c;
+            i++;
+        }
+        if (i == 1) {
+            if (buf[0] == '\n') {
+                return true;
+            }
+        } else if (i == 2) {
+            if (buf[0] == 'Y' || buf[0] == 'y') {
+                return true;
+            }
+            if (buf[0] == 'N' || buf[0] == 'n') {
+                return false;
+            }
+        }
+    }
+}
+
+static void cmd_analyze(struct ggml_mulmat_tune *tune);
+
+static void usage(char *prog) {
+    const char *usage_lines[] = {
+        "usage: %s args",
+        "",
+        "bench [-m MODEL] [-t TYPE] [-f FILE] [-y]",
+        "--model     MODEL    3B | 7B | 13B | 30B | 65B",
+        "                     default 7B",
+        "--ftype     FTYPE    ggml ftype:",
+        "                     0:  all F32",
+        "                     1:  mostly F16",
+        "                     2:  mostly Q4_0",
+        "                     3:  mostly Q4_1",
+        "                     4:  mostly Q4_1, some F16",
+        "                     7:  mostly Q8_0",
+        "                     8:  mostly Q5_0",
+        "                     9:  mostly Q5_1",
+        "                     10: mostly Q2_K",
+        "                     11: mostly Q3_K",
+        "                     12: mostly Q4_K",
+        "                     13: mostly Q5_K",
+        "                     14: mostly Q6_K",
+        "                     default 2 (mostly Q4_0)",
+        "--m_num     M_NUM    number of M, the max M = 2^(M_NUM-1)",
+        "                     requires between [6, 12]",
+        "                     default 10",
+        "--n_pass    PASS     number of passes to run",
+        "                     default 1",
+        "                     requires: between [1, 3]",
+        "--n_threads NTH      bench with this number of threads",
+        "                     requires: between [1, 16]",
+        "                     default 1",
+        "--file      FILE     data file to write",
+        "                     default stdout",
+        "-y                   always answer \"yes\" to all prompts",
+    };
+
+    int len = (int)(sizeof(usage_lines) / sizeof(char *));
+    for (int i = 0; i < len; i++) {
+        const char *line = usage_lines[i];
+        if (i == 0) {
+            fprintf(stderr, line, prog);
+        } else {
+            fprintf(stderr, "%s\n", line);
+        }
+    }
+
+    printf("\n");
+    print_build_tips();
+    printf("\n");
+}
+
+int main(int argc, char **argv) {
+    if (argc == 2) {
+        if (strcmp(argv[1], "-h") == 0 || strcmp(argv[1], "--help") == 0) {
+            usage(argv[0]);
+            return 0;
+        }
+    }
+
+    int arg_start = 1;
+
+    const char *arg_model = NULL;
+    const char *arg_ftype = NULL;
+    const char *arg_m_num = NULL;
+    const char *arg_n_threads = NULL;
+    const char *arg_n_pass = NULL;
+    const char *arg_file = NULL;
+    bool always_yes = false;
+
+    for (int i = arg_start; i < argc; i++) {
+        if (strcmp(argv[i], "--model") == 0) {
+            if (i + 1 < argc) {
+                arg_model = argv[i + 1];
+                ++i;
+            }
+        } else if (strcmp(argv[i], "--ftype") == 0) {
+            if (i + 1 < argc) {
+                arg_ftype = argv[i + 1];
+                ++i;
+            }
+        } else if (strcmp(argv[i], "--m_num") == 0) {
+            if (i + 1 < argc) {
+                arg_m_num = argv[i + 1];
+                ++i;
+            }
+        } else if (strcmp(argv[i], "--n_pass") == 0) {
+            if (i + 1 < argc) {
+                arg_n_pass = argv[i + 1];
+                ++i;
+            }
+        } else if (strcmp(argv[i], "--n_threads") == 0) {
+            if (i + 1 < argc) {
+                arg_n_threads = argv[i + 1];
+                ++i;
+            }
+        } else if (strcmp(argv[i], "--file") == 0) {
+            if (i + 1 < argc) {
+                arg_file = argv[i + 1];
+                ++i;
+            }
+        } else if (strcmp(argv[i], "-y") == 0) {
+            always_yes = true;
+        } else {
+            fprintf(stderr, "invalid arg: %s\n", argv[i]);
+            usage(argv[0]);
+            return 1;
+        }
+    }
+
+    enum ggml_ftype ftype = GGML_FTYPE_MOSTLY_Q4_0;
+    {
+        if (arg_ftype != NULL) {
+            int v = atoi(arg_ftype);
+            ftype = (enum ggml_ftype)v;
+        }
+
+        if (ftype > GGML_FTYPE_MOSTLY_Q5_1) {
+            fprintf(stderr, "k_quants type %d is not implemented\n", ftype);
+            return 1;
+        }
+    }
+
+    if (arg_file != NULL && !always_yes) {
+        struct stat st;
+        int rc = stat(arg_file, &st);
+        UNUSED(st);
+        if (rc == 0) { // prompt
+            size_t len = strlen(arg_file) + 50;
+            char *prompt = (char *)malloc(len);
+            GGML_ASSERT(prompt);
+            snprintf(prompt, len, "data file '%s' exists, override?", arg_file);
+
+            if (!prompt_yes_no(prompt)) {
+                printf("Aborted.\n");
+                return 1;
+            }
+            free(prompt);
+        }
+    }
+
+    int m_num = 10;
+    {
+        if (arg_m_num != NULL) {
+            int v = atoi(arg_m_num);
+            m_num = v;
+        }
+
+        if (m_num < 6 || m_num > 12) {
+            fprintf(stderr, "invalid m_num: %d, expect between [6, 12]\n",
+                    m_num);
+            usage(argv[0]);
+            return 1;
+        }
+    }
+
+    int n_pass = 1;
+    {
+        if (arg_n_pass != NULL) {
+            int v = atoi(arg_n_pass);
+            n_pass = v;
+        }
+        if (n_pass < 1 || n_pass > GGML_MULMAT_MAX_PASS) {
+            fprintf(stderr, "invalid n_pass: %d, expect between [1, %d]\n",
+                    n_pass, GGML_MULMAT_MAX_PASS);
+            usage(argv[0]);
+            return 1;
+        }
+    }
+
+    int n_threads = 1;
+    {
+        if (arg_n_threads != NULL) {
+            int v = atoi(arg_n_threads);
+            n_threads = v;
+            if (n_threads < 1 || n_threads > 16) {
+                fprintf(stderr,
+                        "invalid n_threads: %d, expect between [1, 16]\n",
+                        n_threads);
+                usage(argv[0]);
+                return 1;
+            }
+        }
+    }
+
+    const char *model_name = "7B";
+    {
+        if (arg_model != NULL) {
+            model_name = arg_model;
+        }
+    }
+
+    // Let init message print earlier.
+    {
+        struct ggml_init_params init_params = {
+            /*.mem_size   =*/1,
+            /*.mem_buffer =*/NULL,
+            /*.no_alloc   =*/0,
+        };
+        struct ggml_context *ctx = ggml_init(init_params);
+        GGML_ASSERT(ctx);
+        ggml_free(ctx);
+    }
+
+    struct ggml_mulmat_tune tune;
+
+    struct ggml_mulmat_tune_params params;
+    memset(&params, 0, sizeof(struct ggml_mulmat_tune_params));
+
+    ggml_mulmat_init_task_profiles();
+
+    ggml_mulmat_tune_model_init(&params.model, model_name, ftype);
+    params.m_num = m_num;
+    params.n_pass = n_pass;
+    params.n_threads = n_threads;
+    params.progress = true;
+    params.output_console = true;
+    params.fname = arg_file;
+
+    bool ok = ggml_mulmat_tune_bench(&tune, &params);
+    return ok ? 0 : 1;
+}
diff --git a/examples/perplexity/perplexity.cpp b/examples/perplexity/perplexity.cpp
index ae8cfe0afc0b7..1f14c18def3a3 100644
--- a/examples/perplexity/perplexity.cpp
+++ b/examples/perplexity/perplexity.cpp
@@ -158,6 +158,16 @@ int main(int argc, char ** argv) {
         return 1;
     }
 
+#ifdef GGML_USE_MULMAT_TUNE
+    if (params.tune || !params.tune_file.empty()){
+        bool ok = llama_mulmat_tune(ctx, params.n_threads, params.tune, params.tune_file.c_str());
+        if (!ok || (params.tune && !params.tune_file.empty())) {
+            llama_free(ctx);
+            return ok? 0: 1;
+        }
+    }
+#endif
+
     // print system information
     {
         fprintf(stderr, "\n");
diff --git a/ggml-cuda.cu b/ggml-cuda.cu
index 16488b9f9067f..cf52109bce96e 100644
--- a/ggml-cuda.cu
+++ b/ggml-cuda.cu
@@ -2571,7 +2571,7 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
             func = ggml_cuda_rms_norm;
             break;
         case GGML_OP_MUL_MAT:
-            if (!any_on_device && !ggml_cuda_can_mul_mat(tensor->src0, tensor->src1, tensor)) {
+            if (!any_on_device/* && !ggml_cuda_can_mul_mat(tensor->src0, tensor->src1, tensor)*/) {
                 return false;
             }
             func = ggml_cuda_mul_mat;
diff --git a/ggml-opencl.cpp b/ggml-opencl.cpp
index 95f4cec6dd59c..b2300a104ddb2 100644
--- a/ggml-opencl.cpp
+++ b/ggml-opencl.cpp
@@ -1628,7 +1628,7 @@ bool ggml_cl_mul_mat_use_f16(const struct ggml_tensor * src0, const struct ggml_
 }
 
 void ggml_cl_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst, void * wdata, size_t wsize) {
-    GGML_ASSERT(ggml_cl_can_mul_mat(src0, src1, dst));
+    // GGML_ASSERT(ggml_cl_can_mul_mat(src0, src1, dst));
 
     if (src0->type == GGML_TYPE_F32) {
         ggml_cl_mul_mat_f32(src0, src1, dst);
diff --git a/ggml-threading.c b/ggml-threading.c
new file mode 100644
index 0000000000000..cf17793f6be61
--- /dev/null
+++ b/ggml-threading.c
@@ -0,0 +1,620 @@
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "ggml-threading.h"
+#include "ggml.h"
+
+#define UNUSED(x) (void)(x)
+
+// see https://github.com/ggerganov/llama.cpp/pull/1314
+#if defined(__x86_64__) || (defined(_MSC_VER) && defined(_M_AMD64))
+#include <emmintrin.h>
+static inline void ggml_spin_pause(void) { _mm_pause(); }
+#else
+static inline void ggml_spin_pause(void) {}
+#endif
+
+#if defined(_WIN32)
+
+#include <windows.h>
+
+typedef volatile LONG atomic_int;
+typedef atomic_int atomic_bool;
+typedef LONG atomic_flag;
+
+typedef CRITICAL_SECTION pthread_mutex_t;
+typedef CONDITION_VARIABLE pthread_cond_t;
+typedef void pthread_mutexattr_t;
+typedef void pthread_condattr_t;
+
+typedef HANDLE pthread_t;
+
+static void atomic_store(atomic_int *ptr, LONG val) {
+    InterlockedExchange(ptr, val);
+}
+
+static LONG atomic_load(atomic_int *ptr) {
+    return InterlockedCompareExchange(ptr, 0, 0);
+}
+
+static LONG atomic_fetch_add(atomic_int *ptr, LONG inc) {
+    return InterlockedExchangeAdd(ptr, inc);
+}
+
+static LONG atomic_fetch_sub(atomic_int *ptr, LONG dec) {
+    return atomic_fetch_add(ptr, -(dec));
+}
+
+static inline LONG atomic_flag_test_and_set(volatile atomic_flag *ptr) {
+    return InterlockedCompareExchange(ptr, 1, 0);
+}
+static inline LONG atomic_flag_clear(volatile atomic_flag *ptr) {
+    return InterlockedExchange(ptr, 0);
+}
+static int pthread_create(pthread_t *out, void *unused,
+                          ggml_thread_ret_t (*func)(void *), void *arg) {
+    (void)unused;
+    HANDLE handle =
+        CreateThread(NULL, 0, (LPTHREAD_START_ROUTINE)func, arg, 0, NULL);
+    if (handle == NULL) {
+        return EAGAIN;
+    }
+
+    *out = handle;
+    return 0;
+}
+
+static int pthread_join(pthread_t thread, void *unused) {
+    (void)unused;
+    return (int)WaitForSingleObject(thread, INFINITE);
+}
+
+static int pthread_mutex_init(pthread_mutex_t *mutex,
+                              pthread_mutexattr_t *attr) {
+    (void)attr;
+    InitializeCriticalSection(mutex);
+    return 0;
+}
+
+static int pthread_mutex_destroy(pthread_mutex_t *mutex) {
+    DeleteCriticalSection(mutex);
+    return 0;
+}
+
+static int pthread_mutex_lock(pthread_mutex_t *mutex) {
+    EnterCriticalSection(mutex);
+    return 0;
+}
+
+static int pthread_mutex_unlock(pthread_mutex_t *mutex) {
+    LeaveCriticalSection(mutex);
+    return 0;
+}
+
+static int pthread_cond_init(pthread_cond_t *cond, pthread_condattr_t *attr) {
+    (void)attr;
+    InitializeConditionVariable(cond);
+    return 0;
+}
+
+static int pthread_cond_destroy(pthread_cond_t *cond) {
+    (void)cond;
+    return 0;
+}
+
+static int pthread_cond_wait(pthread_cond_t *cond, pthread_mutex_t *mutex) {
+    SleepConditionVariableCS(cond, mutex, INFINITE);
+    return 0;
+}
+
+static int pthread_cond_signal(pthread_cond_t *cond) {
+    WakeConditionVariable(cond);
+    return 0;
+}
+
+static int pthread_cond_broadcast(pthread_cond_t *cond) {
+    WakeAllConditionVariable(cond);
+    return 0;
+}
+
+static int sched_yield(void) {
+    // https://learn.microsoft.com/en-us/windows/win32/api/winnt/nf-winnt-yieldprocessor
+    YieldProcessor();
+    return 0;
+}
+
+#else // ! _WIN32
+
+#include <pthread.h>
+#include <stdatomic.h>
+#include <stdbool.h>
+
+#endif
+
+// #define GGML_THREADING_DEBUG 1
+
+#ifdef GGML_THREADING_DEBUG
+#define PRINT_DEBUG(...) fprintf(stdout, __VA_ARGS__)
+#else
+#define PRINT_DEBUG(...)
+#endif
+
+struct ggml_perf_stats {
+    int runs;
+
+    // total cycles
+    atomic_int cycles;
+
+    // total time in us.
+    atomic_int time_us;
+};
+
+struct ggml_compute_state_shared {
+    atomic_flag spin;
+    pthread_mutex_t mutex;
+    pthread_cond_t cond;
+
+    // number of threads that has entered thread runner.
+    atomic_int n_ready;
+
+    // number of assigned but unfinished tasks, workers decrease it.
+    atomic_int n_tasks;
+
+    // number of waiting workers, workers increase it.
+    atomic_int n_waiting;
+
+    // commands.
+    atomic_bool wait_now;
+    atomic_bool wait_on_done;
+    atomic_bool stop;
+
+    ggml_threading_task_runner *task_runner;
+
+    struct ggml_threading_context *ctx;
+};
+struct ggml_compute_state {
+    pthread_t thrd;
+
+    atomic_bool has_work;
+    struct ggml_compute_params params;
+    struct ggml_tensor *node;
+
+    struct ggml_compute_state_shared *shared;
+};
+struct ggml_threading_context {
+    int n_threads;
+    struct ggml_compute_state_shared shared;
+    struct ggml_compute_state *workers;
+
+    enum ggml_threading_features features;
+
+    struct ggml_perf_stats wait_perf;
+    struct ggml_perf_stats wakeup_perf;
+
+    int64_t *stages_time;
+};
+
+// NOTE: ggml_spin_lock and ggml_spin_unlock may can be noop if
+// feature wait_on_done is off.
+static inline void ggml_spin_lock(volatile atomic_flag *obj) {
+    while (atomic_flag_test_and_set(obj)) {
+        ggml_spin_pause();
+    }
+}
+
+static inline void ggml_spin_unlock(volatile atomic_flag *obj) {
+    atomic_flag_clear(obj);
+}
+
+static inline void ggml_perf_collect(struct ggml_perf_stats *st, int64_t c0,
+                                     int64_t t0) {
+    st->runs++;
+    st->cycles += (ggml_cycles() - c0);
+    st->time_us += (ggml_time_us() - t0);
+}
+
+// A worker thread goes cond waiting.
+// NOTE: must be protected by shared->spin
+static void ggml_threading_cond_wait(struct ggml_compute_state *state) {
+    struct ggml_compute_state_shared *shared = state->shared;
+
+    int64_t perf_cycles_0 = 0;
+    int64_t perf_time_0 = 0;
+
+    if (shared->ctx->features & GGML_THREADING_FEATURE_PERF) {
+        perf_cycles_0 = ggml_cycles();
+        perf_time_0 = ggml_time_us();
+    }
+
+    GGML_ASSERT(pthread_mutex_lock(&shared->mutex) == 0);
+
+    if (!shared->wait_now) {
+        GGML_ASSERT(pthread_mutex_unlock(&shared->mutex) == 0);
+        ggml_spin_unlock(&shared->spin);
+        return;
+    }
+
+    shared->n_waiting++;
+    ggml_spin_unlock(&shared->spin);
+
+    GGML_ASSERT(pthread_cond_wait(&shared->cond, &shared->mutex) == 0);
+    GGML_ASSERT(pthread_mutex_unlock(&shared->mutex) == 0);
+
+    ggml_spin_lock(&shared->spin);
+
+    shared->n_waiting--;
+
+    if (shared->ctx->features & GGML_THREADING_FEATURE_PERF) {
+        ggml_perf_collect(&shared->ctx->wait_perf, perf_cycles_0, perf_time_0);
+    }
+}
+
+// Wakeup all workers.
+//
+// Workers takes some time to wakeup, and has to lock spin after wakeup. Yield
+// is used to avoid signal frequently. Current implementation is highly
+// experimental. See tests/test-ggml-threading.c for details.
+//
+// NOTE: must be protected by shared->spin
+static void
+ggml_threading_wakeup_workers(struct ggml_compute_state_shared *shared) {
+    int64_t perf_cycles_0 = 0;
+    int64_t perf_time_0 = 0;
+
+    if (shared->ctx->features & GGML_THREADING_FEATURE_PERF) {
+        perf_cycles_0 = ggml_cycles();
+        perf_time_0 = ggml_time_us();
+    }
+
+    shared->wait_now = false;
+
+    int loop_counter = 0;
+    int notify_counter = 0;
+    int64_t last_signal_time = 0;
+
+    while (shared->n_waiting != 0) {
+        ggml_spin_unlock(&shared->spin);
+
+        if (loop_counter > 0) {
+            ggml_spin_pause();
+            if (loop_counter > 3) {
+                sched_yield();
+            }
+        }
+        ++loop_counter;
+
+        // TODO: should bench actual average wait/wakeup time.
+        if (last_signal_time > 0 && (ggml_time_us() - last_signal_time) < 10) {
+            continue;
+        }
+
+        GGML_ASSERT(pthread_mutex_lock(&shared->mutex) == 0);
+        GGML_ASSERT(pthread_cond_broadcast(&shared->cond) == 0);
+        GGML_ASSERT(pthread_mutex_unlock(&shared->mutex) == 0);
+        ++notify_counter;
+        last_signal_time = ggml_time_us();
+
+        ggml_spin_lock(&shared->spin);
+    }
+
+    if (shared->ctx->features & GGML_THREADING_FEATURE_PERF) {
+        ggml_perf_collect(&shared->ctx->wakeup_perf, perf_cycles_0,
+                          perf_time_0);
+    }
+
+    // if (notify_counter > 1) {
+    //     printf("%s: loop counter: %d, notify counter: %d\n", __func__,
+    //            loop_counter, notify_counter);
+    // }
+    UNUSED(notify_counter);
+}
+
+// Setup workers for a task stage.
+// NOTE: must be protected by shared->spin
+static void ggml_threading_setup_workers(struct ggml_threading_context *ctx,
+                                         struct ggml_task_profile *profile,
+                                         enum ggml_task_type type) {
+    PRINT_DEBUG("[main] setup workers for task ...\n");
+
+#ifdef GGML_THREADING_DEBUG
+    int64_t t0 = ggml_time_us();
+#endif
+
+    const int n_worker_threads = ctx->n_threads - 1;
+    struct ggml_task_stage *current = &profile->stages[type];
+    struct ggml_compute_state_shared *shared = &ctx->shared;
+
+    if (current->parallel) {
+        if (shared->n_waiting > 0) {
+            ggml_threading_wakeup_workers(shared);
+        }
+
+        if ((ctx->features & GGML_THREADING_FEATURE_WAIT_ON_DONE) > 0) {
+            // Optimize energy: wait_on_done. We MAY also check following nodes,
+            // but that's a bit complicated.
+            shared->wait_on_done = false;
+            for (int i = type + 1; i <= GGML_TASK_FINALIZE; i++) {
+                struct ggml_task_stage *next = &profile->stages[i];
+                if (next->parallel) {
+                    break;
+                }
+                if (next->wait) {
+                    shared->wait_on_done = true;
+                    PRINT_DEBUG("[main] wait_on_done is enabled for "
+                                "current task stage\n");
+                    break;
+                }
+            }
+        }
+    } else if (current->wait) {
+        if (shared->n_waiting < n_worker_threads) {
+            shared->wait_now = true;
+            PRINT_DEBUG("[main] wait_now was set, expect %d workers wait\n",
+                        n_worker_threads);
+            ggml_spin_unlock(&shared->spin);
+
+            while (shared->n_waiting != n_worker_threads) {
+                ggml_spin_pause();
+            }
+
+            ggml_spin_lock(&shared->spin);
+            PRINT_DEBUG("[main] saw %d workers waiting\n", n_worker_threads);
+        }
+    }
+
+    PRINT_DEBUG("[main] setup workers for task took %d us\n",
+                (int)(ggml_time_us() - t0));
+}
+
+ggml_thread_ret_t ggml_threading_graph_compute_thread(void *data) {
+    GGML_ASSERT(data);
+    struct ggml_compute_state *state = (struct ggml_compute_state *)data;
+    GGML_ASSERT(state);
+
+    struct ggml_compute_state_shared *shared = state->shared;
+    GGML_ASSERT(shared);
+    GGML_ASSERT(shared->task_runner);
+
+    shared->n_ready++;
+
+    PRINT_DEBUG("[%d-th] running\n", state->params.ith);
+
+    while (!shared->stop) {
+        if (shared->wait_now) {
+            ggml_spin_lock(&shared->spin);
+            if (!state->has_work) {
+                ggml_threading_cond_wait(state);
+            }
+            ggml_spin_unlock(&shared->spin);
+        }
+
+        if (shared->n_tasks > 0 && state->has_work) {
+            enum ggml_compute_error err =
+                shared->task_runner(&state->params, state->node);
+
+            GGML_ASSERT(err == GGML_COMPUTE_OK || err == GGML_COMPUTE_FALLBACK);
+
+            ggml_spin_lock(&shared->spin);
+
+            state->has_work = false;
+            shared->n_tasks--;
+
+            bool wait = shared->wait_on_done && !state->has_work;
+            if (wait) {
+                ggml_threading_cond_wait(state);
+            }
+
+            ggml_spin_unlock(&shared->spin);
+
+            // no need to pause.
+            if (wait) {
+                continue;
+            }
+        }
+
+        ggml_spin_pause();
+    }
+
+    PRINT_DEBUG("[%d-th] exited\n", state->params.ith);
+    return 0;
+}
+
+enum ggml_compute_error
+ggml_threading_compute_tensor(struct ggml_threading_context *ctx,
+                              struct ggml_tensor *node, void *wdata,
+                              size_t wsize) {
+    GGML_ASSERT(ctx);
+    GGML_ASSERT(node);
+
+    GGML_ASSERT(ctx->shared.task_runner);
+    struct ggml_compute_state_shared *state_shared = &ctx->shared;
+
+    // This is the params for main thread.
+    struct ggml_compute_params params;
+    enum ggml_compute_error err;
+
+    for (int type = GGML_TASK_INIT; type <= GGML_TASK_FINALIZE; type++) {
+        if (node->task_profile.stages[type].backend == GGML_TASK_BACKEND_NONE) {
+            continue;
+        }
+
+        PRINT_DEBUG("[main] stage: %d\n", type);
+
+        int64_t t_stage = 0;
+        if (ctx->stages_time) {
+            t_stage = ggml_time_us();
+        }
+
+        // n_tasks is the total number of parallel computing tasks
+        // (including main thread).
+        int n_tasks =
+            node->task_profile.stages[type].parallel ? ctx->n_threads : 1;
+
+        ggml_spin_lock(&state_shared->spin);
+
+        if (ctx->n_threads > 1) {
+            ggml_threading_setup_workers(ctx, &node->task_profile, type);
+        }
+
+        if (n_tasks > 1) {
+            // setup compute task parameters.
+            for (int j = 0; j < n_tasks - 1; j++) {
+                ctx->workers[j].params = (struct ggml_compute_params){
+                    .type = type,
+                    .ith = j + 1,
+                    .nth = n_tasks,
+                    .wsize = wsize,
+                    .wdata = wdata,
+                };
+                ctx->workers[j].node = node;
+                ctx->workers[j].has_work = true;
+            }
+            state_shared->n_tasks = n_tasks - 1;
+            PRINT_DEBUG("[main] assigned %d tasks\n", state_shared->n_tasks);
+        }
+
+        ggml_spin_unlock(&state_shared->spin);
+
+        // main thread always run the 0-th task.
+        // TODO: assert(params->nth == 1) instead of
+        // assert(params->ith == 0)
+        {
+            params.type = type;
+            params.ith = 0;
+            params.nth = n_tasks;
+            params.wsize = wsize;
+            params.wdata = wdata;
+
+            err = state_shared->task_runner(&params, node);
+        }
+
+        // wait for tasks done.
+        if (n_tasks > 1) {
+            while (state_shared->n_tasks != 0) {
+                ggml_spin_pause();
+            }
+        }
+
+        PRINT_DEBUG("[main] all tasks finished\n\n");
+
+        if (ctx->stages_time) {
+            ctx->stages_time[type] = ggml_time_us() - t_stage;
+        }
+
+        if (err != GGML_COMPUTE_OK) {
+            return err;
+        }
+    }
+
+    return GGML_COMPUTE_OK;
+}
+
+struct ggml_threading_context *
+ggml_threading_start(int n_threads, ggml_threading_thread_runner *thread_runner,
+                     ggml_threading_task_runner *task_stage_runner,
+                     enum ggml_threading_features features,
+                     int64_t stages_time[3]) {
+    GGML_ASSERT(n_threads > 0);
+    GGML_ASSERT(thread_runner);
+    GGML_ASSERT(task_stage_runner);
+
+    size_t ctx_sz = sizeof(struct ggml_threading_context);
+    struct ggml_threading_context *ctx = malloc(ctx_sz);
+    GGML_ASSERT(ctx);
+    memset(ctx, 0, ctx_sz);
+
+    ctx->shared = (struct ggml_compute_state_shared){
+        .spin = {0},
+        .n_ready = 0,
+        .n_tasks = 0,
+        .n_waiting = 0,
+        .wait_now = false,
+        .wait_on_done = false,
+        .stop = false,
+        .task_runner = task_stage_runner,
+        .ctx = ctx,
+    };
+
+    PRINT_DEBUG("[main] thread start, features: %d\n", features);
+
+    ctx->n_threads = n_threads;
+    ctx->features = features;
+    ctx->stages_time = stages_time;
+
+    int n_workers = n_threads - 1;
+    if (n_workers > 0) {
+        GGML_ASSERT(pthread_mutex_init(&ctx->shared.mutex, NULL) == 0);
+        GGML_ASSERT(pthread_cond_init(&ctx->shared.cond, NULL) == 0);
+
+        size_t workers_sz = sizeof(struct ggml_compute_state) * n_workers;
+        struct ggml_compute_state *workers = malloc(workers_sz);
+        GGML_ASSERT(workers);
+        memset(workers, 0, workers_sz);
+
+        for (int j = 0; j < n_workers; j++) {
+            workers[j].shared = &ctx->shared;
+            GGML_ASSERT(pthread_create(&workers[j].thrd, NULL, thread_runner,
+                                       &workers[j]) == 0);
+        }
+
+        ctx->workers = workers;
+
+        while (ctx->shared.n_ready != n_workers) {
+            ggml_spin_pause();
+        }
+    }
+
+    return ctx;
+}
+
+static void
+ggml_threading_print_perf_stats(struct ggml_threading_context *ctx) {
+    bool print_stats = ctx->features & GGML_THREADING_FEATURE_PERF;
+#ifdef GGML_THREADING_DEBUG
+    print_stats = true;
+#endif
+
+    if (!print_stats) {
+        return;
+    }
+
+    const char *prefix_arr[2] = {"[threading wait  ]", "[threading wakeup]"};
+    struct ggml_perf_stats *st_arr[2] = {&ctx->wait_perf, &ctx->wakeup_perf};
+    for (int i = 0; i < 2; i++) {
+        struct ggml_perf_stats *st = st_arr[i];
+        if (st->runs == 0) {
+            continue;
+        }
+        fprintf(stdout,
+                "%s runs: %4d, avg cycles: %8.3f ms, avg time: "
+                "%8.3f ms\n",
+                prefix_arr[i], st->runs,
+                1.0 * st->cycles / (st->runs * ggml_cycles_per_ms()),
+                1.0 * st->time_us / (st->runs * 1000));
+    }
+}
+
+void ggml_threading_stop(struct ggml_threading_context *ctx) {
+    GGML_ASSERT(ctx);
+
+    if (ctx->workers) {
+        PRINT_DEBUG("[main] stopping thread pool ...\n");
+        ctx->shared.stop = true;
+
+        ggml_spin_lock(&ctx->shared.spin);
+        ggml_threading_wakeup_workers(&ctx->shared);
+        ggml_spin_unlock(&ctx->shared.spin);
+
+        for (int j = 0; j < ctx->n_threads - 1; j++) {
+            GGML_ASSERT(pthread_join(ctx->workers[j].thrd, NULL) == 0);
+        }
+        free(ctx->workers);
+        PRINT_DEBUG("[main] thread pool stopped\n");
+    }
+
+    ggml_threading_print_perf_stats(ctx);
+
+    free(ctx);
+}
diff --git a/ggml-threading.h b/ggml-threading.h
new file mode 100644
index 0000000000000..f3214efc7cb7d
--- /dev/null
+++ b/ggml-threading.h
@@ -0,0 +1,68 @@
+#pragma once
+
+#include "ggml.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#if defined(_WIN32)
+typedef int ggml_thread_ret_t;
+#else
+typedef void *ggml_thread_ret_t;
+#endif
+
+struct ggml_threading_context;
+
+// Optional (experimental) features.
+enum ggml_threading_features {
+    GGML_THREADING_FEATURE_NONE = 0,
+    GGML_THREADING_FEATURE_WAIT_ON_DONE = 1 << 0,
+    GGML_THREADING_FEATURE_PERF = 1 << 1,
+};
+
+// Compute errors.
+enum ggml_compute_error {
+    GGML_COMPUTE_OK = 0,
+    GGML_COMPUTE_FALLBACK = 1,
+};
+
+// The task runner to be called by main thread and workers.
+typedef enum ggml_compute_error(ggml_threading_task_runner)(
+    struct ggml_compute_params *params, struct ggml_tensor *node);
+
+// The thread runner to feed into OS threads.
+typedef ggml_thread_ret_t(ggml_threading_thread_runner)(void *data);
+
+// Init and start underlying workers if n_threads > 1.
+//
+// features: optional for configure threading additional features.
+// see `ggml_threading_feature`, default 0.
+// stages_time: optional for collecting per-stage wall clock time.
+struct ggml_threading_context *
+ggml_threading_start(int n_threads, ggml_threading_thread_runner *thread,
+                     ggml_threading_task_runner *task_stage_runner,
+                     enum ggml_threading_features features,
+                     int64_t stages_time[3]);
+
+// Stop workers (if exist), free memories (including the ctx).
+void ggml_threading_stop(struct ggml_threading_context *ctx);
+
+// The default implementation of `ggml_threading_thread_runner`
+ggml_thread_ret_t ggml_threading_graph_compute_thread(void *data);
+
+// Compute a tensor. It computes the enabled task stages one by one.
+// Caller should take care of the return error: retry for fallback error.
+enum ggml_compute_error
+ggml_threading_compute_tensor(struct ggml_threading_context *ctx,
+                              struct ggml_tensor *node, void *wdata,
+                              size_t wsize);
+
+// This is an experimental functionality for mulmat tune, as a thin wrapper.
+enum ggml_compute_error
+ggml_compute_forward_wrapper(struct ggml_compute_params *params,
+                             struct ggml_tensor *tensor);
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/ggml-tune.c b/ggml-tune.c
new file mode 100644
index 0000000000000..fbca953ed469e
--- /dev/null
+++ b/ggml-tune.c
@@ -0,0 +1,897 @@
+#include <string.h>
+
+#include "ggml-threading.h"
+#include "ggml-tune.h"
+#include "ggml.h"
+
+// MUL_MAT fine tunning for non-GPU-offloading cases.
+
+#define GGML_MULMAT_CACHE_LEN 16
+static struct mm_cache_element default_mm_cache[GGML_MULMAT_CACHE_LEN] = {0};
+
+#define FNV_OFFSET 14695981039346656037UL
+#define FNV_PRIME 1099511628211UL
+static uint64_t ggml_mulmat_tune_cache_hash(int M, int N, int K) {
+    char buf[30];
+    snprintf(buf, 30, "%d%d%d", M, N, K);
+
+    uint64_t hash = FNV_OFFSET;
+    for (const char *p = buf; *p; p++) {
+        hash ^= (uint64_t)(unsigned char)(*p);
+        hash *= FNV_PRIME;
+    }
+    return hash;
+}
+
+static const char *
+ggml_mulmat_tune_task_backend_name(enum ggml_task_backend backend) {
+    switch (backend) {
+    case GGML_TASK_BACKEND_NONE:
+        return "";
+    case GGML_TASK_BACKEND_CPU:
+        return "CPU";
+    case GGML_TASK_BACKEND_CPU_BLAS:
+        return "BLAS";
+    case GGML_TASK_BACKEND_GPU:
+        return "GPU";
+    case GGML_TASK_BACKEND_GPU_CUDA:
+        return "CUDA";
+    case GGML_TASK_BACKEND_GPU_CL:
+        return "CL";
+    default:
+        GGML_ASSERT(false);
+    }
+}
+
+const struct ggml_task_profile *ggml_mulmat_tune_select_task_profile(
+    struct ggml_mulmat_tune *tune, int M, int N, int K, enum ggml_type src0_t,
+    enum ggml_type src1_t, int stages_time[3]) {
+    GGML_ASSERT(tune);
+
+    // TODO: default_mm_cache is thread-unsafe.
+    struct mm_cache_element *mm_cache = default_mm_cache;
+    int slot = ggml_mulmat_tune_cache_hash(M, N, K) % GGML_MULMAT_CACHE_LEN;
+    struct mm_cache_element *e = &mm_cache[slot];
+
+    struct ggml_mulmat_tune_time profiles_time[GGML_MAX_TASK_PROFILES] = {0};
+
+    struct ggml_task_profile *prof = NULL;
+
+    if (e->M == M && e->N == N && e->K == K) {
+        prof = e->profile;
+        if (stages_time) {
+            for (int i = 0; i < 3; i++) {
+                stages_time[i] = e->stages_time[i];
+            }
+        }
+    } else {
+        const struct ggml_mulmat_tune_shape *shape = NULL;
+        shape = ggml_mulmat_tune_get_shape(tune, N, K, src0_t, src1_t);
+        if (shape) {
+            ggml_mulmat_tune_estimate_time(shape, M, profiles_time);
+
+            int min = INT32_MAX;
+            int index = -1;
+            for (int i = 0; i < shape->n_profiles; i++) {
+                int total = profiles_time[i].total_time;
+                if (total < min) {
+                    min = total;
+                    index = i;
+                }
+            }
+
+            if (index >= 0) {
+                prof = profiles_time[index].profile;
+                for (int i = 0; i < 3; i++) {
+                    int t = profiles_time[index].stage_time[i];
+                    if (stages_time) {
+                        stages_time[i] = t;
+                    }
+                    e->stages_time[i] = t;
+                }
+
+                GGML_ASSERT(prof);
+
+                e->profile = prof;
+                e->M = M;
+                e->N = N;
+                e->K = K;
+
+                // to disable this, build with
+                // `make clean; LLAMA_MULMAT_TUNE=1 LLAMA_MULMAT_TUNE_NDEBUG=1
+                // make`
+#if !defined(GGML_MULMAT_TUNE_NDEBUG)
+                const char *names[3];
+                for (int i = 0; i < 3; i++) {
+                    names[i] = ggml_mulmat_tune_task_backend_name(
+                        prof->stages[i].backend);
+                }
+                printf(
+                    "\n[mulmat tune] M: %3d, N: %5d, K: %5d, backends of the "
+                    "fastest profile: %s %s %s\n",
+                    M, N, K, names[0], names[1], names[2]);
+#endif
+            }
+        }
+    }
+
+    return prof;
+}
+
+void ggml_mulmat_tune_model_init(struct ggml_mulmat_tune_model *model,
+                                 const char *name, enum ggml_ftype ftype) {
+    const int n_vocab = 32000;
+    int n_embd;
+    // n_ff = ((2*(4*n_embd)/3 + n_mult - 1)/n_mult)*n_mult
+    int n_ff;
+    // n_rot = n_embd/n_head;
+    int n_rot;
+
+    if (strcmp(name, "3B") == 0) {
+        // n_head=32, n_mult=216, n_layer=26
+        // https://github.com/ggerganov/llama.cpp/pull/1588
+        n_embd = 3200;
+        n_ff = 8640;
+        n_rot = 100;
+    } else if (strcmp(name, "7B") == 0) {
+        n_embd = 4096;
+        n_ff = 11008;
+        n_rot = 128;
+    } else if (strcmp(name, "13B") == 0) {
+        n_embd = 5120;
+        n_ff = 13824;
+        n_rot = 128;
+    } else if (strcmp(name, "30B") == 0) {
+        n_embd = 6656;
+        n_ff = 17920;
+        n_rot = 128;
+    } else if (strcmp(name, "65B") == 0) {
+        n_embd = 8192;
+        n_ff = 22016;
+        n_rot = 128;
+    } else {
+        GGML_ASSERT(false);
+    }
+
+    model->name = name;
+    model->ftype = ftype;
+    model->n_vocab = n_vocab;
+    model->n_embd = n_embd;
+    model->n_ff = n_ff;
+    model->n_rot = n_rot;
+}
+
+bool ggml_mulmat_tune_init(struct ggml_mulmat_tune *tune,
+                           struct ggml_mulmat_tune_params *params,
+                           struct ggml_task_profile_factory *pf) {
+
+    struct ggml_mulmat_tune_model *model = &params->model;
+
+    memset(tune, 0, sizeof(struct ggml_mulmat_tune));
+
+    tune->version = GGML_MULMAT_TUNE_VERSION;
+    tune->n_threads = params->n_threads;
+    tune->ftype = model->ftype;
+
+    size_t name_len = strlen(model->name);
+    GGML_ASSERT(name_len > 0);
+    strncpy(tune->model, model->name, sizeof(tune->model) - 1);
+
+    const enum ggml_type rot_src0_type = GGML_TYPE_F16;
+    const enum ggml_type src1_type = GGML_TYPE_F32;
+
+    int n_vocab = model->n_vocab;
+    int n_embd = model->n_embd;
+    int n_ff = model->n_ff;
+    int n_rot = model->n_rot;
+
+    enum ggml_type type = ggml_ftype_to_ggml_type(model->ftype);
+
+    GGML_ASSERT(GGML_MULMAT_N_SHAPES >= 6);
+    tune->n_shapes = GGML_MULMAT_N_SHAPES;
+
+    // Attention layers
+    tune->shapes[0] = (struct ggml_mulmat_tune_shape){
+        .N = n_embd, .K = n_embd, .src0_type = type, .src1_type = src1_type};
+    // Feed forward layers
+    tune->shapes[1] = (struct ggml_mulmat_tune_shape){
+        .N = n_embd, .K = n_ff, .src0_type = type, .src1_type = src1_type};
+    tune->shapes[2] = (struct ggml_mulmat_tune_shape){
+        .N = n_ff, .K = n_embd, .src0_type = type, .src1_type = src1_type};
+    tune->shapes[3] = (struct ggml_mulmat_tune_shape){
+        .N = n_vocab, .K = n_embd, .src0_type = type, .src1_type = src1_type};
+    // RoPE
+    tune->shapes[4] = (struct ggml_mulmat_tune_shape){
+        .N = n_rot, .K = 0, .src0_type = rot_src0_type, .src1_type = src1_type};
+    tune->shapes[5] = (struct ggml_mulmat_tune_shape){
+        .N = 0, .K = n_rot, .src0_type = rot_src0_type, .src1_type = src1_type};
+
+    for (int i = 0; i < tune->n_shapes; i++) {
+        struct ggml_mulmat_tune_shape *shape = &tune->shapes[i];
+        shape->n_profiles = ggml_mulmat_get_task_profiles(
+            pf, shape->src0_type, shape->src1_type, &shape->profiles);
+        if (shape->n_profiles == 0) {
+            // allowed for testing.
+            continue;
+        }
+
+        shape->m_num = params->m_num;
+        shape->arr_m = malloc(shape->m_num * sizeof(int));
+        for (int j = 0; j < shape->m_num; j++) {
+            shape->arr_m[j] = 1 << j;
+        }
+
+        size_t sz = sizeof(struct ggml_mulmat_tune_m) *
+                    (shape->n_profiles * shape->m_num);
+        shape->items = malloc(sz);
+        GGML_ASSERT(shape->items);
+        memset(shape->items, 0, sz);
+    }
+
+    return true;
+}
+
+void ggml_mulmat_tune_free(struct ggml_mulmat_tune *tune) {
+    for (int i = 0; i < tune->n_shapes; i++) {
+        struct ggml_mulmat_tune_shape *shape = &tune->shapes[i];
+        GGML_ASSERT(shape);
+
+        // arr_m and items can be NULL only when testing.
+        if (shape->arr_m) {
+            free(shape->arr_m);
+        }
+        if (shape->items) {
+            free(shape->items);
+        }
+    }
+}
+
+static bool ggml_mulmat_tune_write_profiles(
+    FILE *fp, const struct ggml_task_profile *profiles, int n_profiles) {
+    int rc;
+    for (int i = 0; i < n_profiles; i++) {
+        const struct ggml_task_profile *profile = &profiles[i];
+        for (int j = 0; j < 3; j++) {
+            const struct ggml_task_stage *ts = &profile->stages[j];
+            rc = fprintf(fp, "%2d %d %d", ts->backend, ts->parallel ? 1 : 0,
+                         ts->wait ? 1 : 0);
+            if (rc <= 0) {
+                return false;
+            }
+            if (j < 2) {
+                rc = fprintf(fp, "  ");
+                if (rc <= 0) {
+                    return false;
+                }
+            }
+        }
+        rc = fprintf(fp, "\n");
+        if (rc <= 0) {
+            return false;
+        }
+    }
+
+    return true;
+}
+
+static bool
+ggml_mulmat_tune_validate_internal(const struct ggml_mulmat_tune *tune,
+                                   const char *model, int ftype, int n_threads,
+                                   char *errbuf, int errbuf_len) {
+
+    if (tune->version != GGML_MULMAT_TUNE_VERSION) {
+        snprintf(errbuf, errbuf_len - 1,
+                 "version mismatch, built-in: %d, "
+                 "yours: %d",
+                 GGML_MULMAT_TUNE_VERSION, tune->version);
+        return false;
+    } else if (strcmp(model, tune->model) != 0) {
+        snprintf(errbuf, errbuf_len - 1,
+                 "model mismatch. built-in: %s, yours: %s", model, tune->model);
+        return false;
+    } else if (ftype != tune->ftype) {
+        snprintf(errbuf, errbuf_len - 1,
+                 "ftype mismatch. built-in: %d, yours: %d\n", ftype,
+                 tune->ftype);
+        return false;
+    } else if (n_threads != tune->n_threads) {
+        snprintf(errbuf, errbuf_len - 1,
+                 "n_threads mismatch. run-time: %d, yours: %d\n", n_threads,
+                 tune->n_threads);
+        return false;
+    }
+
+    for (int i = 0; i < tune->n_shapes; i++) {
+        const struct ggml_mulmat_tune_shape *shape = &tune->shapes[i];
+
+        struct ggml_task_profile *builtin_profiles = NULL;
+        int n_profiles = ggml_mulmat_get_task_profiles(
+            NULL, shape->src0_type, shape->src1_type, &builtin_profiles);
+
+        if (n_profiles != shape->n_profiles) {
+            snprintf(errbuf, errbuf_len - 1, "task profiles mismatch");
+            return false;
+        }
+
+        // TODO: profiles order is relevant, too strict.
+        size_t sz = sizeof(struct ggml_task_profile) * n_profiles;
+        if (memcmp(builtin_profiles, shape->profiles, sz) != 0) {
+            snprintf(errbuf, errbuf_len - 1, "task profiles mismatch");
+
+            printf("=== built-in profiles:\n");
+            ggml_mulmat_tune_write_profiles(stderr, builtin_profiles,
+                                            n_profiles);
+
+            printf("=== incoming profiles:\n");
+            ggml_mulmat_tune_write_profiles(stderr, shape->profiles,
+                                            shape->n_profiles);
+            return false;
+        }
+    }
+
+    return true;
+}
+
+bool ggml_mulmat_tune_validate(const struct ggml_mulmat_tune *tune,
+                               const char *model, int ftype, int n_threads) {
+    char errbuf[128];
+    bool ok = ggml_mulmat_tune_validate_internal(tune, model, ftype, n_threads,
+                                                 errbuf, sizeof(errbuf));
+    if (!ok) {
+        fprintf(stderr, "[mulmat tune] error: %s. run bench again.\n", errbuf);
+    }
+
+    return ok;
+}
+
+bool ggml_mulmat_tune_read_data(struct ggml_mulmat_tune *tune, FILE *fp) {
+    int rc = fscanf(fp, "%d", &tune->version);
+    if (rc <= 0) {
+        return false;
+    }
+
+    if (tune->version != GGML_MULMAT_TUNE_VERSION) {
+        fprintf(stderr, "[mulmat tune] version mismatch, run bench again\n");
+        return false;
+    }
+
+    rc = fscanf(fp, "%s %d %d %d", tune->model, (int *)&tune->ftype,
+                &tune->n_shapes, &tune->n_threads);
+    if (rc <= 0) {
+        return false;
+    }
+
+    for (int i_shape = 0; i_shape < tune->n_shapes; i_shape++) {
+        struct ggml_mulmat_tune_shape *shape = &tune->shapes[i_shape];
+
+        rc = fscanf(fp, "%d %d %d %d %d %d", &shape->N, &shape->K,
+                    (int *)&shape->src0_type, (int *)&shape->src1_type,
+                    &shape->n_profiles, &shape->m_num);
+        if (rc <= 0) {
+            return false;
+        }
+
+        {
+            size_t item_size = sizeof(struct ggml_mulmat_tune_m) *
+                               (shape->n_profiles * shape->m_num);
+            shape->items = malloc(item_size);
+            if (shape->items == NULL) {
+                fprintf(stderr, "[mulmat tune] failed to allocate memory\n");
+                return false;
+            }
+            memset(shape->items, 0, item_size);
+        }
+
+        {
+            size_t sz = sizeof(struct ggml_task_profile) * shape->n_profiles;
+            shape->profiles = malloc(sz);
+            GGML_ASSERT(shape->profiles);
+            memset(shape->profiles, 0, sz);
+        }
+
+        for (int ip = 0; ip < shape->n_profiles; ip++) {
+            struct ggml_task_profile *profile = &shape->profiles[ip];
+            for (int j = 0; j < 3; j++) {
+                struct ggml_task_stage *ts = &profile->stages[j];
+                int backend;
+                int parallel;
+                int wait;
+                rc = fscanf(fp, "%d %d %d", &backend, &parallel, &wait);
+                if (rc <= 0) {
+                    return false;
+                }
+                ts->backend = (enum ggml_task_backend)backend;
+                ts->parallel = parallel ? true : false;
+                ts->wait = wait ? true : false;
+            }
+        }
+
+        for (int i_m = 0; i_m < shape->m_num; i_m++) {
+            int M;
+            for (int ip = 0; ip < shape->n_profiles; ip++) {
+                if (ip == 0) {
+                    rc = fscanf(fp, "%d", &M);
+                    if (rc <= 0) {
+                        return false;
+                    }
+                }
+                struct ggml_mulmat_tune_m *item =
+                    &shape->items[ip * shape->m_num + i_m];
+                item->M = M;
+                rc = fscanf(fp, "%d %d %d", &item->stages_time[0],
+                            &item->stages_time[1], &item->stages_time[2]);
+                if (rc <= 0) {
+                    return false;
+                }
+            }
+        }
+    }
+
+    return true;
+}
+
+bool ggml_mulmat_tune_write_data(const struct ggml_mulmat_tune *tune,
+                                 FILE *fp) {
+    int rc;
+    rc = fprintf(fp, "%d %s %d %d %d\n\n", tune->version, tune->model,
+                 tune->ftype, tune->n_shapes, tune->n_threads);
+    if (rc <= 0) {
+        return false;
+    }
+
+    for (int i_shape = 0; i_shape < tune->n_shapes; i_shape++) {
+        if (i_shape > 0) {
+            fprintf(fp, "\n");
+        }
+        const struct ggml_mulmat_tune_shape *shape = &tune->shapes[i_shape];
+        rc = fprintf(fp, "%d %d %d %d %d %d\n", shape->N, shape->K,
+                     shape->src0_type, shape->src1_type, shape->n_profiles,
+                     shape->m_num);
+        if (rc <= 0) {
+            return false;
+        }
+
+        rc = ggml_mulmat_tune_write_profiles(fp, shape->profiles,
+                                             shape->n_profiles);
+        if (rc <= 0) {
+            return false;
+        }
+
+        for (int i_m = 0; i_m < shape->m_num; i_m++) {
+            for (int ip = 0; ip < shape->n_profiles; ip++) {
+                struct ggml_mulmat_tune_m *item =
+                    &shape->items[ip * shape->m_num + i_m];
+                if (ip == 0) {
+                    rc = fprintf(fp, "%4d", item->M);
+                    if (rc <= 0) {
+                        return false;
+                    }
+                }
+
+                struct ggml_task_profile *profile = &shape->profiles[ip];
+                for (int k = 0; k < 3; k++) {
+                    if (profile->stages[k].backend != GGML_TASK_BACKEND_NONE) {
+                        rc = fprintf(fp, "%9d", item->stages_time[k]);
+                        if (rc <= 0) {
+                            return false;
+                        }
+                    } else {
+                        rc = fprintf(fp, " 0");
+                        if (rc <= 0) {
+                            return false;
+                        }
+                    }
+                }
+            }
+            rc = fprintf(fp, "\n");
+            if (rc <= 0) {
+                return false;
+            }
+        }
+    }
+
+    return true;
+}
+
+const struct ggml_mulmat_tune_shape *
+ggml_mulmat_tune_get_shape(const struct ggml_mulmat_tune *tune, const int N,
+                           const int K, enum ggml_type src0_type,
+                           enum ggml_type src1_type) {
+    GGML_ASSERT(N > 0 && K > 0);
+
+    for (int i = 0; i < tune->n_shapes; i++) {
+        const struct ggml_mulmat_tune_shape *s = &tune->shapes[i];
+        if (s->src0_type != src0_type || s->src1_type != src1_type) {
+            continue;
+        }
+
+        if (s->N > 0 && s->K > 0) {
+            if (s->N == N && s->K == K) {
+                return s;
+            }
+        } else if (s->N > 0 && s->K == 0) {
+            if (s->N == N) {
+                return s;
+            }
+        } else if (s->N == 0 && s->K > 0) {
+            if (s->K == K) {
+                return s;
+            }
+        }
+    }
+
+    return NULL;
+}
+
+// This is the experimental reference implementation.
+// Requires both n_threads are same at bench time and runtime.
+void ggml_mulmat_tune_estimate_time(
+    const struct ggml_mulmat_tune_shape *shape, int M,
+    struct ggml_mulmat_tune_time *profile_time) {
+
+    GGML_ASSERT(shape);
+    GGML_ASSERT(profile_time);
+
+    const int m_num = shape->m_num;
+    const int min_m = shape->items[0].M;
+    const int max_m = shape->items[m_num - 1].M;
+
+    for (int ip = 0; ip < shape->n_profiles; ip++) {
+        struct ggml_task_profile *profile = &shape->profiles[ip];
+        profile_time[ip].total_time = 0;
+        profile_time[ip].profile = profile;
+
+        const int items_offset = ip * m_num;
+
+        struct ggml_mulmat_tune_m *p0 = NULL;
+        struct ggml_mulmat_tune_m *p1 = NULL;
+        if (M < min_m) {
+            // first two.
+            p0 = &shape->items[items_offset];
+            p1 = &shape->items[items_offset + 1];
+        } else if (M > max_m) {
+            // last two
+            p0 = &shape->items[items_offset + m_num - 2];
+            p1 = &shape->items[items_offset + m_num - 1];
+        } else {
+            for (int i = 0; i < m_num; i++) {
+                p1 = &shape->items[items_offset + i];
+                if (p1->M == M) {
+                    p0 = p1;
+                    break;
+                }
+
+                if (i > 0) {
+                    p0 = (struct ggml_mulmat_tune_m *)(p1 - 1);
+                    if (M > p0->M && M < p1->M) {
+                        break;
+                    }
+                }
+            }
+        }
+
+        GGML_ASSERT(p0 && p1);
+
+        for (int i_stage = 0; i_stage < 3; i_stage++) {
+            struct ggml_task_stage *stage = &profile->stages[i_stage];
+            if (stage->backend == GGML_TASK_BACKEND_NONE) {
+                continue;
+            }
+
+            int p0_v = p0->stages_time[i_stage];
+            int p1_v = p1->stages_time[i_stage];
+
+            GGML_ASSERT(p0_v >= 0);
+            GGML_ASSERT(p1_v >= 0);
+
+            // t = aM + b
+            double a;
+            double b;
+
+            if (p0 == p1) {
+                a = 0.0;
+                b = p1_v;
+            } else {
+                a = 1.0 * (p1_v - p0_v) / (p1->M - p0->M);
+                b = p1_v - a * p1->M;
+            }
+            int t = (int)(a * M + b);
+
+            profile_time[ip].stage_time[i_stage] = t;
+            profile_time[ip].total_time += t;
+        }
+    }
+}
+
+// Experimental: create mul_mat tensor.
+static struct ggml_tensor *ggml_mulmat_new_tensor(int M, int N, int K,
+                                                  enum ggml_type src0_type,
+                                                  struct ggml_context **ctx) {
+    // At most 256, because in `ggml_quantize_qx_x`, the index type of hist is
+    // either int8_t or uint8_t.
+    // Use 1024 to avoid suddenly broken.
+    int64_t hist[1024];
+
+    bool src0_is_quantized = ggml_is_quantized(src0_type);
+
+    size_t ctx_size = 0;
+    ctx_size += (size_t)(M * N * ggml_type_sizef(GGML_TYPE_F32)); // src1
+    ctx_size += (size_t)(N * K * ggml_type_sizef(src0_type));     // src0
+    ctx_size += (size_t)(1024 * 1024 * 64); // experimental
+
+    if (src0_is_quantized) {
+        // quantize F32 to Qx_x
+        ctx_size += (size_t)(N * K * ggml_type_sizef(GGML_TYPE_F32));
+    }
+
+    struct ggml_init_params init_params = {
+        .mem_size = ctx_size,
+        .mem_buffer = NULL,
+        .no_alloc = 0,
+    };
+
+    *ctx = ggml_init(init_params);
+    GGML_ASSERT(*ctx);
+
+    // src0: N x K
+    struct ggml_tensor *src0 =
+        ggml_new_tensor_2d(*ctx, src0_type, (int64_t)K, (int64_t)N);
+
+    // src1: M x K
+    struct ggml_tensor *src1 =
+        ggml_new_tensor_2d(*ctx, GGML_TYPE_F32, (int64_t)K, (int64_t)M);
+    ggml_set_f32(src1, 0.5f);
+
+    if (src0_type == GGML_TYPE_F32 || src0_type == GGML_TYPE_F16) {
+        ggml_set_f32(src0, 0.1f);
+    } else if (src0_is_quantized) {
+        struct ggml_tensor *src0_f32 =
+            ggml_new_tensor_2d(*ctx, GGML_TYPE_F32, (int64_t)K, (int64_t)N);
+        ggml_set_f32(src0_f32, 0.1f);
+
+        switch (src0_type) {
+        case GGML_TYPE_Q4_0:
+            ggml_quantize_q4_0((const float *)src0_f32->data, src0->data, N * K,
+                               K, hist);
+            break;
+        case GGML_TYPE_Q4_1:
+            ggml_quantize_q4_1((const float *)src0_f32->data, src0->data, N * K,
+                               K, hist);
+            break;
+        case GGML_TYPE_Q5_0:
+            ggml_quantize_q5_0((const float *)src0_f32->data, src0->data, N * K,
+                               K, hist);
+            break;
+        case GGML_TYPE_Q5_1:
+            ggml_quantize_q5_1((const float *)src0_f32->data, src0->data, N * K,
+                               K, hist);
+            break;
+        case GGML_TYPE_Q8_0:
+            ggml_quantize_q8_0((const float *)src0_f32->data, src0->data, N * K,
+                               K, hist);
+            break;
+        default:
+            GGML_ASSERT(false);
+        }
+    } else {
+        GGML_ASSERT(false);
+    }
+
+    // node: M x N
+    // Will compute z = y * xT, z: node, y: src1, x: src0
+    return ggml_mul_mat(*ctx, src0, src1);
+}
+
+// Experimental: allocate memory for wdata with max possible size.
+// This part of code is actually belongs to ggml compute graph.
+static size_t ggml_mulmat_allocate_wdata(int N, int K, char **wdata) {
+    // The size is actually determined by cgraph before computing.
+    // Apart from the src0_type, wsize is affected by backend, cache line size,
+    // n_threads etc.
+
+    const size_t extra = 1024 * 1024;
+    size_t sz = (size_t)(N * K * ggml_type_sizef(GGML_TYPE_F32)) + extra;
+    void *buf = malloc(sz);
+
+    if (!buf) {
+        fprintf(stderr,
+                "[mulmat tune] error: failed to allocate %zu MiB memory",
+                sz / 1024 / 1024);
+        return 0;
+    }
+
+    memset(buf, 0, sz);
+    *wdata = buf;
+    return sz;
+}
+
+int ggml_mulmat_tune_get_builtin_task_backends(
+    enum ggml_task_backend *backends) {
+    int i = 0;
+    backends[i++] = GGML_TASK_BACKEND_CPU;
+
+    if (ggml_cpu_has_cpublas()) {
+        backends[i++] = GGML_TASK_BACKEND_CPU_BLAS;
+    }
+
+    if (ggml_cpu_has_cublas()) {
+        backends[i++] = GGML_TASK_BACKEND_GPU_CUDA;
+    } else if (ggml_cpu_has_clblast()) {
+        backends[i++] = GGML_TASK_BACKEND_GPU_CL;
+    }
+    return i;
+}
+
+bool ggml_mulmat_tune_bench(struct ggml_mulmat_tune *tune,
+                            struct ggml_mulmat_tune_params *params) {
+    GGML_ASSERT(tune);
+    GGML_ASSERT(params);
+    GGML_ASSERT(params->model.name);
+
+    enum ggml_task_backend backends[16];
+    int n_backends = ggml_mulmat_tune_get_builtin_task_backends(backends);
+    if (n_backends < 2) {
+        fprintf(stderr,
+                "[mulmat tune] error: this program was not built with BLAS.\n");
+        return false;
+    }
+
+    bool ok = ggml_mulmat_tune_init(tune, params, NULL);
+    if (!ok) {
+        return false;
+    }
+
+    {
+        char buf[128] = {0};
+        int offset = 0;
+
+        for (int i = 0; i < n_backends; i++) {
+            if (i > 0) {
+                buf[offset++] = ',';
+                buf[offset++] = ' ';
+            }
+            const char *name = ggml_mulmat_tune_task_backend_name(backends[i]);
+            size_t len = strlen(name);
+            memcpy(&buf[offset], name, len);
+            offset += (int)len;
+        }
+
+        fprintf(stdout,
+                "[mulmat tune] model: %s, ggml ftype: %d, "
+                "n_pass: %d, n_threads: %d, n_shapes: %d, backends: %s\n",
+                params->model.name, params->model.ftype, params->n_pass,
+                params->n_threads, tune->n_shapes, buf);
+    }
+
+    int64_t stages_time[3];
+    int64_t t0 = ggml_time_ms();
+
+    struct ggml_threading_context *thrd_ctx = ggml_threading_start(
+        tune->n_threads, ggml_threading_graph_compute_thread,
+        ggml_compute_forward_wrapper, GGML_THREADING_FEATURE_WAIT_ON_DONE,
+        stages_time);
+
+    for (int i_shape = 0; i_shape < tune->n_shapes; i_shape++) {
+        const struct ggml_mulmat_tune_shape *shape = &tune->shapes[i_shape];
+        int M;
+        int N = shape->N;
+        int K = shape->K;
+
+        char buf[20] = {0};
+        int buf_len = sizeof(buf) - 1;
+        int line_len = 0;
+
+        for (int i_m = 0; i_m < shape->m_num; i_m++) {
+            M = shape->arr_m[i_m];
+            if (shape->N == 0) {
+                N = M;
+            } else if (shape->K == 0) {
+                K = M;
+            }
+
+            if (params->progress) {
+                line_len = snprintf(buf, buf_len, "%d %d %d ", N, K, M);
+                fprintf(stdout, "%s", buf);
+                fflush(stdout);
+            }
+
+            char *wdata = NULL;
+            size_t wsize = ggml_mulmat_allocate_wdata(N, K, &wdata);
+            if (wsize == 0) {
+                return false;
+            }
+
+            struct ggml_context *ctx = NULL;
+            struct ggml_tensor *node =
+                ggml_mulmat_new_tensor(M, N, K, shape->src0_type, &ctx);
+
+            for (int ip = 0; ip < shape->n_profiles; ip++) {
+                const struct ggml_task_profile *profile = &shape->profiles[ip];
+
+                memcpy(&node->task_profile, profile,
+                       sizeof(struct ggml_task_profile));
+
+                struct ggml_mulmat_tune_m *item =
+                    &shape->items[ip * shape->m_num + i_m];
+                item->M = M;
+
+                int min[3] = {INT32_MAX, INT32_MAX, INT32_MAX};
+
+                for (int k = 0; k < params->n_pass; k++) {
+                    for (int j = 0; j < 3; j++) {
+                        stages_time[j] = 0;
+                    }
+
+                    /*enum ggml_compute_error err = */
+                    ggml_threading_compute_tensor(thrd_ctx, node, wdata, wsize);
+
+                    for (int i = 0; i < 3; i++) {
+                        int v = (int)stages_time[i];
+                        if (v < min[i]) {
+                            min[i] = v;
+                        }
+                    }
+
+                    if (params->progress) {
+                        fprintf(stdout, ".");
+                        fflush(stdout);
+                        line_len++;
+                    }
+                }
+                for (int i = 0; i < 3; i++) {
+                    item->stages_time[i] = min[i];
+                }
+            }
+
+            ggml_free(ctx);
+            free(wdata);
+
+            if (params->progress) {
+                line_len += 10;
+                for (int j = 0; j < line_len; j++) {
+                    fprintf(stdout, "\b \b");
+                }
+                fflush(stdout);
+            }
+        }
+    }
+
+    ggml_threading_stop(thrd_ctx);
+
+    fprintf(stdout, "[mulmat tune] done, elapsed time: %d seconds.\n",
+            (int)(ggml_time_ms() - t0) / 1000);
+
+    // output
+
+    if (params->fname && strcmp(params->fname, "") != 0) {
+        FILE *fp = fopen(params->fname, "w");
+        if (!fp) {
+            fprintf(stderr,
+                    "[mulmat tune] warn: failed to open file `%s`, print to "
+                    "console instead\n\n",
+                    params->fname);
+            params->output_console = 1;
+        } else {
+            ok = ggml_mulmat_tune_write_data(tune, fp);
+            fclose(fp);
+
+            if (ok) {
+                fprintf(stdout, "[mulmat tune] data was written to `%s`\n",
+                        params->fname);
+            } else {
+                fprintf(
+                    stderr,
+                    "[mulmat tune] warn: failed to write file `%s`, print to "
+                    "console instead\n\n",
+                    params->fname);
+                params->output_console = 1;
+            }
+        }
+    }
+
+    if (params->output_console) {
+        return ggml_mulmat_tune_write_data(tune, stdout);
+    }
+
+    return true;
+}
diff --git a/ggml-tune.h b/ggml-tune.h
new file mode 100644
index 0000000000000..404f1f1c4a53f
--- /dev/null
+++ b/ggml-tune.h
@@ -0,0 +1,137 @@
+#pragma once
+
+#include <stdbool.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "ggml.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define GGML_MULMAT_TUNE_VERSION 8
+#define GGML_MULMAT_N_SHAPES 6
+
+#define GGML_MULMAT_MAX_PASS 3
+
+struct ggml_mulmat_tune_m {
+    int M;
+
+    int stages_time[3];
+};
+
+struct ggml_mulmat_tune_model {
+    const char *name;
+
+    enum ggml_ftype ftype;
+
+    int n_vocab;
+
+    int n_embd;
+
+    // n_ff = ((2*(4*n_embd)/3 + n_mult - 1)/n_mult)*n_mult
+    int n_ff;
+
+    // n_rot = n_embd/n_head;
+    int n_rot;
+};
+
+struct ggml_mulmat_tune_shape {
+    // For RoPE, one of N / K is 0.
+    int N;
+    int K;
+
+    enum ggml_type src0_type;
+    enum ggml_type src1_type;
+
+    int n_profiles;
+    struct ggml_task_profile *profiles;
+
+    int m_num;
+    int *arr_m;
+
+    struct ggml_mulmat_tune_m *items;
+};
+
+struct ggml_mulmat_tune {
+    int version;
+
+    char model[16];
+
+    enum ggml_ftype ftype;
+
+    int n_shapes;
+    // Given N/K, we bench for mul_mat [M,K] x [K,N].
+    struct ggml_mulmat_tune_shape shapes[GGML_MULMAT_N_SHAPES];
+
+    int n_threads;
+};
+
+struct ggml_mulmat_tune_time {
+    struct ggml_task_profile *profile;
+    int stage_time[3];
+    int total_time;
+};
+
+struct mm_cache_element {
+    int M;
+    int N;
+    int K;
+    struct ggml_task_profile *profile;
+    int stages_time[3];
+};
+
+// params for tune/bench.
+struct ggml_mulmat_tune_params {
+    struct ggml_mulmat_tune_model model;
+    int m_num;
+    int n_pass;
+    int n_threads;
+    bool progress;       // print and clear '.'
+    bool output_console; // also print result to console
+    const char *fname;
+};
+
+// NOTE: stages_time is filled if not null.
+const struct ggml_task_profile *
+ggml_mulmat_tune_select_task_profile(struct ggml_mulmat_tune *tune, int M,
+                                     int N, int K, enum ggml_type src0_t,
+                                     enum ggml_type src1_t, int stages_time[3]);
+
+bool ggml_mulmat_tune_validate(const struct ggml_mulmat_tune *tune,
+                               const char *model_name, int ftype,
+                               int n_threads);
+
+void ggml_mulmat_tune_model_init(struct ggml_mulmat_tune_model *model,
+                                 const char *name, enum ggml_ftype ftype);
+
+bool ggml_mulmat_tune_init(struct ggml_mulmat_tune *tune,
+                           struct ggml_mulmat_tune_params *params,
+                           struct ggml_task_profile_factory *profile_factory);
+
+void ggml_mulmat_tune_free(struct ggml_mulmat_tune *tune);
+
+bool ggml_mulmat_tune_write_data(const struct ggml_mulmat_tune *tune, FILE *fp);
+
+bool ggml_mulmat_tune_read_data(struct ggml_mulmat_tune *tune, FILE *fp);
+
+const struct ggml_mulmat_tune_shape *
+ggml_mulmat_tune_get_shape(const struct ggml_mulmat_tune *tune, int N, int K,
+                           enum ggml_type src0_type, enum ggml_type src1_type);
+
+void ggml_mulmat_tune_estimate_time(const struct ggml_mulmat_tune_shape *shape,
+                                    int M,
+                                    struct ggml_mulmat_tune_time *profile_time);
+
+const char *ggml_task_backend_name(enum ggml_task_backend backend);
+
+int ggml_mulmat_tune_get_builtin_task_backends(
+    enum ggml_task_backend *backends);
+
+bool ggml_mulmat_tune_bench(struct ggml_mulmat_tune *tune,
+                            struct ggml_mulmat_tune_params *params);
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/ggml.c b/ggml.c
index 78c3653543c88..5d0b83b1de198 100644
--- a/ggml.c
+++ b/ggml.c
@@ -61,26 +61,6 @@ static LONG atomic_fetch_sub(atomic_int* ptr, LONG dec) {
     return atomic_fetch_add(ptr, -(dec));
 }
 
-typedef HANDLE pthread_t;
-
-typedef DWORD thread_ret_t;
-static int pthread_create(pthread_t* out, void* unused, thread_ret_t(*func)(void*), void* arg) {
-    (void) unused;
-    HANDLE handle = CreateThread(NULL, 0, (LPTHREAD_START_ROUTINE) func, arg, 0, NULL);
-    if (handle == NULL)
-    {
-        return EAGAIN;
-    }
-
-    *out = handle;
-    return 0;
-}
-
-static int pthread_join(pthread_t thread, void* unused) {
-    (void) unused;
-    return (int) WaitForSingleObject(thread, INFINITE);
-}
-
 static int sched_yield (void) {
     Sleep (0);
     return 0;
@@ -88,8 +68,6 @@ static int sched_yield (void) {
 #else
 #include <pthread.h>
 #include <stdatomic.h>
-
-typedef void* thread_ret_t;
 #endif
 
 // __FMA__ and __F16C__ are not defined in MSVC, however they are implied with AVX2/AVX512
@@ -166,6 +144,12 @@ inline static void* ggml_aligned_malloc(size_t size) {
 #include "ggml-opencl.h"
 #endif
 
+#if defined(GGML_USE_MULMAT_TUNE)
+    #include "ggml-tune.h"
+#endif
+
+#include "ggml-threading.h"
+
 #undef MIN
 #undef MAX
 #define MIN(a, b) ((a) < (b) ? (a) : (b))
@@ -4059,6 +4043,8 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
         ggml_cl_init();
 #endif
 
+        ggml_mulmat_init_task_profiles();
+
         is_first_call = false;
     }
 
@@ -4302,7 +4288,7 @@ struct ggml_tensor * ggml_new_tensor_impl(
         /*.src0         =*/ NULL,
         /*.src1         =*/ NULL,
         /*.opt          =*/ { NULL },
-        /*.n_tasks      =*/ 0,
+        /*.task_profile =*/ { 0 },
         /*.perf_runs    =*/ 0,
         /*.perf_cycles  =*/ 0,
         /*.perf_time_us =*/ 0,
@@ -8516,14 +8502,19 @@ static void ggml_compute_forward_mul_f32(
     const int ith = params->ith;
     const int nth = params->nth;
 
+    enum ggml_task_backend comp_backend = dst->task_profile.stages[GGML_TASK_COMPUTE].backend;
+    if (comp_backend == GGML_TASK_BACKEND_GPU_CL) {
 #ifdef GGML_USE_CLBLAST
-    if (src1->backend == GGML_BACKEND_GPU) {
-        if (ith == 0) {
-            ggml_cl_mul(src0, src1, dst);
+        if (src1->backend == GGML_BACKEND_GPU) {
+            if (ith == 0) {
+                ggml_cl_mul(src0, src1, dst);
+            }
+            return;
         }
-        return;
-    }
+#else
+        GGML_ASSERT(false);
 #endif
+    };
 
     const int64_t nr = ggml_nrows(src0);
 
@@ -9950,36 +9941,6 @@ static void ggml_compute_forward_rms_norm_back(
 }
 
 
-// ggml_compute_forward_mul_mat
-
-#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
-// helper function to determine if it is better to use BLAS or not
-// for large matrices, BLAS is faster
-static bool ggml_compute_forward_mul_mat_use_blas(
-        const struct ggml_tensor * src0,
-        const struct ggml_tensor * src1,
-              struct ggml_tensor * dst) {
-    //const int64_t ne00 = src0->ne[0];
-    //const int64_t ne01 = src0->ne[1];
-
-    const int64_t ne10 = src1->ne[0];
-
-    const int64_t ne0 = dst->ne[0];
-    const int64_t ne1 = dst->ne[1];
-
-    // TODO: find the optimal values for these
-    if (ggml_is_contiguous(src0) &&
-        ggml_is_contiguous(src1) &&
-        (ne0 >= 32 && ne1 >= 32 && ne10 >= 32)) {
-
-        /*printf("BLAS: %d %d %d %d %d\n", ne0, ne1, ne10, ne00, ne01);*/
-        return true;
-    }
-
-    return false;
-}
-#endif
-
 static void ggml_compute_forward_mul_mat_f32(
         const struct ggml_compute_params * params,
         const struct ggml_tensor * src0,
@@ -10050,28 +10011,25 @@ static void ggml_compute_forward_mul_mat_f32(
     // nb01 >= nb00 - src0 is not transposed
     //   compute by src0 rows
 
+    enum ggml_task_backend comp_backend = dst->task_profile.stages[GGML_TASK_COMPUTE].backend;
+
+    if (comp_backend == GGML_TASK_BACKEND_GPU_CL) {
 #if defined(GGML_USE_CLBLAST)
-    if (ggml_cl_can_mul_mat(src0, src1, dst)) {
-        if (params->ith == 0 && params->type == GGML_TASK_COMPUTE) {
-            ggml_cl_mul_mat(src0, src1, dst, params->wdata, params->wsize);
-        }
+        GGML_ASSERT(params->nth == 1);
+        GGML_ASSERT(params->type == GGML_TASK_COMPUTE);
+        ggml_cl_mul_mat(src0, src1, dst, params->wdata, params->wsize);
         return;
-    }
+#else
+        GGML_ASSERT(false);
 #endif
+    }
 
-#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
-    if (ggml_compute_forward_mul_mat_use_blas(src0, src1, dst)) {
-        if (params->ith != 0) {
-            return;
-        }
-
-        if (params->type == GGML_TASK_INIT) {
-            return;
-        }
+    GGML_ASSERT(comp_backend & GGML_TASK_BACKEND_CPU);
 
-        if (params->type == GGML_TASK_FINALIZE) {
-            return;
-        }
+    if (comp_backend == GGML_TASK_BACKEND_CPU_BLAS) {
+#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
+        GGML_ASSERT(params->nth == 1);
+        GGML_ASSERT(params->type == GGML_TASK_COMPUTE);
 
         for (int64_t i03 = 0; i03 < ne03; i03++) {
             for (int64_t i02 = 0; i02 < ne02; i02++) {
@@ -10089,16 +10047,13 @@ static void ggml_compute_forward_mul_mat_f32(
         //printf("CBLAS F32 = %f ms, %d x %d x %d x %d\n", (ggml_perf_time_us() - t0)/1000.0, ne0, ne1, ne2, ne3);
 
         return;
-    }
+#else
+        GGML_ASSERT(false);
 #endif
-
-    if (params->type == GGML_TASK_INIT) {
-        return;
     }
 
-    if (params->type == GGML_TASK_FINALIZE) {
-        return;
-    }
+    GGML_ASSERT(params->type == GGML_TASK_COMPUTE);
+    GGML_ASSERT(comp_backend == GGML_TASK_BACKEND_CPU);
 
     // parallelize by src0 rows using ggml_vec_dot_f32
 
@@ -10215,30 +10170,26 @@ static void ggml_compute_forward_mul_mat_f16_f32(
     // nb01 >= nb00 - src0 is not transposed
     //   compute by src0 rows
 
+    enum ggml_task_backend comp_backend = dst->task_profile.stages[GGML_TASK_COMPUTE].backend;
+
+    if (comp_backend == GGML_TASK_BACKEND_GPU_CL) {
 #if defined(GGML_USE_CLBLAST)
-    if (ggml_cl_can_mul_mat(src0, src1, dst)) {
-        if (params->ith == 0 && params->type == GGML_TASK_COMPUTE) {
-            ggml_cl_mul_mat(src0, src1, dst, params->wdata, params->wsize);
-        }
+        GGML_ASSERT(params->type == GGML_TASK_COMPUTE);
+        ggml_cl_mul_mat(src0, src1, dst, params->wdata, params->wsize);
         return;
-    }
+#else
+        GGML_ASSERT(false);
 #endif
+    }
+
+    enum ggml_task_backend init_backend = dst->task_profile.stages[GGML_TASK_INIT].backend;
+    GGML_ASSERT(comp_backend & GGML_TASK_BACKEND_CPU);
 
+    if (comp_backend == GGML_TASK_BACKEND_CPU_BLAS) {
 #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
-    if (ggml_compute_forward_mul_mat_use_blas(src0, src1, dst)) {
         GGML_ASSERT(nb10 == sizeof(float));
-
-        if (params->ith != 0) {
-            return;
-        }
-
-        if (params->type == GGML_TASK_INIT) {
-            return;
-        }
-
-        if (params->type == GGML_TASK_FINALIZE) {
-            return;
-        }
+        GGML_ASSERT(params->nth == 1);
+        GGML_ASSERT(params->type == GGML_TASK_COMPUTE);
 
         for (int64_t i03 = 0; i03 < ne03; i03++) {
             for (int64_t i02 = 0; i02 < ne02; i02++) {
@@ -10271,8 +10222,14 @@ static void ggml_compute_forward_mul_mat_f16_f32(
         /*printf("CBLAS F16 = %f ms, %d x %d x %d x %d\n", (ggml_perf_time_us() - t0)/1000.0, ne0, ne1, ne2, ne3);*/
 
         return;
-    }
+#else
+        GGML_ASSERT(false);
 #endif
+    }
+
+    GGML_ASSERT(params->type == GGML_TASK_INIT || params->type == GGML_TASK_COMPUTE);
+    GGML_ASSERT(init_backend == GGML_TASK_BACKEND_CPU);
+    GGML_ASSERT(comp_backend == GGML_TASK_BACKEND_CPU);
 
     if (params->type == GGML_TASK_INIT) {
         ggml_fp16_t * const wdata = params->wdata;
@@ -10293,9 +10250,7 @@ static void ggml_compute_forward_mul_mat_f16_f32(
         return;
     }
 
-    if (params->type == GGML_TASK_FINALIZE) {
-        return;
-    }
+    GGML_ASSERT (params->type == GGML_TASK_COMPUTE);
 
     // fp16 -> half the size, so divide by 2
     // TODO: do not support transposed src1
@@ -10420,50 +10375,62 @@ static void ggml_compute_forward_mul_mat_q_f32(
     // nb01 >= nb00 - src0 is not transposed
     //   compute by src0 rows
 
+    enum ggml_task_backend comp_backend = dst->task_profile.stages[GGML_TASK_COMPUTE].backend;
+
+    if (comp_backend == GGML_TASK_BACKEND_GPU_CL) {
 #if defined(GGML_USE_CLBLAST)
-    if (ggml_cl_can_mul_mat(src0, src1, dst)) {
-        if (params->ith == 0 && params->type == GGML_TASK_COMPUTE) {
-            ggml_cl_mul_mat(src0, src1, dst, params->wdata, params->wsize);
-        }
+        GGML_ASSERT(params->nth == 1);
+        GGML_ASSERT(params->type == GGML_TASK_COMPUTE);
+        ggml_cl_mul_mat(src0, src1, dst, params->wdata, params->wsize);
         return;
-    }
+#else
+        GGML_ASSERT(false);
 #endif
+    }
+
+    enum ggml_task_backend init_backend = dst->task_profile.stages[GGML_TASK_INIT].backend;
+    GGML_ASSERT(comp_backend & GGML_TASK_BACKEND_CPU);
 
+    if (comp_backend == GGML_TASK_BACKEND_CPU_BLAS) {
 #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
-    if (ggml_compute_forward_mul_mat_use_blas(src0, src1, dst)) {
-        if (params->ith != 0) {
-            return;
-        }
+        GGML_ASSERT (init_backend == GGML_TASK_BACKEND_CPU);
+        GGML_ASSERT(params->type == GGML_TASK_INIT || params->type == GGML_TASK_COMPUTE);
+        GGML_ASSERT(src0->data);
+        GGML_ASSERT(params->wdata);
 
-        if (params->type == GGML_TASK_INIT) {
-            return;
-        }
+        float * const wdata = params->wdata;
+        dequantize_row_q_t const dequantize_row_q = quantize_fns[type].dequantize_row_q;
 
-        if (params->type == GGML_TASK_FINALIZE) {
+        if (params->type == GGML_TASK_INIT) {
+            // rows per thread
+            const int dr = (ne01 + nth - 1)/nth;
+
+            // row range for this thread
+            const int ir0 = dr*ith;
+            int ir1 = MIN(ir0 + dr, ne01);
+
+            for (int64_t i03 = 0; i03 < ne03; i03++) {
+                for (int64_t i02 = 0; i02 < ne02; i02++) {
+                    char  * data0_offset = (char *) src0->data + i03*nb03 + i02*nb02;
+                    float * wdata_offset = wdata + i03*ne03 + i02*ne02;
+                    for (int64_t i = ir0; i < ir1; ++i) {
+                        dequantize_row_q(data0_offset + i*nb01, wdata_offset + i*ne00, ne00);
+                    }
+                }
+            }
             return;
         }
 
-        float * const wdata = params->wdata;
-        dequantize_row_q_t const dequantize_row_q = quantize_fns[type].dequantize_row_q;
+        GGML_ASSERT(nth == 1);
+        GGML_ASSERT(params->type == GGML_TASK_COMPUTE);
 
         for (int64_t i03 = 0; i03 < ne03; i03++) {
             for (int64_t i02 = 0; i02 < ne02; i02++) {
                 const float * y = (float *) ((char *) src1->data + i02*nb12 + i03*nb13);
-
                 float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3);
 
-                {
-                    size_t id = 0;
-                    for (int64_t i01 = 0; i01 < ne01; ++i01) {
-                        dequantize_row_q((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01, wdata + id, ne00);
-                        id += ne00;
-                    }
-
-                    assert(id*sizeof(float) <= params->wsize);
-                }
-
+                // zT = y * xT
                 const float * x = wdata;
-
                 cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans,
                         ne11, ne01, ne10,
                         1.0f,    y, ne10,
@@ -10472,13 +10439,19 @@ static void ggml_compute_forward_mul_mat_q_f32(
             }
         }
 
-        //printf("CBLAS = %f ms, %d x %d x %d x %d\n", (ggml_perf_time_us() - t0)/1000.0, ne0, ne1, ne2, ne3);
-
         return;
-    }
+#else
+        GGML_ASSERT(false);
 #endif
+    }
+
+    GGML_ASSERT(params->type == GGML_TASK_INIT || params->type == GGML_TASK_COMPUTE);
+    GGML_ASSERT(init_backend == GGML_TASK_BACKEND_CPU);
+    GGML_ASSERT(comp_backend == GGML_TASK_BACKEND_CPU);
 
     if (params->type == GGML_TASK_INIT) {
+        GGML_ASSERT(params->nth == 1);
+
         char * wdata = params->wdata;
         const size_t row_size = ne10*GGML_TYPE_SIZE[vec_dot_type]/GGML_BLCK_SIZE[vec_dot_type];
 
@@ -10490,13 +10463,10 @@ static void ggml_compute_forward_mul_mat_q_f32(
                 }
             }
         }
-
         return;
     }
 
-    if (params->type == GGML_TASK_FINALIZE) {
-        return;
-    }
+    GGML_ASSERT(params->type == GGML_TASK_COMPUTE);
 
     // parallelize by src0 rows using ggml_vec_dot_q
 
@@ -14324,20 +14294,31 @@ static void ggml_compute_forward_cross_entropy_loss_back(
     }
 }
 
-
 /////////////////////////////////
 
-static void ggml_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor) {
+static enum ggml_compute_error ggml_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor) {
     GGML_ASSERT(params);
 
-#ifdef GGML_USE_CUBLAS
-    bool skip_cpu = ggml_cuda_compute_forward(params, tensor);
-    if (skip_cpu) {
-        return;
+    enum ggml_task_backend comp_backend = tensor->task_profile.stages[GGML_TASK_COMPUTE].backend;
+
+    if (comp_backend == GGML_TASK_BACKEND_GPU_CUDA) {
+#if defined(GGML_USE_CUBLAS)
+        bool skip_cpu = ggml_cuda_compute_forward(params, tensor);
+        if (skip_cpu) {
+            return GGML_COMPUTE_OK;
+        }
+        GGML_ASSERT(tensor->src0->backend == GGML_BACKEND_CPU);
+        GGML_ASSERT(tensor->src1 == NULL || tensor->src1->backend == GGML_BACKEND_CPU);
+        return GGML_COMPUTE_FALLBACK;
+#else
+        GGML_ASSERT(false);
+#endif
     }
-    GGML_ASSERT(tensor->src0->backend == GGML_BACKEND_CPU);
-    GGML_ASSERT(tensor->src1 == NULL || tensor->src1->backend == GGML_BACKEND_CPU);
-#endif // GGML_USE_CUBLAS
+
+    // if (tensor->task_profile.stages[params->type].backend > GGML_TASK_BACKEND_CPU) {
+    //     printf("mulmat: test fallback\n");
+    //     return GGML_COMPUTE_FALLBACK;
+    // }
 
     switch (tensor->op) {
         case GGML_OP_DUP:
@@ -14585,6 +14566,15 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
                 GGML_ASSERT(false);
             } break;
     }
+
+    return GGML_COMPUTE_OK;
+}
+
+enum ggml_compute_error ggml_compute_forward_wrapper(struct ggml_compute_params *params,
+    struct ggml_tensor *tensor) {
+    // We call ggml_compute_forward because the CUDA mul_mat entry point
+    // was moved out of `ggml_compute_forward_mul_mat`.
+    return ggml_compute_forward(params, tensor);
 }
 
 ////////////////////////////////////////////////////////////////////////////////
@@ -15480,6 +15470,7 @@ struct ggml_cgraph ggml_build_forward(struct ggml_tensor * tensor) {
         /*.n_nodes      =*/ 0,
         /*.n_leafs      =*/ 0,
         /*.n_threads    =*/ GGML_DEFAULT_N_THREADS,
+        /*.tune         =*/ NULL,
         /*.work_size    =*/ 0,
         /*.work         =*/ NULL,
         /*.nodes        =*/ { NULL },
@@ -15533,175 +15524,288 @@ struct ggml_cgraph ggml_build_backward(struct ggml_context * ctx, struct ggml_cg
     return result;
 }
 
-//
-// thread data
-//
-// synchronization is done via busy loops
-// I tried using spin locks, but not sure how to use them correctly - the things I tried were slower than busy loops
-//
+// ---- mulmat task profiles  ----
 
-#ifdef __APPLE__
+static struct ggml_task_profile_factory default_task_profile_factory = {0};
 
-//#include <os/lock.h>
-//
-//typedef os_unfair_lock ggml_lock_t;
-//
-//#define ggml_lock_init(x)    UNUSED(x)
-//#define ggml_lock_destroy(x) UNUSED(x)
-//#define ggml_lock_lock       os_unfair_lock_lock
-//#define ggml_lock_unlock     os_unfair_lock_unlock
-//
-//#define GGML_LOCK_INITIALIZER OS_UNFAIR_LOCK_INIT
+// TODO: thread unsafe. Should be initialized once.
+void ggml_mulmat_init_task_profiles(void) {
+    const size_t sz = sizeof(struct ggml_task_profile_factory);
+    memset(&default_task_profile_factory, 0, sz);
 
-typedef int ggml_lock_t;
+    // f32
+    {
+        struct ggml_task_profile *p = default_task_profile_factory.f32_f32;
+        int i = 0;
 
-#define ggml_lock_init(x)    UNUSED(x)
-#define ggml_lock_destroy(x) UNUSED(x)
-#define ggml_lock_lock(x)    UNUSED(x)
-#define ggml_lock_unlock(x)  UNUSED(x)
+        p[i].stages[1].backend = GGML_TASK_BACKEND_CPU;
+        p[i].stages[1].parallel = true;
+        i++;
 
-#define GGML_LOCK_INITIALIZER 0
+#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
+        p[i].stages[1].backend = GGML_TASK_BACKEND_CPU_BLAS;
+        p[i].stages[1].wait = true;
+        i++;
+#endif
 
-typedef pthread_t ggml_thread_t;
+#if defined(GGML_USE_CUBLAS)
+        p[i].stages[1].backend = GGML_TASK_BACKEND_GPU_CUDA;
+        p[i].stages[1].wait = true;
+        i++;
+#elif defined(GGML_USE_CLBLAST)
+        p[i].stages[1].backend = GGML_TASK_BACKEND_GPU_CL;
+        p[i].stages[1].wait = true;
+        i++;
+#endif
+        default_task_profile_factory.n_f32_f32 = i;
+    }
 
-#define ggml_thread_create pthread_create
-#define ggml_thread_join   pthread_join
+    // f16
+    {
+        struct ggml_task_profile *p = default_task_profile_factory.f16_f32;
+        int i = 0;
 
-#else
+        p[i].stages[0].backend = GGML_TASK_BACKEND_CPU;
+        p[i].stages[1].backend = GGML_TASK_BACKEND_CPU;
+        p[i].stages[1].parallel = true;
+        i++;
 
-//typedef pthread_spinlock_t ggml_lock_t;
+#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
+        p[i].stages[1].backend = GGML_TASK_BACKEND_CPU_BLAS;
+        p[i].stages[1].wait = true;
+        i++;
+#endif
+
+#if defined(GGML_USE_CUBLAS)
+        p[i].stages[1].backend = GGML_TASK_BACKEND_GPU_CUDA;
+        p[i].stages[1].wait = true;
+        i++;
+#elif defined(GGML_USE_CLBLAST)
+        p[i].stages[1].backend = GGML_TASK_BACKEND_GPU_CL;
+        p[i].stages[1].wait = true;
+        i++;
+#endif
+        default_task_profile_factory.n_f16_f32 = i;
+    }
 
-//#define ggml_lock_init(x) pthread_spin_init(x, PTHREAD_PROCESS_PRIVATE)
-//#define ggml_lock_destroy pthread_spin_destroy
-//#define ggml_lock_lock    pthread_spin_lock
-//#define ggml_lock_unlock  pthread_spin_unlock
+    // qxx
+    {
+        struct ggml_task_profile *p = default_task_profile_factory.qxx_f32;
+        int i = 0;
 
-typedef int ggml_lock_t;
+        p[i].stages[0].backend = GGML_TASK_BACKEND_CPU;
+        p[i].stages[1].backend = GGML_TASK_BACKEND_CPU;
+        p[i].stages[1].parallel = true;
+        i++;
 
-#define ggml_lock_init(x)    UNUSED(x)
-#define ggml_lock_destroy(x) UNUSED(x)
-#if defined(__x86_64__) || (defined(_MSC_VER) && defined(_M_AMD64))
-#define ggml_lock_lock(x)    _mm_pause()
-#else
-#define ggml_lock_lock(x)    UNUSED(x)
+#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
+        p[i].stages[0].backend = GGML_TASK_BACKEND_CPU;
+        p[i].stages[0].parallel = true;
+        p[i].stages[1].backend = GGML_TASK_BACKEND_CPU_BLAS;
+        p[i].stages[1].wait = true;
+        i++;
+#endif
+
+#if defined(GGML_USE_CUBLAS)
+        p[i].stages[1].backend = GGML_TASK_BACKEND_GPU_CUDA;
+        p[i].stages[1].wait = true;
+        i++;
+#elif defined(GGML_USE_CLBLAST)
+        p[i].stages[1].backend = GGML_TASK_BACKEND_GPU_CL;
+        p[i].stages[1].wait = true;
+        i++;
 #endif
-#define ggml_lock_unlock(x)  UNUSED(x)
+        default_task_profile_factory.n_qxx_f32 = i;
+    }
+}
 
-#define GGML_LOCK_INITIALIZER 0
+int ggml_mulmat_get_task_profiles(struct ggml_task_profile_factory *pf,
+                                  enum ggml_type src0_t, enum ggml_type src1_t,
+                                  struct ggml_task_profile **profiles) {
+    GGML_ASSERT(profiles);
 
-typedef pthread_t ggml_thread_t;
+    if (pf == NULL) {
+        pf = &default_task_profile_factory;
+    }
 
-#define ggml_thread_create pthread_create
-#define ggml_thread_join   pthread_join
+    GGML_ASSERT(src1_t == GGML_TYPE_F32);
 
-#endif
+    if (src0_t == GGML_TYPE_F32) {
+        *profiles = pf->f32_f32;
+        return pf->n_f32_f32;
+    }
 
-struct ggml_compute_state_shared {
-    ggml_lock_t spin;
+    if (src0_t == GGML_TYPE_F16) {
+        *profiles = pf->f16_f32;
+        return pf->n_f16_f32;
+    }
 
-    int n_threads;
+    if (ggml_is_quantized(src0_t)) {
+        *profiles = pf->qxx_f32;
+        return pf->n_qxx_f32;
+    }
 
-    // synchronization primitives
-    atomic_int  n_ready;
-    atomic_bool has_work;
-    atomic_bool stop; // stop all threads
-};
+    GGML_ASSERT(false);
+}
+
+static const struct ggml_task_profile *
+ggml_mulmat_get_default_task_profile(struct ggml_task_profile_factory *pf,
+                                     enum ggml_type src0_type,
+                                     enum ggml_type src1_type) {
+    GGML_ASSERT(src1_type == GGML_TYPE_F32);
+    if (pf == NULL) {
+        pf = &default_task_profile_factory;
+    }
 
-struct ggml_compute_state {
-    ggml_thread_t thrd;
+    struct ggml_task_profile *p = NULL;
 
-    struct ggml_compute_params params;
-    struct ggml_tensor * node;
+    if (src0_type == GGML_TYPE_F32) {
+        p = &pf->f32_f32[0];
+    } else if (src0_type == GGML_TYPE_F16) {
+        p = &pf->f16_f32[0];
+    } else if (ggml_is_quantized(src0_type)) {
+        p = &pf->qxx_f32[0];
+    } else {
+        GGML_ASSERT(false);
+    }
 
-    struct ggml_compute_state_shared * shared;
-};
+    for (int i = 0; i < 3; i++) {
+        GGML_ASSERT(p->stages[i].backend == GGML_TASK_BACKEND_CPU ||
+                    p->stages[i].backend == GGML_TASK_BACKEND_NONE);
+    }
 
-static thread_ret_t ggml_graph_compute_thread(void * data) {
-    struct ggml_compute_state * state = (struct ggml_compute_state *) data;
+    return p;
+}
 
-    const int n_threads = state->shared->n_threads;
+// Set task profile for GGML_OP_MUL_MAT or GGML_OP_OUT_PROD.
+static void ggml_mulmat_set_tensor_task_profile(struct ggml_tensor *node,
+                                         struct ggml_mulmat_tune *tune) {
+    GGML_ASSERT(node);
+    GGML_ASSERT(node->op == GGML_OP_MUL_MAT || node->op == GGML_OP_OUT_PROD);
 
-    while (true) {
-        if (atomic_fetch_add(&state->shared->n_ready, 1) == n_threads - 1) {
-            atomic_store(&state->shared->has_work, false);
-        } else {
-            while (atomic_load(&state->shared->has_work)) {
-                if (atomic_load(&state->shared->stop)) {
-                    return 0;
-                }
-                ggml_lock_lock  (&state->shared->spin);
-                ggml_lock_unlock(&state->shared->spin);
-            }
-        }
+    enum ggml_type src0_t = node->src0->type;
+    enum ggml_type src1_t = node->src1->type;
+
+    // Type and memory layout requirements for computing mul_mat with BLAS.
+    bool cond_match = (src0_t == GGML_TYPE_F32 || src0_t == GGML_TYPE_F16 ||
+                       ggml_is_quantized(src0_t)) &&
+                      src1_t == GGML_TYPE_F32 && node->type == GGML_TYPE_F32 &&
+                      ggml_is_contiguous(node->src0) &&
+                      ggml_is_contiguous(node->src1);
+
+    int M = (int)node->ne[1];
+    int N = (int)node->ne[0];
+    int K = (int)node->src1->ne[0];
+
+    struct ggml_task_profile *profiles = NULL;
+    int n_profiles = ggml_mulmat_get_task_profiles(NULL, src0_t, src1_t, &profiles);
+    GGML_ASSERT(n_profiles >= 2);
+    GGML_ASSERT(profiles);
 
-        atomic_fetch_sub(&state->shared->n_ready, 1);
+    const struct ggml_task_profile *prof = NULL;
 
-        // wait for work
-        while (!atomic_load(&state->shared->has_work)) {
-            if (atomic_load(&state->shared->stop)) {
-                return 0;
+    if (cond_match) {
+#if defined(GGML_USE_MULMAT_TUNE)
+        if (tune != NULL) {
+            int stages_time_us[3];
+            prof = ggml_mulmat_tune_select_task_profile(tune, M, N, K, src0_t, src1_t, stages_time_us);
+            if (prof != NULL) {
+                 GGML_ASSERT(prof);
+                 memcpy(&node->task_profile, prof, sizeof(struct ggml_task_profile));
+                 // Do not wait if the estimated execution time is too small (e.g. less than 0.1 ms)
+                 // TODO: need bench actual wait/notify time, see ggml-threading.c
+                 for (int i = 0; i < 3; i++) {
+                    if (node->task_profile.stages[i].wait) {
+                        if (stages_time_us[i] < 100) {
+                            node->task_profile.stages[i].wait = false;
+                        }
+                    }
+                 }
+                 return;
             }
-            ggml_lock_lock  (&state->shared->spin);
-            ggml_lock_unlock(&state->shared->spin);
         }
+#else
+        UNUSED(tune);
+#endif
 
-        // check if we should stop
-        if (atomic_load(&state->shared->stop)) {
-            break;
-        }
+        if (prof == NULL && M >= 32 && N >= 32 && K >= 32) {
+            for (int j = 0; j < n_profiles; j++) {
+                enum ggml_task_backend comp_be =
+                    profiles[j].stages[GGML_TASK_COMPUTE].backend;
 
-        if (state->node) {
-            if (state->params.ith < state->params.nth) {
-                ggml_compute_forward(&state->params, state->node);
+                switch (comp_be) {
+                    case GGML_TASK_BACKEND_GPU_CUDA: {
+                        GGML_ASSERT(ggml_cpu_has_cublas());
+                        prof = &profiles[j];
+                        break;
+                    }
+                    case GGML_TASK_BACKEND_GPU_CL: {
+                        GGML_ASSERT(ggml_cpu_has_clblast());
+                        prof = &profiles[j];
+                        break;
+                    }
+                    case GGML_TASK_BACKEND_CPU_BLAS: {
+                        GGML_ASSERT(ggml_cpu_has_cpublas());
+                        prof = &profiles[j];
+                        break;
+                    }
+                    default: {
+                        break;
+                    }
+                }
             }
-
-            state->node = NULL;
-        } else {
-            break;
         }
     }
 
-    return 0;
+    if (prof == NULL) {
+        prof = ggml_mulmat_get_default_task_profile(NULL, src0_t, src1_t);
+    }
+
+    GGML_ASSERT(prof);
+    memcpy(&node->task_profile, prof, sizeof(struct ggml_task_profile));
 }
 
 void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph) {
-    const int n_threads = cgraph->n_threads;
-
-    struct ggml_compute_state_shared state_shared = {
-        /*.spin      =*/ GGML_LOCK_INITIALIZER,
-        /*.n_threads =*/ n_threads,
-        /*.n_ready   =*/ 0,
-        /*.has_work  =*/ false,
-        /*.stop      =*/ false,
-    };
-    struct ggml_compute_state * workers = n_threads > 1 ? alloca(sizeof(struct ggml_compute_state)*(n_threads - 1)) : NULL;
-
-    // create thread pool
-    if (n_threads > 1) {
-        ggml_lock_init(&state_shared.spin);
-
-        atomic_store(&state_shared.has_work, true);
-
-        for (int j = 0; j < n_threads - 1; j++) {
-            workers[j] = (struct ggml_compute_state) {
-                .thrd   = 0,
-                .params = {
-                    .type  = GGML_TASK_COMPUTE,
-                    .ith   = j + 1,
-                    .nth   = n_threads,
-                    .wsize = cgraph->work ? ggml_nbytes(cgraph->work) : 0,
-                    .wdata = cgraph->work ? cgraph->work->data : NULL,
-                },
-                .node   = NULL,
-                .shared = &state_shared,
-            };
+    int n_threads = cgraph->n_threads;
 
-            int rc = ggml_thread_create(&workers[j].thrd, NULL, ggml_graph_compute_thread, &workers[j]);
-            GGML_ASSERT(rc == 0);
-            UNUSED(rc);
+    if (ggml_cpu_has_blas()) {
+        for (int i = 0; i < cgraph->n_nodes; i++) {
+            struct ggml_tensor *node = cgraph->nodes[i];
+
+            memset(&node->task_profile, 0, sizeof(struct ggml_task_profile));
+            struct ggml_task_stage *stages = node->task_profile.stages;
+
+            // Adapt node->backend: assume GPU at COMPUTE stage.
+            if (node->backend > GGML_BACKEND_CPU) {
+                stages[GGML_TASK_INIT].backend = GGML_TASK_BACKEND_NONE;
+                stages[GGML_TASK_FINALIZE].backend = GGML_TASK_BACKEND_NONE;
+
+                stages[GGML_TASK_COMPUTE].parallel = false;
+                bool wait = (node->op == GGML_OP_MUL_MAT || node->op == GGML_OP_MUL);
+                stages[GGML_TASK_COMPUTE].wait = wait;
+                if (ggml_cpu_has_cublas()) {
+                    stages[GGML_TASK_COMPUTE].backend = GGML_TASK_BACKEND_GPU_CUDA;
+                } else if (ggml_cpu_has_clblast()) {
+                    stages[GGML_TASK_COMPUTE].backend = GGML_TASK_BACKEND_GPU_CL;
+                } else {
+                    GGML_ASSERT(false);
+                }
+            } else if (node->op == GGML_OP_MUL_MAT) {
+                struct ggml_mulmat_tune * tune = NULL;
+#if defined(GGML_USE_MULMAT_TUNE)
+                tune = cgraph->tune;
+#endif
+                ggml_mulmat_set_tensor_task_profile(node, tune);
+            } else if (node->op == GGML_OP_OUT_PROD) {
+                ggml_mulmat_set_tensor_task_profile(node, NULL);
+            }
         }
     }
 
+    struct ggml_threading_context *thrd_ctx = ggml_threading_start(
+        n_threads, ggml_threading_graph_compute_thread, ggml_compute_forward,
+        GGML_THREADING_FEATURE_WAIT_ON_DONE, NULL);
+
     // initialize tasks + work buffer
     {
         size_t work_size = 0;
@@ -15709,13 +15813,13 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
         // thread scheduling for the different operations
         for (int i = 0; i < cgraph->n_nodes; i++) {
             struct ggml_tensor * node = cgraph->nodes[i];
+            struct ggml_task_stage *stages = node->task_profile.stages;
 
             switch (node->op) {
                 case GGML_OP_CPY:
                 case GGML_OP_DUP:
                     {
-                        node->n_tasks = n_threads;
-
+                        stages[GGML_TASK_COMPUTE].backend = GGML_TASK_BACKEND_CPU;
                         size_t cur = 0;
                         if (ggml_is_quantized(node->type)) {
                             cur = GGML_TYPE_SIZE[GGML_TYPE_F32] * node->ne[0] * n_threads;
@@ -15726,7 +15830,8 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
                 case GGML_OP_ADD:
                 case GGML_OP_ADD1:
                     {
-                        node->n_tasks = n_threads;
+                        stages[GGML_TASK_COMPUTE].backend = GGML_TASK_BACKEND_CPU;
+                        stages[GGML_TASK_COMPUTE].parallel = true;
 
                         size_t cur = 0;
 
@@ -15738,7 +15843,9 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
                     } break;
                 case GGML_OP_ACC:
                     {
-                        node->n_tasks = n_threads;
+                        stages[GGML_TASK_INIT].backend = GGML_TASK_BACKEND_CPU;
+                        stages[GGML_TASK_COMPUTE].backend = GGML_TASK_BACKEND_CPU;
+                        stages[GGML_TASK_COMPUTE].parallel = true;
 
                         size_t cur = 0;
 
@@ -15764,9 +15871,15 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
                 case GGML_OP_STEP:
                 case GGML_OP_RELU:
                     {
-                        node->n_tasks = 1;
+                        stages[GGML_TASK_COMPUTE].backend = GGML_TASK_BACKEND_CPU;
                     } break;
                 case GGML_OP_MUL:
+                    {
+                        if (stages[GGML_TASK_COMPUTE].backend == GGML_TASK_BACKEND_NONE) {
+                            stages[GGML_TASK_COMPUTE].backend = GGML_TASK_BACKEND_CPU;
+                            stages[GGML_TASK_COMPUTE].parallel = true;
+                        }
+                    } break;
                 case GGML_OP_GELU:
                 case GGML_OP_SILU:
                 case GGML_OP_SILU_BACK:
@@ -15774,66 +15887,65 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
                 case GGML_OP_RMS_NORM:
                 case GGML_OP_RMS_NORM_BACK:
                     {
-                        node->n_tasks = n_threads;
+                        stages[GGML_TASK_COMPUTE].backend = GGML_TASK_BACKEND_CPU;
+                        stages[GGML_TASK_COMPUTE].parallel = true;
                     } break;
                 case GGML_OP_MUL_MAT:
                 case GGML_OP_OUT_PROD:
                     {
-                        node->n_tasks = n_threads;
-
-                        // TODO: use different scheduling for different matrix sizes
-                        //const int nr0 = ggml_nrows(node->src0);
-                        //const int nr1 = ggml_nrows(node->src1);
-
-                        //node->n_tasks = MIN(n_threads, MAX(1, nr0/128));
-                        //printf("nr0 = %8d, nr1 = %8d, nr0*nr1 = %8d, n_tasks = %d\n", nr0, nr1, nr0*nr1, node->n_tasks);
-
                         size_t cur = 0;
-
-#if defined(GGML_USE_CUBLAS)
-                        if (ggml_cuda_can_mul_mat(node->src0, node->src1, node)) {
-                            node->n_tasks = 1; // TODO: this actually is doing nothing
-                                                //       the threads are still spinning
+                        enum ggml_task_backend comp_backend = stages[GGML_TASK_COMPUTE].backend;
+                        GGML_ASSERT(comp_backend != GGML_TASK_BACKEND_NONE);
+
+                        // TODO: remove this check once we are sure `ggml_mulmat_set_tensor_task_profile()` is correct.
+                        if ((comp_backend & GGML_TASK_BACKEND_GPU) || comp_backend == GGML_TASK_BACKEND_CPU_BLAS) {
+                            enum ggml_type src0_t = node->src0->type;
+                            enum ggml_type src1_t = node->src1->type;
+                            bool cond_match = (src0_t == GGML_TYPE_F32 || src0_t == GGML_TYPE_F16 ||
+                                ggml_is_quantized(src0_t)) &&
+                                src1_t == GGML_TYPE_F32 && node->type == GGML_TYPE_F32 &&
+                                ggml_is_contiguous(node->src0) &&
+                                ggml_is_contiguous(node->src1);
+                            GGML_ASSERT(cond_match);
                         }
-                        else
-#elif defined(GGML_USE_CLBLAST)
-                        if (ggml_cl_can_mul_mat(node->src0, node->src1, node)) {
-                            node->n_tasks = 1; // TODO: this actually is doing nothing
-                                                //       the threads are still spinning
+
+                        if (comp_backend == GGML_TASK_BACKEND_GPU_CL) {
+#if defined(GGML_USE_CLBLAST)
                             cur = ggml_cl_mul_mat_get_wsize(node->src0, node->src1, node);
-                        }
-                        else
+#else
+                            GGML_ASSERT(false);
 #endif
-                        if (node->src0->type == GGML_TYPE_F16 && node->src1->type == GGML_TYPE_F32) {
-#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
-                            if (ggml_compute_forward_mul_mat_use_blas(node->src0, node->src1, node)) {
-                                node->n_tasks = 1; // TODO: this actually is doing nothing
-                                                   //       the threads are still spinning
+                        } else if (comp_backend == GGML_TASK_BACKEND_CPU_BLAS) {
+                            GGML_ASSERT(ggml_cpu_has_cpublas());
+                            GGML_ASSERT(node->src1->type == GGML_TYPE_F32);
+
+                            if (node->src0->type == GGML_TYPE_F32) {
+                                cur = 0;
+                            } else if (node->src0->type == GGML_TYPE_F16) {
                                 // here we need memory just for single 2D matrix from src0
                                 cur = GGML_TYPE_SIZE[GGML_TYPE_F32]*(node->src0->ne[0]*node->src0->ne[1]);
+                            } else if (ggml_is_quantized(node->src0->type)) {
+                                cur = GGML_TYPE_SIZE[GGML_TYPE_F32]*(node->src0->ne[0]*node->src0->ne[1]);
                             } else {
-                                cur = GGML_TYPE_SIZE[GGML_TYPE_F16]*ggml_nelements(node->src1);
+                                GGML_ASSERT(false);
                             }
-#else
-                            cur = GGML_TYPE_SIZE[GGML_TYPE_F16]*ggml_nelements(node->src1);
-#endif
-                        } else if (node->src0->type == GGML_TYPE_F32 && node->src1->type == GGML_TYPE_F32) {
-                            cur = 0;
-#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
-                            if (ggml_compute_forward_mul_mat_use_blas(node->src0, node->src1, node)) {
-                                node->n_tasks = 1;
+                        } else if (comp_backend == GGML_TASK_BACKEND_CPU || comp_backend == GGML_TASK_BACKEND_GPU_CUDA) {
+                            // We have to reseve buffer for CUDA because it may fallback to CPU.
+                            if (comp_backend == GGML_TASK_BACKEND_GPU_CUDA) {
+                                GGML_ASSERT(ggml_cpu_has_cublas());
                             }
-#endif
-                        } else if (ggml_is_quantized(node->src0->type) && node->src1->type == GGML_TYPE_F32) {
-#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
-                            if (ggml_compute_forward_mul_mat_use_blas(node->src0, node->src1, node)) {
-                                node->n_tasks = 1;
-                                cur = GGML_TYPE_SIZE[GGML_TYPE_F32]*(node->src0->ne[0]*node->src0->ne[1]);
-                            } else
-#endif
-                            {
+
+                            GGML_ASSERT(node->src1->type == GGML_TYPE_F32);
+
+                            if (node->src0->type == GGML_TYPE_F32) {
+                                cur = 0;
+                            } else if (node->src0->type == GGML_TYPE_F16) {
+                                cur = GGML_TYPE_SIZE[GGML_TYPE_F16]*ggml_nelements(node->src1);
+                            } else if (ggml_is_quantized(node->src0->type)) {
                                 const enum ggml_type type_q = quantize_fns[node->src0->type].vec_dot_type;
                                 cur = GGML_TYPE_SIZE[type_q]*ggml_nelements(node->src1)/GGML_BLCK_SIZE[type_q];
+                            } else {
+                                GGML_ASSERT(false);
                             }
                         } else {
                             GGML_ASSERT(false);
@@ -15843,9 +15955,14 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
                     } break;
                 case GGML_OP_SCALE:
                     {
-                        node->n_tasks = n_threads;
+                        stages[GGML_TASK_COMPUTE].backend = GGML_TASK_BACKEND_CPU;
+                        stages[GGML_TASK_COMPUTE].parallel = true;
                     } break;
                 case GGML_OP_SET:
+                    {
+                        stages[GGML_TASK_INIT].backend = GGML_TASK_BACKEND_CPU;
+                        stages[GGML_TASK_COMPUTE].backend = GGML_TASK_BACKEND_CPU;
+                    } break;
                 case GGML_OP_CONT:
                 case GGML_OP_RESHAPE:
                 case GGML_OP_VIEW:
@@ -15856,7 +15973,7 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
                 case GGML_OP_DIAG:
                 case GGML_OP_DIAG_MASK_ZERO:
                     {
-                        node->n_tasks = 1;
+                        stages[GGML_TASK_COMPUTE].backend = GGML_TASK_BACKEND_CPU;
                     } break;
                 case GGML_OP_DIAG_MASK_INF:
                 case GGML_OP_SOFT_MAX:
@@ -15864,20 +15981,23 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
                 case GGML_OP_ROPE:
                 case GGML_OP_ROPE_BACK:
                     {
-                        node->n_tasks = n_threads;
+                        stages[GGML_TASK_COMPUTE].backend = GGML_TASK_BACKEND_CPU;
+                        stages[GGML_TASK_COMPUTE].parallel = true;
                     } break;
                 case GGML_OP_ALIBI:
                     {
-                        node->n_tasks = 1; //TODO
+                        stages[GGML_TASK_COMPUTE].backend = GGML_TASK_BACKEND_CPU;
                     } break;
                 case GGML_OP_CLAMP:
                     {
-                        node->n_tasks = 1; //TODO
+                        stages[GGML_TASK_COMPUTE].backend = GGML_TASK_BACKEND_CPU;
                     } break;
                 case GGML_OP_CONV_1D_1S:
                 case GGML_OP_CONV_1D_2S:
                     {
-                        node->n_tasks = n_threads;
+                        stages[GGML_TASK_INIT].backend = GGML_TASK_BACKEND_CPU;
+                        stages[GGML_TASK_COMPUTE].backend = GGML_TASK_BACKEND_CPU;
+                        stages[GGML_TASK_COMPUTE].parallel = true;
 
                         GGML_ASSERT(node->src0->ne[3] == 1);
                         GGML_ASSERT(node->src1->ne[2] == 1);
@@ -15906,45 +16026,48 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
                     } break;
                 case GGML_OP_FLASH_ATTN:
                     {
-                        node->n_tasks = n_threads;
+                        stages[GGML_TASK_COMPUTE].backend = GGML_TASK_BACKEND_CPU;
+                        stages[GGML_TASK_COMPUTE].parallel = true;
 
                         size_t cur = 0;
 
                         const int64_t ne11 = ggml_up(node->src1->ne[1], GGML_SOFT_MAX_UNROLL);
 
                         if (node->src1->type == GGML_TYPE_F32) {
-                            cur  = sizeof(float)*ne11*node->n_tasks; // TODO: this can become (n_tasks-1)
-                            cur += sizeof(float)*ne11*node->n_tasks; // this is overestimated by x2
+                            cur  = sizeof(float)*ne11*n_threads; // TODO: this can become (n_tasks-1)
+                            cur += sizeof(float)*ne11*n_threads; // this is overestimated by x2
                         }
 
                         if (node->src1->type == GGML_TYPE_F16) {
-                            cur  = sizeof(float)*ne11*node->n_tasks; // TODO: this can become (n_tasks-1)
-                            cur += sizeof(float)*ne11*node->n_tasks; // this is overestimated by x2
+                            cur  = sizeof(float)*ne11*n_threads; // TODO: this can become (n_tasks-1)
+                            cur += sizeof(float)*ne11*n_threads; // this is overestimated by x2
                         }
 
                         work_size = MAX(work_size, cur);
                     } break;
                 case GGML_OP_FLASH_FF:
                     {
-                        node->n_tasks = n_threads;
-
+                        stages[GGML_TASK_COMPUTE].backend = GGML_TASK_BACKEND_CPU;
+                        stages[GGML_TASK_COMPUTE].parallel = true;
                         size_t cur = 0;
 
                         if (node->src1->type == GGML_TYPE_F32) {
-                            cur  = sizeof(float)*node->src1->ne[1]*node->n_tasks; // TODO: this can become (n_tasks-1)
-                            cur += sizeof(float)*node->src1->ne[1]*node->n_tasks; // this is overestimated by x2
+                            cur  = sizeof(float)*node->src1->ne[1]*n_threads; // TODO: this can become (n_tasks-1)
+                            cur += sizeof(float)*node->src1->ne[1]*n_threads; // this is overestimated by x2
                         }
 
                         if (node->src1->type == GGML_TYPE_F16) {
-                            cur  = sizeof(float)*node->src1->ne[1]*node->n_tasks; // TODO: this can become (n_tasks-1)
-                            cur += sizeof(float)*node->src1->ne[1]*node->n_tasks; // this is overestimated by x2
+                            cur  = sizeof(float)*node->src1->ne[1]*n_threads; // TODO: this can become (n_tasks-1)
+                            cur += sizeof(float)*node->src1->ne[1]*n_threads; // this is overestimated by x2
                         }
 
                         work_size = MAX(work_size, cur);
                     } break;
                 case GGML_OP_FLASH_ATTN_BACK:
                     {
-                        node->n_tasks = n_threads;
+                        stages[GGML_TASK_INIT].backend = GGML_TASK_BACKEND_CPU;
+                        stages[GGML_TASK_COMPUTE].backend = GGML_TASK_BACKEND_CPU;
+                        stages[GGML_TASK_COMPUTE].parallel = true;
 
                         size_t cur = 0;
 
@@ -15952,13 +16075,13 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
                         const int64_t ne11 = ggml_up(node->src1->ne[1], GGML_SOFT_MAX_UNROLL);
                         const int64_t mxDn = MAX(D, ne11) * 2; // *2 because of S and SM in ggml_compute_forward_flash_attn_back
                         if (node->src1->type == GGML_TYPE_F32) {
-                            cur  = sizeof(float)*mxDn*node->n_tasks; // TODO: this can become (n_tasks-1)
-                            cur += sizeof(float)*mxDn*node->n_tasks; // this is overestimated by x2
+                            cur  = sizeof(float)*mxDn*n_threads; // TODO: this can become (n_tasks-1)
+                            cur += sizeof(float)*mxDn*n_threads; // this is overestimated by x2
                         }
 
                         if (node->src1->type == GGML_TYPE_F16) {
-                            cur  = sizeof(float)*mxDn*node->n_tasks; // TODO: this can become (n_tasks-1)
-                            cur += sizeof(float)*mxDn*node->n_tasks; // this is overestimated by x2
+                            cur  = sizeof(float)*mxDn*n_threads; // TODO: this can become (n_tasks-1)
+                            cur += sizeof(float)*mxDn*n_threads; // this is overestimated by x2
                         }
 
                         work_size = MAX(work_size, cur);
@@ -15966,32 +16089,38 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
                 case GGML_OP_MAP_UNARY:
                 case GGML_OP_MAP_BINARY:
                     {
-                        node->n_tasks = 1;
+                        stages[GGML_TASK_COMPUTE].backend = GGML_TASK_BACKEND_CPU;
                     } break;
                 case GGML_OP_CROSS_ENTROPY_LOSS:
                     {
-                        node->n_tasks = n_threads;
+                        stages[GGML_TASK_INIT].backend = GGML_TASK_BACKEND_CPU;
+                        stages[GGML_TASK_COMPUTE].backend = GGML_TASK_BACKEND_CPU;
+                        stages[GGML_TASK_COMPUTE].parallel = true;
+                        stages[GGML_TASK_FINALIZE].backend = GGML_TASK_BACKEND_CPU;
 
-                        size_t cur = ggml_type_size(node->type)*(node->n_tasks + node->src0->ne[0]*node->n_tasks);
+                        size_t cur = ggml_type_size(node->type)*(n_threads + node->src0->ne[0]*n_threads);
 
                         work_size = MAX(work_size, cur);
                     } break;
                 case GGML_OP_CROSS_ENTROPY_LOSS_BACK:
                     {
-                        node->n_tasks = n_threads;
+                        stages[GGML_TASK_COMPUTE].backend = GGML_TASK_BACKEND_CPU;
+                        stages[GGML_TASK_COMPUTE].parallel = true;
 
-                        size_t cur = ggml_type_size(node->type)*node->src0->ne[0]*node->n_tasks;
+                        size_t cur = ggml_type_size(node->type)*node->src0->ne[0]*n_threads;
 
                         work_size = MAX(work_size, cur);
                     } break;
                 case GGML_OP_NONE:
                     {
-                        node->n_tasks = 1;
+                        stages[GGML_TASK_COMPUTE].backend = GGML_TASK_BACKEND_CPU;
                     } break;
                 case GGML_OP_COUNT:
                     {
                         GGML_ASSERT(false);
                     } break;
+                default:
+                    GGML_ASSERT(false);
             }
         }
 
@@ -16023,126 +16152,27 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
         const int64_t perf_node_start_cycles  = ggml_perf_cycles();
         const int64_t perf_node_start_time_us = ggml_perf_time_us();
 
-        // INIT
-        struct ggml_compute_params params = {
-            /*.type  =*/ GGML_TASK_INIT,
-            /*.ith   =*/ 0,
-            /*.nth   =*/ node->n_tasks,
-            /*.wsize =*/ cgraph->work ? ggml_nbytes(cgraph->work) : 0,
-            /*.wdata =*/ cgraph->work ? cgraph->work->data : NULL,
-        };
-
-        ggml_compute_forward(&params, node);
-
-        // COMPUTE
-        if (node->n_tasks > 1) {
-            if (atomic_fetch_add(&state_shared.n_ready, 1) == n_threads - 1) {
-                atomic_store(&state_shared.has_work, false);
-            }
-
-            while (atomic_load(&state_shared.has_work)) {
-                ggml_lock_lock  (&state_shared.spin);
-                ggml_lock_unlock(&state_shared.spin);
-            }
-
-            // launch thread pool
-            for (int j = 0; j < n_threads - 1; j++) {
-                workers[j].params = (struct ggml_compute_params) {
-                    .type  = GGML_TASK_COMPUTE,
-                    .ith   = j + 1,
-                    .nth   = node->n_tasks,
-                    .wsize = cgraph->work ? ggml_nbytes(cgraph->work) : 0,
-                    .wdata = cgraph->work ? cgraph->work->data : NULL,
-                };
-                workers[j].node = node;
-            }
-
-            atomic_fetch_sub(&state_shared.n_ready, 1);
-
-            while (atomic_load(&state_shared.n_ready) > 0) {
-                ggml_lock_lock  (&state_shared.spin);
-                ggml_lock_unlock(&state_shared.spin);
-            }
-
-            atomic_store(&state_shared.has_work, true);
+        // TODO: can be moved out of loop?
+        void *wdata = NULL;
+        size_t wsize = 0;
+        if (cgraph->work) {
+            wdata = cgraph->work->data;
+            wsize = ggml_nbytes(cgraph->work);
         }
 
-        params.type = GGML_TASK_COMPUTE;
-        ggml_compute_forward(&params, node);
-
-        // wait for thread pool
-        if (node->n_tasks > 1) {
-            if (atomic_fetch_add(&state_shared.n_ready, 1) == n_threads - 1) {
-                atomic_store(&state_shared.has_work, false);
-            }
-
-            while (atomic_load(&state_shared.has_work)) {
-                ggml_lock_lock  (&state_shared.spin);
-                ggml_lock_unlock(&state_shared.spin);
-            }
-
-            atomic_fetch_sub(&state_shared.n_ready, 1);
-
-            while (atomic_load(&state_shared.n_ready) != 0) {
-                ggml_lock_lock  (&state_shared.spin);
-                ggml_lock_unlock(&state_shared.spin);
-            }
-        }
-
-        // FINALIZE
-        if (node->n_tasks > 1) {
-            if (atomic_fetch_add(&state_shared.n_ready, 1) == n_threads - 1) {
-                atomic_store(&state_shared.has_work, false);
-            }
-
-            while (atomic_load(&state_shared.has_work)) {
-                ggml_lock_lock  (&state_shared.spin);
-                ggml_lock_unlock(&state_shared.spin);
-            }
-
-            // launch thread pool
-            for (int j = 0; j < n_threads - 1; j++) {
-                workers[j].params = (struct ggml_compute_params) {
-                    .type  = GGML_TASK_FINALIZE,
-                    .ith   = j + 1,
-                    .nth   = node->n_tasks,
-                    .wsize = cgraph->work ? ggml_nbytes(cgraph->work) : 0,
-                    .wdata = cgraph->work ? cgraph->work->data : NULL,
-                };
-                workers[j].node = node;
-            }
-
-            atomic_fetch_sub(&state_shared.n_ready, 1);
-
-            while (atomic_load(&state_shared.n_ready) > 0) {
-                ggml_lock_lock  (&state_shared.spin);
-                ggml_lock_unlock(&state_shared.spin);
-            }
-
-            atomic_store(&state_shared.has_work, true);
-        }
-
-        params.type = GGML_TASK_FINALIZE;
-        ggml_compute_forward(&params, node);
-
-        // wait for thread pool
-        if (node->n_tasks > 1) {
-            if (atomic_fetch_add(&state_shared.n_ready, 1) == n_threads - 1) {
-                atomic_store(&state_shared.has_work, false);
-            }
-
-            while (atomic_load(&state_shared.has_work)) {
-                ggml_lock_lock  (&state_shared.spin);
-                ggml_lock_unlock(&state_shared.spin);
-            }
-
-            atomic_fetch_sub(&state_shared.n_ready, 1);
-
-            while (atomic_load(&state_shared.n_ready) != 0) {
-                ggml_lock_lock  (&state_shared.spin);
-                ggml_lock_unlock(&state_shared.spin);
+        enum ggml_compute_error err =
+            ggml_threading_compute_tensor(thrd_ctx, node, wdata, wsize);
+        if (err == GGML_COMPUTE_FALLBACK) {
+            if (node->op == GGML_OP_MUL_MAT) {
+                    const struct ggml_task_profile *p =
+                        ggml_mulmat_get_default_task_profile(
+                            NULL, node->src0->type, node->src1->type);
+                    memcpy(&node->task_profile, p,
+                           sizeof(struct ggml_task_profile));
             }
+            err = ggml_threading_compute_tensor(thrd_ctx, node, wdata, wsize);
         }
+        GGML_ASSERT(err == GGML_COMPUTE_OK);
 
         // performance stats (node)
         {
@@ -16155,19 +16185,7 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
         }
     }
 
-    // join thread pool
-    if (n_threads > 1) {
-        atomic_store(&state_shared.stop, true);
-        atomic_store(&state_shared.has_work, true);
-
-        for (int j = 0; j < n_threads - 1; j++) {
-            int rc = ggml_thread_join(workers[j].thrd, NULL);
-            GGML_ASSERT(rc == 0);
-            UNUSED(rc);
-        }
-
-        ggml_lock_destroy(&state_shared.spin);
-    }
+    ggml_threading_stop(thrd_ctx);
 
     // performance stats (graph)
     {
@@ -16242,7 +16260,7 @@ static void ggml_graph_export_node(const struct ggml_tensor * tensor, const char
             tensor->n_dims,
             ne[0], ne[1], ne[2], ne[3],
             nb[0], nb[1], nb[2], nb[3],
-            tensor->n_tasks,
+            tensor->task_profile.stages[0].parallel, // replaceed n_tasks.
             tensor->data,
             tensor->name);
 }
@@ -18024,24 +18042,24 @@ int ggml_cpu_has_wasm_simd(void) {
 #endif
 }
 
-int ggml_cpu_has_blas(void) {
-#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
+int ggml_cpu_has_cublas(void) {
+#if defined(GGML_USE_CUBLAS)
     return 1;
 #else
     return 0;
 #endif
 }
 
-int ggml_cpu_has_cublas(void) {
-#if defined(GGML_USE_CUBLAS)
+int ggml_cpu_has_clblast(void) {
+#if defined(GGML_USE_CLBLAST)
     return 1;
 #else
     return 0;
 #endif
 }
 
-int ggml_cpu_has_clblast(void) {
-#if defined(GGML_USE_CLBLAST)
+int ggml_cpu_has_cpublas(void) {
+#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
     return 1;
 #else
     return 0;
@@ -18052,6 +18070,10 @@ int ggml_cpu_has_gpublas(void) {
     return ggml_cpu_has_cublas() || ggml_cpu_has_clblast();
 }
 
+int ggml_cpu_has_blas(void) {
+    return ggml_cpu_has_cpublas() || ggml_cpu_has_gpublas();
+}
+
 int ggml_cpu_has_sse3(void) {
 #if defined(__SSE3__)
     return 1;
diff --git a/ggml.h b/ggml.h
index 1380c530fdae8..f51b658fd3abe 100644
--- a/ggml.h
+++ b/ggml.h
@@ -1,5 +1,7 @@
 #pragma once
 
+
+
 //
 // GGML Tensor Library
 //
@@ -200,6 +202,7 @@
 #define GGML_MAX_OPT           4
 #define GGML_MAX_NAME          32
 #define GGML_DEFAULT_N_THREADS 4
+#define GGML_MAX_TASK_PROFILES 8
 
 #define GGML_ASSERT(x) \
     do { \
@@ -347,7 +350,6 @@ extern "C" {
         GGML_OP_COUNT,
     };
 
-
     // ggml object
     struct ggml_object {
         size_t offs;
@@ -360,6 +362,54 @@ extern "C" {
 
     static const size_t GGML_OBJECT_SIZE = sizeof(struct ggml_object);
 
+    // As part of task config profile solution, `ggml_task_backend` defines
+    // backends for each task stage. Similar to `ggml_tensor.backend`,
+    // `ggml_tensor.task_profile` generalizes how to configure tensor computing
+    // at per task-stage level.
+    //
+    // The following enum values are designed as combination of hardware and
+    // optional software interface.
+    enum ggml_task_backend {
+        GGML_TASK_BACKEND_NONE     = 0,
+
+        // [0x10, 0x1F]: CPU
+        GGML_TASK_BACKEND_CPU      = 0x10,
+        GGML_TASK_BACKEND_CPU_BLAS = 0x11,
+
+        // [0x20 - 0x2F]: GPU
+        GGML_TASK_BACKEND_GPU      = 0x20,
+        GGML_TASK_BACKEND_GPU_CUDA = 0x21,
+        GGML_TASK_BACKEND_GPU_CL   = 0x22,
+    };
+
+    // config for computing one of the 3 task stages of a tensor.
+    struct ggml_task_stage {
+        enum ggml_task_backend backend;
+        bool parallel;
+        // hint idle workers go waiting, valid only when parallel is false.
+        bool wait;
+    };
+
+    // config for computing a tensor.
+    struct ggml_task_profile {
+        // index 0: INIT, 1: COMPUTE, 2: FINALIZE
+        struct ggml_task_stage stages[3];
+
+        // MUST be used only in testing codes.
+        uint8_t dev_flags[4];
+    };
+
+    struct ggml_task_profile_factory {
+        struct ggml_task_profile f32_f32[GGML_MAX_TASK_PROFILES];
+        int n_f32_f32;
+
+        struct ggml_task_profile f16_f32[GGML_MAX_TASK_PROFILES];
+        int n_f16_f32;
+
+        struct ggml_task_profile qxx_f32[GGML_MAX_TASK_PROFILES];
+        int n_qxx_f32;
+    };
+
     // n-dimensional tensor
     struct ggml_tensor {
         enum ggml_type    type;
@@ -383,7 +433,8 @@ extern "C" {
         struct ggml_tensor * opt[GGML_MAX_OPT];
 
         // thread scheduling
-        int n_tasks;
+
+        struct ggml_task_profile task_profile;
 
         // performance
         int     perf_runs;
@@ -396,7 +447,7 @@ extern "C" {
 
         void * extra; // extra things e.g. for ggml-cuda.cu
 
-        char padding[4];
+        char padding[12];
     };
 
     static const size_t GGML_TENSOR_SIZE = sizeof(struct ggml_tensor);
@@ -407,6 +458,8 @@ extern "C" {
         int n_leafs;
         int n_threads;
 
+        struct ggml_mulmat_tune *tune;
+
         size_t work_size;
         struct ggml_tensor * work;
 
@@ -1287,9 +1340,21 @@ extern "C" {
     GGML_API int ggml_cpu_has_cublas     (void);
     GGML_API int ggml_cpu_has_clblast    (void);
     GGML_API int ggml_cpu_has_gpublas    (void);
+    GGML_API int ggml_cpu_has_cpublas    (void);
     GGML_API int ggml_cpu_has_sse3       (void);
     GGML_API int ggml_cpu_has_vsx        (void);
 
+    //
+    // mulmat task profiles
+    //
+    GGML_API void ggml_mulmat_init_task_profiles(void);
+
+    GGML_API int ggml_mulmat_get_task_profiles(
+        struct ggml_task_profile_factory *pf,
+        enum ggml_type src0_t,
+        enum ggml_type src1_t,
+        struct ggml_task_profile **profiles);
+
     //
     // Internal types and functions exposed for tests and benchmarks
     //
diff --git a/llama.cpp b/llama.cpp
index c165d3239e63f..fa5a94e21f0ab 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -4,6 +4,7 @@
 #include <cstddef>
 #include <cstdint>
 #include <cstdio>
+#include <cstdlib>
 #endif
 
 #include "llama-util.h"
@@ -20,6 +21,10 @@
 #include "ggml-metal.h"
 #endif
 
+#ifdef GGML_USE_MULMAT_TUNE
+#include "ggml-tune.h"
+#endif
+
 #include <array>
 #include <ctime>
 #include <cinttypes>
@@ -280,6 +285,10 @@ struct llama_context {
     int    buf_last = 0;
     size_t buf_max_size[LLAMA_MAX_SCRATCH_BUFFERS] = { 0 };
 
+#ifdef GGML_USE_MULMAT_TUNE
+    struct ggml_mulmat_tune *tune = nullptr;
+#endif
+
     void use_buf(struct ggml_context * ctx, int i) {
 #if defined(LLAMA_USE_SCRATCH)
         size_t last_size = 0;
@@ -1396,10 +1405,12 @@ static bool llama_eval_internal(
 
     struct ggml_context * ctx0 = ggml_init(params);
 
-    // for big prompts, if BLAS is enabled, it is better to use only one thread
-    // otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance
     ggml_cgraph gf = {};
-    gf.n_threads = N >= 32 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas() ? 1 : n_threads;
+    gf.n_threads = n_threads;
+
+#ifdef GGML_USE_MULMAT_TUNE
+    gf.tune =lctx.tune;
+#endif
 
     struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
     ggml_set_name(embd, "embd");
@@ -2732,7 +2743,150 @@ struct llama_context * llama_init_from_file(
     return ctx;
 }
 
+#ifdef GGML_USE_MULMAT_TUNE
+bool llama_mulmat_tune(struct llama_context *ctx, int n_threads, bool tune, const char *fname) {
+    printf("\n");
+    if (ctx->model.n_gpu_layers != 0) {
+        fprintf(stderr, "[mulmat tune] error: is disabled by GPU offloading\n");
+        return false;
+    }
+
+    const char *model_name = llama_model_type_name(ctx->model.type);
+
+    llama_hparams *hparams = &ctx->model.hparams;
+
+    enum ggml_ftype ggml_ftype;
+    switch (hparams->ftype) {
+        case LLAMA_FTYPE_ALL_F32:
+        ggml_ftype = GGML_FTYPE_ALL_F32;
+        break;
+    case LLAMA_FTYPE_MOSTLY_F16:
+        ggml_ftype = GGML_FTYPE_MOSTLY_F16;
+        break;
+    case LLAMA_FTYPE_MOSTLY_Q4_0:
+        ggml_ftype = GGML_FTYPE_MOSTLY_Q4_0;
+        break;
+    case LLAMA_FTYPE_MOSTLY_Q4_1:
+        ggml_ftype = GGML_FTYPE_MOSTLY_Q4_1;
+        break;
+    case LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16:
+        ggml_ftype = GGML_FTYPE_MOSTLY_Q4_1_SOME_F16;
+        break;
+    case LLAMA_FTYPE_MOSTLY_Q5_0:
+        ggml_ftype = GGML_FTYPE_MOSTLY_Q5_0;
+        break;
+    case LLAMA_FTYPE_MOSTLY_Q5_1:
+        ggml_ftype = GGML_FTYPE_MOSTLY_Q5_1;
+        break;
+    case LLAMA_FTYPE_MOSTLY_Q8_0:
+        ggml_ftype = GGML_FTYPE_MOSTLY_Q8_0;
+        break;
+    case LLAMA_FTYPE_MOSTLY_Q2_K:
+        ggml_ftype = GGML_FTYPE_MOSTLY_Q2_K;
+        break;
+    case LLAMA_FTYPE_MOSTLY_Q3_K_S:
+    case LLAMA_FTYPE_MOSTLY_Q3_K_M:
+    case LLAMA_FTYPE_MOSTLY_Q3_K_L:
+        ggml_ftype = GGML_FTYPE_MOSTLY_Q3_K;
+        break;
+    case LLAMA_FTYPE_MOSTLY_Q4_K_S:
+    case LLAMA_FTYPE_MOSTLY_Q4_K_M:
+        ggml_ftype = GGML_FTYPE_MOSTLY_Q4_K;
+        break;
+    case LLAMA_FTYPE_MOSTLY_Q5_K_S:
+    case LLAMA_FTYPE_MOSTLY_Q5_K_M:
+        ggml_ftype = GGML_FTYPE_MOSTLY_Q5_K;
+        break;
+    case LLAMA_FTYPE_MOSTLY_Q6_K:
+        ggml_ftype = GGML_FTYPE_MOSTLY_Q6_K;
+        break;
+    default:
+        throw std::runtime_error(
+            format("invalid output file type %d\n", hparams->ftype));
+    }
+
+    int n_vocab = hparams->n_vocab;
+    int n_embd = hparams->n_embd;
+    int n_rot = hparams->n_rot;
+
+    int n_mult = hparams->n_mult;
+    int n_ff = ((2*(4*n_embd)/3 + n_mult - 1)/n_mult)*n_mult;
+
+    struct ggml_mulmat_tune_params params = {
+        /*.model =*/ {
+            /* .name    =*/ model_name,
+            /* .ftype   =*/ ggml_ftype,
+            /* .n_vocab =*/ n_vocab,
+            /* .n_embd  =*/ n_embd,
+            /* .n_ff    =*/ n_ff,
+            /* .n_rot   =*/ n_rot,
+        },
+        /* .m_num          =*/ 8,
+        /* .n_pass         =*/ 1,
+        /* .n_threads      =*/ n_threads,
+        /* .prrogress      =*/ true,
+        /* .output_console =*/ false,
+        /* .fname          =*/ fname,
+    };
+
+    bool empty_fname = !fname || strcmp(fname, "") == 0;
+
+    ctx->tune = new(struct ggml_mulmat_tune);
+    if (!ctx->tune) {
+        throw std::runtime_error(format("failed to allocate memory for tune\n"));
+    }
+
+    if (tune) {
+        bool ok = ggml_mulmat_tune_bench(ctx->tune, &params);
+        if (!ok) {
+            ggml_mulmat_tune_free(ctx->tune);
+            return false;
+        }
+        if (!empty_fname) {
+            ggml_mulmat_tune_free(ctx->tune);
+            return true;
+        }
+    } else {
+        if (empty_fname) {
+            return false;
+        }
+    }
+
+    if (!empty_fname) {
+        FILE *fp = fopen(fname, "r");
+        if (!fp) {
+            fprintf(stderr, "[mulmat tune] failed to open file %s.\n",
+                    fname);
+        } else {
+            bool ok = ggml_mulmat_tune_read_data(ctx->tune, fp);
+            fclose(fp);
+
+            if (!ok) {
+                fprintf(stderr,
+                        "[mulmat tune] failed to read data from %s\n",
+                        fname);
+                return false;
+            }
+
+            fprintf(stderr, "[mulmat tune] loaded data from %s\n", fname);
+
+            ok = ggml_mulmat_tune_validate(ctx->tune, model_name, ggml_ftype, params.n_threads);
+            if (!ok) {
+                return false;
+            }
+        }
+    }
+
+    return true;
+}
+#endif
+
 void llama_free(struct llama_context * ctx) {
+#ifdef GGML_USE_MULMAT_TUNE
+    if (ctx->tune) {
+        delete(ctx->tune);
+    }
+#endif
     delete ctx;
 }
 
diff --git a/llama.h b/llama.h
index 1241ba6c0ec44..c3f6a21548a52 100644
--- a/llama.h
+++ b/llama.h
@@ -300,6 +300,9 @@ extern "C" {
     // Print system information
     LLAMA_API const char * llama_print_system_info(void);
 
+    // Experimental utility functionality for mulmat tunning.
+    LLAMA_API bool llama_mulmat_tune(struct llama_context *ctx, int n_threads, bool tune, const char *fname);
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/tests/.gitignore b/tests/.gitignore
new file mode 100644
index 0000000000000..f4b8ee1b32633
--- /dev/null
+++ b/tests/.gitignore
@@ -0,0 +1,2 @@
+/test-ggml-threading
+/test-ggml-tune
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index 4171c126c7b7d..977b8ef6db032 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -12,3 +12,5 @@ llama_add_test(test-sampling.cpp)
 llama_add_test(test-tokenizer-0.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab.bin)
 # llama_add_test(test-grad0.c) # SLOW
 # llama_add_test(test-opt.c) # SLOW
+llama_add_test(test-ggml-threading.c)
+llama_add_test(test-ggml-tune.c)
diff --git a/tests/test-ggml-threading.c b/tests/test-ggml-threading.c
new file mode 100644
index 0000000000000..0b47623e2d0c1
--- /dev/null
+++ b/tests/test-ggml-threading.c
@@ -0,0 +1,345 @@
+#include "ggml-threading.h"
+#include "ggml.h"
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+
+// Purposes:
+// 1. general overview of the threading behaviors.
+// 2. race (dead lock) detection.
+
+// # build
+// cd build
+//
+// # build release:
+//   cmake .. && cmake --build . --config Release
+//
+// # build with sanitize:
+//   cmake .. -DLLAMA_SANITIZE_THREAD=ON && cmake --build . --config Release
+//
+// # run:
+// ./bin/test-ggml-threading
+
+// How to turn off the warning on Apple: malloc: nano zone abandoned due to
+// inability to reserve vm space?
+// ==> export MallocNanoZone=0, no need to rebuild.
+// See `nano_init()` from
+// https://opensource.apple.com/source/libmalloc/libmalloc-140.40.1/src/nano_malloc.c.auto.html
+
+// How to view the threading debug:
+// ==> uncomment `#define GGML_THREADING_DEBUG 1` from file ggml-threading.c
+
+#define UNUSED(x) (void)(x)
+
+#define MAX_N_THREADS 16
+
+static const int n_repeat = 10;
+
+// It's frustrating to use atomic with c11 on Windows, let's replace atomic
+// counter with array.
+static int work_done_arr[MAX_N_THREADS];
+
+static enum ggml_compute_error
+mock_task_runner(struct ggml_compute_params *params, struct ggml_tensor *node) {
+    int64_t loops = node->task_profile.dev_flags[1] * 1000 * 1000;
+    if (node->task_profile.stages[params->type].parallel) {
+        loops /= params->nth;
+    }
+
+    volatile int64_t j = 0;
+    for (int i = 0; i < loops; i++) {
+        j++;
+    }
+
+    UNUSED(j);
+
+    work_done_arr[params->ith]++;
+    return GGML_COMPUTE_OK;
+}
+
+int test_driver(int id, struct ggml_tensor *node, int n_threads) {
+    printf("\n[test-ggml-threading] #%d, n_threads: %d\n", id, n_threads);
+
+    for (int i = 0; i < n_threads; i++) {
+        work_done_arr[i] = 0;
+    }
+
+    bool wait_on_done = (node->task_profile.dev_flags[0] > 0u);
+
+    enum ggml_threading_features features = GGML_THREADING_FEATURE_PERF;
+    if (wait_on_done) {
+        features |= GGML_THREADING_FEATURE_WAIT_ON_DONE;
+    }
+
+    int t0 = (int)ggml_time_us();
+
+    struct ggml_threading_context *ctx =
+        ggml_threading_start(n_threads, ggml_threading_graph_compute_thread,
+                             mock_task_runner, features, /*stages_time*/ NULL);
+
+    int t1 = (int)ggml_time_us();
+
+    for (int i = 0; i < n_repeat; i++) {
+        enum ggml_compute_error err = ggml_threading_compute_tensor(
+            ctx, node, /*wdata*/ NULL, /*wsize*/ 0);
+        if (err != GGML_COMPUTE_OK) {
+            ggml_threading_stop(ctx);
+            fprintf(stderr,
+                    "ggml_threading_compute_tensor failed with error: %d.\n",
+                    err);
+            return 1;
+        }
+    }
+
+    int t2 = (int)ggml_time_us();
+
+    ggml_threading_stop(ctx);
+
+    int t3 = (int)ggml_time_us();
+
+    int expect = 0;
+    for (int i = 0; i < 3; i++) {
+        struct ggml_task_stage *ts = &node->task_profile.stages[i];
+        if (ts->backend != GGML_TASK_BACKEND_NONE) {
+            if (ts->parallel) {
+                expect += n_threads;
+            } else {
+                expect++;
+            }
+        }
+    }
+    expect *= n_repeat;
+
+    int actual = 0;
+    for (int i = 0; i < n_threads; i++) {
+        actual += work_done_arr[i];
+    }
+
+    uint8_t loops = node->task_profile.dev_flags[1];
+
+    printf("\tloops: %2d million(s), ---wait_on_done---: %d\n\tstage-0: "
+           "(parallel: %d, "
+           "wait: %d)\n"
+           "\tstage-1: (parallel: %d, wait: %d)\n",
+           loops, wait_on_done, node->task_profile.stages[0].parallel,
+           node->task_profile.stages[0].wait,
+           node->task_profile.stages[1].parallel,
+           node->task_profile.stages[1].wait);
+
+    if (actual == expect) {
+        printf("\tthreading: init %6.3f ms, compute %6.3f ms, cleanup %6.3f "
+               "ms, total %6.3f ms\n",
+               1.0 * (t1 - t0) / 1000, 1.0 * (t2 - t1) / 1000,
+               1.0 * (t3 - t2) / 1000, 1.0 * (t3 - t0) / 1000);
+        return 0;
+    }
+
+    fprintf(stderr, "\t== failed. expect %d done, actual %d done\n\n", expect,
+            actual);
+
+    return 2;
+}
+
+static enum ggml_compute_error
+mock_task_runner_fallback(struct ggml_compute_params *params,
+                          struct ggml_tensor *node) {
+    UNUSED(params);
+    if (node->backend == GGML_BACKEND_GPU) {
+        // ... finally failed to compute in GPU.
+
+        node->backend = GGML_BACKEND_CPU;
+        return GGML_COMPUTE_FALLBACK;
+    } else {
+        return GGML_COMPUTE_OK;
+    }
+}
+
+// By design, fallback should happen when attempt computing tensor in GPU,
+// thus it is not parallelled.
+int test_fallback(struct ggml_tensor *node) {
+    struct ggml_threading_context *ctx = ggml_threading_start(
+        1, ggml_threading_graph_compute_thread, mock_task_runner_fallback,
+        /*features*/ GGML_THREADING_FEATURE_NONE, /*stages_time*/ NULL);
+
+    enum ggml_compute_error err =
+        ggml_threading_compute_tensor(ctx, node, /*wdata*/ NULL, /*wsize*/ 0);
+    if (err == GGML_COMPUTE_FALLBACK) {
+        err = ggml_threading_compute_tensor(ctx, node, /*wdata*/ NULL,
+                                            /*wsize*/ 0);
+    }
+
+    ggml_threading_stop(ctx);
+    if (err != GGML_COMPUTE_OK) {
+        fprintf(stderr,
+                "ggml_threading_compute_tensor failed with error: %d.\n", err);
+        return 1;
+    }
+
+    return 0;
+}
+
+int main(void) {
+    ggml_time_init();
+
+    struct ggml_tensor node;
+    memset(&node, 0, sizeof(struct ggml_tensor));
+
+    struct ggml_task_stage *stages = node.task_profile.stages;
+
+    stages[0].backend = GGML_TASK_BACKEND_CPU;
+    stages[1].backend = GGML_TASK_BACKEND_CPU;
+    stages[2].backend = GGML_TASK_BACKEND_NONE;
+
+    int n_passed = 0;
+    int n_tests = 0;
+
+    int parallel[3] = {0, 1, 2};
+
+    // In github build actions (windows-latest-cmake and ubuntu-latest-cmake):
+    // When n_threads >= 4, the thread init time and compute time suddenly goes
+    // down to 100x ~ 1000x slow -- comparing to n_threads == 2.
+    //
+    // But the tests (n_threads 1, 2, 4, 6) looks sound on my devices:
+    // - MacBook air 2013, ubuntu 22.04
+    // - MacBook pro 2018, macOS 13.4
+    //
+    // So I assume the github build host has limited multi-cpu quota.
+    // Will skip computing when threading init time is too slow.
+    //
+    // NOTE: it's observed that when workload is 0 and n_threads >= number of
+    // physical cores:
+    // - the wait/wakeup time varies much: can be up to tens or hundreds of the
+    //   average time, thus greatly punishes those small workloads.
+    // - wait_on_done is general faster than wait_now, can be 10x faster.
+
+    int threads_arr[] = {1, 2, 4, 8};
+    int threads_arr_len = sizeof(threads_arr) / sizeof(threads_arr[0]);
+
+    // millions of loops.
+    uint8_t workload_arr[] = {0u, 1u, 10u};
+    int workload_arr_len = sizeof(workload_arr) / sizeof(workload_arr[0]);
+
+    // node.task_profile.dev_flags: byte 0 for wait_on_done, byte 1 for loops.
+
+    for (int x = 0; x < workload_arr_len; x++) {
+        node.task_profile.dev_flags[1] = workload_arr[x];
+
+        for (int i = 0; i < threads_arr_len; i++) {
+            int n_threads = threads_arr[i];
+            if (n_threads > MAX_N_THREADS) {
+                abort();
+            }
+
+            printf("\n[test-ggml-threading] ==== n_nodes: %d, n_threads: %d, "
+                   "loops: %2d million(s) ====\n",
+                   n_repeat, n_threads, workload_arr[x]);
+
+            if (n_threads > 1) { // skip this n_threads when too slow.
+                int t0 = (int)ggml_time_us();
+
+                struct ggml_threading_context *ctx = ggml_threading_start(
+                    n_threads, ggml_threading_graph_compute_thread,
+                    mock_task_runner, 0, /*stages_time*/ NULL);
+
+                int t1 = (int)ggml_time_us();
+
+                ggml_threading_stop(ctx);
+
+                int elapsed_us = t1 - t0;
+                if (elapsed_us > 500 * n_threads) {
+                    fprintf(stderr,
+                            "[test-ggml-threading] warning: it took took %.3f "
+                            "ms to start %d worker thread(s).\n",
+                            1.0 * elapsed_us / 1000, n_threads - 1);
+                    fprintf(stderr, "[test-ggml-threading] warning: looks like "
+                                    "the environment is too slow to run this "
+                                    "number of threads, skip.\n");
+                    continue;
+                }
+            }
+
+            // multi-threads: parallel + wait_now/wait_on_done
+
+            if (n_threads == 1) {
+                stages[0].parallel = false;
+                stages[1].parallel = false;
+                stages[0].wait = false;
+                stages[1].wait = false;
+
+                n_tests++;
+                if (test_driver(n_tests, &node, n_threads) == 0) {
+                    n_passed++;
+                }
+                continue;
+            }
+
+            for (int j = 0; j < 3; j++) {
+                stages[0].wait = false;
+                stages[1].wait = false;
+                node.task_profile.dev_flags[0] = 0u;
+
+                if (parallel[j] == 0) {
+                    stages[0].parallel = false;
+                    stages[1].parallel = false;
+
+                    n_tests++;
+                    if (test_driver(n_tests, &node, n_threads) == 0) {
+                        n_passed++;
+                    }
+                } else if (parallel[j] == 1) {
+                    stages[0].parallel = true;
+                    stages[1].parallel = false;
+
+                    for (int k = 0; k < 2; k++) {
+                        stages[1].wait = (k == 1);
+
+                        if (!stages[1].wait) {
+                            n_tests++;
+                            if (test_driver(n_tests, &node, n_threads) == 0) {
+                                n_passed++;
+                            }
+                            continue;
+                        }
+
+                        // wait
+
+                        for (int m = 0; m < 2; m++) {
+                            if (m == 1) {
+                                node.task_profile.dev_flags[0] = 1u;
+                            }
+                            n_tests++;
+                            if (test_driver(n_tests, &node, n_threads) == 0) {
+                                n_passed++;
+                            }
+                            node.task_profile.dev_flags[0] = 0u;
+                        }
+                    }
+                } else {
+                    stages[0].parallel = true;
+                    stages[1].parallel = true;
+
+                    n_tests++;
+                    if (test_driver(n_tests, &node, n_threads) == 0) {
+                        n_passed++;
+                    }
+                }
+            }
+        }
+    }
+
+    {
+        ++n_tests;
+
+        node.backend = GGML_BACKEND_GPU;
+        if (test_fallback(&node) == 0) {
+            ++n_passed;
+            printf("\n[test-ggml-threading] test fallback: ok\n\n");
+        }
+    }
+
+    printf("[test-ggml-threading] %d/%d passed.\n", n_passed, n_tests);
+
+    return (n_passed == n_tests) ? 0 : 1;
+}
diff --git a/tests/test-ggml-tune.c b/tests/test-ggml-tune.c
new file mode 100644
index 0000000000000..ed612fff45562
--- /dev/null
+++ b/tests/test-ggml-tune.c
@@ -0,0 +1,200 @@
+#include "ggml-tune.h"
+#include "ggml.h"
+
+#include <string.h>
+
+static int bench(void);
+static int estimate_time_non_zero_NK(void);
+
+static void init_params(struct ggml_mulmat_tune_params *params, int m_num) {
+    *params = (struct ggml_mulmat_tune_params){
+        .model =
+            (struct ggml_mulmat_tune_model){
+                .name = "3B", // fake
+                .ftype = GGML_FTYPE_MOSTLY_Q4_0,
+                .n_vocab = 4096,
+                .n_embd = 1024,
+                .n_ff = 2048,
+                .n_rot = 128,
+            },
+        .m_num = m_num,
+        .n_pass = 1,
+        .n_threads = 1,
+        .progress = false,
+        .output_console = true,
+        .fname = NULL};
+}
+
+int main(void) {
+    int rv = bench();
+    if (rv != 0) {
+        return rv;
+    }
+
+    printf("\n");
+
+    rv = estimate_time_non_zero_NK();
+    if (rv != 0) {
+        return rv;
+    }
+    printf("\n");
+
+    return 0;
+}
+
+static int bench(void) {
+    printf("test: %s\n", __func__);
+
+    {
+        enum ggml_task_backend backends[16];
+        int n_backends = ggml_mulmat_tune_get_builtin_task_backends(backends);
+        if (n_backends < 2) {
+            printf("test: %s, skipped because no BLAS\n", __func__);
+            return 0;
+        }
+    }
+
+    {
+        struct ggml_init_params init_params = {
+            /*.mem_size   =*/1,
+            /*.mem_buffer =*/NULL,
+            /*.no_alloc   =*/0,
+        };
+        struct ggml_context *ctx = ggml_init(init_params);
+        GGML_ASSERT(ctx);
+        ggml_free(ctx);
+    }
+
+    struct ggml_mulmat_tune tune;
+
+    struct ggml_mulmat_tune_params params;
+
+    init_params(&params, /*m_num*/ 4);
+
+    bool ok = ggml_mulmat_tune_bench(&tune, &params);
+    ggml_mulmat_tune_free(&tune);
+
+    return ok ? 0 : 1;
+}
+
+int estimate_time_non_zero_NK(void) {
+    printf("test: %s\n", __func__);
+
+    struct test_data_t {
+        int M;
+        int time[3]; // 3 profiles.
+    };
+
+    struct ggml_mulmat_tune tune = {
+        .version = 1,
+        .ftype = GGML_FTYPE_MOSTLY_Q4_0,
+    };
+
+    const int m_num = 2;
+
+    struct ggml_task_profile_factory pf;
+    memset(&pf, 0, sizeof(struct ggml_task_profile_factory));
+
+    {
+        pf.n_qxx_f32 = 2;
+        pf.qxx_f32[0].stages[0].backend = GGML_TASK_BACKEND_CPU;
+        pf.qxx_f32[0].stages[1].backend = GGML_TASK_BACKEND_CPU;
+
+        pf.qxx_f32[1].stages[0].backend = GGML_TASK_BACKEND_CPU;
+        pf.qxx_f32[1].stages[1].backend = GGML_TASK_BACKEND_CPU_BLAS;
+    }
+
+    struct ggml_mulmat_tune_params params;
+    init_params(&params, m_num);
+
+    ggml_mulmat_tune_init(&tune, &params, &pf);
+
+    struct ggml_mulmat_tune_shape *shape = NULL;
+    for (int i = 0; i < tune.n_shapes; i++) {
+        if (tune.shapes[i].N > 0 && tune.shapes[i].K > 0) {
+            shape = &tune.shapes[i];
+            break;
+        }
+    }
+    GGML_ASSERT(shape);
+    GGML_ASSERT(shape->n_profiles == 2);
+    GGML_ASSERT(ggml_is_quantized(shape->src0_type));
+
+    printf("shape: N: %d, K: %d, n_profiles: %d\n", shape->N, shape->K,
+           shape->n_profiles);
+
+    {
+        shape->items[0] =
+            (struct ggml_mulmat_tune_m){.M = 2, .stages_time = {2, 4, 0}};
+        shape->items[1] =
+            (struct ggml_mulmat_tune_m){.M = 4, .stages_time = {4, 8, 0}};
+
+        shape->items[2] =
+            (struct ggml_mulmat_tune_m){.M = 2, .stages_time = {4, 4, 0}};
+        shape->items[3] =
+            (struct ggml_mulmat_tune_m){.M = 4, .stages_time = {4, 4, 0}};
+    }
+
+    const struct test_data_t test_data[] = {
+        {
+            .M = 1, // out of range
+            .time = {3, 8},
+        },
+        {
+            .M = 2,
+            .time = {6, 8},
+        },
+        {
+            .M = 3,
+            .time = {9, 8},
+        },
+        {
+            .M = 4,
+            .time = {12, 8},
+        },
+        {
+            .M = 5, // out of range
+            .time = {15, 8},
+        },
+    };
+
+    int n_tests = (int)(sizeof(test_data) / sizeof(struct test_data_t));
+
+    struct ggml_mulmat_tune_time profile_time[GGML_MAX_TASK_PROFILES];
+    size_t profile_time_sz =
+        sizeof(struct ggml_mulmat_tune_time) * GGML_MAX_TASK_PROFILES;
+
+    int n_passed = 0;
+    for (int i = 0; i < n_tests; i++) {
+        memset(profile_time, 0, profile_time_sz);
+        const struct test_data_t *e = &test_data[i];
+
+        const struct ggml_mulmat_tune_shape *matched_shape =
+            ggml_mulmat_tune_get_shape(&tune, shape->N, shape->K,
+                                       shape->src0_type, shape->src1_type);
+        GGML_ASSERT(matched_shape);
+        GGML_ASSERT(matched_shape == shape);
+
+        ggml_mulmat_tune_estimate_time(matched_shape, e->M, profile_time);
+
+        for (int j = 0; j < shape->n_profiles; j++) {
+            int actual = profile_time[j].total_time;
+            int expect = e->time[j];
+            if (expect != actual) {
+                fprintf(stderr,
+                        "test fail. i: %d, j: %d, M: %d, expect: "
+                        "%d, actual: %d\n",
+                        i, j, e->M, expect, actual);
+            } else {
+                ++n_passed;
+            }
+        }
+    }
+
+    n_tests *= shape->n_profiles;
+    printf("%2d of %2d pass\n", n_passed, n_tests);
+
+    ggml_mulmat_tune_free(&tune);
+
+    return n_passed == n_tests ? 0 : 1;
+}

From 1b041d773730efbfa98743ddc4446bbf14cafc8d Mon Sep 17 00:00:00 2001
From: mqy <meng.qingyou@gmail.com>
Date: Wed, 14 Jun 2023 21:17:14 +0800
Subject: [PATCH 02/24] threading test: improve readability at both codes and
 output

---
 tests/test-ggml-threading.c | 192 ++++++++++++++++++++----------------
 1 file changed, 108 insertions(+), 84 deletions(-)

diff --git a/tests/test-ggml-threading.c b/tests/test-ggml-threading.c
index 0b47623e2d0c1..ed9d8aa2bcdd8 100644
--- a/tests/test-ggml-threading.c
+++ b/tests/test-ggml-threading.c
@@ -60,7 +60,11 @@ mock_task_runner(struct ggml_compute_params *params, struct ggml_tensor *node) {
 }
 
 int test_driver(int id, struct ggml_tensor *node, int n_threads) {
-    printf("\n[test-ggml-threading] #%d, n_threads: %d\n", id, n_threads);
+    uint8_t loops = node->task_profile.dev_flags[1];
+    printf(
+        "\n[test-ggml-threading] #%02d, workload: %2d million(s), n_threads: "
+        "%2d\n",
+        id, loops, n_threads);
 
     for (int i = 0; i < n_threads; i++) {
         work_done_arr[i] = 0;
@@ -86,9 +90,8 @@ int test_driver(int id, struct ggml_tensor *node, int n_threads) {
             ctx, node, /*wdata*/ NULL, /*wsize*/ 0);
         if (err != GGML_COMPUTE_OK) {
             ggml_threading_stop(ctx);
-            fprintf(stderr,
-                    "ggml_threading_compute_tensor failed with error: %d.\n",
-                    err);
+            printf("ggml_threading_compute_tensor failed with error: %d.\n",
+                   err);
             return 1;
         }
     }
@@ -99,9 +102,11 @@ int test_driver(int id, struct ggml_tensor *node, int n_threads) {
 
     int t3 = (int)ggml_time_us();
 
+    const struct ggml_task_stage *stages = node->task_profile.stages;
+
     int expect = 0;
     for (int i = 0; i < 3; i++) {
-        struct ggml_task_stage *ts = &node->task_profile.stages[i];
+        const struct ggml_task_stage *ts = &stages[i];
         if (ts->backend != GGML_TASK_BACKEND_NONE) {
             if (ts->parallel) {
                 expect += n_threads;
@@ -117,16 +122,10 @@ int test_driver(int id, struct ggml_tensor *node, int n_threads) {
         actual += work_done_arr[i];
     }
 
-    uint8_t loops = node->task_profile.dev_flags[1];
-
-    printf("\tloops: %2d million(s), ---wait_on_done---: %d\n\tstage-0: "
-           "(parallel: %d, "
-           "wait: %d)\n"
-           "\tstage-1: (parallel: %d, wait: %d)\n",
-           loops, wait_on_done, node->task_profile.stages[0].parallel,
-           node->task_profile.stages[0].wait,
-           node->task_profile.stages[1].parallel,
-           node->task_profile.stages[1].wait);
+    printf("\tstage-0: parallel: %d, wait: %d\n\tstage-1: parallel: %d, wait: "
+           "%d, wait_on_done: %d %s\n",
+           stages[0].parallel, stages[0].wait, stages[1].parallel,
+           stages[1].wait, wait_on_done, stages[1].wait ? "<--------" : "");
 
     if (actual == expect) {
         printf("\tthreading: init %6.3f ms, compute %6.3f ms, cleanup %6.3f "
@@ -136,8 +135,7 @@ int test_driver(int id, struct ggml_tensor *node, int n_threads) {
         return 0;
     }
 
-    fprintf(stderr, "\t== failed. expect %d done, actual %d done\n\n", expect,
-            actual);
+    printf("\t== failed. expect %d done, actual %d done\n\n", expect, actual);
 
     return 2;
 }
@@ -172,8 +170,7 @@ int test_fallback(struct ggml_tensor *node) {
 
     ggml_threading_stop(ctx);
     if (err != GGML_COMPUTE_OK) {
-        fprintf(stderr,
-                "ggml_threading_compute_tensor failed with error: %d.\n", err);
+        printf("ggml_threading_compute_tensor failed with error: %d.\n", err);
         return 1;
     }
 
@@ -195,8 +192,6 @@ int main(void) {
     int n_passed = 0;
     int n_tests = 0;
 
-    int parallel[3] = {0, 1, 2};
-
     // In github build actions (windows-latest-cmake and ubuntu-latest-cmake):
     // When n_threads >= 4, the thread init time and compute time suddenly goes
     // down to 100x ~ 1000x slow -- comparing to n_threads == 2.
@@ -214,13 +209,47 @@ int main(void) {
     //   average time, thus greatly punishes those small workloads.
     // - wait_on_done is general faster than wait_now, can be 10x faster.
 
-    int threads_arr[] = {1, 2, 4, 8};
+    int threads_arr[] = {1, 2, 4, 6, 8, 16};
     int threads_arr_len = sizeof(threads_arr) / sizeof(threads_arr[0]);
 
     // millions of loops.
     uint8_t workload_arr[] = {0u, 1u, 10u};
     int workload_arr_len = sizeof(workload_arr) / sizeof(workload_arr[0]);
 
+    // skip slow/big n_threads.
+    for (int i = 0; i < threads_arr_len; i++) {
+        int n_threads = threads_arr[i];
+
+        if (n_threads == 1) {
+            continue;
+        } else if (n_threads > MAX_N_THREADS) {
+            printf("[test-ggml-threading] warning: the n_threads (%d) is too "
+                   "big, allow at most %d, skip.\n",
+                   n_threads, MAX_N_THREADS);
+            threads_arr[i] = 0;
+            continue;
+        }
+
+        // skip this n_threads when too slow.
+        int t0 = (int)ggml_time_us();
+
+        struct ggml_threading_context *ctx =
+            ggml_threading_start(n_threads, ggml_threading_graph_compute_thread,
+                                 mock_task_runner, 0, /*stages_time*/ NULL);
+
+        int t1 = (int)ggml_time_us();
+
+        ggml_threading_stop(ctx);
+
+        int elapsed_us = t1 - t0;
+        if (elapsed_us > 500 * n_threads) {
+            printf("[test-ggml-threading] warning: it took took %.3f "
+                   "ms to start %d worker thread(s). Loo slow, skip.\n",
+                   1.0 * elapsed_us / 1000, n_threads - 1);
+            threads_arr[i] = 0;
+        }
+    }
+
     // node.task_profile.dev_flags: byte 0 for wait_on_done, byte 1 for loops.
 
     for (int x = 0; x < workload_arr_len; x++) {
@@ -228,101 +257,96 @@ int main(void) {
 
         for (int i = 0; i < threads_arr_len; i++) {
             int n_threads = threads_arr[i];
-            if (n_threads > MAX_N_THREADS) {
-                abort();
+            if (n_threads <= 0) {
+                continue;
             }
 
-            printf("\n[test-ggml-threading] ==== n_nodes: %d, n_threads: %d, "
-                   "loops: %2d million(s) ====\n",
-                   n_repeat, n_threads, workload_arr[x]);
-
-            if (n_threads > 1) { // skip this n_threads when too slow.
-                int t0 = (int)ggml_time_us();
+            printf("\n[test-ggml-threading] ==== workload: %2d million(s), "
+                   "n_threads: %2d ====\n",
+                   workload_arr[x], n_threads);
 
-                struct ggml_threading_context *ctx = ggml_threading_start(
-                    n_threads, ggml_threading_graph_compute_thread,
-                    mock_task_runner, 0, /*stages_time*/ NULL);
+            // multi-threads: parallel + wait_now/wait_on_done
 
-                int t1 = (int)ggml_time_us();
+            if (n_threads == 1) {
+                stages[0].parallel = false;
+                stages[1].parallel = false;
+                stages[0].wait = false;
+                stages[1].wait = false;
 
-                ggml_threading_stop(ctx);
+                node.task_profile.dev_flags[0] = 0u;
 
-                int elapsed_us = t1 - t0;
-                if (elapsed_us > 500 * n_threads) {
-                    fprintf(stderr,
-                            "[test-ggml-threading] warning: it took took %.3f "
-                            "ms to start %d worker thread(s).\n",
-                            1.0 * elapsed_us / 1000, n_threads - 1);
-                    fprintf(stderr, "[test-ggml-threading] warning: looks like "
-                                    "the environment is too slow to run this "
-                                    "number of threads, skip.\n");
-                    continue;
+                n_tests++;
+                if (test_driver(n_tests, &node, n_threads) == 0) {
+                    n_passed++;
                 }
+                continue;
             }
 
-            // multi-threads: parallel + wait_now/wait_on_done
-
-            if (n_threads == 1) {
+            { // no parallel, no wait
                 stages[0].parallel = false;
                 stages[1].parallel = false;
                 stages[0].wait = false;
                 stages[1].wait = false;
 
+                node.task_profile.dev_flags[0] = 0u;
+
                 n_tests++;
                 if (test_driver(n_tests, &node, n_threads) == 0) {
                     n_passed++;
                 }
-                continue;
             }
 
-            for (int j = 0; j < 3; j++) {
+            { // both parallel, no wait
+                stages[0].parallel = true;
+                stages[1].parallel = true;
                 stages[0].wait = false;
                 stages[1].wait = false;
+
                 node.task_profile.dev_flags[0] = 0u;
 
-                if (parallel[j] == 0) {
-                    stages[0].parallel = false;
-                    stages[1].parallel = false;
+                n_tests++;
+                if (test_driver(n_tests, &node, n_threads) == 0) {
+                    n_passed++;
+                }
+            }
+
+            { // stage 0 parallel, stage 1 may wait
+                stages[0].parallel = true;
+                stages[1].parallel = false;
+                stages[0].wait = false;
+
+                { // stage 1 no wait
+                    stages[1].wait = false;
+                    node.task_profile.dev_flags[0] = 0u;
 
                     n_tests++;
                     if (test_driver(n_tests, &node, n_threads) == 0) {
                         n_passed++;
                     }
-                } else if (parallel[j] == 1) {
-                    stages[0].parallel = true;
-                    stages[1].parallel = false;
-
-                    for (int k = 0; k < 2; k++) {
-                        stages[1].wait = (k == 1);
-
-                        if (!stages[1].wait) {
-                            n_tests++;
-                            if (test_driver(n_tests, &node, n_threads) == 0) {
-                                n_passed++;
-                            }
-                            continue;
-                        }
+                }
+
+                { // stage 1 wait
+                    stages[1].wait = true;
+                    if (stages[1].parallel) {
+                        abort();
+                    }
 
-                        // wait
-
-                        for (int m = 0; m < 2; m++) {
-                            if (m == 1) {
-                                node.task_profile.dev_flags[0] = 1u;
-                            }
-                            n_tests++;
-                            if (test_driver(n_tests, &node, n_threads) == 0) {
-                                n_passed++;
-                            }
-                            node.task_profile.dev_flags[0] = 0u;
+                    { // disable wait_on_done
+                        node.task_profile.dev_flags[0] = 0u; // wait now.
+
+                        n_tests++;
+                        if (test_driver(n_tests, &node, n_threads) == 0) {
+                            n_passed++;
                         }
                     }
-                } else {
-                    stages[0].parallel = true;
-                    stages[1].parallel = true;
 
-                    n_tests++;
-                    if (test_driver(n_tests, &node, n_threads) == 0) {
-                        n_passed++;
+                    { // enable wait_on_done
+                        node.task_profile.dev_flags[0] = 1u; // wait on done
+
+                        n_tests++;
+                        if (test_driver(n_tests, &node, n_threads) == 0) {
+                            n_passed++;
+                        }
                     }
                 }
             }

From 48016f685c9377a5bfe9810c3f736bb2fa527125 Mon Sep 17 00:00:00 2001
From: mqy <meng.qingyou@gmail.com>
Date: Thu, 15 Jun 2023 06:43:08 +0800
Subject: [PATCH 03/24] bulk refactored task profile to support complete
 fallback; enable tune by default for ease of dev

---
 CMakeLists.txt                       |   8 +-
 Makefile                             |   8 +-
 examples/common.cpp                  |   6 +-
 examples/main/main.cpp               |   2 +-
 examples/mulmat-tune/README.md       |  21 +-
 examples/mulmat-tune/mulmat-tune.cpp |   2 -
 examples/perplexity/perplexity.cpp   |   2 +-
 ggml-threading.c                     |  18 +-
 ggml-tune.c                          |  59 ++-
 ggml-tune.h                          |   8 +-
 ggml.c                               | 615 +++++++++++++++------------
 ggml.h                               |  28 +-
 llama.cpp                            |  10 +-
 tests/test-ggml-threading.c          |  11 +
 tests/test-ggml-tune.c               |  30 +-
 15 files changed, 457 insertions(+), 371 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 832c1e986a6eb..716673da28c5f 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -78,7 +78,7 @@ option(LLAMA_K_QUANTS                        "llama: use k-quants"
 option(LLAMA_BUILD_TESTS                "llama: build tests"    ${LLAMA_STANDALONE})
 option(LLAMA_BUILD_EXAMPLES             "llama: build examples" ${LLAMA_STANDALONE})
 option(LLAMA_BUILD_SERVER               "llama: build server example"                           OFF)
-option(LLAMA_MULMAT_TUNE                "llama: mulmat tune"                                    OFF)
+option(LLAMA_TUNE                       "llama: mulmat tune"                                    ON)
 
 #
 # Build info header
@@ -278,9 +278,9 @@ if (LLAMA_METAL)
         )
 endif()
 
-if (LLAMA_MULMAT_TUNE)
-    add_compile_definitions(GGML_USE_MULMAT_TUNE)
-    add_compile_definitions(GGML_MULMAT_TUNE_NDEBUG)
+if (LLAMA_TUNE)
+    add_compile_definitions(GGML_USE_TUNE)
+    add_compile_definitions(GGML_TUNE_NDEBUG)
 endif()
 
 if (LLAMA_K_QUANTS)
diff --git a/Makefile b/Makefile
index a8d1bdc0991ae..531f62fb01347 100644
--- a/Makefile
+++ b/Makefile
@@ -231,14 +231,14 @@ ifneq ($(filter armv8%,$(UNAME_M)),)
 	CFLAGS += -mfp16-format=ieee -mno-unaligned-access
 endif
 
-ifdef LLAMA_NO_K_QUANTS
+ifndef LLAMA_NO_K_QUANTS
 k_quants.o: k_quants.c k_quants.h
 	$(CC) $(CFLAGS) -c $< -o $@
 endif # LLAMA_NO_K_QUANTS
 
-ifdef LLAMA_MULMAT_TUNE
-	CFLAGS   += -DGGML_USE_MULMAT_TUNE -DGGML_MULMAT_TUNE_NDEBUG
-	CXXFLAGS += -DGGML_USE_MULMAT_TUNE
+ifndef LLAMA_NO_TUNE
+CFLAGS   += -DGGML_USE_TUNE -DGGML_TUNE_NDEBUG
+CXXFLAGS += -DGGML_USE_TUNE
 endif
 
 #
diff --git a/examples/common.cpp b/examples/common.cpp
index 882e90c9c3649..fd6df49477949 100644
--- a/examples/common.cpp
+++ b/examples/common.cpp
@@ -345,7 +345,7 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
             params.mem_test = true;
         } else if (arg == "--export") {
             params.export_cgraph = true;
-#ifdef GGML_USE_MULMAT_TUNE
+#ifdef GGML_USE_TUNE
         } else if (arg == "--tune") {
             params.tune = true;
         } else if (arg == "--tune-file") {
@@ -354,7 +354,7 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
                 break;
             }
             params.tune_file = argv[i];
-#endif // GGML_USE_MULMAT_TUNE
+#endif // GGML_USE_TUNE
         } else if (arg == "--verbose-prompt") {
             params.verbose_prompt = true;
         } else if (arg == "-r" || arg == "--reverse-prompt") {
@@ -508,7 +508,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
 #endif
     fprintf(stderr, "  --mtest               compute maximum memory usage\n");
     fprintf(stderr, "  --export              export the computation graph to 'llama.ggml'\n");
-#ifdef GGML_USE_MULMAT_TUNE
+#ifdef GGML_USE_TUNE
     fprintf(stderr, "  --tune                mulmat tune enable. If tune-file is set then exit after bench\n");
     fprintf(stderr, "  --tune-file FILE      mulmat tune data file. If tune is true, then write bench result to this file, else load the file and run\n");
 #endif
diff --git a/examples/main/main.cpp b/examples/main/main.cpp
index 542e463bfe84e..fa243ce95cbb0 100644
--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@@ -117,7 +117,7 @@ int main(int argc, char ** argv) {
         return 1;
     }
 
-#ifdef GGML_USE_MULMAT_TUNE
+#ifdef GGML_USE_TUNE
     if (params.tune || !params.tune_file.empty()) {
         bool ok = llama_mulmat_tune(ctx, params.n_threads, params.tune, params.tune_file.c_str());
         if (!ok || (params.tune && !params.tune_file.empty())) {
diff --git a/examples/mulmat-tune/README.md b/examples/mulmat-tune/README.md
index cff8a3d6467ea..df023757a85b1 100644
--- a/examples/mulmat-tune/README.md
+++ b/examples/mulmat-tune/README.md
@@ -23,13 +23,13 @@ run bench ahead of time (saving tens of seconds), but there are two shortcomings
 
 Makefile:
 ```
-make clean && LLAMA_MULMAT_TUNE=1 make
+make clean && make
 ```
 
 CMake (with BLAS):
 ```
 cmake --build . --target clean
-cmake .. -DLLAMA_BLAS=ON -DLLAMA_MULMAT_TUNE=ON
+cmake .. -DLLAMA_BLAS=ON
 cmake --build . --config Release
 ```
 
@@ -52,13 +52,13 @@ Run examples:
 
 Makefile:
 ```
-make clean && LLAMA_MULMAT_TUNE=1 make
+make clean && make
 ```
 
 CMake (with BLAS)
 ```
 cmake --build . --target clean
-cmake .. -DLLAMA_BLAS=ON -DLLAMA_MULMAT_TUNE=ON
+cmake .. -DLLAMA_BLAS=ON
 cmake --build . --config Release
 ```
 
@@ -103,22 +103,29 @@ setup properly.
 General steps:
 
 1. run `./mulmat-tune -h` to see how to build for misc vendors.
-   you can build with `GGML_MULMAT_TUNE_NDEBUG=` to enable the the debug, e.g:
+   To enable the debug, comment out `-DGGML_TUNE_NDEBUG` from makefile then run:
+
    ```
-   make clean; LLAMA_MULMAT_TUNE=1 LLAMA_MULMAT_TUNE_NDEBUG=1 LLAMA_NO_ACCELERATE=1 LLAMA_CLBLAST=1 make
+   make clean; make
    ```
+
    On `macOS`, `ACCELERATE` is enabled by default. When `ACCELERATE` is built along
    with `CUDA` or `CL`, you may not see `CUDA` or `CL` from debug because `CPU`
-   or `CPU_BLAS` is more faster (as of the estimation from mulmat tune).
+   or `CPU_BLAS` is more faster (as of the estimation from mulmat tune), try run
+   with `-t 1`?
 2. create a small prompt file:
+
    ```
    head -n 5 ./models/wikitext-2-raw/wiki.valid.raw > ./models/wiki.valid-5.raw
    ```
+
 3. run any of the following example commands.
+
    ```
    ./perplexity -m models/7B/ggml-model-q4_0.bin -f ./models/wiki.valid-5.raw -c 128 --mlock -t 1 -b 32
    ./perplexity -m models/7B/ggml-model-q4_0.bin -f ./models/wiki.valid-5.raw -c 128 --mlock -t 4 -b 64
    ```
+
    * `--mlock` is recommended for `macOS`, you may not want to use it.
    * don't change `-c 128`: too large `context size` causes 0 perplexity trunk.
    * `-t` is the number of threads, recommend `1`, `2`, `4` or `6`.
diff --git a/examples/mulmat-tune/mulmat-tune.cpp b/examples/mulmat-tune/mulmat-tune.cpp
index 62f1da27764b9..ab3334d763870 100644
--- a/examples/mulmat-tune/mulmat-tune.cpp
+++ b/examples/mulmat-tune/mulmat-tune.cpp
@@ -262,8 +262,6 @@ int main(int argc, char **argv) {
     struct ggml_mulmat_tune_params params;
     memset(&params, 0, sizeof(struct ggml_mulmat_tune_params));
 
-    ggml_mulmat_init_task_profiles();
-
     ggml_mulmat_tune_model_init(&params.model, model_name, ftype);
     params.m_num = m_num;
     params.n_pass = n_pass;
diff --git a/examples/perplexity/perplexity.cpp b/examples/perplexity/perplexity.cpp
index 1f14c18def3a3..2cdd9db060d25 100644
--- a/examples/perplexity/perplexity.cpp
+++ b/examples/perplexity/perplexity.cpp
@@ -158,7 +158,7 @@ int main(int argc, char ** argv) {
         return 1;
     }
 
-#ifdef GGML_USE_MULMAT_TUNE
+#ifdef GGML_USE_TUNE
     if (params.tune || !params.tune_file.empty()){
         bool ok = llama_mulmat_tune(ctx, params.n_threads, params.tune, params.tune_file.c_str());
         if (!ok || (params.tune && !params.tune_file.empty())) {
diff --git a/ggml-threading.c b/ggml-threading.c
index cf17793f6be61..6dd6d2817eff0 100644
--- a/ggml-threading.c
+++ b/ggml-threading.c
@@ -394,7 +394,7 @@ ggml_thread_ret_t ggml_threading_graph_compute_thread(void *data) {
             enum ggml_compute_error err =
                 shared->task_runner(&state->params, state->node);
 
-            GGML_ASSERT(err == GGML_COMPUTE_OK || err == GGML_COMPUTE_FALLBACK);
+            GGML_ASSERT(err == GGML_COMPUTE_OK);
 
             ggml_spin_lock(&shared->spin);
 
@@ -433,7 +433,11 @@ ggml_threading_compute_tensor(struct ggml_threading_context *ctx,
 
     // This is the params for main thread.
     struct ggml_compute_params params;
-    enum ggml_compute_error err;
+    enum ggml_compute_error err = GGML_COMPUTE_OK;
+
+START:
+
+    memset(&params, 0, sizeof(struct ggml_compute_params));
 
     for (int type = GGML_TASK_INIT; type <= GGML_TASK_FINALIZE; type++) {
         if (node->task_profile.stages[type].backend == GGML_TASK_BACKEND_NONE) {
@@ -504,11 +508,19 @@ ggml_threading_compute_tensor(struct ggml_threading_context *ctx,
         }
 
         if (err != GGML_COMPUTE_OK) {
+            if (err == GGML_COMPUTE_FALLBACK) {
+                struct ggml_task_profile profiles[GGML_MAX_TASK_PROFILES];
+                int n = ggml_get_task_profiles(node, profiles);
+                GGML_ASSERT(n > 0);
+                memcpy(&node->task_profile, &profiles[0],
+                       sizeof(struct ggml_task_profile));
+                goto START;
+            }
             return err;
         }
     }
 
-    return GGML_COMPUTE_OK;
+    return err;
 }
 
 struct ggml_threading_context *
diff --git a/ggml-tune.c b/ggml-tune.c
index fbca953ed469e..52ca96bf302fa 100644
--- a/ggml-tune.c
+++ b/ggml-tune.c
@@ -55,7 +55,7 @@ const struct ggml_task_profile *ggml_mulmat_tune_select_task_profile(
 
     struct ggml_mulmat_tune_time profiles_time[GGML_MAX_TASK_PROFILES] = {0};
 
-    struct ggml_task_profile *prof = NULL;
+    const struct ggml_task_profile *prof = NULL;
 
     if (e->M == M && e->N == N && e->K == K) {
         prof = e->profile;
@@ -97,10 +97,7 @@ const struct ggml_task_profile *ggml_mulmat_tune_select_task_profile(
                 e->N = N;
                 e->K = K;
 
-                // to disable this, build with
-                // `make clean; LLAMA_MULMAT_TUNE=1 LLAMA_MULMAT_TUNE_NDEBUG=1
-                // make`
-#if !defined(GGML_MULMAT_TUNE_NDEBUG)
+#ifndef GGML_TUNE_NDEBUG
                 const char *names[3];
                 for (int i = 0; i < 3; i++) {
                     names[i] = ggml_mulmat_tune_task_backend_name(
@@ -163,8 +160,8 @@ void ggml_mulmat_tune_model_init(struct ggml_mulmat_tune_model *model,
 
 bool ggml_mulmat_tune_init(struct ggml_mulmat_tune *tune,
                            struct ggml_mulmat_tune_params *params,
-                           struct ggml_task_profile_factory *pf) {
-
+                           ggml_task_profiles_provider *profiles_provider) {
+    GGML_ASSERT(profiles_provider);
     struct ggml_mulmat_tune_model *model = &params->model;
 
     memset(tune, 0, sizeof(struct ggml_mulmat_tune));
@@ -208,8 +205,20 @@ bool ggml_mulmat_tune_init(struct ggml_mulmat_tune *tune,
 
     for (int i = 0; i < tune->n_shapes; i++) {
         struct ggml_mulmat_tune_shape *shape = &tune->shapes[i];
-        shape->n_profiles = ggml_mulmat_get_task_profiles(
-            pf, shape->src0_type, shape->src1_type, &shape->profiles);
+
+        struct ggml_tensor src0 = {
+            .type = shape->src0_type,
+        };
+        struct ggml_tensor src1 = {
+            .type = shape->src1_type,
+        };
+        struct ggml_tensor node = {
+            .op = GGML_OP_MUL_MAT,
+            .src0 = &src0,
+            .src1 = &src1,
+        };
+
+        shape->n_profiles = profiles_provider(&node, shape->profiles);
         if (shape->n_profiles == 0) {
             // allowed for testing.
             continue;
@@ -304,9 +313,20 @@ ggml_mulmat_tune_validate_internal(const struct ggml_mulmat_tune *tune,
     for (int i = 0; i < tune->n_shapes; i++) {
         const struct ggml_mulmat_tune_shape *shape = &tune->shapes[i];
 
-        struct ggml_task_profile *builtin_profiles = NULL;
-        int n_profiles = ggml_mulmat_get_task_profiles(
-            NULL, shape->src0_type, shape->src1_type, &builtin_profiles);
+        struct ggml_tensor src0 = {
+            .type = shape->src0_type,
+        };
+        struct ggml_tensor src1 = {
+            .type = shape->src1_type,
+        };
+        struct ggml_tensor node = {
+            .op = GGML_OP_MUL_MAT,
+            .src0 = &src0,
+            .src1 = &src1,
+        };
+
+        struct ggml_task_profile builtin_profiles[GGML_MAX_TASK_PROFILES];
+        int n_profiles = ggml_get_task_profiles(&node, builtin_profiles);
 
         if (n_profiles != shape->n_profiles) {
             snprintf(errbuf, errbuf_len - 1, "task profiles mismatch");
@@ -382,13 +402,6 @@ bool ggml_mulmat_tune_read_data(struct ggml_mulmat_tune *tune, FILE *fp) {
             memset(shape->items, 0, item_size);
         }
 
-        {
-            size_t sz = sizeof(struct ggml_task_profile) * shape->n_profiles;
-            shape->profiles = malloc(sz);
-            GGML_ASSERT(shape->profiles);
-            memset(shape->profiles, 0, sz);
-        }
-
         for (int ip = 0; ip < shape->n_profiles; ip++) {
             struct ggml_task_profile *profile = &shape->profiles[ip];
             for (int j = 0; j < 3; j++) {
@@ -468,7 +481,7 @@ bool ggml_mulmat_tune_write_data(const struct ggml_mulmat_tune *tune,
                     }
                 }
 
-                struct ggml_task_profile *profile = &shape->profiles[ip];
+                const struct ggml_task_profile *profile = &shape->profiles[ip];
                 for (int k = 0; k < 3; k++) {
                     if (profile->stages[k].backend != GGML_TASK_BACKEND_NONE) {
                         rc = fprintf(fp, "%9d", item->stages_time[k]);
@@ -537,7 +550,7 @@ void ggml_mulmat_tune_estimate_time(
     const int max_m = shape->items[m_num - 1].M;
 
     for (int ip = 0; ip < shape->n_profiles; ip++) {
-        struct ggml_task_profile *profile = &shape->profiles[ip];
+        const struct ggml_task_profile *profile = &shape->profiles[ip];
         profile_time[ip].total_time = 0;
         profile_time[ip].profile = profile;
 
@@ -573,7 +586,7 @@ void ggml_mulmat_tune_estimate_time(
         GGML_ASSERT(p0 && p1);
 
         for (int i_stage = 0; i_stage < 3; i_stage++) {
-            struct ggml_task_stage *stage = &profile->stages[i_stage];
+            const struct ggml_task_stage *stage = &profile->stages[i_stage];
             if (stage->backend == GGML_TASK_BACKEND_NONE) {
                 continue;
             }
@@ -736,7 +749,7 @@ bool ggml_mulmat_tune_bench(struct ggml_mulmat_tune *tune,
         return false;
     }
 
-    bool ok = ggml_mulmat_tune_init(tune, params, NULL);
+    bool ok = ggml_mulmat_tune_init(tune, params, ggml_get_task_profiles);
     if (!ok) {
         return false;
     }
diff --git a/ggml-tune.h b/ggml-tune.h
index 404f1f1c4a53f..04b25873c932f 100644
--- a/ggml-tune.h
+++ b/ggml-tune.h
@@ -46,7 +46,7 @@ struct ggml_mulmat_tune_shape {
     enum ggml_type src1_type;
 
     int n_profiles;
-    struct ggml_task_profile *profiles;
+    struct ggml_task_profile profiles[GGML_MAX_TASK_PROFILES];
 
     int m_num;
     int *arr_m;
@@ -69,7 +69,7 @@ struct ggml_mulmat_tune {
 };
 
 struct ggml_mulmat_tune_time {
-    struct ggml_task_profile *profile;
+    const struct ggml_task_profile *profile;
     int stage_time[3];
     int total_time;
 };
@@ -78,7 +78,7 @@ struct mm_cache_element {
     int M;
     int N;
     int K;
-    struct ggml_task_profile *profile;
+    const struct ggml_task_profile *profile;
     int stages_time[3];
 };
 
@@ -108,7 +108,7 @@ void ggml_mulmat_tune_model_init(struct ggml_mulmat_tune_model *model,
 
 bool ggml_mulmat_tune_init(struct ggml_mulmat_tune *tune,
                            struct ggml_mulmat_tune_params *params,
-                           struct ggml_task_profile_factory *profile_factory);
+                           ggml_task_profiles_provider *profiles_provider);
 
 void ggml_mulmat_tune_free(struct ggml_mulmat_tune *tune);
 
diff --git a/ggml.c b/ggml.c
index 5d0b83b1de198..b75f33b88d877 100644
--- a/ggml.c
+++ b/ggml.c
@@ -144,7 +144,7 @@ inline static void* ggml_aligned_malloc(size_t size) {
 #include "ggml-opencl.h"
 #endif
 
-#if defined(GGML_USE_MULMAT_TUNE)
+#if defined(GGML_USE_TUNE)
     #include "ggml-tune.h"
 #endif
 
@@ -4043,8 +4043,6 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
         ggml_cl_init();
 #endif
 
-        ggml_mulmat_init_task_profiles();
-
         is_first_call = false;
     }
 
@@ -15524,164 +15522,254 @@ struct ggml_cgraph ggml_build_backward(struct ggml_context * ctx, struct ggml_cg
     return result;
 }
 
-// ---- mulmat task profiles  ----
+// ---- task profiles  ----
 
-static struct ggml_task_profile_factory default_task_profile_factory = {0};
+// Implement `ggml_task_profiles_provider`.
+// Fill `profiles` for the `node` and return number of profiles.
+//
+// NOTE: the node may be incompleted from testing or tunning, so please assert
+//       everything used here.
+inline int ggml_get_task_profiles(
+    struct ggml_tensor *node,
+    struct ggml_task_profile profiles[GGML_MAX_TASK_PROFILES]) {
+    GGML_ASSERT(node);
+    GGML_ASSERT(node->op >= 0);
+    GGML_ASSERT(profiles);
 
-// TODO: thread unsafe. Should be initialized once.
-void ggml_mulmat_init_task_profiles(void) {
-    const size_t sz = sizeof(struct ggml_task_profile_factory);
-    memset(&default_task_profile_factory, 0, sz);
+    memset(profiles, 0,
+           sizeof(struct ggml_task_profile) * GGML_MAX_TASK_PROFILES);
+
+    struct ggml_task_profile *p = profiles;
+    int n_profiles = 0;
+
+    switch (node->op) {
+    case GGML_OP_CPY:
+    case GGML_OP_DUP: {
+        p[0].stages[1].backend = GGML_TASK_BACKEND_CPU;
+        n_profiles = 1;
+    } break;
+    case GGML_OP_ADD:
+    case GGML_OP_ADD1: {
+        p[0].stages[1].backend = GGML_TASK_BACKEND_CPU;
+        p[0].stages[1].parallel = true;
+        n_profiles = 1;
+    } break;
+    case GGML_OP_ACC: {
+        p[0].stages[0].backend = GGML_TASK_BACKEND_CPU;
+        p[0].stages[1].backend = GGML_TASK_BACKEND_CPU;
+        p[0].stages[1].parallel = true;
+        n_profiles = 1;
+    } break;
+    case GGML_OP_SUB:
+    case GGML_OP_DIV:
+    case GGML_OP_SQR:
+    case GGML_OP_SQRT:
+    case GGML_OP_LOG:
+    case GGML_OP_SUM:
+    case GGML_OP_SUM_ROWS:
+    case GGML_OP_MEAN:
+    case GGML_OP_REPEAT:
+    case GGML_OP_REPEAT_BACK:
+    case GGML_OP_ABS:
+    case GGML_OP_SGN:
+    case GGML_OP_NEG:
+    case GGML_OP_STEP:
+    case GGML_OP_RELU: {
+        p[0].stages[1].backend = GGML_TASK_BACKEND_CPU;
+        n_profiles = 1;
+    } break;
+    case GGML_OP_MUL: {
+        p[0].stages[1].backend = GGML_TASK_BACKEND_CPU;
+        p[0].stages[1].parallel = true;
+        n_profiles = 1;
+    } break;
+    case GGML_OP_GELU:
+    case GGML_OP_SILU:
+    case GGML_OP_SILU_BACK:
+    case GGML_OP_NORM:
+    case GGML_OP_RMS_NORM:
+    case GGML_OP_RMS_NORM_BACK: {
+        p[0].stages[1].backend = GGML_TASK_BACKEND_CPU;
+        p[0].stages[1].parallel = true;
+        n_profiles = 1;
+    } break;
+    case GGML_OP_MUL_MAT:
+    case GGML_OP_OUT_PROD: {
+        GGML_ASSERT(node->src0);
+        GGML_ASSERT(node->src1);
+
+        enum ggml_type src0_t = node->src0->type;
+        enum ggml_type src1_t = node->src1->type;
+
+        GGML_ASSERT(src1_t == GGML_TYPE_F32);
 
-    // f32
-    {
-        struct ggml_task_profile *p = default_task_profile_factory.f32_f32;
         int i = 0;
-
-        p[i].stages[1].backend = GGML_TASK_BACKEND_CPU;
-        p[i].stages[1].parallel = true;
-        i++;
+        if (src0_t == GGML_TYPE_F32) {
+            p[i].stages[1].backend = GGML_TASK_BACKEND_CPU;
+            p[i].stages[1].parallel = true;
+            i++;
 
 #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
-        p[i].stages[1].backend = GGML_TASK_BACKEND_CPU_BLAS;
-        p[i].stages[1].wait = true;
-        i++;
+            p[i].stages[1].backend = GGML_TASK_BACKEND_CPU_BLAS;
+            p[i].stages[1].wait = true;
+            i++;
 #endif
 
 #if defined(GGML_USE_CUBLAS)
-        p[i].stages[1].backend = GGML_TASK_BACKEND_GPU_CUDA;
-        p[i].stages[1].wait = true;
-        i++;
+            p[i].stages[1].backend = GGML_TASK_BACKEND_GPU_CUDA;
+            p[i].stages[1].wait = true;
+            i++;
 #elif defined(GGML_USE_CLBLAST)
-        p[i].stages[1].backend = GGML_TASK_BACKEND_GPU_CL;
-        p[i].stages[1].wait = true;
-        i++;
+            p[i].stages[1].backend = GGML_TASK_BACKEND_GPU_CL;
+            p[i].stages[1].wait = true;
+            i++;
 #endif
-        default_task_profile_factory.n_f32_f32 = i;
-    }
-
-    // f16
-    {
-        struct ggml_task_profile *p = default_task_profile_factory.f16_f32;
-        int i = 0;
-
-        p[i].stages[0].backend = GGML_TASK_BACKEND_CPU;
-        p[i].stages[1].backend = GGML_TASK_BACKEND_CPU;
-        p[i].stages[1].parallel = true;
-        i++;
+        } else if (src0_t == GGML_TYPE_F16) {
+            p[i].stages[0].backend = GGML_TASK_BACKEND_CPU;
+            p[i].stages[1].backend = GGML_TASK_BACKEND_CPU;
+            p[i].stages[1].parallel = true;
+            i++;
 
 #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
-        p[i].stages[1].backend = GGML_TASK_BACKEND_CPU_BLAS;
-        p[i].stages[1].wait = true;
-        i++;
+            p[i].stages[1].backend = GGML_TASK_BACKEND_CPU_BLAS;
+            p[i].stages[1].wait = true;
+            i++;
 #endif
 
 #if defined(GGML_USE_CUBLAS)
-        p[i].stages[1].backend = GGML_TASK_BACKEND_GPU_CUDA;
-        p[i].stages[1].wait = true;
-        i++;
+            p[i].stages[1].backend = GGML_TASK_BACKEND_GPU_CUDA;
+            p[i].stages[1].wait = true;
+            i++;
 #elif defined(GGML_USE_CLBLAST)
-        p[i].stages[1].backend = GGML_TASK_BACKEND_GPU_CL;
-        p[i].stages[1].wait = true;
-        i++;
+            p[i].stages[1].backend = GGML_TASK_BACKEND_GPU_CL;
+            p[i].stages[1].wait = true;
+            i++;
 #endif
-        default_task_profile_factory.n_f16_f32 = i;
-    }
-
-    // qxx
-    {
-        struct ggml_task_profile *p = default_task_profile_factory.qxx_f32;
-        int i = 0;
-
-        p[i].stages[0].backend = GGML_TASK_BACKEND_CPU;
-        p[i].stages[1].backend = GGML_TASK_BACKEND_CPU;
-        p[i].stages[1].parallel = true;
-        i++;
+        } else if (ggml_is_quantized(src0_t)) {
+            p[i].stages[0].backend = GGML_TASK_BACKEND_CPU;
+            p[i].stages[1].backend = GGML_TASK_BACKEND_CPU;
+            p[i].stages[1].parallel = true;
+            i++;
 
 #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
-        p[i].stages[0].backend = GGML_TASK_BACKEND_CPU;
-        p[i].stages[0].parallel = true;
-        p[i].stages[1].backend = GGML_TASK_BACKEND_CPU_BLAS;
-        p[i].stages[1].wait = true;
-        i++;
+            p[i].stages[0].backend = GGML_TASK_BACKEND_CPU;
+            p[i].stages[0].parallel = true;
+            p[i].stages[1].backend = GGML_TASK_BACKEND_CPU_BLAS;
+            p[i].stages[1].wait = true;
+            i++;
 #endif
 
 #if defined(GGML_USE_CUBLAS)
-        p[i].stages[1].backend = GGML_TASK_BACKEND_GPU_CUDA;
-        p[i].stages[1].wait = true;
-        i++;
+            p[i].stages[1].backend = GGML_TASK_BACKEND_GPU_CUDA;
+            p[i].stages[1].wait = true;
+            i++;
 #elif defined(GGML_USE_CLBLAST)
-        p[i].stages[1].backend = GGML_TASK_BACKEND_GPU_CL;
-        p[i].stages[1].wait = true;
-        i++;
+            p[i].stages[1].backend = GGML_TASK_BACKEND_GPU_CL;
+            p[i].stages[1].wait = true;
+            i++;
 #endif
-        default_task_profile_factory.n_qxx_f32 = i;
-    }
-}
-
-int ggml_mulmat_get_task_profiles(struct ggml_task_profile_factory *pf,
-                                  enum ggml_type src0_t, enum ggml_type src1_t,
-                                  struct ggml_task_profile **profiles) {
-    GGML_ASSERT(profiles);
-
-    if (pf == NULL) {
-        pf = &default_task_profile_factory;
-    }
-
-    GGML_ASSERT(src1_t == GGML_TYPE_F32);
-
-    if (src0_t == GGML_TYPE_F32) {
-        *profiles = pf->f32_f32;
-        return pf->n_f32_f32;
-    }
-
-    if (src0_t == GGML_TYPE_F16) {
-        *profiles = pf->f16_f32;
-        return pf->n_f16_f32;
-    }
-
-    if (ggml_is_quantized(src0_t)) {
-        *profiles = pf->qxx_f32;
-        return pf->n_qxx_f32;
-    }
-
-    GGML_ASSERT(false);
-}
-
-static const struct ggml_task_profile *
-ggml_mulmat_get_default_task_profile(struct ggml_task_profile_factory *pf,
-                                     enum ggml_type src0_type,
-                                     enum ggml_type src1_type) {
-    GGML_ASSERT(src1_type == GGML_TYPE_F32);
-    if (pf == NULL) {
-        pf = &default_task_profile_factory;
-    }
-
-    struct ggml_task_profile *p = NULL;
-
-    if (src0_type == GGML_TYPE_F32) {
-        p = &pf->f32_f32[0];
-    } else if (src0_type == GGML_TYPE_F16) {
-        p = &pf->f16_f32[0];
-    } else if (ggml_is_quantized(src0_type)) {
-        p = &pf->qxx_f32[0];
-    } else {
+        }
+        n_profiles = i;
+    } break;
+    case GGML_OP_SCALE: {
+        p[0].stages[1].backend = GGML_TASK_BACKEND_CPU;
+        p[0].stages[1].parallel = true;
+        n_profiles = 1;
+    } break;
+    case GGML_OP_SET: {
+        p[0].stages[0].backend = GGML_TASK_BACKEND_CPU;
+        p[0].stages[1].backend = GGML_TASK_BACKEND_CPU;
+        n_profiles = 1;
+    } break;
+    case GGML_OP_CONT:
+    case GGML_OP_RESHAPE:
+    case GGML_OP_VIEW:
+    case GGML_OP_PERMUTE:
+    case GGML_OP_TRANSPOSE:
+    case GGML_OP_GET_ROWS:
+    case GGML_OP_GET_ROWS_BACK:
+    case GGML_OP_DIAG:
+    case GGML_OP_DIAG_MASK_ZERO: {
+        p[0].stages[1].backend = GGML_TASK_BACKEND_CPU;
+        n_profiles = 1;
+    } break;
+    case GGML_OP_DIAG_MASK_INF:
+    case GGML_OP_SOFT_MAX:
+    case GGML_OP_SOFT_MAX_BACK:
+    case GGML_OP_ROPE:
+    case GGML_OP_ROPE_BACK: {
+        p[0].stages[1].backend = GGML_TASK_BACKEND_CPU;
+        p[0].stages[1].parallel = true;
+        n_profiles = 1;
+    } break;
+    case GGML_OP_ALIBI: {
+        p[0].stages[1].backend = GGML_TASK_BACKEND_CPU;
+        n_profiles = 1;
+    } break;
+    case GGML_OP_CLAMP: {
+        p[0].stages[1].backend = GGML_TASK_BACKEND_CPU;
+        n_profiles = 1;
+    } break;
+    case GGML_OP_CONV_1D_1S:
+    case GGML_OP_CONV_1D_2S: {
+        p[0].stages[0].backend = GGML_TASK_BACKEND_CPU;
+        p[0].stages[1].backend = GGML_TASK_BACKEND_CPU;
+        p[0].stages[1].parallel = true;
+        n_profiles = 1;
+    } break;
+    case GGML_OP_FLASH_ATTN: {
+        p[0].stages[1].backend = GGML_TASK_BACKEND_CPU;
+        p[0].stages[1].parallel = true;
+        n_profiles = 1;
+    } break;
+    case GGML_OP_FLASH_FF: {
+        p[0].stages[1].backend = GGML_TASK_BACKEND_CPU;
+        p[0].stages[1].parallel = true;
+        n_profiles = 1;
+    }
+    case GGML_OP_FLASH_ATTN_BACK: {
+        p[0].stages[0].backend = GGML_TASK_BACKEND_CPU;
+        p[0].stages[1].backend = GGML_TASK_BACKEND_CPU;
+        p[0].stages[1].parallel = true;
+        n_profiles = 1;
+    } break;
+    case GGML_OP_MAP_UNARY:
+    case GGML_OP_MAP_BINARY: {
+        p[0].stages[1].backend = GGML_TASK_BACKEND_CPU;
+        n_profiles = 1;
+    } break;
+    case GGML_OP_CROSS_ENTROPY_LOSS:
+        p[0].stages[0].backend = GGML_TASK_BACKEND_CPU;
+        p[0].stages[1].backend = GGML_TASK_BACKEND_CPU;
+        p[0].stages[1].parallel = true;
+        p[0].stages[2].backend = GGML_TASK_BACKEND_CPU;
+        n_profiles = 1;
+    case GGML_OP_CROSS_ENTROPY_LOSS_BACK: {
+        p[0].stages[1].backend = GGML_TASK_BACKEND_CPU;
+        p[0].stages[1].parallel = true;
+        n_profiles = 1;
+    } break;
+    case GGML_OP_NONE:
+    case GGML_OP_COUNT: {
+        GGML_ASSERT(false);
+    } break;
+    default:
         GGML_ASSERT(false);
     }
 
-    for (int i = 0; i < 3; i++) {
-        GGML_ASSERT(p->stages[i].backend == GGML_TASK_BACKEND_CPU ||
-                    p->stages[i].backend == GGML_TASK_BACKEND_NONE);
-    }
-
-    return p;
+    GGML_ASSERT(n_profiles > 0 && n_profiles <= GGML_MAX_TASK_PROFILES);
+    return n_profiles;
 }
 
 // Set task profile for GGML_OP_MUL_MAT or GGML_OP_OUT_PROD.
-static void ggml_mulmat_set_tensor_task_profile(struct ggml_tensor *node,
-                                         struct ggml_mulmat_tune *tune) {
+static const struct ggml_task_profile *ggml_mulmat_get_task_profile(
+    struct ggml_tensor *node, struct ggml_task_profile *profiles,
+    int n_profiles, struct ggml_mulmat_tune *tune, int stages_time_us[3]) {
+
     GGML_ASSERT(node);
     GGML_ASSERT(node->op == GGML_OP_MUL_MAT || node->op == GGML_OP_OUT_PROD);
+    GGML_ASSERT(profiles);
+    GGML_ASSERT(n_profiles >= 2);
 
     enum ggml_type src0_t = node->src0->type;
     enum ggml_type src1_t = node->src1->type;
@@ -15697,42 +15785,26 @@ static void ggml_mulmat_set_tensor_task_profile(struct ggml_tensor *node,
     int N = (int)node->ne[0];
     int K = (int)node->src1->ne[0];
 
-    struct ggml_task_profile *profiles = NULL;
-    int n_profiles = ggml_mulmat_get_task_profiles(NULL, src0_t, src1_t, &profiles);
-    GGML_ASSERT(n_profiles >= 2);
-    GGML_ASSERT(profiles);
-
     const struct ggml_task_profile *prof = NULL;
 
     if (cond_match) {
-#if defined(GGML_USE_MULMAT_TUNE)
+#if defined(GGML_USE_TUNE)
         if (tune != NULL) {
-            int stages_time_us[3];
-            prof = ggml_mulmat_tune_select_task_profile(tune, M, N, K, src0_t, src1_t, stages_time_us);
+            prof = ggml_mulmat_tune_select_task_profile(tune, M, N, K, src0_t,
+                                                        src1_t, stages_time_us);
             if (prof != NULL) {
-                 GGML_ASSERT(prof);
-                 memcpy(&node->task_profile, prof, sizeof(struct ggml_task_profile));
-                 // Do not wait if the estimated execution time is too small (e.g. less than 0.1 ms)
-                 // TODO: need bench actual wait/notify time, see ggml-threading.c
-                 for (int i = 0; i < 3; i++) {
-                    if (node->task_profile.stages[i].wait) {
-                        if (stages_time_us[i] < 100) {
-                            node->task_profile.stages[i].wait = false;
-                        }
-                    }
-                 }
-                 return;
+                return prof;
             }
         }
 #else
         UNUSED(tune);
+        UNUSED(stages_time_us);
 #endif
 
         if (prof == NULL && M >= 32 && N >= 32 && K >= 32) {
             for (int j = 0; j < n_profiles; j++) {
                 enum ggml_task_backend comp_be =
                     profiles[j].stages[GGML_TASK_COMPUTE].backend;
-
                 switch (comp_be) {
                     case GGML_TASK_BACKEND_GPU_CUDA: {
                         GGML_ASSERT(ggml_cpu_has_cublas());
@@ -15753,76 +15825,131 @@ static void ggml_mulmat_set_tensor_task_profile(struct ggml_tensor *node,
                         break;
                     }
                 }
+
+                if (prof) {
+                    break;
+                }
             }
         }
     }
 
     if (prof == NULL) {
-        prof = ggml_mulmat_get_default_task_profile(NULL, src0_t, src1_t);
+        prof = &profiles[0];
+        GGML_ASSERT(prof->stages[1].backend == GGML_TASK_BACKEND_CPU);
     }
 
-    GGML_ASSERT(prof);
-    memcpy(&node->task_profile, prof, sizeof(struct ggml_task_profile));
+    return prof;
 }
 
 void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph) {
     int n_threads = cgraph->n_threads;
 
-    if (ggml_cpu_has_blas()) {
+    struct ggml_threading_context *thrd_ctx = ggml_threading_start(
+        n_threads, ggml_threading_graph_compute_thread, ggml_compute_forward,
+        GGML_THREADING_FEATURE_WAIT_ON_DONE, NULL);
+
+    // initialize tasks + work buffer
+    {
+        // int64_t t0 = ggml_time_us();
+
+        size_t work_size = 0;
+
+        struct ggml_task_profile profiles[GGML_MAX_TASK_PROFILES];
+
+        // thread scheduling for the different operations
         for (int i = 0; i < cgraph->n_nodes; i++) {
-            struct ggml_tensor *node = cgraph->nodes[i];
+            struct ggml_tensor * node = cgraph->nodes[i];
+            if (node->op == GGML_OP_NONE || node->op == GGML_OP_CONT) {
+                continue;
+            }
 
-            memset(&node->task_profile, 0, sizeof(struct ggml_task_profile));
-            struct ggml_task_stage *stages = node->task_profile.stages;
+            int n_profiles = ggml_get_task_profiles(node, profiles);
 
-            // Adapt node->backend: assume GPU at COMPUTE stage.
-            if (node->backend > GGML_BACKEND_CPU) {
-                stages[GGML_TASK_INIT].backend = GGML_TASK_BACKEND_NONE;
-                stages[GGML_TASK_FINALIZE].backend = GGML_TASK_BACKEND_NONE;
+            const struct ggml_task_profile *profile = NULL;
 
-                stages[GGML_TASK_COMPUTE].parallel = false;
-                bool wait = (node->op == GGML_OP_MUL_MAT || node->op == GGML_OP_MUL);
-                stages[GGML_TASK_COMPUTE].wait = wait;
+            // Adapt node->backend: assume GPU at COMPUTE stage.
+            if (node->backend == GGML_BACKEND_GPU ||
+                node->backend == GGML_BACKEND_GPU_SPLIT) {
+                enum ggml_task_backend be;
                 if (ggml_cpu_has_cublas()) {
-                    stages[GGML_TASK_COMPUTE].backend = GGML_TASK_BACKEND_GPU_CUDA;
+                    be = GGML_TASK_BACKEND_GPU_CUDA;
                 } else if (ggml_cpu_has_clblast()) {
-                    stages[GGML_TASK_COMPUTE].backend = GGML_TASK_BACKEND_GPU_CL;
+                    be = GGML_TASK_BACKEND_GPU_CL;
                 } else {
                     GGML_ASSERT(false);
                 }
-            } else if (node->op == GGML_OP_MUL_MAT) {
-                struct ggml_mulmat_tune * tune = NULL;
-#if defined(GGML_USE_MULMAT_TUNE)
-                tune = cgraph->tune;
-#endif
-                ggml_mulmat_set_tensor_task_profile(node, tune);
-            } else if (node->op == GGML_OP_OUT_PROD) {
-                ggml_mulmat_set_tensor_task_profile(node, NULL);
+
+                for (int j = 0; j < n_profiles; j++) {
+                    if (profiles[j].stages[1].backend == be) {
+                        profile = &profiles[j];
+                        break;
+                    }
+                }
+                GGML_ASSERT(profile);
+            } else {
+                GGML_ASSERT(node->backend == GGML_BACKEND_CPU);
             }
-        }
-    }
 
-    struct ggml_threading_context *thrd_ctx = ggml_threading_start(
-        n_threads, ggml_threading_graph_compute_thread, ggml_compute_forward,
-        GGML_THREADING_FEATURE_WAIT_ON_DONE, NULL);
+            bool profile_copied = false;
 
-    // initialize tasks + work buffer
-    {
-        size_t work_size = 0;
+            if (node->op == GGML_OP_MUL_MAT) {
+#if defined(GGML_USE_TUNE)
+                int stages_time_us[3];
+                profile = ggml_mulmat_get_task_profile(
+                    node, profiles, n_profiles, cgraph->tune, stages_time_us);
+                GGML_ASSERT(profile);
+
+                if (cgraph->tune) {
+                    memcpy(&node->task_profile, profile,
+                           sizeof(struct ggml_task_profile));
+                    profile_copied = true;
+
+                    // Do not wait if the estimated execution time is too small
+                    // (e.g. less than 0.1 ms)
+                    // TODO: need bench actual wait/notify time, see
+                    // ggml-threading.c
+                    for (int j = 0; j< 3; j++) {
+                        if (node->task_profile.stages[j].wait) {
+                            if (stages_time_us[j] < 100) {
+                                node->task_profile.stages[j].wait = false;
+                            }
+                        }
+                    }
+                }
+#else
+                profile = ggml_mulmat_get_task_profile(node, profiles,
+                                                       n_profiles, NULL, NULL);
+                GGML_ASSERT(profile);
+#endif
+            } else if (node->op == GGML_OP_OUT_PROD) { // FIXME: is is right?
+                profile = ggml_mulmat_get_task_profile(node, profiles,
+                                                       n_profiles, NULL, NULL);
+                GGML_ASSERT(profile);
+            } else {
+                profile = &profiles[0];
+                GGML_ASSERT(profile->stages[1].backend ==
+                            GGML_TASK_BACKEND_CPU);
+            }
+
+            if (!profile_copied) {
+                memcpy(&node->task_profile, profile,
+                       sizeof(struct ggml_task_profile));
+            }
 
-        // thread scheduling for the different operations
-        for (int i = 0; i < cgraph->n_nodes; i++) {
-            struct ggml_tensor * node = cgraph->nodes[i];
             struct ggml_task_stage *stages = node->task_profile.stages;
 
+            // compute stage n_tasks.
+            int n_tasks = stages[1].parallel ? n_threads : 1;
+
+            // Allocate temp buffer `wdata` for CPU.
+            // NOTE: GPU MAY fallback to CPU, so we have to cover all possible cases.
             switch (node->op) {
                 case GGML_OP_CPY:
                 case GGML_OP_DUP:
                     {
-                        stages[GGML_TASK_COMPUTE].backend = GGML_TASK_BACKEND_CPU;
                         size_t cur = 0;
                         if (ggml_is_quantized(node->type)) {
-                            cur = GGML_TYPE_SIZE[GGML_TYPE_F32] * node->ne[0] * n_threads;
+                            cur = GGML_TYPE_SIZE[GGML_TYPE_F32] * node->ne[0] * n_tasks;
                         }
 
                         work_size = MAX(work_size, cur);
@@ -15830,27 +15957,20 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
                 case GGML_OP_ADD:
                 case GGML_OP_ADD1:
                     {
-                        stages[GGML_TASK_COMPUTE].backend = GGML_TASK_BACKEND_CPU;
-                        stages[GGML_TASK_COMPUTE].parallel = true;
-
                         size_t cur = 0;
 
                         if (ggml_is_quantized(node->src0->type)) {
-                            cur = GGML_TYPE_SIZE[GGML_TYPE_F32] * node->src0->ne[0] * n_threads;
+                            cur = GGML_TYPE_SIZE[GGML_TYPE_F32] * node->src0->ne[0] * n_tasks;
                         }
 
                         work_size = MAX(work_size, cur);
                     } break;
                 case GGML_OP_ACC:
                     {
-                        stages[GGML_TASK_INIT].backend = GGML_TASK_BACKEND_CPU;
-                        stages[GGML_TASK_COMPUTE].backend = GGML_TASK_BACKEND_CPU;
-                        stages[GGML_TASK_COMPUTE].parallel = true;
-
                         size_t cur = 0;
 
                         if (ggml_is_quantized(node->src0->type)) {
-                            cur = GGML_TYPE_SIZE[GGML_TYPE_F32] * node->src1->ne[0] * n_threads;
+                            cur = GGML_TYPE_SIZE[GGML_TYPE_F32] * node->src1->ne[0] * n_tasks;
                         }
 
                         work_size = MAX(work_size, cur);
@@ -15870,16 +15990,7 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
                 case GGML_OP_NEG:
                 case GGML_OP_STEP:
                 case GGML_OP_RELU:
-                    {
-                        stages[GGML_TASK_COMPUTE].backend = GGML_TASK_BACKEND_CPU;
-                    } break;
                 case GGML_OP_MUL:
-                    {
-                        if (stages[GGML_TASK_COMPUTE].backend == GGML_TASK_BACKEND_NONE) {
-                            stages[GGML_TASK_COMPUTE].backend = GGML_TASK_BACKEND_CPU;
-                            stages[GGML_TASK_COMPUTE].parallel = true;
-                        }
-                    } break;
                 case GGML_OP_GELU:
                 case GGML_OP_SILU:
                 case GGML_OP_SILU_BACK:
@@ -15887,28 +15998,14 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
                 case GGML_OP_RMS_NORM:
                 case GGML_OP_RMS_NORM_BACK:
                     {
-                        stages[GGML_TASK_COMPUTE].backend = GGML_TASK_BACKEND_CPU;
-                        stages[GGML_TASK_COMPUTE].parallel = true;
                     } break;
                 case GGML_OP_MUL_MAT:
-                case GGML_OP_OUT_PROD:
+                case GGML_OP_OUT_PROD: // FIXME: is is right?
                     {
                         size_t cur = 0;
                         enum ggml_task_backend comp_backend = stages[GGML_TASK_COMPUTE].backend;
                         GGML_ASSERT(comp_backend != GGML_TASK_BACKEND_NONE);
 
-                        // TODO: remove this check once we are sure `ggml_mulmat_set_tensor_task_profile()` is correct.
-                        if ((comp_backend & GGML_TASK_BACKEND_GPU) || comp_backend == GGML_TASK_BACKEND_CPU_BLAS) {
-                            enum ggml_type src0_t = node->src0->type;
-                            enum ggml_type src1_t = node->src1->type;
-                            bool cond_match = (src0_t == GGML_TYPE_F32 || src0_t == GGML_TYPE_F16 ||
-                                ggml_is_quantized(src0_t)) &&
-                                src1_t == GGML_TYPE_F32 && node->type == GGML_TYPE_F32 &&
-                                ggml_is_contiguous(node->src0) &&
-                                ggml_is_contiguous(node->src1);
-                            GGML_ASSERT(cond_match);
-                        }
-
                         if (comp_backend == GGML_TASK_BACKEND_GPU_CL) {
 #if defined(GGML_USE_CLBLAST)
                             cur = ggml_cl_mul_mat_get_wsize(node->src0, node->src1, node);
@@ -15930,7 +16027,6 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
                                 GGML_ASSERT(false);
                             }
                         } else if (comp_backend == GGML_TASK_BACKEND_CPU || comp_backend == GGML_TASK_BACKEND_GPU_CUDA) {
-                            // We have to reseve buffer for CUDA because it may fallback to CPU.
                             if (comp_backend == GGML_TASK_BACKEND_GPU_CUDA) {
                                 GGML_ASSERT(ggml_cpu_has_cublas());
                             }
@@ -15955,13 +16051,9 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
                     } break;
                 case GGML_OP_SCALE:
                     {
-                        stages[GGML_TASK_COMPUTE].backend = GGML_TASK_BACKEND_CPU;
-                        stages[GGML_TASK_COMPUTE].parallel = true;
                     } break;
                 case GGML_OP_SET:
                     {
-                        stages[GGML_TASK_INIT].backend = GGML_TASK_BACKEND_CPU;
-                        stages[GGML_TASK_COMPUTE].backend = GGML_TASK_BACKEND_CPU;
                     } break;
                 case GGML_OP_CONT:
                 case GGML_OP_RESHAPE:
@@ -15972,33 +16064,18 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
                 case GGML_OP_GET_ROWS_BACK:
                 case GGML_OP_DIAG:
                 case GGML_OP_DIAG_MASK_ZERO:
-                    {
-                        stages[GGML_TASK_COMPUTE].backend = GGML_TASK_BACKEND_CPU;
-                    } break;
                 case GGML_OP_DIAG_MASK_INF:
                 case GGML_OP_SOFT_MAX:
                 case GGML_OP_SOFT_MAX_BACK:
                 case GGML_OP_ROPE:
                 case GGML_OP_ROPE_BACK:
-                    {
-                        stages[GGML_TASK_COMPUTE].backend = GGML_TASK_BACKEND_CPU;
-                        stages[GGML_TASK_COMPUTE].parallel = true;
-                    } break;
                 case GGML_OP_ALIBI:
-                    {
-                        stages[GGML_TASK_COMPUTE].backend = GGML_TASK_BACKEND_CPU;
-                    } break;
                 case GGML_OP_CLAMP:
                     {
-                        stages[GGML_TASK_COMPUTE].backend = GGML_TASK_BACKEND_CPU;
                     } break;
                 case GGML_OP_CONV_1D_1S:
                 case GGML_OP_CONV_1D_2S:
                     {
-                        stages[GGML_TASK_INIT].backend = GGML_TASK_BACKEND_CPU;
-                        stages[GGML_TASK_COMPUTE].backend = GGML_TASK_BACKEND_CPU;
-                        stages[GGML_TASK_COMPUTE].parallel = true;
-
                         GGML_ASSERT(node->src0->ne[3] == 1);
                         GGML_ASSERT(node->src1->ne[2] == 1);
                         GGML_ASSERT(node->src1->ne[3] == 1);
@@ -16026,62 +16103,53 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
                     } break;
                 case GGML_OP_FLASH_ATTN:
                     {
-                        stages[GGML_TASK_COMPUTE].backend = GGML_TASK_BACKEND_CPU;
-                        stages[GGML_TASK_COMPUTE].parallel = true;
-
                         size_t cur = 0;
 
                         const int64_t ne11 = ggml_up(node->src1->ne[1], GGML_SOFT_MAX_UNROLL);
 
                         if (node->src1->type == GGML_TYPE_F32) {
-                            cur  = sizeof(float)*ne11*n_threads; // TODO: this can become (n_tasks-1)
-                            cur += sizeof(float)*ne11*n_threads; // this is overestimated by x2
+                            cur  = sizeof(float)*ne11*n_tasks; // TODO: this can become (n_tasks-1)
+                            cur += sizeof(float)*ne11*n_tasks; // this is overestimated by x2
                         }
 
                         if (node->src1->type == GGML_TYPE_F16) {
-                            cur  = sizeof(float)*ne11*n_threads; // TODO: this can become (n_tasks-1)
-                            cur += sizeof(float)*ne11*n_threads; // this is overestimated by x2
+                            cur  = sizeof(float)*ne11*n_tasks; // TODO: this can become (n_tasks-1)
+                            cur += sizeof(float)*ne11*n_tasks; // this is overestimated by x2
                         }
 
                         work_size = MAX(work_size, cur);
                     } break;
                 case GGML_OP_FLASH_FF:
                     {
-                        stages[GGML_TASK_COMPUTE].backend = GGML_TASK_BACKEND_CPU;
-                        stages[GGML_TASK_COMPUTE].parallel = true;
                         size_t cur = 0;
 
                         if (node->src1->type == GGML_TYPE_F32) {
-                            cur  = sizeof(float)*node->src1->ne[1]*n_threads; // TODO: this can become (n_tasks-1)
-                            cur += sizeof(float)*node->src1->ne[1]*n_threads; // this is overestimated by x2
+                            cur  = sizeof(float)*node->src1->ne[1]*n_tasks; // TODO: this can become (n_tasks-1)
+                            cur += sizeof(float)*node->src1->ne[1]*n_tasks; // this is overestimated by x2
                         }
 
                         if (node->src1->type == GGML_TYPE_F16) {
-                            cur  = sizeof(float)*node->src1->ne[1]*n_threads; // TODO: this can become (n_tasks-1)
-                            cur += sizeof(float)*node->src1->ne[1]*n_threads; // this is overestimated by x2
+                            cur  = sizeof(float)*node->src1->ne[1]*n_tasks; // TODO: this can become (n_tasks-1)
+                            cur += sizeof(float)*node->src1->ne[1]*n_tasks; // this is overestimated by x2
                         }
 
                         work_size = MAX(work_size, cur);
                     } break;
                 case GGML_OP_FLASH_ATTN_BACK:
                     {
-                        stages[GGML_TASK_INIT].backend = GGML_TASK_BACKEND_CPU;
-                        stages[GGML_TASK_COMPUTE].backend = GGML_TASK_BACKEND_CPU;
-                        stages[GGML_TASK_COMPUTE].parallel = true;
-
                         size_t cur = 0;
 
                         const int64_t    D = node->src0->ne[0];
                         const int64_t ne11 = ggml_up(node->src1->ne[1], GGML_SOFT_MAX_UNROLL);
                         const int64_t mxDn = MAX(D, ne11) * 2; // *2 because of S and SM in ggml_compute_forward_flash_attn_back
                         if (node->src1->type == GGML_TYPE_F32) {
-                            cur  = sizeof(float)*mxDn*n_threads; // TODO: this can become (n_tasks-1)
-                            cur += sizeof(float)*mxDn*n_threads; // this is overestimated by x2
+                            cur  = sizeof(float)*mxDn*n_tasks; // TODO: this can become (n_tasks-1)
+                            cur += sizeof(float)*mxDn*n_tasks; // this is overestimated by x2
                         }
 
                         if (node->src1->type == GGML_TYPE_F16) {
-                            cur  = sizeof(float)*mxDn*n_threads; // TODO: this can become (n_tasks-1)
-                            cur += sizeof(float)*mxDn*n_threads; // this is overestimated by x2
+                            cur  = sizeof(float)*mxDn*n_tasks; // TODO: this can become (n_tasks-1)
+                            cur += sizeof(float)*mxDn*n_tasks; // this is overestimated by x2
                         }
 
                         work_size = MAX(work_size, cur);
@@ -16089,31 +16157,21 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
                 case GGML_OP_MAP_UNARY:
                 case GGML_OP_MAP_BINARY:
                     {
-                        stages[GGML_TASK_COMPUTE].backend = GGML_TASK_BACKEND_CPU;
                     } break;
                 case GGML_OP_CROSS_ENTROPY_LOSS:
                     {
-                        stages[GGML_TASK_INIT].backend = GGML_TASK_BACKEND_CPU;
-                        stages[GGML_TASK_COMPUTE].backend = GGML_TASK_BACKEND_CPU;
-                        stages[GGML_TASK_COMPUTE].parallel = true;
-                        stages[GGML_TASK_FINALIZE].backend = GGML_TASK_BACKEND_CPU;
-
-                        size_t cur = ggml_type_size(node->type)*(n_threads + node->src0->ne[0]*n_threads);
+                        size_t cur = ggml_type_size(node->type)*(n_threads + node->src0->ne[0]*n_tasks);
 
                         work_size = MAX(work_size, cur);
                     } break;
                 case GGML_OP_CROSS_ENTROPY_LOSS_BACK:
                     {
-                        stages[GGML_TASK_COMPUTE].backend = GGML_TASK_BACKEND_CPU;
-                        stages[GGML_TASK_COMPUTE].parallel = true;
-
-                        size_t cur = ggml_type_size(node->type)*node->src0->ne[0]*n_threads;
+                        size_t cur = ggml_type_size(node->type)*node->src0->ne[0]*n_tasks;
 
                         work_size = MAX(work_size, cur);
                     } break;
                 case GGML_OP_NONE:
                     {
-                        stages[GGML_TASK_COMPUTE].backend = GGML_TASK_BACKEND_CPU;
                     } break;
                 case GGML_OP_COUNT:
                     {
@@ -16134,6 +16192,9 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
             GGML_PRINT_DEBUG("%s: allocating work buffer for graph (%zu bytes)\n", __func__, cgraph->work_size);
             cgraph->work = ggml_new_tensor_1d(ctx, GGML_TYPE_I8, cgraph->work_size);
         }
+
+        // ~ 50 us
+        //printf("=== prepare computing took %d us\n", (int)(ggml_time_us() - t0));
     }
 
     const int64_t perf_start_cycles  = ggml_perf_cycles();
@@ -16162,16 +16223,6 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
 
         enum ggml_compute_error err =
             ggml_threading_compute_tensor(thrd_ctx, node, wdata, wsize);
-        if (err == GGML_COMPUTE_FALLBACK) {
-            if (node->op == GGML_OP_MUL_MAT) {
-                    const struct ggml_task_profile *p =
-                        ggml_mulmat_get_default_task_profile(
-                            NULL, node->src0->type, node->src1->type);
-                    memcpy(&node->task_profile, p,
-                           sizeof(struct ggml_task_profile));
-            }
-            err = ggml_threading_compute_tensor(thrd_ctx, node, wdata, wsize);
-        }
         GGML_ASSERT(err == GGML_COMPUTE_OK);
 
         // performance stats (node)
diff --git a/ggml.h b/ggml.h
index f51b658fd3abe..5ab78c4a011b5 100644
--- a/ggml.h
+++ b/ggml.h
@@ -202,7 +202,7 @@
 #define GGML_MAX_OPT           4
 #define GGML_MAX_NAME          32
 #define GGML_DEFAULT_N_THREADS 4
-#define GGML_MAX_TASK_PROFILES 8
+#define GGML_MAX_TASK_PROFILES 4
 
 #define GGML_ASSERT(x) \
     do { \
@@ -399,17 +399,6 @@ extern "C" {
         uint8_t dev_flags[4];
     };
 
-    struct ggml_task_profile_factory {
-        struct ggml_task_profile f32_f32[GGML_MAX_TASK_PROFILES];
-        int n_f32_f32;
-
-        struct ggml_task_profile f16_f32[GGML_MAX_TASK_PROFILES];
-        int n_f16_f32;
-
-        struct ggml_task_profile qxx_f32[GGML_MAX_TASK_PROFILES];
-        int n_qxx_f32;
-    };
-
     // n-dimensional tensor
     struct ggml_tensor {
         enum ggml_type    type;
@@ -450,6 +439,11 @@ extern "C" {
         char padding[12];
     };
 
+    // Fill `profiles` for the `node` and return number of profiles.
+    typedef int (ggml_task_profiles_provider) (
+        struct ggml_tensor *node,
+        struct ggml_task_profile profiles[GGML_MAX_TASK_PROFILES]);
+
     static const size_t GGML_TENSOR_SIZE = sizeof(struct ggml_tensor);
 
     // computation graph
@@ -1345,15 +1339,11 @@ extern "C" {
     GGML_API int ggml_cpu_has_vsx        (void);
 
     //
-    // mulmat task profiles
+    // task profiles
     //
-    GGML_API void ggml_mulmat_init_task_profiles(void);
 
-    GGML_API int ggml_mulmat_get_task_profiles(
-        struct ggml_task_profile_factory *pf,
-        enum ggml_type src0_t,
-        enum ggml_type src1_t,
-        struct ggml_task_profile **profiles);
+    // Implements `ggml_task_profiles_provider`.
+    GGML_API int ggml_get_task_profiles (struct ggml_tensor *node, struct ggml_task_profile profiles[GGML_MAX_TASK_PROFILES]);
 
     //
     // Internal types and functions exposed for tests and benchmarks
diff --git a/llama.cpp b/llama.cpp
index fa5a94e21f0ab..acc0e59f71dbf 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -21,7 +21,7 @@
 #include "ggml-metal.h"
 #endif
 
-#ifdef GGML_USE_MULMAT_TUNE
+#ifdef GGML_USE_TUNE
 #include "ggml-tune.h"
 #endif
 
@@ -285,7 +285,7 @@ struct llama_context {
     int    buf_last = 0;
     size_t buf_max_size[LLAMA_MAX_SCRATCH_BUFFERS] = { 0 };
 
-#ifdef GGML_USE_MULMAT_TUNE
+#ifdef GGML_USE_TUNE
     struct ggml_mulmat_tune *tune = nullptr;
 #endif
 
@@ -1408,7 +1408,7 @@ static bool llama_eval_internal(
     ggml_cgraph gf = {};
     gf.n_threads = n_threads;
 
-#ifdef GGML_USE_MULMAT_TUNE
+#ifdef GGML_USE_TUNE
     gf.tune =lctx.tune;
 #endif
 
@@ -2743,7 +2743,7 @@ struct llama_context * llama_init_from_file(
     return ctx;
 }
 
-#ifdef GGML_USE_MULMAT_TUNE
+#ifdef GGML_USE_TUNE
 bool llama_mulmat_tune(struct llama_context *ctx, int n_threads, bool tune, const char *fname) {
     printf("\n");
     if (ctx->model.n_gpu_layers != 0) {
@@ -2882,7 +2882,7 @@ bool llama_mulmat_tune(struct llama_context *ctx, int n_threads, bool tune, cons
 #endif
 
 void llama_free(struct llama_context * ctx) {
-#ifdef GGML_USE_MULMAT_TUNE
+#ifdef GGML_USE_TUNE
     if (ctx->tune) {
         delete(ctx->tune);
     }
diff --git a/tests/test-ggml-threading.c b/tests/test-ggml-threading.c
index ed9d8aa2bcdd8..deb15fd84d8e7 100644
--- a/tests/test-ggml-threading.c
+++ b/tests/test-ggml-threading.c
@@ -356,6 +356,17 @@ int main(void) {
     {
         ++n_tests;
 
+        // required by getting task profiles.
+        node.op = GGML_OP_MUL_MAT;
+        struct ggml_tensor src0 = {
+            .type = GGML_TYPE_Q4_0,
+        };
+        struct ggml_tensor src1 = {
+            .type = GGML_TYPE_F32,
+        };
+        node.src0 = &src0;
+        node.src1 = &src1;
+
         node.backend = GGML_BACKEND_GPU;
         if (test_fallback(&node) == 0) {
             ++n_passed;
diff --git a/tests/test-ggml-tune.c b/tests/test-ggml-tune.c
index ed612fff45562..e0a6950d9502b 100644
--- a/tests/test-ggml-tune.c
+++ b/tests/test-ggml-tune.c
@@ -3,6 +3,8 @@
 
 #include <string.h>
 
+#define UNUSED(x) (void)(x)
+
 static int bench(void);
 static int estimate_time_non_zero_NK(void);
 
@@ -77,6 +79,20 @@ static int bench(void) {
     return ok ? 0 : 1;
 }
 
+// implement `ggml_task_profiles_provider`
+static int
+ggml_task_profiles_mock_qxx_provider(struct ggml_tensor *node,
+                                     struct ggml_task_profile *profiles) {
+    UNUSED(node);
+    profiles[0].stages[0].backend = GGML_TASK_BACKEND_CPU;
+    profiles[0].stages[0].backend = GGML_TASK_BACKEND_CPU;
+    profiles[0].stages[1].backend = GGML_TASK_BACKEND_CPU;
+    profiles[1].stages[0].backend = GGML_TASK_BACKEND_CPU;
+    profiles[1].stages[1].backend = GGML_TASK_BACKEND_CPU_BLAS;
+
+    return 2;
+}
+
 int estimate_time_non_zero_NK(void) {
     printf("test: %s\n", __func__);
 
@@ -92,22 +108,10 @@ int estimate_time_non_zero_NK(void) {
 
     const int m_num = 2;
 
-    struct ggml_task_profile_factory pf;
-    memset(&pf, 0, sizeof(struct ggml_task_profile_factory));
-
-    {
-        pf.n_qxx_f32 = 2;
-        pf.qxx_f32[0].stages[0].backend = GGML_TASK_BACKEND_CPU;
-        pf.qxx_f32[0].stages[1].backend = GGML_TASK_BACKEND_CPU;
-
-        pf.qxx_f32[1].stages[0].backend = GGML_TASK_BACKEND_CPU;
-        pf.qxx_f32[1].stages[1].backend = GGML_TASK_BACKEND_CPU_BLAS;
-    }
-
     struct ggml_mulmat_tune_params params;
     init_params(&params, m_num);
 
-    ggml_mulmat_tune_init(&tune, &params, &pf);
+    ggml_mulmat_tune_init(&tune, &params, ggml_task_profiles_mock_qxx_provider);
 
     struct ggml_mulmat_tune_shape *shape = NULL;
     for (int i = 0; i < tune.n_shapes; i++) {

From 91062322604974d9afeab58618d88282cafedafd Mon Sep 17 00:00:00 2001
From: mqy <meng.qingyou@gmail.com>
Date: Thu, 15 Jun 2023 07:19:00 +0800
Subject: [PATCH 04/24] threading test: At github, Windows can take more than
 20 seconds to start 15 threads.Let's silently ignore when we saw two adjacent
 slowness.

---
 tests/test-ggml-threading.c | 18 ++++++++++++++++--
 1 file changed, 16 insertions(+), 2 deletions(-)

diff --git a/tests/test-ggml-threading.c b/tests/test-ggml-threading.c
index deb15fd84d8e7..90d53e4cdea2a 100644
--- a/tests/test-ggml-threading.c
+++ b/tests/test-ggml-threading.c
@@ -217,9 +217,19 @@ int main(void) {
     int workload_arr_len = sizeof(workload_arr) / sizeof(workload_arr[0]);
 
     // skip slow/big n_threads.
+
+    int n_slow = 0;
+
     for (int i = 0; i < threads_arr_len; i++) {
         int n_threads = threads_arr[i];
 
+        // At github, Windows can take more than 20 seconds to start 15 threads.
+        // Let's silently ignore when we saw two adjacent slowness.
+        if (n_slow >= 2) {
+            threads_arr[i] = 0;
+            continue;
+        }
+
         if (n_threads == 1) {
             continue;
         } else if (n_threads > MAX_N_THREADS) {
@@ -243,10 +253,14 @@ int main(void) {
 
         int elapsed_us = t1 - t0;
         if (elapsed_us > 500 * n_threads) {
-            printf("[test-ggml-threading] warning: it took took %.3f "
-                   "ms to start %d worker thread(s). Loo slow, skip.\n",
+            printf("[test-ggml-threading] warning: it took took %7.3f "
+                   "ms to start %2d worker thread(s). Too slow, skip.\n",
                    1.0 * elapsed_us / 1000, n_threads - 1);
             threads_arr[i] = 0;
+            ++n_slow;
+        } else {
+            // clear.
+            n_slow = 0;
         }
     }
 

From bb590f14826ccfd0f182d005a3fe6333da14017f Mon Sep 17 00:00:00 2001
From: mqy <meng.qingyou@gmail.com>
Date: Thu, 15 Jun 2023 08:28:39 +0800
Subject: [PATCH 05/24] Workrounnd to set node->backend

---
 ggml-opencl.cpp        |  4 ++--
 ggml.c                 | 13 +++++++++++++
 tests/test-ggml-tune.c |  1 -
 3 files changed, 15 insertions(+), 3 deletions(-)

diff --git a/ggml-opencl.cpp b/ggml-opencl.cpp
index b2300a104ddb2..c9151a8e49561 100644
--- a/ggml-opencl.cpp
+++ b/ggml-opencl.cpp
@@ -1599,8 +1599,8 @@ bool ggml_cl_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tens
     // TODO: find the optimal values for these
     if ((src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) &&
         src1->type == GGML_TYPE_F32 &&
-        dst->type == GGML_TYPE_F32 &&
-        ((ne0 >= 32 && ne1 >= 32 && ne10 >= 32) || src0->backend == GGML_BACKEND_GPU)) {
+        dst->type == GGML_TYPE_F32 /*&&
+        ((ne0 >= 32 && ne1 >= 32 && ne10 >= 32) || src0->backend == GGML_BACKEND_GPU)*/) {
         return true;
     }
 
diff --git a/ggml.c b/ggml.c
index b75f33b88d877..b734f1a0c4d46 100644
--- a/ggml.c
+++ b/ggml.c
@@ -15938,6 +15938,18 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
 
             struct ggml_task_stage *stages = node->task_profile.stages;
 
+            // Workrounnd to set node->backend.
+            for (int j = 0; j < 3; j++) {
+                if (node->backend == GGML_BACKEND_CPU &&
+                    (stages[j].backend & GGML_TASK_BACKEND_GPU)) {
+                    if (ggml_cpu_has_cublas() || ggml_cpu_has_clblast()) {
+                        node->backend = GGML_BACKEND_GPU;
+                    } else {
+                        GGML_ASSERT(false);
+                    }
+                }
+            }
+
             // compute stage n_tasks.
             int n_tasks = stages[1].parallel ? n_threads : 1;
 
@@ -16008,6 +16020,7 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
 
                         if (comp_backend == GGML_TASK_BACKEND_GPU_CL) {
 #if defined(GGML_USE_CLBLAST)
+                            GGML_ASSERT(ggml_cl_can_mul_mat(node->src0, node->src1, node));
                             cur = ggml_cl_mul_mat_get_wsize(node->src0, node->src1, node);
 #else
                             GGML_ASSERT(false);
diff --git a/tests/test-ggml-tune.c b/tests/test-ggml-tune.c
index e0a6950d9502b..913d25ff56ff4 100644
--- a/tests/test-ggml-tune.c
+++ b/tests/test-ggml-tune.c
@@ -85,7 +85,6 @@ ggml_task_profiles_mock_qxx_provider(struct ggml_tensor *node,
                                      struct ggml_task_profile *profiles) {
     UNUSED(node);
     profiles[0].stages[0].backend = GGML_TASK_BACKEND_CPU;
-    profiles[0].stages[0].backend = GGML_TASK_BACKEND_CPU;
     profiles[0].stages[1].backend = GGML_TASK_BACKEND_CPU;
     profiles[1].stages[0].backend = GGML_TASK_BACKEND_CPU;
     profiles[1].stages[1].backend = GGML_TASK_BACKEND_CPU_BLAS;

From 7c05049f8b0ba6e090e5a9d5bb11a6d4c74e4a3f Mon Sep 17 00:00:00 2001
From: mqy <meng.qingyou@gmail.com>
Date: Thu, 15 Jun 2023 14:06:11 +0800
Subject: [PATCH 06/24] tunning: check GPU offloading before loading model

---
 examples/common.cpp                |  8 ++++++++
 examples/perplexity/perplexity.cpp |  2 +-
 ggml-tune.c                        | 22 +++++++++++-----------
 llama.cpp                          | 12 +++++-------
 4 files changed, 25 insertions(+), 19 deletions(-)

diff --git a/examples/common.cpp b/examples/common.cpp
index fd6df49477949..09ce484a12a11 100644
--- a/examples/common.cpp
+++ b/examples/common.cpp
@@ -435,6 +435,14 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
         process_escapes(params.prompt);
     }
 
+#ifdef GGML_USE_TUNE
+     if (params.n_gpu_layers > 0) {
+        if (params.tune || !params.tune_file.empty()) {
+            fprintf(stderr, "[tune] error: tunning and GPU offloading cannot be used at the same time, abort.\n");
+            exit(1);
+        }
+     }
+#endif
     return true;
 }
 
diff --git a/examples/perplexity/perplexity.cpp b/examples/perplexity/perplexity.cpp
index 2cdd9db060d25..4732205160836 100644
--- a/examples/perplexity/perplexity.cpp
+++ b/examples/perplexity/perplexity.cpp
@@ -159,7 +159,7 @@ int main(int argc, char ** argv) {
     }
 
 #ifdef GGML_USE_TUNE
-    if (params.tune || !params.tune_file.empty()){
+    if (params.tune || !params.tune_file.empty()) {
         bool ok = llama_mulmat_tune(ctx, params.n_threads, params.tune, params.tune_file.c_str());
         if (!ok || (params.tune && !params.tune_file.empty())) {
             llama_free(ctx);
diff --git a/ggml-tune.c b/ggml-tune.c
index 52ca96bf302fa..0a52443e4685c 100644
--- a/ggml-tune.c
+++ b/ggml-tune.c
@@ -104,7 +104,7 @@ const struct ggml_task_profile *ggml_mulmat_tune_select_task_profile(
                         prof->stages[i].backend);
                 }
                 printf(
-                    "\n[mulmat tune] M: %3d, N: %5d, K: %5d, backends of the "
+                    "\n[tune] M: %3d, N: %5d, K: %5d, backends of the "
                     "fastest profile: %s %s %s\n",
                     M, N, K, names[0], names[1], names[2]);
 #endif
@@ -358,7 +358,7 @@ bool ggml_mulmat_tune_validate(const struct ggml_mulmat_tune *tune,
     bool ok = ggml_mulmat_tune_validate_internal(tune, model, ftype, n_threads,
                                                  errbuf, sizeof(errbuf));
     if (!ok) {
-        fprintf(stderr, "[mulmat tune] error: %s. run bench again.\n", errbuf);
+        fprintf(stderr, "[tune] error: %s. run bench again.\n", errbuf);
     }
 
     return ok;
@@ -371,7 +371,7 @@ bool ggml_mulmat_tune_read_data(struct ggml_mulmat_tune *tune, FILE *fp) {
     }
 
     if (tune->version != GGML_MULMAT_TUNE_VERSION) {
-        fprintf(stderr, "[mulmat tune] version mismatch, run bench again\n");
+        fprintf(stderr, "[tune] version mismatch, run bench again\n");
         return false;
     }
 
@@ -396,7 +396,7 @@ bool ggml_mulmat_tune_read_data(struct ggml_mulmat_tune *tune, FILE *fp) {
                                (shape->n_profiles * shape->m_num);
             shape->items = malloc(item_size);
             if (shape->items == NULL) {
-                fprintf(stderr, "[mulmat tune] failed to allocate memory\n");
+                fprintf(stderr, "[tune] failed to allocate memory\n");
                 return false;
             }
             memset(shape->items, 0, item_size);
@@ -708,7 +708,7 @@ static size_t ggml_mulmat_allocate_wdata(int N, int K, char **wdata) {
 
     if (!buf) {
         fprintf(stderr,
-                "[mulmat tune] error: failed to allocate %zu MiB memory",
+                "[tune] error: failed to allocate %zu MiB memory",
                 sz / 1024 / 1024);
         return 0;
     }
@@ -745,7 +745,7 @@ bool ggml_mulmat_tune_bench(struct ggml_mulmat_tune *tune,
     int n_backends = ggml_mulmat_tune_get_builtin_task_backends(backends);
     if (n_backends < 2) {
         fprintf(stderr,
-                "[mulmat tune] error: this program was not built with BLAS.\n");
+                "[tune] error: this program was not built with BLAS.\n");
         return false;
     }
 
@@ -770,7 +770,7 @@ bool ggml_mulmat_tune_bench(struct ggml_mulmat_tune *tune,
         }
 
         fprintf(stdout,
-                "[mulmat tune] model: %s, ggml ftype: %d, "
+                "[tune] model: %s, ggml ftype: %d, "
                 "n_pass: %d, n_threads: %d, n_shapes: %d, backends: %s\n",
                 params->model.name, params->model.ftype, params->n_pass,
                 params->n_threads, tune->n_shapes, buf);
@@ -871,7 +871,7 @@ bool ggml_mulmat_tune_bench(struct ggml_mulmat_tune *tune,
 
     ggml_threading_stop(thrd_ctx);
 
-    fprintf(stdout, "[mulmat tune] done, elapsed time: %d seconds.\n",
+    fprintf(stdout, "[tune] done, elapsed time: %d seconds.\n",
             (int)(ggml_time_ms() - t0) / 1000);
 
     // output
@@ -880,7 +880,7 @@ bool ggml_mulmat_tune_bench(struct ggml_mulmat_tune *tune,
         FILE *fp = fopen(params->fname, "w");
         if (!fp) {
             fprintf(stderr,
-                    "[mulmat tune] warn: failed to open file `%s`, print to "
+                    "[tune] warn: failed to open file `%s`, print to "
                     "console instead\n\n",
                     params->fname);
             params->output_console = 1;
@@ -889,12 +889,12 @@ bool ggml_mulmat_tune_bench(struct ggml_mulmat_tune *tune,
             fclose(fp);
 
             if (ok) {
-                fprintf(stdout, "[mulmat tune] data was written to `%s`\n",
+                fprintf(stdout, "[tune] data was written to `%s`\n",
                         params->fname);
             } else {
                 fprintf(
                     stderr,
-                    "[mulmat tune] warn: failed to write file `%s`, print to "
+                    "[tune] warn: failed to write file `%s`, print to "
                     "console instead\n\n",
                     params->fname);
                 params->output_console = 1;
diff --git a/llama.cpp b/llama.cpp
index acc0e59f71dbf..06555e1dddeb2 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -2745,11 +2745,9 @@ struct llama_context * llama_init_from_file(
 
 #ifdef GGML_USE_TUNE
 bool llama_mulmat_tune(struct llama_context *ctx, int n_threads, bool tune, const char *fname) {
+    GGML_ASSERT (ctx->model.n_gpu_layers == 0);
+
     printf("\n");
-    if (ctx->model.n_gpu_layers != 0) {
-        fprintf(stderr, "[mulmat tune] error: is disabled by GPU offloading\n");
-        return false;
-    }
 
     const char *model_name = llama_model_type_name(ctx->model.type);
 
@@ -2855,7 +2853,7 @@ bool llama_mulmat_tune(struct llama_context *ctx, int n_threads, bool tune, cons
     if (!empty_fname) {
         FILE *fp = fopen(fname, "r");
         if (!fp) {
-            fprintf(stderr, "[mulmat tune] failed to open file %s.\n",
+            fprintf(stderr, "[tune] failed to open file %s.\n",
                     fname);
         } else {
             bool ok = ggml_mulmat_tune_read_data(ctx->tune, fp);
@@ -2863,12 +2861,12 @@ bool llama_mulmat_tune(struct llama_context *ctx, int n_threads, bool tune, cons
 
             if (!ok) {
                 fprintf(stderr,
-                        "[mulmat tune] failed to read data from %s\n",
+                        "[tune] failed to read data from %s\n",
                         fname);
                 return false;
             }
 
-            fprintf(stderr, "[mulmat tune] loaded data from %s\n", fname);
+            fprintf(stderr, "[tune] loaded data from %s\n", fname);
 
             ok = ggml_mulmat_tune_validate(ctx->tune, model_name, ggml_ftype, params.n_threads);
             if (!ok) {

From 21e9379707ca62e462d4c1a07876d6b9435d5412 Mon Sep 17 00:00:00 2001
From: mqy <meng.qingyou@gmail.com>
Date: Thu, 15 Jun 2023 15:57:31 +0800
Subject: [PATCH 07/24] tunning: add f16, todo: f32 failed with CL

---
 ggml-tune.c            | 24 +++++++--------
 tests/test-ggml-tune.c | 66 +++++++++++++++++++++++++++++++-----------
 2 files changed, 60 insertions(+), 30 deletions(-)

diff --git a/ggml-tune.c b/ggml-tune.c
index 0a52443e4685c..81b012766124c 100644
--- a/ggml-tune.c
+++ b/ggml-tune.c
@@ -103,10 +103,9 @@ const struct ggml_task_profile *ggml_mulmat_tune_select_task_profile(
                     names[i] = ggml_mulmat_tune_task_backend_name(
                         prof->stages[i].backend);
                 }
-                printf(
-                    "\n[tune] M: %3d, N: %5d, K: %5d, backends of the "
-                    "fastest profile: %s %s %s\n",
-                    M, N, K, names[0], names[1], names[2]);
+                printf("\n[tune] M: %3d, N: %5d, K: %5d, backends of the "
+                       "fastest profile: %s %s %s\n",
+                       M, N, K, names[0], names[1], names[2]);
 #endif
             }
         }
@@ -707,8 +706,7 @@ static size_t ggml_mulmat_allocate_wdata(int N, int K, char **wdata) {
     void *buf = malloc(sz);
 
     if (!buf) {
-        fprintf(stderr,
-                "[tune] error: failed to allocate %zu MiB memory",
+        fprintf(stderr, "[tune] error: failed to allocate %zu MiB memory",
                 sz / 1024 / 1024);
         return 0;
     }
@@ -835,8 +833,9 @@ bool ggml_mulmat_tune_bench(struct ggml_mulmat_tune *tune,
                         stages_time[j] = 0;
                     }
 
-                    /*enum ggml_compute_error err = */
-                    ggml_threading_compute_tensor(thrd_ctx, node, wdata, wsize);
+                    enum ggml_compute_error err = ggml_threading_compute_tensor(
+                        thrd_ctx, node, wdata, wsize);
+                    GGML_ASSERT(err == GGML_COMPUTE_OK);
 
                     for (int i = 0; i < 3; i++) {
                         int v = (int)stages_time[i];
@@ -892,11 +891,10 @@ bool ggml_mulmat_tune_bench(struct ggml_mulmat_tune *tune,
                 fprintf(stdout, "[tune] data was written to `%s`\n",
                         params->fname);
             } else {
-                fprintf(
-                    stderr,
-                    "[tune] warn: failed to write file `%s`, print to "
-                    "console instead\n\n",
-                    params->fname);
+                fprintf(stderr,
+                        "[tune] warn: failed to write file `%s`, print to "
+                        "console instead\n\n",
+                        params->fname);
                 params->output_console = 1;
             }
         }
diff --git a/tests/test-ggml-tune.c b/tests/test-ggml-tune.c
index 913d25ff56ff4..a8a2048621eca 100644
--- a/tests/test-ggml-tune.c
+++ b/tests/test-ggml-tune.c
@@ -8,12 +8,13 @@
 static int bench(void);
 static int estimate_time_non_zero_NK(void);
 
-static void init_params(struct ggml_mulmat_tune_params *params, int m_num) {
+static void init_params(struct ggml_mulmat_tune_params *params,
+                        enum ggml_ftype ftype, int m_num, int n_threads) {
     *params = (struct ggml_mulmat_tune_params){
         .model =
             (struct ggml_mulmat_tune_model){
-                .name = "3B", // fake
-                .ftype = GGML_FTYPE_MOSTLY_Q4_0,
+                .name = "xB", // fake model name
+                .ftype = ftype,
                 .n_vocab = 4096,
                 .n_embd = 1024,
                 .n_ff = 2048,
@@ -21,7 +22,7 @@ static void init_params(struct ggml_mulmat_tune_params *params, int m_num) {
             },
         .m_num = m_num,
         .n_pass = 1,
-        .n_threads = 1,
+        .n_threads = n_threads,
         .progress = false,
         .output_console = true,
         .fname = NULL};
@@ -45,13 +46,11 @@ int main(void) {
 }
 
 static int bench(void) {
-    printf("test: %s\n", __func__);
-
     {
         enum ggml_task_backend backends[16];
         int n_backends = ggml_mulmat_tune_get_builtin_task_backends(backends);
         if (n_backends < 2) {
-            printf("test: %s, skipped because no BLAS\n", __func__);
+            printf("[test-ggml-tune] skipped because no BLAS\n");
             return 0;
         }
     }
@@ -67,16 +66,48 @@ static int bench(void) {
         ggml_free(ctx);
     }
 
-    struct ggml_mulmat_tune tune;
+    // F32: ggml_opencl: ggml_cl_h2d_tensor_2d(queue, d_X, 0, src0, i03, i02,
+    // NULL) error -30 at /Users/mqy/tools/AI/llama.cpp/ggml-opencl.cpp:838
+    enum ggml_ftype ftypes[] = {
+        // GGML_FTYPE_ALL_F32,
+        GGML_FTYPE_MOSTLY_F16,
+        GGML_FTYPE_MOSTLY_Q4_0,
+    };
 
-    struct ggml_mulmat_tune_params params;
+    int n_ftypes = sizeof(ftypes) / sizeof(ftypes[0]);
 
-    init_params(&params, /*m_num*/ 4);
+    const int m_num = 4;
 
-    bool ok = ggml_mulmat_tune_bench(&tune, &params);
-    ggml_mulmat_tune_free(&tune);
+    // Don't use n_threads larger than 2 because Github build hots has limited
+    // resource quota.
+    int threads_arr[] = {1, 2};
+    int thread_arr_len = sizeof(threads_arr) / sizeof(threads_arr[0]);
+
+    int n_passed = 0;
+    int n_tests = 0;
+
+    for (int i = 0; i < n_ftypes; i++) {
+        for (int j = 0; j < thread_arr_len; j++) {
+            printf("\n");
+
+            int n_threads = threads_arr[j];
+            struct ggml_mulmat_tune tune;
+
+            struct ggml_mulmat_tune_params params;
+            memset(&params, 0, sizeof(struct ggml_mulmat_tune_params));
+            init_params(&params, ftypes[i], m_num, n_threads);
+
+            ++n_tests;
+            bool ok = ggml_mulmat_tune_bench(&tune, &params);
+            if (ok) {
+                ++n_passed;
+            }
+            ggml_mulmat_tune_free(&tune);
+        }
+    }
 
-    return ok ? 0 : 1;
+    printf("[test-ggml-tune] %d / %d passed\n", n_passed, n_tests);
+    return (n_passed == n_tests) ? 0 : 1;
 }
 
 // implement `ggml_task_profiles_provider`
@@ -93,7 +124,7 @@ ggml_task_profiles_mock_qxx_provider(struct ggml_tensor *node,
 }
 
 int estimate_time_non_zero_NK(void) {
-    printf("test: %s\n", __func__);
+    printf("test-ggml-tune: %s\n", __func__);
 
     struct test_data_t {
         int M;
@@ -106,9 +137,10 @@ int estimate_time_non_zero_NK(void) {
     };
 
     const int m_num = 2;
+    const int n_threads = 1; // useless.
 
     struct ggml_mulmat_tune_params params;
-    init_params(&params, m_num);
+    init_params(&params, tune.ftype, m_num, n_threads);
 
     ggml_mulmat_tune_init(&tune, &params, ggml_task_profiles_mock_qxx_provider);
 
@@ -123,8 +155,8 @@ int estimate_time_non_zero_NK(void) {
     GGML_ASSERT(shape->n_profiles == 2);
     GGML_ASSERT(ggml_is_quantized(shape->src0_type));
 
-    printf("shape: N: %d, K: %d, n_profiles: %d\n", shape->N, shape->K,
-           shape->n_profiles);
+    printf("[test-ggml-tune] %s, shape: N: %d, K: %d, n_profiles: %d\n",
+           __func__, shape->N, shape->K, shape->n_profiles);
 
     {
         shape->items[0] =

From 5342dc075ff19862c5da1e5073759fd6c92e4943 Mon Sep 17 00:00:00 2001
From: mqy <meng.qingyou@gmail.com>
Date: Thu, 15 Jun 2023 21:34:34 +0800
Subject: [PATCH 08/24] tunning: support k_quants; disabled rope shapes
 (workaround); make cache thread safe; fixed shape comprison

---
 examples/mulmat-tune/mulmat-tune.cpp |  4 +-
 ggml-tune.c                          | 94 +++++++++++++++++++---------
 ggml-tune.h                          | 22 ++++---
 tests/test-ggml-tune.c               |  7 ++-
 4 files changed, 85 insertions(+), 42 deletions(-)

diff --git a/examples/mulmat-tune/mulmat-tune.cpp b/examples/mulmat-tune/mulmat-tune.cpp
index ab3334d763870..55dd1927588de 100644
--- a/examples/mulmat-tune/mulmat-tune.cpp
+++ b/examples/mulmat-tune/mulmat-tune.cpp
@@ -170,8 +170,8 @@ int main(int argc, char **argv) {
             ftype = (enum ggml_ftype)v;
         }
 
-        if (ftype > GGML_FTYPE_MOSTLY_Q5_1) {
-            fprintf(stderr, "k_quants type %d is not implemented\n", ftype);
+        if (ftype == GGML_FTYPE_ALL_F32 || ftype == GGML_FTYPE_MOSTLY_F16) {
+            fprintf(stderr, "none quantized type %d is not supported\n", ftype);
             return 1;
         }
     }
diff --git a/ggml-tune.c b/ggml-tune.c
index 81b012766124c..20f3950693fcb 100644
--- a/ggml-tune.c
+++ b/ggml-tune.c
@@ -4,10 +4,11 @@
 #include "ggml-tune.h"
 #include "ggml.h"
 
-// MUL_MAT fine tunning for non-GPU-offloading cases.
+#ifdef GGML_USE_K_QUANTS
+#include "k_quants.h"
+#endif
 
-#define GGML_MULMAT_CACHE_LEN 16
-static struct mm_cache_element default_mm_cache[GGML_MULMAT_CACHE_LEN] = {0};
+// MUL_MAT fine tunning for non-GPU-offloading cases.
 
 #define FNV_OFFSET 14695981039346656037UL
 #define FNV_PRIME 1099511628211UL
@@ -49,9 +50,8 @@ const struct ggml_task_profile *ggml_mulmat_tune_select_task_profile(
     GGML_ASSERT(tune);
 
     // TODO: default_mm_cache is thread-unsafe.
-    struct mm_cache_element *mm_cache = default_mm_cache;
     int slot = ggml_mulmat_tune_cache_hash(M, N, K) % GGML_MULMAT_CACHE_LEN;
-    struct mm_cache_element *e = &mm_cache[slot];
+    struct ggml_mulmat_tune_cache_ele *e = &tune->cache[slot];
 
     struct ggml_mulmat_tune_time profiles_time[GGML_MAX_TASK_PROFILES] = {0};
 
@@ -183,7 +183,7 @@ bool ggml_mulmat_tune_init(struct ggml_mulmat_tune *tune,
 
     enum ggml_type type = ggml_ftype_to_ggml_type(model->ftype);
 
-    GGML_ASSERT(GGML_MULMAT_N_SHAPES >= 6);
+    GGML_ASSERT(GGML_MULMAT_N_SHAPES == 4 || GGML_MULMAT_N_SHAPES == 6);
     tune->n_shapes = GGML_MULMAT_N_SHAPES;
 
     // Attention layers
@@ -196,11 +196,26 @@ bool ggml_mulmat_tune_init(struct ggml_mulmat_tune *tune,
         .N = n_ff, .K = n_embd, .src0_type = type, .src1_type = src1_type};
     tune->shapes[3] = (struct ggml_mulmat_tune_shape){
         .N = n_vocab, .K = n_embd, .src0_type = type, .src1_type = src1_type};
-    // RoPE
-    tune->shapes[4] = (struct ggml_mulmat_tune_shape){
-        .N = n_rot, .K = 0, .src0_type = rot_src0_type, .src1_type = src1_type};
-    tune->shapes[5] = (struct ggml_mulmat_tune_shape){
-        .N = 0, .K = n_rot, .src0_type = rot_src0_type, .src1_type = src1_type};
+
+    tune->n_shapes = GGML_MULMAT_N_SHAPES;
+
+    if (GGML_MULMAT_N_SHAPES == 6) {
+        // RoPE.
+        // - very small comparing to previous, almost no need to bench.
+        // - an Illegal instruction exception on Github (mac-latest-cmake).
+        // - CL sometimes throws error on localhost.
+        // So temporarily disabled as a workaround.
+        tune->shapes[4] =
+            (struct ggml_mulmat_tune_shape){.N = n_rot,
+                                            .K = 0,
+                                            .src0_type = rot_src0_type,
+                                            .src1_type = src1_type};
+        tune->shapes[5] =
+            (struct ggml_mulmat_tune_shape){.N = 0,
+                                            .K = n_rot,
+                                            .src0_type = rot_src0_type,
+                                            .src1_type = src1_type};
+    }
 
     for (int i = 0; i < tune->n_shapes; i++) {
         struct ggml_mulmat_tune_shape *shape = &tune->shapes[i];
@@ -225,6 +240,7 @@ bool ggml_mulmat_tune_init(struct ggml_mulmat_tune *tune,
 
         shape->m_num = params->m_num;
         shape->arr_m = malloc(shape->m_num * sizeof(int));
+        GGML_ASSERT(shape->arr_m);
         for (int j = 0; j < shape->m_num; j++) {
             shape->arr_m[j] = 1 << j;
         }
@@ -245,11 +261,13 @@ void ggml_mulmat_tune_free(struct ggml_mulmat_tune *tune) {
         GGML_ASSERT(shape);
 
         // arr_m and items can be NULL only when testing.
-        if (shape->arr_m) {
-            free(shape->arr_m);
-        }
-        if (shape->items) {
-            free(shape->items);
+        if (shape->m_num > 0) {
+            if (shape->arr_m) {
+                free(shape->arr_m);
+            }
+            if (shape->items) {
+                free(shape->items);
+            }
         }
     }
 }
@@ -325,17 +343,19 @@ ggml_mulmat_tune_validate_internal(const struct ggml_mulmat_tune *tune,
         };
 
         struct ggml_task_profile builtin_profiles[GGML_MAX_TASK_PROFILES];
+        memset(builtin_profiles, 0, sizeof(builtin_profiles));
+
         int n_profiles = ggml_get_task_profiles(&node, builtin_profiles);
 
         if (n_profiles != shape->n_profiles) {
-            snprintf(errbuf, errbuf_len - 1, "task profiles mismatch");
+            snprintf(errbuf, errbuf_len - 1, "task profiles mismatch(n_profiles)");
             return false;
         }
 
         // TODO: profiles order is relevant, too strict.
         size_t sz = sizeof(struct ggml_task_profile) * n_profiles;
         if (memcmp(builtin_profiles, shape->profiles, sz) != 0) {
-            snprintf(errbuf, errbuf_len - 1, "task profiles mismatch");
+            snprintf(errbuf, errbuf_len - 1, "task profiles mismatch(profiles)");
 
             printf("=== built-in profiles:\n");
             ggml_mulmat_tune_write_profiles(stderr, builtin_profiles,
@@ -364,6 +384,9 @@ bool ggml_mulmat_tune_validate(const struct ggml_mulmat_tune *tune,
 }
 
 bool ggml_mulmat_tune_read_data(struct ggml_mulmat_tune *tune, FILE *fp) {
+    GGML_ASSERT(tune);
+    memset(tune, 0, sizeof(struct ggml_mulmat_tune));
+
     int rc = fscanf(fp, "%d", &tune->version);
     if (rc <= 0) {
         return false;
@@ -661,27 +684,42 @@ static struct ggml_tensor *ggml_mulmat_new_tensor(int M, int N, int K,
             ggml_new_tensor_2d(*ctx, GGML_TYPE_F32, (int64_t)K, (int64_t)N);
         ggml_set_f32(src0_f32, 0.1f);
 
+        const float *src_data = (const float *)src0_f32->data;
+        int nxk = N * K;
+
         switch (src0_type) {
         case GGML_TYPE_Q4_0:
-            ggml_quantize_q4_0((const float *)src0_f32->data, src0->data, N * K,
-                               K, hist);
+            ggml_quantize_q4_0(src_data, src0->data, nxk, K, hist);
             break;
         case GGML_TYPE_Q4_1:
-            ggml_quantize_q4_1((const float *)src0_f32->data, src0->data, N * K,
-                               K, hist);
+            ggml_quantize_q4_1(src_data, src0->data, nxk, K, hist);
             break;
         case GGML_TYPE_Q5_0:
-            ggml_quantize_q5_0((const float *)src0_f32->data, src0->data, N * K,
-                               K, hist);
+            ggml_quantize_q5_0(src_data, src0->data, nxk, K, hist);
             break;
         case GGML_TYPE_Q5_1:
-            ggml_quantize_q5_1((const float *)src0_f32->data, src0->data, N * K,
-                               K, hist);
+            ggml_quantize_q5_1(src_data, src0->data, nxk, K, hist);
             break;
         case GGML_TYPE_Q8_0:
-            ggml_quantize_q8_0((const float *)src0_f32->data, src0->data, N * K,
-                               K, hist);
+            ggml_quantize_q8_0(src_data, src0->data, nxk, K, hist);
+            break;
+#ifdef GGML_USE_K_QUANTS
+        case GGML_TYPE_Q2_K:
+            ggml_quantize_q2_K(src_data, src0->data, nxk, K, hist);
             break;
+        case GGML_TYPE_Q3_K:
+            ggml_quantize_q3_K(src_data, src0->data, nxk, K, hist);
+            break;
+        case GGML_TYPE_Q4_K:
+            ggml_quantize_q4_K(src_data, src0->data, nxk, K, hist);
+            break;
+        case GGML_TYPE_Q5_K:
+            ggml_quantize_q5_K(src_data, src0->data, nxk, K, hist);
+            break;
+        case GGML_TYPE_Q6_K:
+            ggml_quantize_q6_K(src_data, src0->data, nxk, K, hist);
+            break;
+#endif
         default:
             GGML_ASSERT(false);
         }
diff --git a/ggml-tune.h b/ggml-tune.h
index 04b25873c932f..b1246615503d3 100644
--- a/ggml-tune.h
+++ b/ggml-tune.h
@@ -11,7 +11,8 @@ extern "C" {
 #endif
 
 #define GGML_MULMAT_TUNE_VERSION 8
-#define GGML_MULMAT_N_SHAPES 6
+#define GGML_MULMAT_N_SHAPES 4
+#define GGML_MULMAT_CACHE_LEN 16
 
 #define GGML_MULMAT_MAX_PASS 3
 
@@ -54,6 +55,14 @@ struct ggml_mulmat_tune_shape {
     struct ggml_mulmat_tune_m *items;
 };
 
+ struct ggml_mulmat_tune_cache_ele {
+    int M;
+    int N;
+    int K;
+    const struct ggml_task_profile *profile;
+    int stages_time[3];
+};
+
 struct ggml_mulmat_tune {
     int version;
 
@@ -66,6 +75,9 @@ struct ggml_mulmat_tune {
     struct ggml_mulmat_tune_shape shapes[GGML_MULMAT_N_SHAPES];
 
     int n_threads;
+
+    // Cache for time estimating.
+    struct ggml_mulmat_tune_cache_ele cache[GGML_MULMAT_CACHE_LEN];
 };
 
 struct ggml_mulmat_tune_time {
@@ -74,14 +86,6 @@ struct ggml_mulmat_tune_time {
     int total_time;
 };
 
-struct mm_cache_element {
-    int M;
-    int N;
-    int K;
-    const struct ggml_task_profile *profile;
-    int stages_time[3];
-};
-
 // params for tune/bench.
 struct ggml_mulmat_tune_params {
     struct ggml_mulmat_tune_model model;
diff --git a/tests/test-ggml-tune.c b/tests/test-ggml-tune.c
index a8a2048621eca..5499fa6bf1d82 100644
--- a/tests/test-ggml-tune.c
+++ b/tests/test-ggml-tune.c
@@ -70,15 +70,16 @@ static int bench(void) {
     // NULL) error -30 at /Users/mqy/tools/AI/llama.cpp/ggml-opencl.cpp:838
     enum ggml_ftype ftypes[] = {
         // GGML_FTYPE_ALL_F32,
-        GGML_FTYPE_MOSTLY_F16,
+        // GGML_FTYPE_MOSTLY_F16,
         GGML_FTYPE_MOSTLY_Q4_0,
+        GGML_FTYPE_MOSTLY_Q4_K,
     };
 
     int n_ftypes = sizeof(ftypes) / sizeof(ftypes[0]);
 
     const int m_num = 4;
 
-    // Don't use n_threads larger than 2 because Github build hots has limited
+    // Don't use n_threads larger than 2 because Github build hosts has limited
     // resource quota.
     int threads_arr[] = {1, 2};
     int thread_arr_len = sizeof(threads_arr) / sizeof(threads_arr[0]);
@@ -124,7 +125,7 @@ ggml_task_profiles_mock_qxx_provider(struct ggml_tensor *node,
 }
 
 int estimate_time_non_zero_NK(void) {
-    printf("test-ggml-tune: %s\n", __func__);
+    printf("[test-ggml-tune] %s\n", __func__);
 
     struct test_data_t {
         int M;

From 6b83a3e16fa3126d2c5e6667d2f396bde84a68b4 Mon Sep 17 00:00:00 2001
From: mqy <meng.qingyou@gmail.com>
Date: Fri, 16 Jun 2023 20:32:12 +0800
Subject: [PATCH 09/24] try make CL run w/o tunning, but -ngl stucks no output.
 had to add task runer and profile id, many changes, see the f codes

---
 examples/mulmat-tune/mulmat-tune.cpp |  27 +-
 ggml-opencl.cpp                      |   2 +-
 ggml-threading.c                     |  37 ++-
 ggml-threading.h                     |  22 +-
 ggml-tune.c                          |  70 ++++-
 ggml-tune.h                          |  13 +-
 ggml.c                               | 441 ++++++++++++++-------------
 ggml.h                               |  29 ++
 tests/test-ggml-threading.c          |  61 +++-
 tests/test-ggml-tune.c               |  10 +-
 10 files changed, 433 insertions(+), 279 deletions(-)

diff --git a/examples/mulmat-tune/mulmat-tune.cpp b/examples/mulmat-tune/mulmat-tune.cpp
index 55dd1927588de..da1d0a1c1fe7e 100644
--- a/examples/mulmat-tune/mulmat-tune.cpp
+++ b/examples/mulmat-tune/mulmat-tune.cpp
@@ -11,6 +11,10 @@
 
 #define UNUSED(x) (void)(x)
 
+// F16 has an pending Illegal Instruction error on macos-latest-cmake.
+// So the workaround is to disable non-quantized ftypes.
+// #define SUPPORT_NONE_Q_TYPE 1
+
 static void print_build_tips(void) {
     const char *a = "LLAMA_NO_ACCELERATE";
     fprintf(stderr, "Tips on how to build with various backend vendors:\n\n");
@@ -62,11 +66,12 @@ static void usage(char *prog) {
         "--model     MODEL    3B | 7B | 13B | 30B | 65B",
         "                     default 7B",
         "--ftype     FTYPE    ggml ftype:",
+#ifdef SUPPORT_NONE_Q_TYPE
         "                     0:  all F32",
         "                     1:  mostly F16",
+#endif
         "                     2:  mostly Q4_0",
         "                     3:  mostly Q4_1",
-        "                     4:  mostly Q4_1, some F16",
         "                     7:  mostly Q8_0",
         "                     8:  mostly Q5_0",
         "                     9:  mostly Q5_1",
@@ -84,7 +89,7 @@ static void usage(char *prog) {
         "                     requires: between [1, 3]",
         "--n_threads NTH      bench with this number of threads",
         "                     requires: between [1, 16]",
-        "                     default 1",
+        "                     default 4",
         "--file      FILE     data file to write",
         "                     default stdout",
         "-y                   always answer \"yes\" to all prompts",
@@ -170,8 +175,22 @@ int main(int argc, char **argv) {
             ftype = (enum ggml_ftype)v;
         }
 
+#ifndef SUPPORT_NONE_Q_TYPE
         if (ftype == GGML_FTYPE_ALL_F32 || ftype == GGML_FTYPE_MOSTLY_F16) {
-            fprintf(stderr, "none quantized type %d is not supported\n", ftype);
+            fprintf(stderr, "error: none quantized type %d is not supported\n",
+                    ftype);
+            return 1;
+        }
+#endif
+
+        bool cond_1 = ftype >= GGML_FTYPE_MOSTLY_Q4_0 &&
+                      ftype <= GGML_FTYPE_MOSTLY_Q4_1;
+        bool cond_2 =
+            ftype >= GGML_FTYPE_MOSTLY_Q8_0 && ftype <= GGML_FTYPE_MOSTLY_Q6_K;
+
+        if (!(cond_1 || cond_2)) {
+            fprintf(stderr, "error: type %d is not a known ggml ftype.\n",
+                    ftype);
             return 1;
         }
     }
@@ -223,7 +242,7 @@ int main(int argc, char **argv) {
         }
     }
 
-    int n_threads = 1;
+    int n_threads = 4;
     {
         if (arg_n_threads != NULL) {
             int v = atoi(arg_n_threads);
diff --git a/ggml-opencl.cpp b/ggml-opencl.cpp
index c9151a8e49561..2a1a04fcaccf1 100644
--- a/ggml-opencl.cpp
+++ b/ggml-opencl.cpp
@@ -1628,7 +1628,7 @@ bool ggml_cl_mul_mat_use_f16(const struct ggml_tensor * src0, const struct ggml_
 }
 
 void ggml_cl_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst, void * wdata, size_t wsize) {
-    // GGML_ASSERT(ggml_cl_can_mul_mat(src0, src1, dst));
+    GGML_ASSERT(ggml_cl_can_mul_mat(src0, src1, dst));
 
     if (src0->type == GGML_TYPE_F32) {
         ggml_cl_mul_mat_f32(src0, src1, dst);
diff --git a/ggml-threading.c b/ggml-threading.c
index 6dd6d2817eff0..7ef763c0f81e1 100644
--- a/ggml-threading.c
+++ b/ggml-threading.c
@@ -170,7 +170,8 @@ struct ggml_compute_state_shared {
     atomic_bool wait_on_done;
     atomic_bool stop;
 
-    ggml_threading_task_runner *task_runner;
+    // Default task runner, can be overriden by node.task_profile.runner.
+    ggml_task_runner *task_runner;
 
     struct ggml_threading_context *ctx;
 };
@@ -391,8 +392,10 @@ ggml_thread_ret_t ggml_threading_graph_compute_thread(void *data) {
         }
 
         if (shared->n_tasks > 0 && state->has_work) {
-            enum ggml_compute_error err =
-                shared->task_runner(&state->params, state->node);
+            ggml_task_runner *runner = state->node->task_profile.runner
+                                           ? state->node->task_profile.runner
+                                           : shared->task_runner;
+            enum ggml_compute_error err = runner(&state->params, state->node);
 
             GGML_ASSERT(err == GGML_COMPUTE_OK);
 
@@ -427,8 +430,13 @@ ggml_threading_compute_tensor(struct ggml_threading_context *ctx,
                               size_t wsize) {
     GGML_ASSERT(ctx);
     GGML_ASSERT(node);
-
     GGML_ASSERT(ctx->shared.task_runner);
+
+    ggml_task_runner *runner = ctx->shared.task_runner;
+    if (node->task_profile.runner) {
+        runner = node->task_profile.runner;
+    }
+
     struct ggml_compute_state_shared *state_shared = &ctx->shared;
 
     // This is the params for main thread.
@@ -491,7 +499,7 @@ ggml_threading_compute_tensor(struct ggml_threading_context *ctx,
             params.wsize = wsize;
             params.wdata = wdata;
 
-            err = state_shared->task_runner(&params, node);
+            err = runner(&params, node);
         }
 
         // wait for tasks done.
@@ -509,11 +517,21 @@ ggml_threading_compute_tensor(struct ggml_threading_context *ctx,
 
         if (err != GGML_COMPUTE_OK) {
             if (err == GGML_COMPUTE_FALLBACK) {
+                PRINT_DEBUG("[main] fallback from profile, id=%d\n",
+                            node->task_profile.id);
+                GGML_ASSERT(node->task_profile.stages[1].backend >
+                            GGML_TASK_BACKEND_CPU);
+
                 struct ggml_task_profile profiles[GGML_MAX_TASK_PROFILES];
                 int n = ggml_get_task_profiles(node, profiles);
                 GGML_ASSERT(n > 0);
+                GGML_ASSERT(profiles[0].stages[1].backend ==
+                            GGML_TASK_BACKEND_CPU);
+
                 memcpy(&node->task_profile, &profiles[0],
                        sizeof(struct ggml_task_profile));
+                runner = ctx->shared.task_runner;
+
                 goto START;
             }
             return err;
@@ -525,12 +543,13 @@ ggml_threading_compute_tensor(struct ggml_threading_context *ctx,
 
 struct ggml_threading_context *
 ggml_threading_start(int n_threads, ggml_threading_thread_runner *thread_runner,
-                     ggml_threading_task_runner *task_stage_runner,
+                     ggml_task_runner *task_runner,
                      enum ggml_threading_features features,
                      int64_t stages_time[3]) {
     GGML_ASSERT(n_threads > 0);
-    GGML_ASSERT(thread_runner);
-    GGML_ASSERT(task_stage_runner);
+    if (thread_runner == NULL) {
+        thread_runner = ggml_threading_graph_compute_thread;
+    }
 
     size_t ctx_sz = sizeof(struct ggml_threading_context);
     struct ggml_threading_context *ctx = malloc(ctx_sz);
@@ -545,7 +564,7 @@ ggml_threading_start(int n_threads, ggml_threading_thread_runner *thread_runner,
         .wait_now = false,
         .wait_on_done = false,
         .stop = false,
-        .task_runner = task_stage_runner,
+        .task_runner = task_runner,
         .ctx = ctx,
     };
 
diff --git a/ggml-threading.h b/ggml-threading.h
index f3214efc7cb7d..189fc2ed56a69 100644
--- a/ggml-threading.h
+++ b/ggml-threading.h
@@ -21,27 +21,21 @@ enum ggml_threading_features {
     GGML_THREADING_FEATURE_PERF = 1 << 1,
 };
 
-// Compute errors.
-enum ggml_compute_error {
-    GGML_COMPUTE_OK = 0,
-    GGML_COMPUTE_FALLBACK = 1,
-};
-
-// The task runner to be called by main thread and workers.
-typedef enum ggml_compute_error(ggml_threading_task_runner)(
-    struct ggml_compute_params *params, struct ggml_tensor *node);
-
 // The thread runner to feed into OS threads.
 typedef ggml_thread_ret_t(ggml_threading_thread_runner)(void *data);
 
 // Init and start underlying workers if n_threads > 1.
 //
-// features: optional for configure threading additional features.
-// see `ggml_threading_feature`, default 0.
+// thread: optional OS thread runner, default value:
+// `ggml_threading_graph_compute_thread`.
+//
+// features: optional for configure
+// threading additional features. see `ggml_threading_feature`, default 0.
+//
 // stages_time: optional for collecting per-stage wall clock time.
 struct ggml_threading_context *
 ggml_threading_start(int n_threads, ggml_threading_thread_runner *thread,
-                     ggml_threading_task_runner *task_stage_runner,
+                     ggml_task_runner *task_runner,
                      enum ggml_threading_features features,
                      int64_t stages_time[3]);
 
@@ -60,7 +54,7 @@ ggml_threading_compute_tensor(struct ggml_threading_context *ctx,
 
 // This is an experimental functionality for mulmat tune, as a thin wrapper.
 enum ggml_compute_error
-ggml_compute_forward_wrapper(struct ggml_compute_params *params,
+ggml_compute_forward_wrapper(const struct ggml_compute_params *params,
                              struct ggml_tensor *tensor);
 
 #ifdef __cplusplus
diff --git a/ggml-tune.c b/ggml-tune.c
index 20f3950693fcb..aeb63e957da2d 100644
--- a/ggml-tune.c
+++ b/ggml-tune.c
@@ -44,9 +44,12 @@ ggml_mulmat_tune_task_backend_name(enum ggml_task_backend backend) {
     }
 }
 
-const struct ggml_task_profile *ggml_mulmat_tune_select_task_profile(
-    struct ggml_mulmat_tune *tune, int M, int N, int K, enum ggml_type src0_t,
-    enum ggml_type src1_t, int stages_time[3]) {
+// NOTE: we can not use the profile from tune because the profiles do not
+// contain fields such as runner, get_size.
+int ggml_mulmat_tune_select_task_profile(struct ggml_mulmat_tune *tune, int M,
+                                         int N, int K, enum ggml_type src0_t,
+                                         enum ggml_type src1_t,
+                                         int stages_time[3]) {
     GGML_ASSERT(tune);
 
     // TODO: default_mm_cache is thread-unsafe.
@@ -103,15 +106,15 @@ const struct ggml_task_profile *ggml_mulmat_tune_select_task_profile(
                     names[i] = ggml_mulmat_tune_task_backend_name(
                         prof->stages[i].backend);
                 }
-                printf("\n[tune] M: %3d, N: %5d, K: %5d, backends of the "
-                       "fastest profile: %s %s %s\n",
-                       M, N, K, names[0], names[1], names[2]);
+                printf("\n[tune] M: %3d, N: %5d, K: %5d, profile id: %d, "
+                       "backends: %s %s %s\n",
+                       M, N, K, prof->id, names[0], names[1], names[2]);
 #endif
             }
         }
     }
 
-    return prof;
+    return prof->id;
 }
 
 void ggml_mulmat_tune_model_init(struct ggml_mulmat_tune_model *model,
@@ -264,10 +267,13 @@ void ggml_mulmat_tune_free(struct ggml_mulmat_tune *tune) {
         if (shape->m_num > 0) {
             if (shape->arr_m) {
                 free(shape->arr_m);
+                shape->arr_m = NULL;
             }
             if (shape->items) {
                 free(shape->items);
+                shape->items = NULL;
             }
+            shape->m_num = 0;
         }
     }
 }
@@ -277,6 +283,11 @@ static bool ggml_mulmat_tune_write_profiles(
     int rc;
     for (int i = 0; i < n_profiles; i++) {
         const struct ggml_task_profile *profile = &profiles[i];
+        rc = fprintf(fp, "%d ", profile->id);
+        if (rc <= 0) {
+            return false;
+        }
+
         for (int j = 0; j < 3; j++) {
             const struct ggml_task_stage *ts = &profile->stages[j];
             rc = fprintf(fp, "%2d %d %d", ts->backend, ts->parallel ? 1 : 0,
@@ -304,7 +315,6 @@ static bool
 ggml_mulmat_tune_validate_internal(const struct ggml_mulmat_tune *tune,
                                    const char *model, int ftype, int n_threads,
                                    char *errbuf, int errbuf_len) {
-
     if (tune->version != GGML_MULMAT_TUNE_VERSION) {
         snprintf(errbuf, errbuf_len - 1,
                  "version mismatch, built-in: %d, "
@@ -348,14 +358,28 @@ ggml_mulmat_tune_validate_internal(const struct ggml_mulmat_tune *tune,
         int n_profiles = ggml_get_task_profiles(&node, builtin_profiles);
 
         if (n_profiles != shape->n_profiles) {
-            snprintf(errbuf, errbuf_len - 1, "task profiles mismatch(n_profiles)");
+            snprintf(errbuf, errbuf_len - 1,
+                     "task profiles mismatch (n_profiles)");
             return false;
         }
 
         // TODO: profiles order is relevant, too strict.
-        size_t sz = sizeof(struct ggml_task_profile) * n_profiles;
-        if (memcmp(builtin_profiles, shape->profiles, sz) != 0) {
-            snprintf(errbuf, errbuf_len - 1, "task profiles mismatch(profiles)");
+        // Only validate stages!
+        size_t sz = sizeof(struct ggml_task_stage) * 3;
+        bool matched = true;
+        for (int j = 0; j < n_profiles; j++) {
+            if (builtin_profiles[j].id != shape->profiles[j].id) {
+                return false;
+            }
+            if (memcmp(builtin_profiles[j].stages, shape->profiles[j].stages,
+                       sz) != 0) {
+                matched = false;
+                break;
+            }
+        }
+        if (!matched) {
+            snprintf(errbuf, errbuf_len - 1,
+                     "task profiles mismatch (profiles)");
 
             printf("=== built-in profiles:\n");
             ggml_mulmat_tune_write_profiles(stderr, builtin_profiles,
@@ -426,6 +450,12 @@ bool ggml_mulmat_tune_read_data(struct ggml_mulmat_tune *tune, FILE *fp) {
 
         for (int ip = 0; ip < shape->n_profiles; ip++) {
             struct ggml_task_profile *profile = &shape->profiles[ip];
+
+            rc = fscanf(fp, "%d ", &profile->id);
+            if (rc <= 0) {
+                return false;
+            }
+
             for (int j = 0; j < 3; j++) {
                 struct ggml_task_stage *ts = &profile->stages[j];
                 int backend;
@@ -777,6 +807,8 @@ bool ggml_mulmat_tune_bench(struct ggml_mulmat_tune *tune,
     GGML_ASSERT(params);
     GGML_ASSERT(params->model.name);
 
+    memset(tune, 0, sizeof(struct ggml_mulmat_tune));
+
     enum ggml_task_backend backends[16];
     int n_backends = ggml_mulmat_tune_get_builtin_task_backends(backends);
     if (n_backends < 2) {
@@ -785,6 +817,15 @@ bool ggml_mulmat_tune_bench(struct ggml_mulmat_tune *tune,
         return false;
     }
 
+    if (params->model.ftype >= GGML_FTYPE_MOSTLY_Q2_K &&
+        params->model.ftype <= GGML_FTYPE_MOSTLY_Q6_K) {
+#if defined(GGML_USE_CLBLAST)
+        printf("[tune] error: cl implementation does not support k_quants at "
+               "the time of writing this code, skip.\n");
+        return false;
+#endif
+    }
+
     bool ok = ggml_mulmat_tune_init(tune, params, ggml_get_task_profiles);
     if (!ok) {
         return false;
@@ -816,9 +857,8 @@ bool ggml_mulmat_tune_bench(struct ggml_mulmat_tune *tune,
     int64_t t0 = ggml_time_ms();
 
     struct ggml_threading_context *thrd_ctx = ggml_threading_start(
-        tune->n_threads, ggml_threading_graph_compute_thread,
-        ggml_compute_forward_wrapper, GGML_THREADING_FEATURE_WAIT_ON_DONE,
-        stages_time);
+        tune->n_threads, NULL, ggml_compute_forward_wrapper,
+        GGML_THREADING_FEATURE_WAIT_ON_DONE, stages_time);
 
     for (int i_shape = 0; i_shape < tune->n_shapes; i_shape++) {
         const struct ggml_mulmat_tune_shape *shape = &tune->shapes[i_shape];
diff --git a/ggml-tune.h b/ggml-tune.h
index b1246615503d3..7955a50a977da 100644
--- a/ggml-tune.h
+++ b/ggml-tune.h
@@ -10,7 +10,7 @@
 extern "C" {
 #endif
 
-#define GGML_MULMAT_TUNE_VERSION 8
+#define GGML_MULMAT_TUNE_VERSION 9
 #define GGML_MULMAT_N_SHAPES 4
 #define GGML_MULMAT_CACHE_LEN 16
 
@@ -55,7 +55,7 @@ struct ggml_mulmat_tune_shape {
     struct ggml_mulmat_tune_m *items;
 };
 
- struct ggml_mulmat_tune_cache_ele {
+struct ggml_mulmat_tune_cache_ele {
     int M;
     int N;
     int K;
@@ -98,10 +98,11 @@ struct ggml_mulmat_tune_params {
 };
 
 // NOTE: stages_time is filled if not null.
-const struct ggml_task_profile *
-ggml_mulmat_tune_select_task_profile(struct ggml_mulmat_tune *tune, int M,
-                                     int N, int K, enum ggml_type src0_t,
-                                     enum ggml_type src1_t, int stages_time[3]);
+// Return profile id.
+int ggml_mulmat_tune_select_task_profile(struct ggml_mulmat_tune *tune, int M,
+                                         int N, int K, enum ggml_type src0_t,
+                                         enum ggml_type src1_t,
+                                         int stages_time[3]);
 
 bool ggml_mulmat_tune_validate(const struct ggml_mulmat_tune *tune,
                                const char *model_name, int ftype,
diff --git a/ggml.c b/ggml.c
index b734f1a0c4d46..43ec93a64e2b2 100644
--- a/ggml.c
+++ b/ggml.c
@@ -8500,19 +8500,14 @@ static void ggml_compute_forward_mul_f32(
     const int ith = params->ith;
     const int nth = params->nth;
 
-    enum ggml_task_backend comp_backend = dst->task_profile.stages[GGML_TASK_COMPUTE].backend;
-    if (comp_backend == GGML_TASK_BACKEND_GPU_CL) {
 #ifdef GGML_USE_CLBLAST
-        if (src1->backend == GGML_BACKEND_GPU) {
-            if (ith == 0) {
-                ggml_cl_mul(src0, src1, dst);
-            }
-            return;
+    if (src1->backend == GGML_BACKEND_GPU) {
+        if (ith == 0) {
+            ggml_cl_mul(src0, src1, dst);
         }
-#else
-        GGML_ASSERT(false);
+        return;
+    }
 #endif
-    };
 
     const int64_t nr = ggml_nrows(src0);
 
@@ -9938,7 +9933,7 @@ static void ggml_compute_forward_rms_norm_back(
     }
 }
 
-
+// CPU only
 static void ggml_compute_forward_mul_mat_f32(
         const struct ggml_compute_params * params,
         const struct ggml_tensor * src0,
@@ -10010,18 +10005,6 @@ static void ggml_compute_forward_mul_mat_f32(
     //   compute by src0 rows
 
     enum ggml_task_backend comp_backend = dst->task_profile.stages[GGML_TASK_COMPUTE].backend;
-
-    if (comp_backend == GGML_TASK_BACKEND_GPU_CL) {
-#if defined(GGML_USE_CLBLAST)
-        GGML_ASSERT(params->nth == 1);
-        GGML_ASSERT(params->type == GGML_TASK_COMPUTE);
-        ggml_cl_mul_mat(src0, src1, dst, params->wdata, params->wsize);
-        return;
-#else
-        GGML_ASSERT(false);
-#endif
-    }
-
     GGML_ASSERT(comp_backend & GGML_TASK_BACKEND_CPU);
 
     if (comp_backend == GGML_TASK_BACKEND_CPU_BLAS) {
@@ -10104,6 +10087,7 @@ static void ggml_compute_forward_mul_mat_f32(
     //}
 }
 
+// CPU only.
 static void ggml_compute_forward_mul_mat_f16_f32(
         const struct ggml_compute_params * params,
         const struct ggml_tensor * src0,
@@ -10168,19 +10152,9 @@ static void ggml_compute_forward_mul_mat_f16_f32(
     // nb01 >= nb00 - src0 is not transposed
     //   compute by src0 rows
 
+    enum ggml_task_backend init_backend = dst->task_profile.stages[GGML_TASK_INIT].backend;
     enum ggml_task_backend comp_backend = dst->task_profile.stages[GGML_TASK_COMPUTE].backend;
 
-    if (comp_backend == GGML_TASK_BACKEND_GPU_CL) {
-#if defined(GGML_USE_CLBLAST)
-        GGML_ASSERT(params->type == GGML_TASK_COMPUTE);
-        ggml_cl_mul_mat(src0, src1, dst, params->wdata, params->wsize);
-        return;
-#else
-        GGML_ASSERT(false);
-#endif
-    }
-
-    enum ggml_task_backend init_backend = dst->task_profile.stages[GGML_TASK_INIT].backend;
     GGML_ASSERT(comp_backend & GGML_TASK_BACKEND_CPU);
 
     if (comp_backend == GGML_TASK_BACKEND_CPU_BLAS) {
@@ -10304,6 +10278,7 @@ static void ggml_compute_forward_mul_mat_f16_f32(
     //}
 }
 
+// CPU only
 static void ggml_compute_forward_mul_mat_q_f32(
         const struct ggml_compute_params * params,
         const struct ggml_tensor * src0,
@@ -10373,20 +10348,8 @@ static void ggml_compute_forward_mul_mat_q_f32(
     // nb01 >= nb00 - src0 is not transposed
     //   compute by src0 rows
 
-    enum ggml_task_backend comp_backend = dst->task_profile.stages[GGML_TASK_COMPUTE].backend;
-
-    if (comp_backend == GGML_TASK_BACKEND_GPU_CL) {
-#if defined(GGML_USE_CLBLAST)
-        GGML_ASSERT(params->nth == 1);
-        GGML_ASSERT(params->type == GGML_TASK_COMPUTE);
-        ggml_cl_mul_mat(src0, src1, dst, params->wdata, params->wsize);
-        return;
-#else
-        GGML_ASSERT(false);
-#endif
-    }
-
     enum ggml_task_backend init_backend = dst->task_profile.stages[GGML_TASK_INIT].backend;
+    enum ggml_task_backend comp_backend = dst->task_profile.stages[GGML_TASK_COMPUTE].backend;
     GGML_ASSERT(comp_backend & GGML_TASK_BACKEND_CPU);
 
     if (comp_backend == GGML_TASK_BACKEND_CPU_BLAS) {
@@ -14294,30 +14257,9 @@ static void ggml_compute_forward_cross_entropy_loss_back(
 
 /////////////////////////////////
 
-static enum ggml_compute_error ggml_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor) {
+static enum ggml_compute_error ggml_compute_forward(const struct ggml_compute_params * params, struct ggml_tensor * tensor) {
     GGML_ASSERT(params);
 
-    enum ggml_task_backend comp_backend = tensor->task_profile.stages[GGML_TASK_COMPUTE].backend;
-
-    if (comp_backend == GGML_TASK_BACKEND_GPU_CUDA) {
-#if defined(GGML_USE_CUBLAS)
-        bool skip_cpu = ggml_cuda_compute_forward(params, tensor);
-        if (skip_cpu) {
-            return GGML_COMPUTE_OK;
-        }
-        GGML_ASSERT(tensor->src0->backend == GGML_BACKEND_CPU);
-        GGML_ASSERT(tensor->src1 == NULL || tensor->src1->backend == GGML_BACKEND_CPU);
-        return GGML_COMPUTE_FALLBACK;
-#else
-        GGML_ASSERT(false);
-#endif
-    }
-
-    // if (tensor->task_profile.stages[params->type].backend > GGML_TASK_BACKEND_CPU) {
-    //     printf("mulmat: test fallback\n");
-    //     return GGML_COMPUTE_FALLBACK;
-    // }
-
     switch (tensor->op) {
         case GGML_OP_DUP:
             {
@@ -14568,13 +14510,6 @@ static enum ggml_compute_error ggml_compute_forward(struct ggml_compute_params *
     return GGML_COMPUTE_OK;
 }
 
-enum ggml_compute_error ggml_compute_forward_wrapper(struct ggml_compute_params *params,
-    struct ggml_tensor *tensor) {
-    // We call ggml_compute_forward because the CUDA mul_mat entry point
-    // was moved out of `ggml_compute_forward_mul_mat`.
-    return ggml_compute_forward(params, tensor);
-}
-
 ////////////////////////////////////////////////////////////////////////////////
 
 static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor * tensor, bool inplace) {
@@ -15524,12 +15459,67 @@ struct ggml_cgraph ggml_build_backward(struct ggml_context * ctx, struct ggml_cg
 
 // ---- task profiles  ----
 
+// TODO: replace with ggml_compute_forward_cuda
+// DO NOT check matrix size further.
+#if defined(GGML_USE_CUBLAS)
+static enum ggml_compute_error ggml_compute_forward_cuda(
+        const struct ggml_compute_params * params,
+        struct ggml_tensor * tensor) {
+    GGML_ASSERT (ggml_cuda_can_mul_mat(tensor->src0, tensor->src1, tensor));
+    if (ggml_cuda_compute_forward(params, tensor)) {
+        return GGML_COMPUTE_OK;
+    }
+    GGML_ASSERT(tensor->src0->backend == GGML_BACKEND_CPU);
+    GGML_ASSERT(tensor->src1 == NULL || tensor->src1->backend == GGML_BACKEND_CPU);
+    return GGML_COMPUTE_FALLBACK;
+}
+#endif
+
+// TODO: replace with ggml_cl_mul_mat.
+// DO NOT check matrix size further.
+#if defined(GGML_USE_CLBLAST)
+static enum ggml_compute_error ggml_compute_forward_cl(
+        const struct ggml_compute_params * params,
+        struct ggml_tensor * tensor) {
+    switch (tensor->op) {
+        case GGML_OP_MUL_MAT:
+            GGML_ASSERT(ggml_cl_can_mul_mat(tensor->src0, tensor->src1, tensor));
+            ggml_cl_mul_mat(tensor->src0, tensor->src1, tensor, params->wdata, params->wsize);
+            return GGML_COMPUTE_OK;
+        default:
+            break;
+    }
+
+    GGML_ASSERT(false);
+}
+
+static int ggml_compute_forward_get_wsize_cl (struct ggml_tensor *tensor) {
+     switch (tensor->op) {
+        case GGML_OP_MUL_MAT:
+            return ggml_cl_mul_mat_get_wsize(tensor->src0, tensor->src1, tensor);
+        default:
+            break;
+     }
+     return -1;
+}
+#endif
+
+// The wrapper for external mulmat tune tool.
+enum ggml_compute_error ggml_compute_forward_wrapper(const struct ggml_compute_params *params,
+    struct ggml_tensor *tensor) {
+    // We call ggml_compute_forward because the CUDA mul_mat entry point
+    // was moved out of `ggml_compute_forward_mul_mat`.
+    return ggml_compute_forward(params, tensor);
+}
+
 // Implement `ggml_task_profiles_provider`.
 // Fill `profiles` for the `node` and return number of profiles.
 //
 // NOTE: the node may be incompleted from testing or tunning, so please assert
 //       everything used here.
-inline int ggml_get_task_profiles(
+//
+// TODO: configure cuda for none mul_mat nodes.
+int ggml_get_task_profiles(
     struct ggml_tensor *node,
     struct ggml_task_profile profiles[GGML_MAX_TASK_PROFILES]) {
     GGML_ASSERT(node);
@@ -15595,6 +15585,8 @@ inline int ggml_get_task_profiles(
     } break;
     case GGML_OP_MUL_MAT:
     case GGML_OP_OUT_PROD: {
+        // CPU only profiles.
+        // CUDA/CL: see end of function.
         GGML_ASSERT(node->src0);
         GGML_ASSERT(node->src1);
 
@@ -15614,16 +15606,6 @@ inline int ggml_get_task_profiles(
             p[i].stages[1].wait = true;
             i++;
 #endif
-
-#if defined(GGML_USE_CUBLAS)
-            p[i].stages[1].backend = GGML_TASK_BACKEND_GPU_CUDA;
-            p[i].stages[1].wait = true;
-            i++;
-#elif defined(GGML_USE_CLBLAST)
-            p[i].stages[1].backend = GGML_TASK_BACKEND_GPU_CL;
-            p[i].stages[1].wait = true;
-            i++;
-#endif
         } else if (src0_t == GGML_TYPE_F16) {
             p[i].stages[0].backend = GGML_TASK_BACKEND_CPU;
             p[i].stages[1].backend = GGML_TASK_BACKEND_CPU;
@@ -15635,16 +15617,6 @@ inline int ggml_get_task_profiles(
             p[i].stages[1].wait = true;
             i++;
 #endif
-
-#if defined(GGML_USE_CUBLAS)
-            p[i].stages[1].backend = GGML_TASK_BACKEND_GPU_CUDA;
-            p[i].stages[1].wait = true;
-            i++;
-#elif defined(GGML_USE_CLBLAST)
-            p[i].stages[1].backend = GGML_TASK_BACKEND_GPU_CL;
-            p[i].stages[1].wait = true;
-            i++;
-#endif
         } else if (ggml_is_quantized(src0_t)) {
             p[i].stages[0].backend = GGML_TASK_BACKEND_CPU;
             p[i].stages[1].backend = GGML_TASK_BACKEND_CPU;
@@ -15658,16 +15630,6 @@ inline int ggml_get_task_profiles(
             p[i].stages[1].wait = true;
             i++;
 #endif
-
-#if defined(GGML_USE_CUBLAS)
-            p[i].stages[1].backend = GGML_TASK_BACKEND_GPU_CUDA;
-            p[i].stages[1].wait = true;
-            i++;
-#elif defined(GGML_USE_CLBLAST)
-            p[i].stages[1].backend = GGML_TASK_BACKEND_GPU_CL;
-            p[i].stages[1].wait = true;
-            i++;
-#endif
         }
         n_profiles = i;
     } break;
@@ -15757,7 +15719,43 @@ inline int ggml_get_task_profiles(
         GGML_ASSERT(false);
     }
 
+#if defined(GGML_USE_CUBLAS)
+    switch (node->op) {
+        case GGML_OP_ADD:
+        case GGML_OP_MUL:
+        case GGML_OP_SILU:
+        case GGML_OP_RMS_NORM:
+        case GGML_OP_MUL_MAT:
+        case GGML_OP_RESHAPE:
+        case GGML_OP_ROPE: {
+            int i = n_profiles;
+            p[i].runner = ggml_compute_forward_cuda;
+            p[i].stages[1].backend = GGML_TASK_BACKEND_GPU_CUDA;
+            p[i].stages[1].wait = true;
+            ++n_profiles;
+        } break;
+        default: {
+        } break;
+    }
+#elif defined(GGML_USE_CLBLAST)
+    switch (node->op) {
+        case GGML_OP_MUL_MAT: {
+            int i = n_profiles;
+            p[i].runner = ggml_compute_forward_cl;
+            p[i].get_wsize = ggml_compute_forward_get_wsize_cl;
+            p[i].stages[1].backend = GGML_TASK_BACKEND_GPU_CL;
+            p[i].stages[1].wait = true;
+            ++n_profiles;
+        } break;
+        default: {
+        } break;
+    }
+#endif
+
     GGML_ASSERT(n_profiles > 0 && n_profiles <= GGML_MAX_TASK_PROFILES);
+    for (int i = 0; i < n_profiles; i++) {
+        profiles[i].id = i + 1;
+    }
     return n_profiles;
 }
 
@@ -15769,7 +15767,7 @@ static const struct ggml_task_profile *ggml_mulmat_get_task_profile(
     GGML_ASSERT(node);
     GGML_ASSERT(node->op == GGML_OP_MUL_MAT || node->op == GGML_OP_OUT_PROD);
     GGML_ASSERT(profiles);
-    GGML_ASSERT(n_profiles >= 2);
+    GGML_ASSERT(n_profiles > 0);
 
     enum ggml_type src0_t = node->src0->type;
     enum ggml_type src1_t = node->src1->type;
@@ -15777,9 +15775,9 @@ static const struct ggml_task_profile *ggml_mulmat_get_task_profile(
     // Type and memory layout requirements for computing mul_mat with BLAS.
     bool cond_match = (src0_t == GGML_TYPE_F32 || src0_t == GGML_TYPE_F16 ||
                        ggml_is_quantized(src0_t)) &&
-                      src1_t == GGML_TYPE_F32 && node->type == GGML_TYPE_F32 &&
-                      ggml_is_contiguous(node->src0) &&
-                      ggml_is_contiguous(node->src1);
+                       src1_t == GGML_TYPE_F32 && node->type == GGML_TYPE_F32 &&
+                       ggml_is_contiguous(node->src0) &&
+                       ggml_is_contiguous(node->src1);
 
     int M = (int)node->ne[1];
     int N = (int)node->ne[0];
@@ -15790,10 +15788,14 @@ static const struct ggml_task_profile *ggml_mulmat_get_task_profile(
     if (cond_match) {
 #if defined(GGML_USE_TUNE)
         if (tune != NULL) {
-            prof = ggml_mulmat_tune_select_task_profile(tune, M, N, K, src0_t,
+            GGML_ASSERT(n_profiles >= 2);
+            int id = ggml_mulmat_tune_select_task_profile(tune, M, N, K, src0_t,
                                                         src1_t, stages_time_us);
-            if (prof != NULL) {
-                return prof;
+            for (int i = 0; i < n_profiles; i++) {
+                if (profiles[i].id == id) {
+                    prof = &profiles[i];
+                    return prof;
+                }
             }
         }
 #else
@@ -15841,11 +15843,101 @@ static const struct ggml_task_profile *ggml_mulmat_get_task_profile(
     return prof;
 }
 
+void ggml_graph_compute_set_tensor_task_proile(struct ggml_tensor *node,
+                                               struct ggml_cgraph *cgraph) {
+    // Pre-specified.
+    for (int i = 0; i < 3; i++) {
+        if (node->task_profile.stages[i].backend > 0) {
+            return;
+        }
+    }
+
+    struct ggml_task_profile profiles[GGML_MAX_TASK_PROFILES];
+    int n_profiles = ggml_get_task_profiles(node, profiles);
+
+    const struct ggml_task_profile *profile = NULL;
+
+    // GPU offloading. A special case of pre-specified task_profile.
+    if (node->backend == GGML_BACKEND_GPU || node->backend == GGML_BACKEND_GPU_SPLIT) {
+        if (node->op != GGML_OP_MUL_MAT && node->op != GGML_OP_OUT_PROD) {
+            enum ggml_task_backend be;
+            if (ggml_cpu_has_cublas()) {
+                be = GGML_TASK_BACKEND_GPU_CUDA;
+            } else if (ggml_cpu_has_clblast()) {
+                be = GGML_TASK_BACKEND_GPU_CL;
+            } else {
+                GGML_ASSERT(false);
+            }
+
+            for (int j = 0; j < n_profiles; j++) {
+                if (profiles[j].stages[1].backend == be) {
+                    profile = &profiles[j];
+                    break;
+                }
+            }
+            GGML_ASSERT(profile);
+            GGML_ASSERT(!cgraph->tune);
+
+            memcpy(&node->task_profile, profile, sizeof(struct ggml_task_profile));
+            return;
+        }
+    }
+
+    // mul_mat: GGML_OP_MUL_MAT and GGML_OP_OUT_PROD.
+    if (node->op == GGML_OP_MUL_MAT) {
+#if defined(GGML_USE_TUNE)
+        GGML_ASSERT(node->backend == GGML_BACKEND_CPU);
+
+        int stages_time_us[3];
+        profile = ggml_mulmat_get_task_profile(node, profiles, n_profiles,
+                                               cgraph->tune, stages_time_us);
+        GGML_ASSERT(profile);
+
+        memcpy(&node->task_profile, profile, sizeof(struct ggml_task_profile));
+
+        if (cgraph->tune) {
+            memcpy(&node->task_profile, profile,
+                    sizeof(struct ggml_task_profile));
+
+            // Do not wait if the estimated execution time is too small
+            // (e.g. less than 0.1 ms)
+            // TODO: need bench actual wait/notify time, see
+            // ggml-threading.c
+            for (int j = 0; j < 3; j++) {
+                if (node->task_profile.stages[j].wait) {
+                    if (stages_time_us[j] < 100) {
+                        node->task_profile.stages[j].wait = false;
+                    }
+                }
+            }
+        }
+        return;
+#else
+        profile = ggml_mulmat_get_task_profile(node, profiles, n_profiles, NULL,
+                                               NULL);
+        GGML_ASSERT(profile);
+        memcpy(&node->task_profile, profile, sizeof(struct ggml_task_profile));
+        return;
+#endif
+    } else if (node->op == GGML_OP_OUT_PROD) { // FIXME: is this correct?
+        profile = ggml_mulmat_get_task_profile(node, profiles, n_profiles, NULL,
+                                               NULL);
+        GGML_ASSERT(profile);
+        memcpy(&node->task_profile, profile, sizeof(struct ggml_task_profile));
+        return;
+    }
+
+    // default.
+    profile = &profiles[0];
+    GGML_ASSERT(profile->stages[1].backend == GGML_TASK_BACKEND_CPU);
+    memcpy(&node->task_profile, profile, sizeof(struct ggml_task_profile));
+}
+
 void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph) {
     int n_threads = cgraph->n_threads;
 
     struct ggml_threading_context *thrd_ctx = ggml_threading_start(
-        n_threads, ggml_threading_graph_compute_thread, ggml_compute_forward,
+        n_threads, NULL, ggml_compute_forward,
         GGML_THREADING_FEATURE_WAIT_ON_DONE, NULL);
 
     // initialize tasks + work buffer
@@ -15854,107 +15946,34 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
 
         size_t work_size = 0;
 
-        struct ggml_task_profile profiles[GGML_MAX_TASK_PROFILES];
-
         // thread scheduling for the different operations
         for (int i = 0; i < cgraph->n_nodes; i++) {
             struct ggml_tensor * node = cgraph->nodes[i];
-            if (node->op == GGML_OP_NONE || node->op == GGML_OP_CONT) {
-                continue;
-            }
-
-            int n_profiles = ggml_get_task_profiles(node, profiles);
 
-            const struct ggml_task_profile *profile = NULL;
+            GGML_ASSERT (node->op != GGML_OP_NONE);
 
-            // Adapt node->backend: assume GPU at COMPUTE stage.
-            if (node->backend == GGML_BACKEND_GPU ||
-                node->backend == GGML_BACKEND_GPU_SPLIT) {
-                enum ggml_task_backend be;
-                if (ggml_cpu_has_cublas()) {
-                    be = GGML_TASK_BACKEND_GPU_CUDA;
-                } else if (ggml_cpu_has_clblast()) {
-                    be = GGML_TASK_BACKEND_GPU_CL;
-                } else {
-                    GGML_ASSERT(false);
-                }
+            struct ggml_task_stage *stages = node->task_profile.stages;
 
-                for (int j = 0; j < n_profiles; j++) {
-                    if (profiles[j].stages[1].backend == be) {
-                        profile = &profiles[j];
-                        break;
-                    }
-                }
-                GGML_ASSERT(profile);
-            } else {
-                GGML_ASSERT(node->backend == GGML_BACKEND_CPU);
-            }
+            ggml_graph_compute_set_tensor_task_proile(node, cgraph);
 
-            bool profile_copied = false;
+            //
+            // Allocate temp buffer `wdata` for CPU.
+            // NOTE: GPU MAY fallback to CPU, so we have to cover all possible cases.
+            //
 
-            if (node->op == GGML_OP_MUL_MAT) {
-#if defined(GGML_USE_TUNE)
-                int stages_time_us[3];
-                profile = ggml_mulmat_get_task_profile(
-                    node, profiles, n_profiles, cgraph->tune, stages_time_us);
-                GGML_ASSERT(profile);
-
-                if (cgraph->tune) {
-                    memcpy(&node->task_profile, profile,
-                           sizeof(struct ggml_task_profile));
-                    profile_copied = true;
-
-                    // Do not wait if the estimated execution time is too small
-                    // (e.g. less than 0.1 ms)
-                    // TODO: need bench actual wait/notify time, see
-                    // ggml-threading.c
-                    for (int j = 0; j< 3; j++) {
-                        if (node->task_profile.stages[j].wait) {
-                            if (stages_time_us[j] < 100) {
-                                node->task_profile.stages[j].wait = false;
-                            }
-                        }
-                    }
+            if (node->task_profile.get_wsize) {
+                int sz = node->task_profile.get_wsize(node);
+                if (sz >= 0) {
+                    work_size = MAX(work_size, (size_t)sz);
+                    continue;
                 }
-#else
-                profile = ggml_mulmat_get_task_profile(node, profiles,
-                                                       n_profiles, NULL, NULL);
-                GGML_ASSERT(profile);
-#endif
-            } else if (node->op == GGML_OP_OUT_PROD) { // FIXME: is is right?
-                profile = ggml_mulmat_get_task_profile(node, profiles,
-                                                       n_profiles, NULL, NULL);
-                GGML_ASSERT(profile);
-            } else {
-                profile = &profiles[0];
-                GGML_ASSERT(profile->stages[1].backend ==
-                            GGML_TASK_BACKEND_CPU);
-            }
-
-            if (!profile_copied) {
-                memcpy(&node->task_profile, profile,
-                       sizeof(struct ggml_task_profile));
             }
 
-            struct ggml_task_stage *stages = node->task_profile.stages;
-
-            // Workrounnd to set node->backend.
-            for (int j = 0; j < 3; j++) {
-                if (node->backend == GGML_BACKEND_CPU &&
-                    (stages[j].backend & GGML_TASK_BACKEND_GPU)) {
-                    if (ggml_cpu_has_cublas() || ggml_cpu_has_clblast()) {
-                        node->backend = GGML_BACKEND_GPU;
-                    } else {
-                        GGML_ASSERT(false);
-                    }
-                }
-            }
+            //printf("op: %d, comp backend: %d\n", node->op, node->task_profile.stages[1].backend);
 
             // compute stage n_tasks.
             int n_tasks = stages[1].parallel ? n_threads : 1;
 
-            // Allocate temp buffer `wdata` for CPU.
-            // NOTE: GPU MAY fallback to CPU, so we have to cover all possible cases.
             switch (node->op) {
                 case GGML_OP_CPY:
                 case GGML_OP_DUP:
@@ -16012,20 +16031,12 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
                     {
                     } break;
                 case GGML_OP_MUL_MAT:
-                case GGML_OP_OUT_PROD: // FIXME: is is right?
+                case GGML_OP_OUT_PROD: // FIXME: is this correct?
                     {
                         size_t cur = 0;
                         enum ggml_task_backend comp_backend = stages[GGML_TASK_COMPUTE].backend;
                         GGML_ASSERT(comp_backend != GGML_TASK_BACKEND_NONE);
-
-                        if (comp_backend == GGML_TASK_BACKEND_GPU_CL) {
-#if defined(GGML_USE_CLBLAST)
-                            GGML_ASSERT(ggml_cl_can_mul_mat(node->src0, node->src1, node));
-                            cur = ggml_cl_mul_mat_get_wsize(node->src0, node->src1, node);
-#else
-                            GGML_ASSERT(false);
-#endif
-                        } else if (comp_backend == GGML_TASK_BACKEND_CPU_BLAS) {
+                        if (comp_backend == GGML_TASK_BACKEND_CPU_BLAS) {
                             GGML_ASSERT(ggml_cpu_has_cpublas());
                             GGML_ASSERT(node->src1->type == GGML_TYPE_F32);
 
@@ -16039,11 +16050,7 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
                             } else {
                                 GGML_ASSERT(false);
                             }
-                        } else if (comp_backend == GGML_TASK_BACKEND_CPU || comp_backend == GGML_TASK_BACKEND_GPU_CUDA) {
-                            if (comp_backend == GGML_TASK_BACKEND_GPU_CUDA) {
-                                GGML_ASSERT(ggml_cpu_has_cublas());
-                            }
-
+                        } else { // CPU or GPU fallback
                             GGML_ASSERT(node->src1->type == GGML_TYPE_F32);
 
                             if (node->src0->type == GGML_TYPE_F32) {
@@ -16056,8 +16063,6 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
                             } else {
                                 GGML_ASSERT(false);
                             }
-                        } else {
-                            GGML_ASSERT(false);
                         }
 
                         work_size = MAX(work_size, cur);
diff --git a/ggml.h b/ggml.h
index 5ab78c4a011b5..d4d5d3521c74f 100644
--- a/ggml.h
+++ b/ggml.h
@@ -390,11 +390,40 @@ extern "C" {
         bool wait;
     };
 
+    struct ggml_tensor;
+    struct ggml_compute_params;
+
+    // Compute errors.
+    enum ggml_compute_error {
+        GGML_COMPUTE_OK = 0,
+        GGML_COMPUTE_FALLBACK = 1,
+    };
+
+    // The task runner to be called by main thread and workers.
+    typedef enum ggml_compute_error(ggml_task_runner)(
+        const struct ggml_compute_params *params,
+        struct ggml_tensor *node);
+
+    // Get wsize for node computing.
+    // When return -1: should be explained as `fallback to CPU`, caller MUST
+    // determine how much memory to reserve for this node.
+    typedef int (ggml_task_get_wsize)(struct ggml_tensor *tensor);
+
     // config for computing a tensor.
     struct ggml_task_profile {
+        // profile id, start from 1.
+        int id;
+
         // index 0: INIT, 1: COMPUTE, 2: FINALIZE
         struct ggml_task_stage stages[3];
 
+        // Optional task runner, overrides threading's task runner.
+        ggml_task_runner *runner;
+
+        // Optional function to return required wsize for wdata.
+        ggml_task_get_wsize *get_wsize;
+
+        // Optional flag for development.
         // MUST be used only in testing codes.
         uint8_t dev_flags[4];
     };
diff --git a/tests/test-ggml-threading.c b/tests/test-ggml-threading.c
index 90d53e4cdea2a..2079fe144b194 100644
--- a/tests/test-ggml-threading.c
+++ b/tests/test-ggml-threading.c
@@ -42,7 +42,8 @@ static const int n_repeat = 10;
 static int work_done_arr[MAX_N_THREADS];
 
 static enum ggml_compute_error
-mock_task_runner(struct ggml_compute_params *params, struct ggml_tensor *node) {
+mock_task_runner(const struct ggml_compute_params *params,
+                 struct ggml_tensor *node) {
     int64_t loops = node->task_profile.dev_flags[1] * 1000 * 1000;
     if (node->task_profile.stages[params->type].parallel) {
         loops /= params->nth;
@@ -79,9 +80,8 @@ int test_driver(int id, struct ggml_tensor *node, int n_threads) {
 
     int t0 = (int)ggml_time_us();
 
-    struct ggml_threading_context *ctx =
-        ggml_threading_start(n_threads, ggml_threading_graph_compute_thread,
-                             mock_task_runner, features, /*stages_time*/ NULL);
+    struct ggml_threading_context *ctx = ggml_threading_start(
+        n_threads, NULL, mock_task_runner, features, /*stages_time*/ NULL);
 
     int t1 = (int)ggml_time_us();
 
@@ -141,7 +141,7 @@ int test_driver(int id, struct ggml_tensor *node, int n_threads) {
 }
 
 static enum ggml_compute_error
-mock_task_runner_fallback(struct ggml_compute_params *params,
+mock_task_runner_fallback(const struct ggml_compute_params *params,
                           struct ggml_tensor *node) {
     UNUSED(params);
     if (node->backend == GGML_BACKEND_GPU) {
@@ -158,7 +158,7 @@ mock_task_runner_fallback(struct ggml_compute_params *params,
 // thus it is not parallelled.
 int test_fallback(struct ggml_tensor *node) {
     struct ggml_threading_context *ctx = ggml_threading_start(
-        1, ggml_threading_graph_compute_thread, mock_task_runner_fallback,
+        1, NULL, mock_task_runner_fallback,
         /*features*/ GGML_THREADING_FEATURE_NONE, /*stages_time*/ NULL);
 
     enum ggml_compute_error err =
@@ -177,6 +177,38 @@ int test_fallback(struct ggml_tensor *node) {
     return 0;
 }
 
+static enum ggml_compute_error
+customized_node_runner(const struct ggml_compute_params *params,
+                       struct ggml_tensor *node) {
+    UNUSED(params);
+    // Reset runner thus caller will know it was called.
+    node->task_profile.runner = NULL;
+    return GGML_COMPUTE_OK;
+}
+
+// Test when node->task_profile.runner is not NULL.
+int test_customized_node_runner(struct ggml_tensor *node) {
+    struct ggml_threading_context *ctx = ggml_threading_start(
+        1, NULL, mock_task_runner,
+        /*features*/ GGML_THREADING_FEATURE_NONE, /*stages_time*/ NULL);
+
+    node->task_profile.runner = customized_node_runner;
+    enum ggml_compute_error err =
+        ggml_threading_compute_tensor(ctx, node, /*wdata*/ NULL, /*wsize*/ 0);
+
+    ggml_threading_stop(ctx);
+    if (err != GGML_COMPUTE_OK) {
+        // should not happen.
+        abort();
+    }
+
+    if (node->task_profile.runner != NULL) {
+        return 2;
+    }
+
+    return 0;
+}
+
 int main(void) {
     ggml_time_init();
 
@@ -367,7 +399,10 @@ int main(void) {
         }
     }
 
+    // fallback
     {
+        printf("[test-ggml-threading] test fallback ...\n");
+
         ++n_tests;
 
         // required by getting task profiles.
@@ -382,9 +417,21 @@ int main(void) {
         node.src1 = &src1;
 
         node.backend = GGML_BACKEND_GPU;
+        stages[1].backend = GGML_TASK_BACKEND_GPU;
         if (test_fallback(&node) == 0) {
             ++n_passed;
-            printf("\n[test-ggml-threading] test fallback: ok\n\n");
+            printf("[test-ggml-threading] test fallback: ok\n\n");
+        }
+    }
+
+    // customized node runner
+    {
+        printf("[test-ggml-threading] test customized node runner ...\n");
+        ++n_tests;
+
+        if (test_customized_node_runner(&node) == 0) {
+            ++n_passed;
+            printf("[test-ggml-threading] test customized node runner: ok\n\n");
         }
     }
 
diff --git a/tests/test-ggml-tune.c b/tests/test-ggml-tune.c
index 5499fa6bf1d82..4339881e52c2d 100644
--- a/tests/test-ggml-tune.c
+++ b/tests/test-ggml-tune.c
@@ -72,7 +72,9 @@ static int bench(void) {
         // GGML_FTYPE_ALL_F32,
         // GGML_FTYPE_MOSTLY_F16,
         GGML_FTYPE_MOSTLY_Q4_0,
+#if defined(GGML_USE_K_QUANTS)
         GGML_FTYPE_MOSTLY_Q4_K,
+#endif
     };
 
     int n_ftypes = sizeof(ftypes) / sizeof(ftypes[0]);
@@ -132,16 +134,14 @@ int estimate_time_non_zero_NK(void) {
         int time[3]; // 3 profiles.
     };
 
-    struct ggml_mulmat_tune tune = {
-        .version = 1,
-        .ftype = GGML_FTYPE_MOSTLY_Q4_0,
-    };
+    struct ggml_mulmat_tune tune;
 
+    enum ggml_ftype ftype = GGML_FTYPE_MOSTLY_Q4_0;
     const int m_num = 2;
     const int n_threads = 1; // useless.
 
     struct ggml_mulmat_tune_params params;
-    init_params(&params, tune.ftype, m_num, n_threads);
+    init_params(&params, ftype, m_num, n_threads);
 
     ggml_mulmat_tune_init(&tune, &params, ggml_task_profiles_mock_qxx_provider);
 

From 06b00827a04a304cee1b2f5ca540a5a55223b9bb Mon Sep 17 00:00:00 2001
From: mqy <meng.qingyou@gmail.com>
Date: Sun, 18 Jun 2023 12:29:16 +0800
Subject: [PATCH 10/24] bulk refactoring task profile and related to run CL GPU
 offloading. * removed ggml_task_backend, infavour of ggml_task_profile.runner
 and newly added id and name. * extracted mul_mat blas codes into
 ggml_compute_forward_mul_mat_blas,   thus align with CUDA/CL a bit more and
 make it easier to fix profile and run tune. * rewrote task profile and
 update/add some cuda/cl codes, finnaly made CL GPU offloading work. * misc
 minor fix/update to tune, the data format was changed.

---
 examples/mulmat-tune/README.md       |  45 +-
 examples/mulmat-tune/mulmat-tune.cpp |   5 +
 ggml-cuda.cu                         |  25 +-
 ggml-cuda.h                          |   2 +-
 ggml-opencl.cpp                      |  17 +-
 ggml-opencl.h                        |   2 +-
 ggml-threading.c                     |  17 +-
 ggml-threading.h                     |  10 +-
 ggml-tune.c                          | 165 ++---
 ggml-tune.h                          |   9 +-
 ggml.c                               | 962 +++++++++++++--------------
 ggml.h                               |  30 +-
 llama.cpp                            |  73 +-
 tests/test-ggml-threading.c          |  45 +-
 tests/test-ggml-tune.c               |  21 +-
 15 files changed, 664 insertions(+), 764 deletions(-)

diff --git a/examples/mulmat-tune/README.md b/examples/mulmat-tune/README.md
index df023757a85b1..4e521211d968e 100644
--- a/examples/mulmat-tune/README.md
+++ b/examples/mulmat-tune/README.md
@@ -214,26 +214,19 @@ The following results are generated with Accelerate compiled.
 **Example**
 
 ```
-5 3B 2 6 1
-
-3200 3200  2 0 3 10
-16 0 0 0  16 1 0 1   0 0 0 0
-16 1 0 2  17 0 1 0   0 0 0 0
- 0 0 0 0  34 0 1 0   0 0 0 0
-   1        1      793 0     9103     2102 0 0     6014 0
-   2        2     1591 0     8034     2305 0 0    30982 0
-   4        4     2236 0     6476     2484 0 0    31388 0
-   8        7     4161 0     6623     2389 0 0    29204 0
-  16       15     8339 0     6434     2752 0 0    34303 0
-  32       32    16919 0     6915     3651 0 0    42511 0
-  64      200    34270 0     6574     4528 0 0    68212 0
- 128      188    69400 0     6325     6839 0 0    74437 0
- 256      303   134597 0     6168    11544 0 0   110180 0
- 512      687   279685 0     6337    29712 0 0   159728 0
-
-3200 8640  2 0 2 10
-
- ...
+[tune] done, elapsed time: 0 seconds.
+10 xB 12 4 2
+
+1024 1024 12 0 2 4
+100 110 000 1 CPU
+110 101 000 2 BLAS
+   1       11      309 0     1234       90 0
+   2       23      654 0     1359      215 0
+   4       44     1283 0     1362      421 0
+   8       85     2341 0     1357      347 0
+
+1024 2048 12 0 2 4
+...
 
  ```
 
@@ -249,17 +242,17 @@ shape+
 # head
 version: 1
 model: "3B" | "7B" | "13B" | "30B" | "65B"
-ggml_ftype: 0 - 4, 7 - 14
+ggml_ftype: 0 - 3, 7 - 14
 n_shapes: number of shapes
 n_threads: number of threads
 
-shape := N K  m_num n_profiles
-task_conf_profile+
+shape := N K  src0_ggml_type src1_ggml_type n_profiles m_num
+task_profile+
 bench_item+
 
-task_conf_profile: stage_conf(init) stage_conf(compute) stage_conf(finalize)
-stage_conf: backend parallel wait
-backend: 0 (NONE) | 16 (CPU) | 17 (CPU_BLAS) | 32 (GPU) | 33 (GPU_CUDA) | 34 (GPU_CL)
+task_profile: stage_conf(init) stage_conf(compute) stage_conf(finalize) id name
+stage_conf(bitmap): valid parallel wait
+valid: 0 (false) | 1 (true)
 parallel: 0 (false) | 1 (true)
 wait: 0 (false) | 1 (true)
 
diff --git a/examples/mulmat-tune/mulmat-tune.cpp b/examples/mulmat-tune/mulmat-tune.cpp
index da1d0a1c1fe7e..ba1cc0f8a1fac 100644
--- a/examples/mulmat-tune/mulmat-tune.cpp
+++ b/examples/mulmat-tune/mulmat-tune.cpp
@@ -111,6 +111,11 @@ static void usage(char *prog) {
 }
 
 int main(int argc, char **argv) {
+    if (!ggml_cpu_has_blas()) {
+        fprintf(stderr, "error: this program is not built with BLAS.\n");
+        return 1;
+    }
+
     if (argc == 2) {
         if (strcmp(argv[1], "-h") == 0 || strcmp(argv[1], "--help") == 0) {
             usage(argv[0]);
diff --git a/ggml-cuda.cu b/ggml-cuda.cu
index cf52109bce96e..5a4c7725a92de 100644
--- a/ggml-cuda.cu
+++ b/ggml-cuda.cu
@@ -2207,17 +2207,12 @@ void ggml_cuda_rms_norm(const ggml_tensor * src0, const ggml_tensor * src1, ggml
     ggml_cuda_op(src0, src1, dst, ggml_cuda_op_rms_norm, true, true);
 }
 
-bool ggml_cuda_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) {
-    const int64_t ne10 = src1->ne[0];
-
-    const int64_t ne0 = dst->ne[0];
-    const int64_t ne1 = dst->ne[1];
-
+// NOTE: don't check matrix size, otherwise mul_mat tune will fail to run.
+static bool ggml_cuda_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) {
     // TODO: find the optimal values for these
     if ((src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) &&
         src1->type == GGML_TYPE_F32 &&
-        dst->type == GGML_TYPE_F32 &&
-        (ne0 >= 32 && ne1 >= 32 && ne10 >= 32)) {
+        dst->type == GGML_TYPE_F32) {
         return true;
     }
 
@@ -2539,11 +2534,17 @@ void ggml_cuda_free_scratch() {
     g_scratch_buffer = nullptr;
 }
 
-bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor){
-    ggml_cuda_func_t func;
-    const bool any_on_device = tensor->backend == GGML_BACKEND_GPU
+bool ggml_cuda_is_gpu_offloading(struct ggml_tensor * tensor) {
+    GGML_ASSERT(tensor);
+    GGML_ASSERT(tensor->src0);
+    return tensor->backend == GGML_BACKEND_GPU
         || tensor->src0->backend == GGML_BACKEND_GPU || tensor->src0->backend == GGML_BACKEND_GPU_SPLIT
         || (tensor->src1 != nullptr && tensor->src1->backend == GGML_BACKEND_GPU);
+}
+
+bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor){
+    ggml_cuda_func_t func;
+    const bool any_on_device = is_gpu_offloading(tensor);
 
     switch (tensor->op) {
         case GGML_OP_ADD:
@@ -2571,7 +2572,7 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
             func = ggml_cuda_rms_norm;
             break;
         case GGML_OP_MUL_MAT:
-            if (!any_on_device/* && !ggml_cuda_can_mul_mat(tensor->src0, tensor->src1, tensor)*/) {
+            if (!any_on_device && !ggml_cuda_can_mul_mat(tensor->src0, tensor->src1, tensor)) {
                 return false;
             }
             func = ggml_cuda_mul_mat;
diff --git a/ggml-cuda.h b/ggml-cuda.h
index d32b4484267ab..75ea94392ce6b 100644
--- a/ggml-cuda.h
+++ b/ggml-cuda.h
@@ -16,7 +16,7 @@ void   ggml_init_cublas(void);
 void   ggml_cuda_set_tensor_split(const float * tensor_split);
 
 void   ggml_cuda_mul(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
-bool   ggml_cuda_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
+bool   ggml_cuda_is_gpu_offloading(const struct ggml_tensor * src0);
 size_t ggml_cuda_mul_mat_get_wsize(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
 void   ggml_cuda_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst, void * wdata, size_t wsize);
 
diff --git a/ggml-opencl.cpp b/ggml-opencl.cpp
index 2a1a04fcaccf1..28098793df296 100644
--- a/ggml-opencl.cpp
+++ b/ggml-opencl.cpp
@@ -1589,18 +1589,17 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor *
     }
 }
 
+bool ggml_cl_is_gpu_offloading(struct ggml_tensor * tensor) {
+    GGML_ASSERT(tensor);
+    return (tensor->src0 && tensor->src0->backend == GGML_BACKEND_GPU) ||
+        (tensor->src1 && tensor->src1->backend == GGML_BACKEND_GPU);
+}
 
-bool ggml_cl_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) {
-    const int64_t ne10 = src1->ne[0];
-
-    const int64_t ne0 = dst->ne[0];
-    const int64_t ne1 = dst->ne[1];
-
-    // TODO: find the optimal values for these
+// NOTE: don't check matrix size, otherwise mul_mat tune will fail to run.
+static bool ggml_cl_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) {
     if ((src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) &&
         src1->type == GGML_TYPE_F32 &&
-        dst->type == GGML_TYPE_F32 /*&&
-        ((ne0 >= 32 && ne1 >= 32 && ne10 >= 32) || src0->backend == GGML_BACKEND_GPU)*/) {
+        dst->type == GGML_TYPE_F32) {
         return true;
     }
 
diff --git a/ggml-opencl.h b/ggml-opencl.h
index a92b445c9d766..1de12f55a5c95 100644
--- a/ggml-opencl.h
+++ b/ggml-opencl.h
@@ -9,7 +9,7 @@ extern "C" {
 void ggml_cl_init(void);
 
 void   ggml_cl_mul(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
-bool   ggml_cl_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
+bool   ggml_cl_is_gpu_offloading(struct ggml_tensor * tensor);
 size_t ggml_cl_mul_mat_get_wsize(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
 void   ggml_cl_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst, void * wdata, size_t wsize);
 
diff --git a/ggml-threading.c b/ggml-threading.c
index 7ef763c0f81e1..dada9f3fe8466 100644
--- a/ggml-threading.c
+++ b/ggml-threading.c
@@ -376,7 +376,7 @@ ggml_thread_ret_t ggml_threading_graph_compute_thread(void *data) {
 
     struct ggml_compute_state_shared *shared = state->shared;
     GGML_ASSERT(shared);
-    GGML_ASSERT(shared->task_runner);
+    //GGML_ASSERT(shared->task_runner);
 
     shared->n_ready++;
 
@@ -397,7 +397,7 @@ ggml_thread_ret_t ggml_threading_graph_compute_thread(void *data) {
                                            : shared->task_runner;
             enum ggml_compute_error err = runner(&state->params, state->node);
 
-            GGML_ASSERT(err == GGML_COMPUTE_OK);
+            GGML_ASSERT(err == GGML_COMPUTE_OK || err == GGML_COMPUTE_FALLBACK);
 
             ggml_spin_lock(&shared->spin);
 
@@ -430,7 +430,7 @@ ggml_threading_compute_tensor(struct ggml_threading_context *ctx,
                               size_t wsize) {
     GGML_ASSERT(ctx);
     GGML_ASSERT(node);
-    GGML_ASSERT(ctx->shared.task_runner);
+    // GGML_ASSERT(ctx->shared.task_runner);
 
     ggml_task_runner *runner = ctx->shared.task_runner;
     if (node->task_profile.runner) {
@@ -448,7 +448,7 @@ ggml_threading_compute_tensor(struct ggml_threading_context *ctx,
     memset(&params, 0, sizeof(struct ggml_compute_params));
 
     for (int type = GGML_TASK_INIT; type <= GGML_TASK_FINALIZE; type++) {
-        if (node->task_profile.stages[type].backend == GGML_TASK_BACKEND_NONE) {
+        if (!node->task_profile.stages[type].valid) {
             continue;
         }
 
@@ -519,18 +519,17 @@ ggml_threading_compute_tensor(struct ggml_threading_context *ctx,
             if (err == GGML_COMPUTE_FALLBACK) {
                 PRINT_DEBUG("[main] fallback from profile, id=%d\n",
                             node->task_profile.id);
-                GGML_ASSERT(node->task_profile.stages[1].backend >
-                            GGML_TASK_BACKEND_CPU);
+                GGML_ASSERT(node->task_profile.id > 1);
 
                 struct ggml_task_profile profiles[GGML_MAX_TASK_PROFILES];
                 int n = ggml_get_task_profiles(node, profiles);
                 GGML_ASSERT(n > 0);
-                GGML_ASSERT(profiles[0].stages[1].backend ==
-                            GGML_TASK_BACKEND_CPU);
+                GGML_ASSERT(profiles[0].id == 1);
 
                 memcpy(&node->task_profile, &profiles[0],
-                       sizeof(struct ggml_task_profile));
+                    sizeof(struct ggml_task_profile));
                 runner = ctx->shared.task_runner;
+                GGML_ASSERT(runner);
 
                 goto START;
             }
diff --git a/ggml-threading.h b/ggml-threading.h
index 189fc2ed56a69..81192450c6728 100644
--- a/ggml-threading.h
+++ b/ggml-threading.h
@@ -29,7 +29,9 @@ typedef ggml_thread_ret_t(ggml_threading_thread_runner)(void *data);
 // thread: optional OS thread runner, default value:
 // `ggml_threading_graph_compute_thread`.
 //
-// features: optional for configure
+// task_runner: default task runner, nullable wheen tensor.runner is not NULL.
+//              Overridden by tensor.runner.
+// features: configure threading behaviour, optional.
 // threading additional features. see `ggml_threading_feature`, default 0.
 //
 // stages_time: optional for collecting per-stage wall clock time.
@@ -51,12 +53,6 @@ enum ggml_compute_error
 ggml_threading_compute_tensor(struct ggml_threading_context *ctx,
                               struct ggml_tensor *node, void *wdata,
                               size_t wsize);
-
-// This is an experimental functionality for mulmat tune, as a thin wrapper.
-enum ggml_compute_error
-ggml_compute_forward_wrapper(const struct ggml_compute_params *params,
-                             struct ggml_tensor *tensor);
-
 #ifdef __cplusplus
 }
 #endif
diff --git a/ggml-tune.c b/ggml-tune.c
index aeb63e957da2d..444269ae4f55a 100644
--- a/ggml-tune.c
+++ b/ggml-tune.c
@@ -24,26 +24,7 @@ static uint64_t ggml_mulmat_tune_cache_hash(int M, int N, int K) {
     return hash;
 }
 
-static const char *
-ggml_mulmat_tune_task_backend_name(enum ggml_task_backend backend) {
-    switch (backend) {
-    case GGML_TASK_BACKEND_NONE:
-        return "";
-    case GGML_TASK_BACKEND_CPU:
-        return "CPU";
-    case GGML_TASK_BACKEND_CPU_BLAS:
-        return "BLAS";
-    case GGML_TASK_BACKEND_GPU:
-        return "GPU";
-    case GGML_TASK_BACKEND_GPU_CUDA:
-        return "CUDA";
-    case GGML_TASK_BACKEND_GPU_CL:
-        return "CL";
-    default:
-        GGML_ASSERT(false);
-    }
-}
-
+// Return profile id, -1 when failed (such as unable to match shape).
 // NOTE: we can not use the profile from tune because the profiles do not
 // contain fields such as runner, get_size.
 int ggml_mulmat_tune_select_task_profile(struct ggml_mulmat_tune *tune, int M,
@@ -101,20 +82,15 @@ int ggml_mulmat_tune_select_task_profile(struct ggml_mulmat_tune *tune, int M,
                 e->K = K;
 
 #ifndef GGML_TUNE_NDEBUG
-                const char *names[3];
-                for (int i = 0; i < 3; i++) {
-                    names[i] = ggml_mulmat_tune_task_backend_name(
-                        prof->stages[i].backend);
-                }
                 printf("\n[tune] M: %3d, N: %5d, K: %5d, profile id: %d, "
                        "backends: %s %s %s\n",
-                       M, N, K, prof->id, names[0], names[1], names[2]);
+                       M, N, K, prof->id, prof->name);
 #endif
             }
         }
     }
 
-    return prof->id;
+    return prof ? prof->id : -1;
 }
 
 void ggml_mulmat_tune_model_init(struct ggml_mulmat_tune_model *model,
@@ -283,25 +259,24 @@ static bool ggml_mulmat_tune_write_profiles(
     int rc;
     for (int i = 0; i < n_profiles; i++) {
         const struct ggml_task_profile *profile = &profiles[i];
-        rc = fprintf(fp, "%d ", profile->id);
-        if (rc <= 0) {
-            return false;
-        }
-
         for (int j = 0; j < 3; j++) {
             const struct ggml_task_stage *ts = &profile->stages[j];
-            rc = fprintf(fp, "%2d %d %d", ts->backend, ts->parallel ? 1 : 0,
-                         ts->wait ? 1 : 0);
+            rc = fprintf(fp, "%1d%1d%1d", ts->valid ? 1 : 0,
+                         ts->parallel ? 1 : 0, ts->wait ? 1 : 0);
             if (rc <= 0) {
                 return false;
             }
             if (j < 2) {
-                rc = fprintf(fp, "  ");
+                rc = fprintf(fp, " ");
                 if (rc <= 0) {
                     return false;
                 }
             }
         }
+        rc = fprintf(fp, " %d %s", profile->id, profile->name);
+        if (rc <= 0) {
+            return false;
+        }
         rc = fprintf(fp, "\n");
         if (rc <= 0) {
             return false;
@@ -407,24 +382,24 @@ bool ggml_mulmat_tune_validate(const struct ggml_mulmat_tune *tune,
     return ok;
 }
 
-bool ggml_mulmat_tune_read_data(struct ggml_mulmat_tune *tune, FILE *fp) {
+int ggml_mulmat_tune_read_data(struct ggml_mulmat_tune *tune, FILE *fp) {
     GGML_ASSERT(tune);
     memset(tune, 0, sizeof(struct ggml_mulmat_tune));
 
     int rc = fscanf(fp, "%d", &tune->version);
     if (rc <= 0) {
-        return false;
+        return 1;
     }
 
     if (tune->version != GGML_MULMAT_TUNE_VERSION) {
         fprintf(stderr, "[tune] version mismatch, run bench again\n");
-        return false;
+        return 2;
     }
 
     rc = fscanf(fp, "%s %d %d %d", tune->model, (int *)&tune->ftype,
                 &tune->n_shapes, &tune->n_threads);
     if (rc <= 0) {
-        return false;
+        return 3;
     }
 
     for (int i_shape = 0; i_shape < tune->n_shapes; i_shape++) {
@@ -434,7 +409,7 @@ bool ggml_mulmat_tune_read_data(struct ggml_mulmat_tune *tune, FILE *fp) {
                     (int *)&shape->src0_type, (int *)&shape->src1_type,
                     &shape->n_profiles, &shape->m_num);
         if (rc <= 0) {
-            return false;
+            return 4;
         }
 
         {
@@ -451,24 +426,24 @@ bool ggml_mulmat_tune_read_data(struct ggml_mulmat_tune *tune, FILE *fp) {
         for (int ip = 0; ip < shape->n_profiles; ip++) {
             struct ggml_task_profile *profile = &shape->profiles[ip];
 
-            rc = fscanf(fp, "%d ", &profile->id);
-            if (rc <= 0) {
-                return false;
-            }
-
             for (int j = 0; j < 3; j++) {
                 struct ggml_task_stage *ts = &profile->stages[j];
-                int backend;
+                int valid;
                 int parallel;
                 int wait;
-                rc = fscanf(fp, "%d %d %d", &backend, &parallel, &wait);
+                rc = fscanf(fp, " %1d%1d%1d", &valid, &parallel, &wait);
                 if (rc <= 0) {
-                    return false;
+                    return 5;
                 }
-                ts->backend = (enum ggml_task_backend)backend;
+                ts->valid = valid ? true : false;
                 ts->parallel = parallel ? true : false;
                 ts->wait = wait ? true : false;
             }
+
+            rc = fscanf(fp, "%d %s", &profile->id, profile->name);
+            if (rc <= 0) {
+                return 6;
+            }
         }
 
         for (int i_m = 0; i_m < shape->m_num; i_m++) {
@@ -477,7 +452,7 @@ bool ggml_mulmat_tune_read_data(struct ggml_mulmat_tune *tune, FILE *fp) {
                 if (ip == 0) {
                     rc = fscanf(fp, "%d", &M);
                     if (rc <= 0) {
-                        return false;
+                        return 7;
                     }
                 }
                 struct ggml_mulmat_tune_m *item =
@@ -486,13 +461,13 @@ bool ggml_mulmat_tune_read_data(struct ggml_mulmat_tune *tune, FILE *fp) {
                 rc = fscanf(fp, "%d %d %d", &item->stages_time[0],
                             &item->stages_time[1], &item->stages_time[2]);
                 if (rc <= 0) {
-                    return false;
+                    return 8;
                 }
             }
         }
     }
 
-    return true;
+    return 0;
 }
 
 bool ggml_mulmat_tune_write_data(const struct ggml_mulmat_tune *tune,
@@ -535,7 +510,7 @@ bool ggml_mulmat_tune_write_data(const struct ggml_mulmat_tune *tune,
 
                 const struct ggml_task_profile *profile = &shape->profiles[ip];
                 for (int k = 0; k < 3; k++) {
-                    if (profile->stages[k].backend != GGML_TASK_BACKEND_NONE) {
+                    if (profile->stages[k].valid) {
                         rc = fprintf(fp, "%9d", item->stages_time[k]);
                         if (rc <= 0) {
                             return false;
@@ -562,8 +537,6 @@ const struct ggml_mulmat_tune_shape *
 ggml_mulmat_tune_get_shape(const struct ggml_mulmat_tune *tune, const int N,
                            const int K, enum ggml_type src0_type,
                            enum ggml_type src1_type) {
-    GGML_ASSERT(N > 0 && K > 0);
-
     for (int i = 0; i < tune->n_shapes; i++) {
         const struct ggml_mulmat_tune_shape *s = &tune->shapes[i];
         if (s->src0_type != src0_type || s->src1_type != src1_type) {
@@ -574,13 +547,17 @@ ggml_mulmat_tune_get_shape(const struct ggml_mulmat_tune *tune, const int N,
             if (s->N == N && s->K == K) {
                 return s;
             }
-        } else if (s->N > 0 && s->K == 0) {
-            if (s->N == N) {
-                return s;
-            }
-        } else if (s->N == 0 && s->K > 0) {
-            if (s->K == K) {
-                return s;
+        }
+
+        if (GGML_MULMAT_N_SHAPES == 6) {
+            if (s->N > 0 && s->K == 0) {
+                if (s->N == N) {
+                    return s;
+                }
+            } else if (s->N == 0 && s->K > 0) {
+                if (s->K == K) {
+                    return s;
+                }
             }
         }
     }
@@ -639,7 +616,7 @@ void ggml_mulmat_tune_estimate_time(
 
         for (int i_stage = 0; i_stage < 3; i_stage++) {
             const struct ggml_task_stage *stage = &profile->stages[i_stage];
-            if (stage->backend == GGML_TASK_BACKEND_NONE) {
+            if (!stage->valid) {
                 continue;
             }
 
@@ -784,23 +761,6 @@ static size_t ggml_mulmat_allocate_wdata(int N, int K, char **wdata) {
     return sz;
 }
 
-int ggml_mulmat_tune_get_builtin_task_backends(
-    enum ggml_task_backend *backends) {
-    int i = 0;
-    backends[i++] = GGML_TASK_BACKEND_CPU;
-
-    if (ggml_cpu_has_cpublas()) {
-        backends[i++] = GGML_TASK_BACKEND_CPU_BLAS;
-    }
-
-    if (ggml_cpu_has_cublas()) {
-        backends[i++] = GGML_TASK_BACKEND_GPU_CUDA;
-    } else if (ggml_cpu_has_clblast()) {
-        backends[i++] = GGML_TASK_BACKEND_GPU_CL;
-    }
-    return i;
-}
-
 bool ggml_mulmat_tune_bench(struct ggml_mulmat_tune *tune,
                             struct ggml_mulmat_tune_params *params) {
     GGML_ASSERT(tune);
@@ -809,23 +769,6 @@ bool ggml_mulmat_tune_bench(struct ggml_mulmat_tune *tune,
 
     memset(tune, 0, sizeof(struct ggml_mulmat_tune));
 
-    enum ggml_task_backend backends[16];
-    int n_backends = ggml_mulmat_tune_get_builtin_task_backends(backends);
-    if (n_backends < 2) {
-        fprintf(stderr,
-                "[tune] error: this program was not built with BLAS.\n");
-        return false;
-    }
-
-    if (params->model.ftype >= GGML_FTYPE_MOSTLY_Q2_K &&
-        params->model.ftype <= GGML_FTYPE_MOSTLY_Q6_K) {
-#if defined(GGML_USE_CLBLAST)
-        printf("[tune] error: cl implementation does not support k_quants at "
-               "the time of writing this code, skip.\n");
-        return false;
-#endif
-    }
-
     bool ok = ggml_mulmat_tune_init(tune, params, ggml_get_task_profiles);
     if (!ok) {
         return false;
@@ -835,12 +778,13 @@ bool ggml_mulmat_tune_bench(struct ggml_mulmat_tune *tune,
         char buf[128] = {0};
         int offset = 0;
 
-        for (int i = 0; i < n_backends; i++) {
+        for (int i = 0; i < tune->shapes[0].n_profiles; i++) {
             if (i > 0) {
                 buf[offset++] = ',';
                 buf[offset++] = ' ';
             }
-            const char *name = ggml_mulmat_tune_task_backend_name(backends[i]);
+            const char *name = tune->shapes[0].profiles[i].name;
+            GGML_ASSERT(name != NULL && strcmp(name, "") != 0);
             size_t len = strlen(name);
             memcpy(&buf[offset], name, len);
             offset += (int)len;
@@ -848,17 +792,17 @@ bool ggml_mulmat_tune_bench(struct ggml_mulmat_tune *tune,
 
         fprintf(stdout,
                 "[tune] model: %s, ggml ftype: %d, "
-                "n_pass: %d, n_threads: %d, n_shapes: %d, backends: %s\n",
+                "n_pass: %d, n_shapes: %d, n_threads: %d, profiles: %s\n",
                 params->model.name, params->model.ftype, params->n_pass,
-                params->n_threads, tune->n_shapes, buf);
+                tune->n_shapes, params->n_threads, buf);
     }
 
     int64_t stages_time[3];
     int64_t t0 = ggml_time_ms();
 
-    struct ggml_threading_context *thrd_ctx = ggml_threading_start(
-        tune->n_threads, NULL, ggml_compute_forward_wrapper,
-        GGML_THREADING_FEATURE_WAIT_ON_DONE, stages_time);
+    struct ggml_threading_context *thrd_ctx =
+        ggml_threading_start(tune->n_threads, NULL, NULL,
+                             GGML_THREADING_FEATURE_WAIT_ON_DONE, stages_time);
 
     for (int i_shape = 0; i_shape < tune->n_shapes; i_shape++) {
         const struct ggml_mulmat_tune_shape *shape = &tune->shapes[i_shape];
@@ -896,6 +840,7 @@ bool ggml_mulmat_tune_bench(struct ggml_mulmat_tune *tune,
 
             for (int ip = 0; ip < shape->n_profiles; ip++) {
                 const struct ggml_task_profile *profile = &shape->profiles[ip];
+                // GGML_ASSERT(profile->runner);
 
                 memcpy(&node->task_profile, profile,
                        sizeof(struct ggml_task_profile));
@@ -911,9 +856,15 @@ bool ggml_mulmat_tune_bench(struct ggml_mulmat_tune *tune,
                         stages_time[j] = 0;
                     }
 
-                    enum ggml_compute_error err = ggml_threading_compute_tensor(
-                        thrd_ctx, node, wdata, wsize);
-                    GGML_ASSERT(err == GGML_COMPUTE_OK);
+                    ggml_threading_compute_tensor(thrd_ctx, node, wdata, wsize);
+
+                    if (memcmp(profile, &node->task_profile,
+                               sizeof(struct ggml_task_profile)) != 0) {
+                        printf("[tune] error: task profile changed, tensor op: "
+                               "%d, original id: %d, current id: %d\n",
+                               node->op, profile->id, node->task_profile.id);
+                        exit(1);
+                    }
 
                     for (int i = 0; i < 3; i++) {
                         int v = (int)stages_time[i];
diff --git a/ggml-tune.h b/ggml-tune.h
index 7955a50a977da..addcd34dbbd62 100644
--- a/ggml-tune.h
+++ b/ggml-tune.h
@@ -10,7 +10,7 @@
 extern "C" {
 #endif
 
-#define GGML_MULMAT_TUNE_VERSION 9
+#define GGML_MULMAT_TUNE_VERSION 10
 #define GGML_MULMAT_N_SHAPES 4
 #define GGML_MULMAT_CACHE_LEN 16
 
@@ -119,7 +119,7 @@ void ggml_mulmat_tune_free(struct ggml_mulmat_tune *tune);
 
 bool ggml_mulmat_tune_write_data(const struct ggml_mulmat_tune *tune, FILE *fp);
 
-bool ggml_mulmat_tune_read_data(struct ggml_mulmat_tune *tune, FILE *fp);
+int ggml_mulmat_tune_read_data(struct ggml_mulmat_tune *tune, FILE *fp);
 
 const struct ggml_mulmat_tune_shape *
 ggml_mulmat_tune_get_shape(const struct ggml_mulmat_tune *tune, int N, int K,
@@ -129,11 +129,6 @@ void ggml_mulmat_tune_estimate_time(const struct ggml_mulmat_tune_shape *shape,
                                     int M,
                                     struct ggml_mulmat_tune_time *profile_time);
 
-const char *ggml_task_backend_name(enum ggml_task_backend backend);
-
-int ggml_mulmat_tune_get_builtin_task_backends(
-    enum ggml_task_backend *backends);
-
 bool ggml_mulmat_tune_bench(struct ggml_mulmat_tune *tune,
                             struct ggml_mulmat_tune_params *params);
 
diff --git a/ggml.c b/ggml.c
index 43ec93a64e2b2..62750b20bd127 100644
--- a/ggml.c
+++ b/ggml.c
@@ -8500,15 +8500,6 @@ static void ggml_compute_forward_mul_f32(
     const int ith = params->ith;
     const int nth = params->nth;
 
-#ifdef GGML_USE_CLBLAST
-    if (src1->backend == GGML_BACKEND_GPU) {
-        if (ith == 0) {
-            ggml_cl_mul(src0, src1, dst);
-        }
-        return;
-    }
-#endif
-
     const int64_t nr = ggml_nrows(src0);
 
     const int64_t ne00 = src0->ne[0];
@@ -9933,6 +9924,168 @@ static void ggml_compute_forward_rms_norm_back(
     }
 }
 
+#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
+static void ggml_compute_forward_mul_mat_blas(
+        const struct ggml_compute_params * params,
+              struct ggml_tensor * dst) {
+    struct ggml_tensor * src0 = dst->src0;
+    struct ggml_tensor * src1 = dst->src1;
+
+    const int64_t ne00 = src0->ne[0];
+    const int64_t ne01 = src0->ne[1];
+    const int64_t ne02 = src0->ne[2];
+    const int64_t ne03 = src0->ne[3];
+
+    const int64_t ne10 = src1->ne[0];
+    const int64_t ne11 = src1->ne[1];
+    const int64_t ne12 = src1->ne[2];
+    const int64_t ne13 = src1->ne[3];
+
+    const int64_t ne0  = dst->ne[0];
+    const int64_t ne1  = dst->ne[1];
+    const int64_t ne2  = dst->ne[2];
+    const int64_t ne3  = dst->ne[3];
+
+    const int nb00 = src0->nb[0];
+    const int nb01 = src0->nb[1];
+    const int nb02 = src0->nb[2];
+    const int nb03 = src0->nb[3];
+
+    const int nb10 = src1->nb[0];
+    // const int nb11 = src1->nb[1];
+    const int nb12 = src1->nb[2];
+    const int nb13 = src1->nb[3];
+
+    const int nb0  = dst->nb[0];
+    const int nb1  = dst->nb[1];
+    const int nb2  = dst->nb[2];
+    const int nb3  = dst->nb[3];
+
+    GGML_ASSERT(ne02 == ne12);
+    GGML_ASSERT(ne03 == ne13);
+    GGML_ASSERT(ne2  == ne12);
+    GGML_ASSERT(ne3  == ne13);
+
+    // dst cannot be transposed or permuted
+    GGML_ASSERT(nb0 == sizeof(float));
+    GGML_ASSERT(nb10 == sizeof(float));
+
+    GGML_ASSERT(nb0 <= nb1);
+    GGML_ASSERT(nb1 <= nb2);
+    GGML_ASSERT(nb2 <= nb3);
+
+    GGML_ASSERT(ne0 == ne01);
+    GGML_ASSERT(ne1 == ne11);
+    GGML_ASSERT(ne2 == ne02);
+    GGML_ASSERT(ne3 == ne03);
+
+    const int ith = params->ith;
+    const int nth = params->nth;
+
+    if (src0->type == GGML_TYPE_F32) {
+        // we don't support permuted src0 or src1
+        GGML_ASSERT(nb00 == sizeof(float));
+        GGML_ASSERT(params->nth == 1);
+        GGML_ASSERT(params->type == GGML_TASK_COMPUTE);
+
+        for (int64_t i03 = 0; i03 < ne03; i03++) {
+            for (int64_t i02 = 0; i02 < ne02; i02++) {
+                const float * x = (float *) ((char *) src0->data + i02*nb02 + i03*nb03);
+                const float * y = (float *) ((char *) src1->data + i02*nb12 + i03*nb13);
+                float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3);
+                cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans,
+                        ne11, ne01, ne10,
+                        1.0f,    y, ne10,
+                                 x, ne00,
+                        0.0f,    d, ne01);
+            }
+        }
+        return;
+    } else if (src0->type == GGML_TYPE_F16) {
+        // TODO: we don't support permuted src0
+        GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
+        GGML_ASSERT(params->nth == 1);
+        GGML_ASSERT(params->type == GGML_TASK_COMPUTE);
+
+        for (int64_t i03 = 0; i03 < ne03; i03++) {
+            for (int64_t i02 = 0; i02 < ne02; i02++) {
+                float * const wdata = params->wdata;
+                {
+                    size_t id = 0;
+                    for (int64_t i01 = 0; i01 < ne01; ++i01) {
+                        for (int64_t i00 = 0; i00 < ne00; ++i00) {
+                            wdata[id++] = GGML_FP16_TO_FP32(*(ggml_fp16_t *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01 + i00*nb00));
+                        }
+                    }
+
+                    assert(id*sizeof(float) <= params->wsize);
+                }
+
+                const float * x = wdata;
+                const float * y = (float *) ((char *) src1->data + i02*nb12 + i03*nb13);
+                float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3);
+
+                // zT = y * xT
+                cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans,
+                        ne11, ne01, ne10,
+                        1.0f,    y, ne10,
+                                 x, ne00,
+                        0.0f,    d, ne01);
+            }
+        }
+        return;
+    } else if (ggml_is_quantized(src0->type)) {
+        // we don't support permuted src0 or src1
+        GGML_ASSERT(nb00 == (int) GGML_TYPE_SIZE[src0->type]);
+        GGML_ASSERT(src0->data);
+        GGML_ASSERT(params->wdata);
+
+        float * const wdata = params->wdata;
+        dequantize_row_q_t const dequantize_row_q = quantize_fns[src0->type].dequantize_row_q;
+
+        if (params->type == GGML_TASK_INIT) {
+            // rows per thread
+            const int dr = (ne01 + nth - 1)/nth;
+
+            // row range for this thread
+            const int ir0 = dr*ith;
+            int ir1 = MIN(ir0 + dr, ne01);
+
+            for (int64_t i03 = 0; i03 < ne03; i03++) {
+                for (int64_t i02 = 0; i02 < ne02; i02++) {
+                    char  * data0_offset = (char *) src0->data + i03*nb03 + i02*nb02;
+                    float * wdata_offset = wdata + i03*ne03 + i02*ne02;
+                    for (int64_t i = ir0; i < ir1; ++i) {
+                        dequantize_row_q(data0_offset + i*nb01, wdata_offset + i*ne00, ne00);
+                    }
+                }
+            }
+            return;
+        }
+
+        GGML_ASSERT(nth == 1);
+        GGML_ASSERT(params->type == GGML_TASK_COMPUTE);
+
+        for (int64_t i03 = 0; i03 < ne03; i03++) {
+            for (int64_t i02 = 0; i02 < ne02; i02++) {
+                const float * x = wdata;
+                const float * y = (float *) ((char *) src1->data + i02*nb12 + i03*nb13);
+                float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3);
+                // zT = y * xT
+                cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans,
+                        ne11, ne01, ne10,
+                        1.0f,    y, ne10,
+                                 x, ne00,
+                        0.0f,    d, ne01);
+            }
+        }
+        return;
+    } else {
+        GGML_ASSERT(false);
+    }
+}
+#endif
+
 // CPU only
 static void ggml_compute_forward_mul_mat_f32(
         const struct ggml_compute_params * params,
@@ -9947,9 +10100,6 @@ static void ggml_compute_forward_mul_mat_f32(
     const int64_t ne02 = src0->ne[2];
     const int64_t ne03 = src0->ne[3];
 
-#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
-    const int64_t ne10 = src1->ne[0];
-#endif
     const int64_t ne11 = src1->ne[1];
 #ifndef NDEBUG
     const int64_t ne12 = src1->ne[2];
@@ -10004,37 +10154,7 @@ static void ggml_compute_forward_mul_mat_f32(
     // nb01 >= nb00 - src0 is not transposed
     //   compute by src0 rows
 
-    enum ggml_task_backend comp_backend = dst->task_profile.stages[GGML_TASK_COMPUTE].backend;
-    GGML_ASSERT(comp_backend & GGML_TASK_BACKEND_CPU);
-
-    if (comp_backend == GGML_TASK_BACKEND_CPU_BLAS) {
-#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
-        GGML_ASSERT(params->nth == 1);
-        GGML_ASSERT(params->type == GGML_TASK_COMPUTE);
-
-        for (int64_t i03 = 0; i03 < ne03; i03++) {
-            for (int64_t i02 = 0; i02 < ne02; i02++) {
-                const float * x = (float *) ((char *) src0->data + i02*nb02 + i03*nb03);
-                const float * y = (float *) ((char *) src1->data + i02*nb12 + i03*nb13);
-                float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3);
-
-                cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans,
-                        ne11, ne01, ne10,
-                        1.0f,    y, ne10,
-                                 x, ne00,
-                        0.0f,    d, ne01);
-            }
-        }
-        //printf("CBLAS F32 = %f ms, %d x %d x %d x %d\n", (ggml_perf_time_us() - t0)/1000.0, ne0, ne1, ne2, ne3);
-
-        return;
-#else
-        GGML_ASSERT(false);
-#endif
-    }
-
     GGML_ASSERT(params->type == GGML_TASK_COMPUTE);
-    GGML_ASSERT(comp_backend == GGML_TASK_BACKEND_CPU);
 
     // parallelize by src0 rows using ggml_vec_dot_f32
 
@@ -10152,57 +10272,6 @@ static void ggml_compute_forward_mul_mat_f16_f32(
     // nb01 >= nb00 - src0 is not transposed
     //   compute by src0 rows
 
-    enum ggml_task_backend init_backend = dst->task_profile.stages[GGML_TASK_INIT].backend;
-    enum ggml_task_backend comp_backend = dst->task_profile.stages[GGML_TASK_COMPUTE].backend;
-
-    GGML_ASSERT(comp_backend & GGML_TASK_BACKEND_CPU);
-
-    if (comp_backend == GGML_TASK_BACKEND_CPU_BLAS) {
-#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
-        GGML_ASSERT(nb10 == sizeof(float));
-        GGML_ASSERT(params->nth == 1);
-        GGML_ASSERT(params->type == GGML_TASK_COMPUTE);
-
-        for (int64_t i03 = 0; i03 < ne03; i03++) {
-            for (int64_t i02 = 0; i02 < ne02; i02++) {
-                float * const wdata = params->wdata;
-                {
-                    size_t id = 0;
-                    for (int64_t i01 = 0; i01 < ne01; ++i01) {
-                        for (int64_t i00 = 0; i00 < ne00; ++i00) {
-                            wdata[id++] = GGML_FP16_TO_FP32(*(ggml_fp16_t *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01 + i00*nb00));
-                        }
-                    }
-
-                    assert(id*sizeof(float) <= params->wsize);
-                }
-
-                const float * x = wdata;
-                const float * y = (float *) ((char *) src1->data + i02*nb12 + i03*nb13);
-
-                float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3);
-
-                // zT = y * xT
-                cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans,
-                        ne11, ne01, ne10,
-                        1.0f,    y, ne10,
-                                 x, ne00,
-                        0.0f,    d, ne01);
-            }
-        }
-
-        /*printf("CBLAS F16 = %f ms, %d x %d x %d x %d\n", (ggml_perf_time_us() - t0)/1000.0, ne0, ne1, ne2, ne3);*/
-
-        return;
-#else
-        GGML_ASSERT(false);
-#endif
-    }
-
-    GGML_ASSERT(params->type == GGML_TASK_INIT || params->type == GGML_TASK_COMPUTE);
-    GGML_ASSERT(init_backend == GGML_TASK_BACKEND_CPU);
-    GGML_ASSERT(comp_backend == GGML_TASK_BACKEND_CPU);
-
     if (params->type == GGML_TASK_INIT) {
         ggml_fp16_t * const wdata = params->wdata;
 
@@ -10348,68 +10417,6 @@ static void ggml_compute_forward_mul_mat_q_f32(
     // nb01 >= nb00 - src0 is not transposed
     //   compute by src0 rows
 
-    enum ggml_task_backend init_backend = dst->task_profile.stages[GGML_TASK_INIT].backend;
-    enum ggml_task_backend comp_backend = dst->task_profile.stages[GGML_TASK_COMPUTE].backend;
-    GGML_ASSERT(comp_backend & GGML_TASK_BACKEND_CPU);
-
-    if (comp_backend == GGML_TASK_BACKEND_CPU_BLAS) {
-#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
-        GGML_ASSERT (init_backend == GGML_TASK_BACKEND_CPU);
-        GGML_ASSERT(params->type == GGML_TASK_INIT || params->type == GGML_TASK_COMPUTE);
-        GGML_ASSERT(src0->data);
-        GGML_ASSERT(params->wdata);
-
-        float * const wdata = params->wdata;
-        dequantize_row_q_t const dequantize_row_q = quantize_fns[type].dequantize_row_q;
-
-        if (params->type == GGML_TASK_INIT) {
-            // rows per thread
-            const int dr = (ne01 + nth - 1)/nth;
-
-            // row range for this thread
-            const int ir0 = dr*ith;
-            int ir1 = MIN(ir0 + dr, ne01);
-
-            for (int64_t i03 = 0; i03 < ne03; i03++) {
-                for (int64_t i02 = 0; i02 < ne02; i02++) {
-                    char  * data0_offset = (char *) src0->data + i03*nb03 + i02*nb02;
-                    float * wdata_offset = wdata + i03*ne03 + i02*ne02;
-                    for (int64_t i = ir0; i < ir1; ++i) {
-                        dequantize_row_q(data0_offset + i*nb01, wdata_offset + i*ne00, ne00);
-                    }
-                }
-            }
-            return;
-        }
-
-        GGML_ASSERT(nth == 1);
-        GGML_ASSERT(params->type == GGML_TASK_COMPUTE);
-
-        for (int64_t i03 = 0; i03 < ne03; i03++) {
-            for (int64_t i02 = 0; i02 < ne02; i02++) {
-                const float * y = (float *) ((char *) src1->data + i02*nb12 + i03*nb13);
-                float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3);
-
-                // zT = y * xT
-                const float * x = wdata;
-                cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans,
-                        ne11, ne01, ne10,
-                        1.0f,    y, ne10,
-                                 x, ne00,
-                        0.0f,    d, ne01);
-            }
-        }
-
-        return;
-#else
-        GGML_ASSERT(false);
-#endif
-    }
-
-    GGML_ASSERT(params->type == GGML_TASK_INIT || params->type == GGML_TASK_COMPUTE);
-    GGML_ASSERT(init_backend == GGML_TASK_BACKEND_CPU);
-    GGML_ASSERT(comp_backend == GGML_TASK_BACKEND_CPU);
-
     if (params->type == GGML_TASK_INIT) {
         GGML_ASSERT(params->nth == 1);
 
@@ -14257,6 +14264,7 @@ static void ggml_compute_forward_cross_entropy_loss_back(
 
 /////////////////////////////////
 
+// CPU only: no BLAS.
 static enum ggml_compute_error ggml_compute_forward(const struct ggml_compute_params * params, struct ggml_tensor * tensor) {
     GGML_ASSERT(params);
 
@@ -15459,96 +15467,163 @@ struct ggml_cgraph ggml_build_backward(struct ggml_context * ctx, struct ggml_cg
 
 // ---- task profiles  ----
 
-// TODO: replace with ggml_compute_forward_cuda
+// Check the type and memeory layout for mul_mat on blas(CPU BLAS)
+static bool ggml_mul_mat_check_type_mem(struct ggml_tensor *tensor) {
+    enum ggml_type src0_t = tensor->src0->type;
+    enum ggml_type src1_t = tensor->src1->type;
+
+    // This is the minimal requirement to run mulmat with BLAS.
+    // Don't check matrix size because that would break tuning.
+    return (src0_t == GGML_TYPE_F32 || src0_t == GGML_TYPE_F16 ||
+            ggml_is_quantized(src0_t)) &&
+           src1_t == GGML_TYPE_F32 && tensor->type == GGML_TYPE_F32 &&
+           ggml_is_contiguous(tensor->src0) && ggml_is_contiguous(tensor->src1);
+}
+
 // DO NOT check matrix size further.
 #if defined(GGML_USE_CUBLAS)
-static enum ggml_compute_error ggml_compute_forward_cuda(
-        const struct ggml_compute_params * params,
-        struct ggml_tensor * tensor) {
-    GGML_ASSERT (ggml_cuda_can_mul_mat(tensor->src0, tensor->src1, tensor));
+// Implements ggml_task_runner.
+static enum ggml_compute_error
+ggml_compute_forward_cuda(const struct ggml_compute_params *params,
+                          struct ggml_tensor *tensor) {
+    if (tensor->op == GGML_OP_MUL_MAT) {
+        GGML_ASSERT(ggml_mul_mat_check_type_mem(tensor));
+    }
+
     if (ggml_cuda_compute_forward(params, tensor)) {
         return GGML_COMPUTE_OK;
     }
+
     GGML_ASSERT(tensor->src0->backend == GGML_BACKEND_CPU);
-    GGML_ASSERT(tensor->src1 == NULL || tensor->src1->backend == GGML_BACKEND_CPU);
+    GGML_ASSERT(tensor->src1 == NULL ||
+                tensor->src1->backend == GGML_BACKEND_CPU);
+
     return GGML_COMPUTE_FALLBACK;
 }
-#endif
+#endif // GGML_USE_CUBLAS
 
-// TODO: replace with ggml_cl_mul_mat.
-// DO NOT check matrix size further.
 #if defined(GGML_USE_CLBLAST)
-static enum ggml_compute_error ggml_compute_forward_cl(
-        const struct ggml_compute_params * params,
-        struct ggml_tensor * tensor) {
+// Implements ggml_task_runner.
+static enum ggml_compute_error
+ggml_compute_forward_cl(const struct ggml_compute_params *params,
+                        struct ggml_tensor *tensor) {
     switch (tensor->op) {
-        case GGML_OP_MUL_MAT:
-            GGML_ASSERT(ggml_cl_can_mul_mat(tensor->src0, tensor->src1, tensor));
-            ggml_cl_mul_mat(tensor->src0, tensor->src1, tensor, params->wdata, params->wsize);
-            return GGML_COMPUTE_OK;
-        default:
-            break;
+    case GGML_OP_MUL: {
+        if (tensor->src1 && ggml_cl_is_gpu_offloading(tensor)) {
+            if (params->ith == 0) {
+                ggml_cl_mul(tensor->src0, tensor->src1, tensor);
+                return GGML_COMPUTE_OK;
+            }
+        }
+    } break;
+    case GGML_OP_MUL_MAT: {
+        GGML_ASSERT(ggml_mul_mat_check_type_mem(tensor));
+        ggml_cl_mul_mat(tensor->src0, tensor->src1, tensor, params->wdata,
+                        params->wsize);
+        return GGML_COMPUTE_OK;
+    } break;
+    default: {
+    } break;
     }
 
-    GGML_ASSERT(false);
+    return GGML_COMPUTE_FALLBACK;
 }
 
-static int ggml_compute_forward_get_wsize_cl (struct ggml_tensor *tensor) {
-     switch (tensor->op) {
-        case GGML_OP_MUL_MAT:
-            return ggml_cl_mul_mat_get_wsize(tensor->src0, tensor->src1, tensor);
-        default:
-            break;
-     }
-     return -1;
+// Implements ggml_task_wsize_getter.
+static int ggml_compute_forward_cl_get_wsize(struct ggml_tensor *tensor) {
+    switch (tensor->op) {
+    case GGML_OP_MUL_MAT:
+        return ggml_cl_mul_mat_get_wsize(tensor->src0, tensor->src1, tensor);
+    default:
+        break;
+    }
+    return -1;
 }
-#endif
+#endif // GGML_USE_CLBLAST
+
+#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
+// Implements ggml_task_runner.
+static enum ggml_compute_error
+ggml_compute_forward_blas(const struct ggml_compute_params *params,
+                          struct ggml_tensor *tensor) {
+    switch (tensor->op) {
+    case GGML_OP_MUL_MAT: {
+        GGML_ASSERT(ggml_mul_mat_check_type_mem(tensor));
+        ggml_compute_forward_mul_mat_blas(params, tensor);
+        return GGML_COMPUTE_OK;
+    } break;
+    default: {
+    } break;
+    }
 
-// The wrapper for external mulmat tune tool.
-enum ggml_compute_error ggml_compute_forward_wrapper(const struct ggml_compute_params *params,
-    struct ggml_tensor *tensor) {
-    // We call ggml_compute_forward because the CUDA mul_mat entry point
-    // was moved out of `ggml_compute_forward_mul_mat`.
-    return ggml_compute_forward(params, tensor);
+    return GGML_COMPUTE_FALLBACK;
 }
 
+// Implements ggml_task_wsize_getter.
+static int ggml_compute_forward_blas_get_wsize(struct ggml_tensor *tensor) {
+    switch (tensor->op) {
+    case GGML_OP_MUL_MAT: {
+        GGML_ASSERT(tensor->src1->type == GGML_TYPE_F32);
+        enum ggml_type src0_t = tensor->src0->type;
+
+        if (src0_t == GGML_TYPE_F16) {
+            return GGML_TYPE_SIZE[GGML_TYPE_F32] *
+                   (tensor->src0->ne[0] * tensor->src0->ne[1]);
+        } else if (src0_t == GGML_TYPE_F32) {
+            return 0;
+        } else if (ggml_is_quantized(src0_t)) {
+            return GGML_TYPE_SIZE[GGML_TYPE_F32] *
+                   (tensor->src0->ne[0] * tensor->src0->ne[1]);
+        } else {
+            GGML_ASSERT(false);
+        }
+    } break;
+    default:
+        break;
+    }
+    return -1;
+}
+#endif // GGML_USE_ACCELERATE | GGML_USE_OPENBLAS
+
 // Implement `ggml_task_profiles_provider`.
-// Fill `profiles` for the `node` and return number of profiles.
+// Fill `profiles` for the `tensor` and return number of profiles.
 //
-// NOTE: the node may be incompleted from testing or tunning, so please assert
+// NOTE: the tensor may be incompleted from testing or tunning, so please assert
 //       everything used here.
 //
-// TODO: configure cuda for none mul_mat nodes.
+// First profile is always CPU, followed by BLAS, CUDA/CL.
 int ggml_get_task_profiles(
-    struct ggml_tensor *node,
+    struct ggml_tensor *tensor,
     struct ggml_task_profile profiles[GGML_MAX_TASK_PROFILES]) {
-    GGML_ASSERT(node);
-    GGML_ASSERT(node->op >= 0);
+
+    GGML_ASSERT(tensor);
+    GGML_ASSERT(tensor->op >= 0);
     GGML_ASSERT(profiles);
 
     memset(profiles, 0,
            sizeof(struct ggml_task_profile) * GGML_MAX_TASK_PROFILES);
 
     struct ggml_task_profile *p = profiles;
-    int n_profiles = 0;
 
-    switch (node->op) {
+    int n_profiles = 1;
+    strcpy(p[0].name, "CPU");
+    p[0].runner = ggml_compute_forward;
+    // p[0].wsize_getter = ...;
+
+    switch (tensor->op) {
     case GGML_OP_CPY:
     case GGML_OP_DUP: {
-        p[0].stages[1].backend = GGML_TASK_BACKEND_CPU;
-        n_profiles = 1;
+        p[0].stages[1].valid = true;
     } break;
     case GGML_OP_ADD:
     case GGML_OP_ADD1: {
-        p[0].stages[1].backend = GGML_TASK_BACKEND_CPU;
+        p[0].stages[1].valid = true;
         p[0].stages[1].parallel = true;
-        n_profiles = 1;
     } break;
     case GGML_OP_ACC: {
-        p[0].stages[0].backend = GGML_TASK_BACKEND_CPU;
-        p[0].stages[1].backend = GGML_TASK_BACKEND_CPU;
+        p[0].stages[0].valid = true;
+        p[0].stages[1].valid = true;
         p[0].stages[1].parallel = true;
-        n_profiles = 1;
     } break;
     case GGML_OP_SUB:
     case GGML_OP_DIV:
@@ -15565,13 +15640,11 @@ int ggml_get_task_profiles(
     case GGML_OP_NEG:
     case GGML_OP_STEP:
     case GGML_OP_RELU: {
-        p[0].stages[1].backend = GGML_TASK_BACKEND_CPU;
-        n_profiles = 1;
+        p[0].stages[1].valid = true;
     } break;
     case GGML_OP_MUL: {
-        p[0].stages[1].backend = GGML_TASK_BACKEND_CPU;
+        p[0].stages[1].valid = true;
         p[0].stages[1].parallel = true;
-        n_profiles = 1;
     } break;
     case GGML_OP_GELU:
     case GGML_OP_SILU:
@@ -15579,69 +15652,32 @@ int ggml_get_task_profiles(
     case GGML_OP_NORM:
     case GGML_OP_RMS_NORM:
     case GGML_OP_RMS_NORM_BACK: {
-        p[0].stages[1].backend = GGML_TASK_BACKEND_CPU;
+        p[0].stages[1].valid = true;
         p[0].stages[1].parallel = true;
-        n_profiles = 1;
     } break;
     case GGML_OP_MUL_MAT:
-    case GGML_OP_OUT_PROD: {
-        // CPU only profiles.
-        // CUDA/CL: see end of function.
-        GGML_ASSERT(node->src0);
-        GGML_ASSERT(node->src1);
-
-        enum ggml_type src0_t = node->src0->type;
-        enum ggml_type src1_t = node->src1->type;
-
-        GGML_ASSERT(src1_t == GGML_TYPE_F32);
-
-        int i = 0;
+    case GGML_OP_OUT_PROD: { // FIXME: is this correct?
+        enum ggml_type src0_t = tensor->src0->type;
         if (src0_t == GGML_TYPE_F32) {
-            p[i].stages[1].backend = GGML_TASK_BACKEND_CPU;
-            p[i].stages[1].parallel = true;
-            i++;
-
-#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
-            p[i].stages[1].backend = GGML_TASK_BACKEND_CPU_BLAS;
-            p[i].stages[1].wait = true;
-            i++;
-#endif
+            p[0].stages[1].valid = true;
+            p[0].stages[1].parallel = true;
         } else if (src0_t == GGML_TYPE_F16) {
-            p[i].stages[0].backend = GGML_TASK_BACKEND_CPU;
-            p[i].stages[1].backend = GGML_TASK_BACKEND_CPU;
-            p[i].stages[1].parallel = true;
-            i++;
-
-#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
-            p[i].stages[1].backend = GGML_TASK_BACKEND_CPU_BLAS;
-            p[i].stages[1].wait = true;
-            i++;
-#endif
+            p[0].stages[0].valid = true;
+            p[0].stages[1].valid = true;
+            p[0].stages[1].parallel = true;
         } else if (ggml_is_quantized(src0_t)) {
-            p[i].stages[0].backend = GGML_TASK_BACKEND_CPU;
-            p[i].stages[1].backend = GGML_TASK_BACKEND_CPU;
-            p[i].stages[1].parallel = true;
-            i++;
-
-#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
-            p[i].stages[0].backend = GGML_TASK_BACKEND_CPU;
-            p[i].stages[0].parallel = true;
-            p[i].stages[1].backend = GGML_TASK_BACKEND_CPU_BLAS;
-            p[i].stages[1].wait = true;
-            i++;
-#endif
+            p[0].stages[0].valid = true;
+            p[0].stages[1].valid = true;
+            p[0].stages[1].parallel = true;
         }
-        n_profiles = i;
     } break;
     case GGML_OP_SCALE: {
-        p[0].stages[1].backend = GGML_TASK_BACKEND_CPU;
+        p[0].stages[1].valid = true;
         p[0].stages[1].parallel = true;
-        n_profiles = 1;
     } break;
     case GGML_OP_SET: {
-        p[0].stages[0].backend = GGML_TASK_BACKEND_CPU;
-        p[0].stages[1].backend = GGML_TASK_BACKEND_CPU;
-        n_profiles = 1;
+        p[0].stages[0].valid = true;
+        p[0].stages[1].valid = true;
     } break;
     case GGML_OP_CONT:
     case GGML_OP_RESHAPE:
@@ -15652,64 +15688,53 @@ int ggml_get_task_profiles(
     case GGML_OP_GET_ROWS_BACK:
     case GGML_OP_DIAG:
     case GGML_OP_DIAG_MASK_ZERO: {
-        p[0].stages[1].backend = GGML_TASK_BACKEND_CPU;
-        n_profiles = 1;
+        p[0].stages[1].valid = true;
     } break;
     case GGML_OP_DIAG_MASK_INF:
     case GGML_OP_SOFT_MAX:
     case GGML_OP_SOFT_MAX_BACK:
     case GGML_OP_ROPE:
     case GGML_OP_ROPE_BACK: {
-        p[0].stages[1].backend = GGML_TASK_BACKEND_CPU;
+        p[0].stages[1].valid = true;
         p[0].stages[1].parallel = true;
-        n_profiles = 1;
     } break;
     case GGML_OP_ALIBI: {
-        p[0].stages[1].backend = GGML_TASK_BACKEND_CPU;
-        n_profiles = 1;
+        p[0].stages[1].valid = true;
     } break;
     case GGML_OP_CLAMP: {
-        p[0].stages[1].backend = GGML_TASK_BACKEND_CPU;
-        n_profiles = 1;
+        p[0].stages[1].valid = true;
     } break;
     case GGML_OP_CONV_1D_1S:
     case GGML_OP_CONV_1D_2S: {
-        p[0].stages[0].backend = GGML_TASK_BACKEND_CPU;
-        p[0].stages[1].backend = GGML_TASK_BACKEND_CPU;
+        p[0].stages[0].valid = true;
+        p[0].stages[1].valid = true;
         p[0].stages[1].parallel = true;
-        n_profiles = 1;
     } break;
     case GGML_OP_FLASH_ATTN: {
-        p[0].stages[1].backend = GGML_TASK_BACKEND_CPU;
+        p[0].stages[1].valid = true;
         p[0].stages[1].parallel = true;
-        n_profiles = 1;
     } break;
     case GGML_OP_FLASH_FF: {
-        p[0].stages[1].backend = GGML_TASK_BACKEND_CPU;
+        p[0].stages[1].valid = true;
         p[0].stages[1].parallel = true;
-        n_profiles = 1;
     }
     case GGML_OP_FLASH_ATTN_BACK: {
-        p[0].stages[0].backend = GGML_TASK_BACKEND_CPU;
-        p[0].stages[1].backend = GGML_TASK_BACKEND_CPU;
+        p[0].stages[0].valid = true;
+        p[0].stages[1].valid = true;
         p[0].stages[1].parallel = true;
-        n_profiles = 1;
     } break;
     case GGML_OP_MAP_UNARY:
     case GGML_OP_MAP_BINARY: {
-        p[0].stages[1].backend = GGML_TASK_BACKEND_CPU;
-        n_profiles = 1;
+        p[0].stages[1].valid = true;
     } break;
     case GGML_OP_CROSS_ENTROPY_LOSS:
-        p[0].stages[0].backend = GGML_TASK_BACKEND_CPU;
-        p[0].stages[1].backend = GGML_TASK_BACKEND_CPU;
+        p[0].stages[0].valid = true;
+        p[0].stages[1].valid = true;
         p[0].stages[1].parallel = true;
-        p[0].stages[2].backend = GGML_TASK_BACKEND_CPU;
-        n_profiles = 1;
+        p[0].stages[2].valid = true;
     case GGML_OP_CROSS_ENTROPY_LOSS_BACK: {
-        p[0].stages[1].backend = GGML_TASK_BACKEND_CPU;
+        p[0].stages[1].valid = true;
         p[0].stages[1].parallel = true;
-        n_profiles = 1;
     } break;
     case GGML_OP_NONE:
     case GGML_OP_COUNT: {
@@ -15719,227 +15744,196 @@ int ggml_get_task_profiles(
         GGML_ASSERT(false);
     }
 
-#if defined(GGML_USE_CUBLAS)
-    switch (node->op) {
-        case GGML_OP_ADD:
-        case GGML_OP_MUL:
-        case GGML_OP_SILU:
-        case GGML_OP_RMS_NORM:
-        case GGML_OP_MUL_MAT:
-        case GGML_OP_RESHAPE:
-        case GGML_OP_ROPE: {
-            int i = n_profiles;
-            p[i].runner = ggml_compute_forward_cuda;
-            p[i].stages[1].backend = GGML_TASK_BACKEND_GPU_CUDA;
+#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
+    if (tensor->op == GGML_OP_MUL_MAT) {
+        enum ggml_type src0_t = tensor->src0->type;
+        int i = n_profiles;
+
+        strcpy(p[i].name, "BLAS");
+        p[i].runner = ggml_compute_forward_blas;
+        p[i].wsize_getter = ggml_compute_forward_blas_get_wsize;
+
+        if (src0_t == GGML_TYPE_F32) {
+            p[i].stages[1].valid = true;
             p[i].stages[1].wait = true;
-            ++n_profiles;
-        } break;
-        default: {
-        } break;
+        } else if (src0_t == GGML_TYPE_F16) {
+            p[i].stages[1].valid = true;
+            p[i].stages[1].wait = true;
+        } else if (ggml_is_quantized(src0_t)) {
+            p[i].stages[0].valid = true;
+            p[i].stages[0].parallel = true;
+            p[i].stages[1].valid = true;
+            p[i].stages[1].wait = true;
+        }
+        ++n_profiles;
+    }
+#endif
+
+#if defined(GGML_USE_CUBLAS)
+    if (true) { // FIXME: filter supported op to avoid unnecceary fallback.
+        int i = n_profiles;
+        strcpy(p[i].name, "CUDA");
+        p[i].runner = ggml_compute_forward_cuda;
+        p[i].stages[1].valid = true;
+        p[i].stages[1].wait = true;
+        ++n_profiles;
     }
 #elif defined(GGML_USE_CLBLAST)
-    switch (node->op) {
-        case GGML_OP_MUL_MAT: {
-            int i = n_profiles;
-            p[i].runner = ggml_compute_forward_cl;
-            p[i].get_wsize = ggml_compute_forward_get_wsize_cl;
-            p[i].stages[1].backend = GGML_TASK_BACKEND_GPU_CL;
-            p[i].stages[1].wait = true;
-            ++n_profiles;
-        } break;
-        default: {
-        } break;
+    if (tensor->op == GGML_OP_MUL || tensor->op == GGML_OP_MUL_MAT) {
+        int i = n_profiles;
+        strcpy(p[i].name, "CL");
+        p[i].runner = ggml_compute_forward_cl;
+        p[i].wsize_getter = ggml_compute_forward_cl_get_wsize;
+        p[i].stages[1].valid = true;
+        p[i].stages[1].wait = true;
+        ++n_profiles;
     }
 #endif
 
     GGML_ASSERT(n_profiles > 0 && n_profiles <= GGML_MAX_TASK_PROFILES);
-    for (int i = 0; i < n_profiles; i++) {
-        profiles[i].id = i + 1;
+
+    for (int j = 0; j < n_profiles; j++) {
+        profiles[j].id = j + 1;
     }
+
     return n_profiles;
 }
 
-// Set task profile for GGML_OP_MUL_MAT or GGML_OP_OUT_PROD.
-static const struct ggml_task_profile *ggml_mulmat_get_task_profile(
-    struct ggml_tensor *node, struct ggml_task_profile *profiles,
-    int n_profiles, struct ggml_mulmat_tune *tune, int stages_time_us[3]) {
+// Try to fix task profile for given tensor, because the task profile might not
+// be the most performant.
+static void ggml_optimize_tensor_task_profile(
+    struct ggml_tensor *tensor, struct ggml_task_profile *profiles,
+    int n_profiles, struct ggml_mulmat_tune *tune) {
+
+    if (tensor->op != GGML_OP_MUL_MAT && tensor->op != GGML_OP_OUT_PROD) {
+        return;
+    }
+
+    GGML_ASSERT(tensor);
+    GGML_ASSERT(tensor->op == GGML_OP_MUL_MAT ||
+                tensor->op == GGML_OP_OUT_PROD);
+    GGML_ASSERT(tensor->task_profile.id == n_profiles);
 
-    GGML_ASSERT(node);
-    GGML_ASSERT(node->op == GGML_OP_MUL_MAT || node->op == GGML_OP_OUT_PROD);
     GGML_ASSERT(profiles);
-    GGML_ASSERT(n_profiles > 0);
+    GGML_ASSERT(n_profiles > 1);
 
-    enum ggml_type src0_t = node->src0->type;
-    enum ggml_type src1_t = node->src1->type;
+    int M = (int)tensor->ne[1];
+    int N = (int)tensor->ne[0];
+    int K = (int)tensor->src1->ne[0];
 
-    // Type and memory layout requirements for computing mul_mat with BLAS.
-    bool cond_match = (src0_t == GGML_TYPE_F32 || src0_t == GGML_TYPE_F16 ||
-                       ggml_is_quantized(src0_t)) &&
-                       src1_t == GGML_TYPE_F32 && node->type == GGML_TYPE_F32 &&
-                       ggml_is_contiguous(node->src0) &&
-                       ggml_is_contiguous(node->src1);
+#if defined(GGML_USE_TUNE)
+    if (tune != NULL && ggml_mul_mat_check_type_mem(tensor)) {
+        GGML_ASSERT(tensor->backend == 0 && tensor->src0->backend == 0 &&
+                    tensor->src1->backend == 0);
 
-    int M = (int)node->ne[1];
-    int N = (int)node->ne[0];
-    int K = (int)node->src1->ne[0];
+        GGML_ASSERT(n_profiles >= 2);
 
-    const struct ggml_task_profile *prof = NULL;
+        enum ggml_type src0_t = tensor->src0->type;
+        enum ggml_type src1_t = tensor->src1->type;
+
+        int stages_time_us[3];
+
+        int id = ggml_mulmat_tune_select_task_profile(tune, M, N, K, src0_t,
+                                                      src1_t, stages_time_us);
+        if (id > 0) {
+            struct ggml_task_profile *prof = NULL;
 
-    if (cond_match) {
-#if defined(GGML_USE_TUNE)
-        if (tune != NULL) {
-            GGML_ASSERT(n_profiles >= 2);
-            int id = ggml_mulmat_tune_select_task_profile(tune, M, N, K, src0_t,
-                                                        src1_t, stages_time_us);
             for (int i = 0; i < n_profiles; i++) {
                 if (profiles[i].id == id) {
                     prof = &profiles[i];
-                    return prof;
+                    break;
                 }
             }
-        }
-#else
-        UNUSED(tune);
-        UNUSED(stages_time_us);
-#endif
 
-        if (prof == NULL && M >= 32 && N >= 32 && K >= 32) {
-            for (int j = 0; j < n_profiles; j++) {
-                enum ggml_task_backend comp_be =
-                    profiles[j].stages[GGML_TASK_COMPUTE].backend;
-                switch (comp_be) {
-                    case GGML_TASK_BACKEND_GPU_CUDA: {
-                        GGML_ASSERT(ggml_cpu_has_cublas());
-                        prof = &profiles[j];
-                        break;
-                    }
-                    case GGML_TASK_BACKEND_GPU_CL: {
-                        GGML_ASSERT(ggml_cpu_has_clblast());
-                        prof = &profiles[j];
-                        break;
-                    }
-                    case GGML_TASK_BACKEND_CPU_BLAS: {
-                        GGML_ASSERT(ggml_cpu_has_cpublas());
-                        prof = &profiles[j];
-                        break;
-                    }
-                    default: {
-                        break;
-                    }
-                }
+            if (prof) {
+                memcpy(&tensor->task_profile, prof,
+                    sizeof(struct ggml_task_profile));
 
-                if (prof) {
-                    break;
+                // Do not wait if the estimated execution time is too small
+                // (e.g. less than 0.1 ms)
+                // TODO: need bench actual wait/notify time, see
+                // ggml-threading.c
+                for (int j = 0; j < 3; j++) {
+                    if (tensor->task_profile.stages[j].wait) {
+                        if (stages_time_us[j] < 100) {
+                            tensor->task_profile.stages[j].wait = false;
+                        }
+                    }
                 }
+                return;
             }
         }
     }
+#else
+    UNUSED(tune);
+#endif
 
-    if (prof == NULL) {
-        prof = &profiles[0];
-        GGML_ASSERT(prof->stages[1].backend == GGML_TASK_BACKEND_CPU);
-    }
+    // Guess the optimal matrix size.
+    bool size_match = (M >= 32 && N >= 32 && K >= 32);
+    UNUSED(size_match);
 
-    return prof;
-}
+    for (int i = n_profiles - 1; i >= 0; --i) {
+        const char *name = profiles[i].name;
 
-void ggml_graph_compute_set_tensor_task_proile(struct ggml_tensor *node,
-                                               struct ggml_cgraph *cgraph) {
-    // Pre-specified.
-    for (int i = 0; i < 3; i++) {
-        if (node->task_profile.stages[i].backend > 0) {
-            return;
+        if (strcmp(name, "CUDA") == 0) {
+#if defined(GGML_USE_CUBLAS)
+            if ((size_match || ggml_cuda_is_gpu_offloading(tensor)) &&
+                ggml_mul_mat_check_type_mem(tensor)) {
+                memcpy(&tensor->task_profile, &profiles[i],
+                       sizeof(struct ggml_task_profile));
+                return;
+            }
+#endif
         }
-    }
 
-    struct ggml_task_profile profiles[GGML_MAX_TASK_PROFILES];
-    int n_profiles = ggml_get_task_profiles(node, profiles);
-
-    const struct ggml_task_profile *profile = NULL;
-
-    // GPU offloading. A special case of pre-specified task_profile.
-    if (node->backend == GGML_BACKEND_GPU || node->backend == GGML_BACKEND_GPU_SPLIT) {
-        if (node->op != GGML_OP_MUL_MAT && node->op != GGML_OP_OUT_PROD) {
-            enum ggml_task_backend be;
-            if (ggml_cpu_has_cublas()) {
-                be = GGML_TASK_BACKEND_GPU_CUDA;
-            } else if (ggml_cpu_has_clblast()) {
-                be = GGML_TASK_BACKEND_GPU_CL;
-            } else {
-                GGML_ASSERT(false);
+        if (strcmp(name, "CL") == 0) {
+#if defined(GGML_USE_CLBLAST)
+            if ((size_match || ggml_cl_is_gpu_offloading(tensor)) &&
+                ggml_mul_mat_check_type_mem(tensor)) {
+                memcpy(&tensor->task_profile, &profiles[i],
+                       sizeof(struct ggml_task_profile));
+                return;
             }
+#endif
+        }
 
-            for (int j = 0; j < n_profiles; j++) {
-                if (profiles[j].stages[1].backend == be) {
-                    profile = &profiles[j];
-                    break;
-                }
+        if (strcmp(name, "BLAS") == 0) {
+#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
+            if (size_match && ggml_mul_mat_check_type_mem(tensor)) {
+                memcpy(&tensor->task_profile, &profiles[0],
+                       sizeof(struct ggml_task_profile));
+                return;
             }
-            GGML_ASSERT(profile);
-            GGML_ASSERT(!cgraph->tune);
-
-            memcpy(&node->task_profile, profile, sizeof(struct ggml_task_profile));
-            return;
+#endif
         }
     }
 
-    // mul_mat: GGML_OP_MUL_MAT and GGML_OP_OUT_PROD.
-    if (node->op == GGML_OP_MUL_MAT) {
-#if defined(GGML_USE_TUNE)
-        GGML_ASSERT(node->backend == GGML_BACKEND_CPU);
-
-        int stages_time_us[3];
-        profile = ggml_mulmat_get_task_profile(node, profiles, n_profiles,
-                                               cgraph->tune, stages_time_us);
-        GGML_ASSERT(profile);
+    memcpy(&tensor->task_profile, &profiles[0],
+           sizeof(struct ggml_task_profile));
+}
 
-        memcpy(&node->task_profile, profile, sizeof(struct ggml_task_profile));
+static void ggml_set_tensor_task_profile(struct ggml_tensor *tensor,
+                                         struct ggml_mulmat_tune *tune) {
+    struct ggml_task_profile profiles[GGML_MAX_TASK_PROFILES];
+    int n_profiles = ggml_get_task_profiles(tensor, profiles);
+    GGML_ASSERT(n_profiles > 0);
 
-        if (cgraph->tune) {
-            memcpy(&node->task_profile, profile,
-                    sizeof(struct ggml_task_profile));
+    // By default use profile with the largest id.
+    // Profile id starts from 1.
+    memcpy(&tensor->task_profile, &profiles[n_profiles - 1],
+           sizeof(struct ggml_task_profile));
 
-            // Do not wait if the estimated execution time is too small
-            // (e.g. less than 0.1 ms)
-            // TODO: need bench actual wait/notify time, see
-            // ggml-threading.c
-            for (int j = 0; j < 3; j++) {
-                if (node->task_profile.stages[j].wait) {
-                    if (stages_time_us[j] < 100) {
-                        node->task_profile.stages[j].wait = false;
-                    }
-                }
-            }
-        }
-        return;
-#else
-        profile = ggml_mulmat_get_task_profile(node, profiles, n_profiles, NULL,
-                                               NULL);
-        GGML_ASSERT(profile);
-        memcpy(&node->task_profile, profile, sizeof(struct ggml_task_profile));
-        return;
-#endif
-    } else if (node->op == GGML_OP_OUT_PROD) { // FIXME: is this correct?
-        profile = ggml_mulmat_get_task_profile(node, profiles, n_profiles, NULL,
-                                               NULL);
-        GGML_ASSERT(profile);
-        memcpy(&node->task_profile, profile, sizeof(struct ggml_task_profile));
-        return;
+    if (n_profiles > 1) {
+        GGML_ASSERT(tensor->task_profile.id > 1);
+        ggml_optimize_tensor_task_profile(tensor, profiles, n_profiles, tune);
     }
 
-    // default.
-    profile = &profiles[0];
-    GGML_ASSERT(profile->stages[1].backend == GGML_TASK_BACKEND_CPU);
-    memcpy(&node->task_profile, profile, sizeof(struct ggml_task_profile));
+    GGML_ASSERT(tensor->task_profile.id > 0);
 }
 
 void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph) {
     int n_threads = cgraph->n_threads;
 
-    struct ggml_threading_context *thrd_ctx = ggml_threading_start(
-        n_threads, NULL, ggml_compute_forward,
-        GGML_THREADING_FEATURE_WAIT_ON_DONE, NULL);
-
     // initialize tasks + work buffer
     {
         // int64_t t0 = ggml_time_us();
@@ -15952,25 +15946,26 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
 
             GGML_ASSERT (node->op != GGML_OP_NONE);
 
-            struct ggml_task_stage *stages = node->task_profile.stages;
+            if (node->task_profile.id == 0) {
+                ggml_set_tensor_task_profile(node, cgraph->tune);
+            }
 
-            ggml_graph_compute_set_tensor_task_proile(node, cgraph);
+            struct ggml_task_stage *stages = node->task_profile.stages;
 
             //
             // Allocate temp buffer `wdata` for CPU.
             // NOTE: GPU MAY fallback to CPU, so we have to cover all possible cases.
             //
 
-            if (node->task_profile.get_wsize) {
-                int sz = node->task_profile.get_wsize(node);
+            if (node->task_profile.wsize_getter) {
+                int sz = node->task_profile.wsize_getter(node);
                 if (sz >= 0) {
                     work_size = MAX(work_size, (size_t)sz);
+                    // FIXME: is it safe to continue in case fallback?
                     continue;
                 }
             }
 
-            //printf("op: %d, comp backend: %d\n", node->op, node->task_profile.stages[1].backend);
-
             // compute stage n_tasks.
             int n_tasks = stages[1].parallel ? n_threads : 1;
 
@@ -16034,35 +16029,17 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
                 case GGML_OP_OUT_PROD: // FIXME: is this correct?
                     {
                         size_t cur = 0;
-                        enum ggml_task_backend comp_backend = stages[GGML_TASK_COMPUTE].backend;
-                        GGML_ASSERT(comp_backend != GGML_TASK_BACKEND_NONE);
-                        if (comp_backend == GGML_TASK_BACKEND_CPU_BLAS) {
-                            GGML_ASSERT(ggml_cpu_has_cpublas());
-                            GGML_ASSERT(node->src1->type == GGML_TYPE_F32);
-
-                            if (node->src0->type == GGML_TYPE_F32) {
-                                cur = 0;
-                            } else if (node->src0->type == GGML_TYPE_F16) {
-                                // here we need memory just for single 2D matrix from src0
-                                cur = GGML_TYPE_SIZE[GGML_TYPE_F32]*(node->src0->ne[0]*node->src0->ne[1]);
-                            } else if (ggml_is_quantized(node->src0->type)) {
-                                cur = GGML_TYPE_SIZE[GGML_TYPE_F32]*(node->src0->ne[0]*node->src0->ne[1]);
-                            } else {
-                                GGML_ASSERT(false);
-                            }
-                        } else { // CPU or GPU fallback
-                            GGML_ASSERT(node->src1->type == GGML_TYPE_F32);
-
-                            if (node->src0->type == GGML_TYPE_F32) {
-                                cur = 0;
-                            } else if (node->src0->type == GGML_TYPE_F16) {
-                                cur = GGML_TYPE_SIZE[GGML_TYPE_F16]*ggml_nelements(node->src1);
-                            } else if (ggml_is_quantized(node->src0->type)) {
-                                const enum ggml_type type_q = quantize_fns[node->src0->type].vec_dot_type;
-                                cur = GGML_TYPE_SIZE[type_q]*ggml_nelements(node->src1)/GGML_BLCK_SIZE[type_q];
-                            } else {
-                                GGML_ASSERT(false);
-                            }
+                        GGML_ASSERT(node->src1->type == GGML_TYPE_F32);
+
+                        if (node->src0->type == GGML_TYPE_F32) {
+                            cur = 0;
+                        } else if (node->src0->type == GGML_TYPE_F16) {
+                            cur = GGML_TYPE_SIZE[GGML_TYPE_F16]*ggml_nelements(node->src1);
+                        } else if (ggml_is_quantized(node->src0->type)) {
+                            const enum ggml_type type_q = quantize_fns[node->src0->type].vec_dot_type;
+                            cur = GGML_TYPE_SIZE[type_q]*ggml_nelements(node->src1)/GGML_BLCK_SIZE[type_q];
+                        } else {
+                            GGML_ASSERT(false);
                         }
 
                         work_size = MAX(work_size, cur);
@@ -16218,6 +16195,9 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
     const int64_t perf_start_cycles  = ggml_perf_cycles();
     const int64_t perf_start_time_us = ggml_perf_time_us();
 
+    struct ggml_threading_context *thrd_ctx = ggml_threading_start(n_threads,
+        NULL, ggml_compute_forward, GGML_THREADING_FEATURE_WAIT_ON_DONE, NULL);
+
     for (int i = 0; i < cgraph->n_nodes; i++) {
         GGML_PRINT_DEBUG_5("%s: %d/%d\n", __func__, i, cgraph->n_nodes);
 
diff --git a/ggml.h b/ggml.h
index d4d5d3521c74f..554645ba8b6d5 100644
--- a/ggml.h
+++ b/ggml.h
@@ -362,29 +362,10 @@ extern "C" {
 
     static const size_t GGML_OBJECT_SIZE = sizeof(struct ggml_object);
 
-    // As part of task config profile solution, `ggml_task_backend` defines
-    // backends for each task stage. Similar to `ggml_tensor.backend`,
-    // `ggml_tensor.task_profile` generalizes how to configure tensor computing
-    // at per task-stage level.
-    //
-    // The following enum values are designed as combination of hardware and
-    // optional software interface.
-    enum ggml_task_backend {
-        GGML_TASK_BACKEND_NONE     = 0,
-
-        // [0x10, 0x1F]: CPU
-        GGML_TASK_BACKEND_CPU      = 0x10,
-        GGML_TASK_BACKEND_CPU_BLAS = 0x11,
-
-        // [0x20 - 0x2F]: GPU
-        GGML_TASK_BACKEND_GPU      = 0x20,
-        GGML_TASK_BACKEND_GPU_CUDA = 0x21,
-        GGML_TASK_BACKEND_GPU_CL   = 0x22,
-    };
-
     // config for computing one of the 3 task stages of a tensor.
     struct ggml_task_stage {
-        enum ggml_task_backend backend;
+        bool valid;
+
         bool parallel;
         // hint idle workers go waiting, valid only when parallel is false.
         bool wait;
@@ -407,13 +388,16 @@ extern "C" {
     // Get wsize for node computing.
     // When return -1: should be explained as `fallback to CPU`, caller MUST
     // determine how much memory to reserve for this node.
-    typedef int (ggml_task_get_wsize)(struct ggml_tensor *tensor);
+    typedef int (ggml_task_wsize_getter)(struct ggml_tensor *tensor);
 
     // config for computing a tensor.
     struct ggml_task_profile {
         // profile id, start from 1.
         int id;
 
+        // Required, not empty, no whitespaces.
+        char name[16];
+
         // index 0: INIT, 1: COMPUTE, 2: FINALIZE
         struct ggml_task_stage stages[3];
 
@@ -421,7 +405,7 @@ extern "C" {
         ggml_task_runner *runner;
 
         // Optional function to return required wsize for wdata.
-        ggml_task_get_wsize *get_wsize;
+        ggml_task_wsize_getter *wsize_getter;
 
         // Optional flag for development.
         // MUST be used only in testing codes.
diff --git a/llama.cpp b/llama.cpp
index 06555e1dddeb2..e6bddffd5edaa 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -2744,8 +2744,9 @@ struct llama_context * llama_init_from_file(
 }
 
 #ifdef GGML_USE_TUNE
-bool llama_mulmat_tune(struct llama_context *ctx, int n_threads, bool tune, const char *fname) {
-    GGML_ASSERT (ctx->model.n_gpu_layers == 0);
+bool llama_mulmat_tune(struct llama_context *ctx, int n_threads, bool tune,
+                       const char *fname) {
+    GGML_ASSERT(ctx->model.n_gpu_layers == 0);
 
     printf("\n");
 
@@ -2755,7 +2756,7 @@ bool llama_mulmat_tune(struct llama_context *ctx, int n_threads, bool tune, cons
 
     enum ggml_ftype ggml_ftype;
     switch (hparams->ftype) {
-        case LLAMA_FTYPE_ALL_F32:
+    case LLAMA_FTYPE_ALL_F32:
         ggml_ftype = GGML_FTYPE_ALL_F32;
         break;
     case LLAMA_FTYPE_MOSTLY_F16:
@@ -2767,9 +2768,6 @@ bool llama_mulmat_tune(struct llama_context *ctx, int n_threads, bool tune, cons
     case LLAMA_FTYPE_MOSTLY_Q4_1:
         ggml_ftype = GGML_FTYPE_MOSTLY_Q4_1;
         break;
-    case LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16:
-        ggml_ftype = GGML_FTYPE_MOSTLY_Q4_1_SOME_F16;
-        break;
     case LLAMA_FTYPE_MOSTLY_Q5_0:
         ggml_ftype = GGML_FTYPE_MOSTLY_Q5_0;
         break;
@@ -2799,8 +2797,8 @@ bool llama_mulmat_tune(struct llama_context *ctx, int n_threads, bool tune, cons
         ggml_ftype = GGML_FTYPE_MOSTLY_Q6_K;
         break;
     default:
-        throw std::runtime_error(
-            format("invalid output file type %d\n", hparams->ftype));
+        fprintf(stderr, "[tune] unsupported file type %d\n", hparams->ftype);
+        return false;
     }
 
     int n_vocab = hparams->n_vocab;
@@ -2808,30 +2806,36 @@ bool llama_mulmat_tune(struct llama_context *ctx, int n_threads, bool tune, cons
     int n_rot = hparams->n_rot;
 
     int n_mult = hparams->n_mult;
-    int n_ff = ((2*(4*n_embd)/3 + n_mult - 1)/n_mult)*n_mult;
+    int n_ff = ((2 * (4 * n_embd) / 3 + n_mult - 1) / n_mult) * n_mult;
 
     struct ggml_mulmat_tune_params params = {
-        /*.model =*/ {
-            /* .name    =*/ model_name,
-            /* .ftype   =*/ ggml_ftype,
-            /* .n_vocab =*/ n_vocab,
-            /* .n_embd  =*/ n_embd,
-            /* .n_ff    =*/ n_ff,
-            /* .n_rot   =*/ n_rot,
+        /*.model =*/{
+            /* .name    =*/model_name,
+            /* .ftype   =*/ggml_ftype,
+            /* .n_vocab =*/n_vocab,
+            /* .n_embd  =*/n_embd,
+            /* .n_ff    =*/n_ff,
+            /* .n_rot   =*/n_rot,
         },
-        /* .m_num          =*/ 8,
-        /* .n_pass         =*/ 1,
-        /* .n_threads      =*/ n_threads,
-        /* .prrogress      =*/ true,
-        /* .output_console =*/ false,
-        /* .fname          =*/ fname,
+        /* .m_num          =*/8,
+        /* .n_pass         =*/1,
+        /* .n_threads      =*/n_threads,
+        /* .prrogress      =*/true,
+        /* .output_console =*/false,
+        /* .fname          =*/fname,
     };
 
     bool empty_fname = !fname || strcmp(fname, "") == 0;
 
-    ctx->tune = new(struct ggml_mulmat_tune);
+    ctx->tune = new (struct ggml_mulmat_tune);
     if (!ctx->tune) {
-        throw std::runtime_error(format("failed to allocate memory for tune\n"));
+        fprintf(stderr, "[tune] failed to allocate memory for tune\n");
+        return false;
+    }
+
+    if (!ggml_cpu_has_blas()) {
+        fprintf(stderr, "[tune] this program is not built with BLAS, abort.\n");
+        return false;
     }
 
     if (tune) {
@@ -2844,31 +2848,30 @@ bool llama_mulmat_tune(struct llama_context *ctx, int n_threads, bool tune, cons
             ggml_mulmat_tune_free(ctx->tune);
             return true;
         }
-    } else {
-        if (empty_fname) {
-            return false;
-        }
+    } else if (empty_fname) {
+        return false;
     }
 
     if (!empty_fname) {
         FILE *fp = fopen(fname, "r");
         if (!fp) {
-            fprintf(stderr, "[tune] failed to open file %s.\n",
-                    fname);
+            fprintf(stderr, "[tune] failed to open file %s.\n", fname);
+            return false;
         } else {
-            bool ok = ggml_mulmat_tune_read_data(ctx->tune, fp);
+            int rc = ggml_mulmat_tune_read_data(ctx->tune, fp);
             fclose(fp);
 
-            if (!ok) {
+            if (rc != 0) {
                 fprintf(stderr,
-                        "[tune] failed to read data from %s\n",
-                        fname);
+                        "[tune] failed to read data from %s, error code: %d\n",
+                        fname, rc);
                 return false;
             }
 
             fprintf(stderr, "[tune] loaded data from %s\n", fname);
 
-            ok = ggml_mulmat_tune_validate(ctx->tune, model_name, ggml_ftype, params.n_threads);
+            bool ok = ggml_mulmat_tune_validate(ctx->tune, model_name, ggml_ftype,
+                                                params.n_threads);
             if (!ok) {
                 return false;
             }
diff --git a/tests/test-ggml-threading.c b/tests/test-ggml-threading.c
index 2079fe144b194..e904fc10dcf67 100644
--- a/tests/test-ggml-threading.c
+++ b/tests/test-ggml-threading.c
@@ -41,9 +41,8 @@ static const int n_repeat = 10;
 // counter with array.
 static int work_done_arr[MAX_N_THREADS];
 
-static enum ggml_compute_error
-mock_task_runner(const struct ggml_compute_params *params,
-                 struct ggml_tensor *node) {
+static enum ggml_compute_error mock_task_runner(const struct ggml_compute_params *params,
+                             struct ggml_tensor *node) {
     int64_t loops = node->task_profile.dev_flags[1] * 1000 * 1000;
     if (node->task_profile.stages[params->type].parallel) {
         loops /= params->nth;
@@ -80,20 +79,15 @@ int test_driver(int id, struct ggml_tensor *node, int n_threads) {
 
     int t0 = (int)ggml_time_us();
 
-    struct ggml_threading_context *ctx = ggml_threading_start(
-        n_threads, NULL, mock_task_runner, features, /*stages_time*/ NULL);
+    node->task_profile.runner = mock_task_runner;
+
+    struct ggml_threading_context *ctx =
+        ggml_threading_start(n_threads, NULL, NULL, features, /*stages_time*/ NULL);
 
     int t1 = (int)ggml_time_us();
 
     for (int i = 0; i < n_repeat; i++) {
-        enum ggml_compute_error err = ggml_threading_compute_tensor(
-            ctx, node, /*wdata*/ NULL, /*wsize*/ 0);
-        if (err != GGML_COMPUTE_OK) {
-            ggml_threading_stop(ctx);
-            printf("ggml_threading_compute_tensor failed with error: %d.\n",
-                   err);
-            return 1;
-        }
+        ggml_threading_compute_tensor(ctx, node, /*wdata*/ NULL, /*wsize*/ 0);
     }
 
     int t2 = (int)ggml_time_us();
@@ -107,7 +101,7 @@ int test_driver(int id, struct ggml_tensor *node, int n_threads) {
     int expect = 0;
     for (int i = 0; i < 3; i++) {
         const struct ggml_task_stage *ts = &stages[i];
-        if (ts->backend != GGML_TASK_BACKEND_NONE) {
+        if (ts->valid) {
             if (ts->parallel) {
                 expect += n_threads;
             } else {
@@ -144,14 +138,12 @@ static enum ggml_compute_error
 mock_task_runner_fallback(const struct ggml_compute_params *params,
                           struct ggml_tensor *node) {
     UNUSED(params);
-    if (node->backend == GGML_BACKEND_GPU) {
-        // ... finally failed to compute in GPU.
 
-        node->backend = GGML_BACKEND_CPU;
+    // failed to run ...
+    if (node->task_profile.id == 2) {
         return GGML_COMPUTE_FALLBACK;
-    } else {
-        return GGML_COMPUTE_OK;
     }
+    return GGML_COMPUTE_OK;
 }
 
 // By design, fallback should happen when attempt computing tensor in GPU,
@@ -164,6 +156,9 @@ int test_fallback(struct ggml_tensor *node) {
     enum ggml_compute_error err =
         ggml_threading_compute_tensor(ctx, node, /*wdata*/ NULL, /*wsize*/ 0);
     if (err == GGML_COMPUTE_FALLBACK) {
+        // mock setup new profile ...
+        node->task_profile.id = 1;
+
         err = ggml_threading_compute_tensor(ctx, node, /*wdata*/ NULL,
                                             /*wsize*/ 0);
     }
@@ -214,12 +209,12 @@ int main(void) {
 
     struct ggml_tensor node;
     memset(&node, 0, sizeof(struct ggml_tensor));
+    node.task_profile.runner = mock_task_runner;
 
     struct ggml_task_stage *stages = node.task_profile.stages;
 
-    stages[0].backend = GGML_TASK_BACKEND_CPU;
-    stages[1].backend = GGML_TASK_BACKEND_CPU;
-    stages[2].backend = GGML_TASK_BACKEND_NONE;
+    stages[0].valid = true;
+    stages[1].valid = true;
 
     int n_passed = 0;
     int n_tests = 0;
@@ -277,7 +272,7 @@ int main(void) {
 
         struct ggml_threading_context *ctx =
             ggml_threading_start(n_threads, ggml_threading_graph_compute_thread,
-                                 mock_task_runner, 0, /*stages_time*/ NULL);
+                                 NULL, 0, /*stages_time*/ NULL);
 
         int t1 = (int)ggml_time_us();
 
@@ -416,8 +411,8 @@ int main(void) {
         node.src0 = &src0;
         node.src1 = &src1;
 
-        node.backend = GGML_BACKEND_GPU;
-        stages[1].backend = GGML_TASK_BACKEND_GPU;
+        node.task_profile.id = 2;
+        stages[1].valid = true;
         if (test_fallback(&node) == 0) {
             ++n_passed;
             printf("[test-ggml-threading] test fallback: ok\n\n");
diff --git a/tests/test-ggml-tune.c b/tests/test-ggml-tune.c
index 4339881e52c2d..97fd6cfbfd07b 100644
--- a/tests/test-ggml-tune.c
+++ b/tests/test-ggml-tune.c
@@ -46,13 +46,9 @@ int main(void) {
 }
 
 static int bench(void) {
-    {
-        enum ggml_task_backend backends[16];
-        int n_backends = ggml_mulmat_tune_get_builtin_task_backends(backends);
-        if (n_backends < 2) {
-            printf("[test-ggml-tune] skipped because no BLAS\n");
-            return 0;
-        }
+    if (!ggml_cpu_has_blas()) {
+        printf("[test-ggml-tune] skipped because no BLAS\n");
+        return 0;
     }
 
     {
@@ -118,10 +114,13 @@ static int
 ggml_task_profiles_mock_qxx_provider(struct ggml_tensor *node,
                                      struct ggml_task_profile *profiles) {
     UNUSED(node);
-    profiles[0].stages[0].backend = GGML_TASK_BACKEND_CPU;
-    profiles[0].stages[1].backend = GGML_TASK_BACKEND_CPU;
-    profiles[1].stages[0].backend = GGML_TASK_BACKEND_CPU;
-    profiles[1].stages[1].backend = GGML_TASK_BACKEND_CPU_BLAS;
+    profiles[0].id = 1;
+    profiles[0].stages[0].valid = true;
+    profiles[0].stages[1].valid = true;
+
+    profiles[0].id = 2;
+    profiles[1].stages[0].valid = true;
+    profiles[1].stages[1].valid = true;
 
     return 2;
 }

From 67bb3679625f8b5e0d2443bcc9cd5098d7b3c931 Mon Sep 17 00:00:00 2001
From: mqy <meng.qingyou@gmail.com>
Date: Sun, 18 Jun 2023 14:03:09 +0800
Subject: [PATCH 11/24] typos

---
 Makefile    | 2 +-
 ggml-tune.c | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/Makefile b/Makefile
index 531f62fb01347..8cd253b531d7f 100644
--- a/Makefile
+++ b/Makefile
@@ -237,7 +237,7 @@ k_quants.o: k_quants.c k_quants.h
 endif # LLAMA_NO_K_QUANTS
 
 ifndef LLAMA_NO_TUNE
-CFLAGS   += -DGGML_USE_TUNE -DGGML_TUNE_NDEBUG
+CFLAGS   += -DGGML_USE_TUNE #-DGGML_TUNE_NDEBUG
 CXXFLAGS += -DGGML_USE_TUNE
 endif
 
diff --git a/ggml-tune.c b/ggml-tune.c
index 444269ae4f55a..2e292e98e7bb6 100644
--- a/ggml-tune.c
+++ b/ggml-tune.c
@@ -83,7 +83,7 @@ int ggml_mulmat_tune_select_task_profile(struct ggml_mulmat_tune *tune, int M,
 
 #ifndef GGML_TUNE_NDEBUG
                 printf("\n[tune] M: %3d, N: %5d, K: %5d, profile id: %d, "
-                       "backends: %s %s %s\n",
+                       "profile name: %s\n",
                        M, N, K, prof->id, prof->name);
 #endif
             }

From 2193ab6281c6c21c76608500ae573c1be7c0d0bb Mon Sep 17 00:00:00 2001
From: mqy <meng.qingyou@gmail.com>
Date: Sun, 18 Jun 2023 14:07:33 +0800
Subject: [PATCH 12/24] fix cuda build error

---
 ggml-cuda.cu | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ggml-cuda.cu b/ggml-cuda.cu
index 5a4c7725a92de..48c7c83dfb4ad 100644
--- a/ggml-cuda.cu
+++ b/ggml-cuda.cu
@@ -2544,7 +2544,7 @@ bool ggml_cuda_is_gpu_offloading(struct ggml_tensor * tensor) {
 
 bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor){
     ggml_cuda_func_t func;
-    const bool any_on_device = is_gpu_offloading(tensor);
+    const bool any_on_device = ggml_cuda_is_gpu_offloading(tensor);
 
     switch (tensor->op) {
         case GGML_OP_ADD:

From 0ec4dab8649c00519df9f04838e28ca2cafb0e57 Mon Sep 17 00:00:00 2001
From: mqy <meng.qingyou@gmail.com>
Date: Sun, 18 Jun 2023 14:59:44 +0800
Subject: [PATCH 13/24] fixed break and asssion from select; try fix cuda link
 error

---
 ggml-cuda.h      |  2 +-
 ggml-opencl.cpp  |  2 +-
 ggml-opencl.h    |  2 +-
 ggml-threading.c |  8 ++++----
 ggml.c           | 11 ++++++++---
 5 files changed, 15 insertions(+), 10 deletions(-)

diff --git a/ggml-cuda.h b/ggml-cuda.h
index 75ea94392ce6b..70bd65e227bf2 100644
--- a/ggml-cuda.h
+++ b/ggml-cuda.h
@@ -15,8 +15,8 @@ struct ggml_tensor_extra_gpu {
 void   ggml_init_cublas(void);
 void   ggml_cuda_set_tensor_split(const float * tensor_split);
 
+bool   ggml_cuda_is_gpu_offloading(struct ggml_tensor * tensor);
 void   ggml_cuda_mul(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
-bool   ggml_cuda_is_gpu_offloading(const struct ggml_tensor * src0);
 size_t ggml_cuda_mul_mat_get_wsize(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
 void   ggml_cuda_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst, void * wdata, size_t wsize);
 
diff --git a/ggml-opencl.cpp b/ggml-opencl.cpp
index 28098793df296..3ed9d1adb0c2b 100644
--- a/ggml-opencl.cpp
+++ b/ggml-opencl.cpp
@@ -1589,7 +1589,7 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor *
     }
 }
 
-bool ggml_cl_is_gpu_offloading(struct ggml_tensor * tensor) {
+bool ggml_cl_is_gpu_offloading(const struct ggml_tensor * tensor) {
     GGML_ASSERT(tensor);
     return (tensor->src0 && tensor->src0->backend == GGML_BACKEND_GPU) ||
         (tensor->src1 && tensor->src1->backend == GGML_BACKEND_GPU);
diff --git a/ggml-opencl.h b/ggml-opencl.h
index 1de12f55a5c95..6d815bbf07294 100644
--- a/ggml-opencl.h
+++ b/ggml-opencl.h
@@ -8,8 +8,8 @@ extern "C" {
 
 void ggml_cl_init(void);
 
+bool   ggml_cl_is_gpu_offloading(const struct ggml_tensor * tensor);
 void   ggml_cl_mul(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
-bool   ggml_cl_is_gpu_offloading(struct ggml_tensor * tensor);
 size_t ggml_cl_mul_mat_get_wsize(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
 void   ggml_cl_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst, void * wdata, size_t wsize);
 
diff --git a/ggml-threading.c b/ggml-threading.c
index dada9f3fe8466..882639666ab05 100644
--- a/ggml-threading.c
+++ b/ggml-threading.c
@@ -142,7 +142,7 @@ static int sched_yield(void) {
 #endif
 
 struct ggml_perf_stats {
-    int runs;
+    atomic_int runs;
 
     // total cycles
     atomic_int cycles;
@@ -211,9 +211,9 @@ static inline void ggml_spin_unlock(volatile atomic_flag *obj) {
 
 static inline void ggml_perf_collect(struct ggml_perf_stats *st, int64_t c0,
                                      int64_t t0) {
-    st->runs++;
-    st->cycles += (ggml_cycles() - c0);
-    st->time_us += (ggml_time_us() - t0);
+    atomic_fetch_add(&st->runs, 1);
+    atomic_fetch_add(&st->cycles, (int)(ggml_cycles() - c0));
+    atomic_fetch_add(&st->time_us, (int)(ggml_time_us() - t0));
 }
 
 // A worker thread goes cond waiting.
diff --git a/ggml.c b/ggml.c
index 62750b20bd127..5a9e0b33e8cea 100644
--- a/ggml.c
+++ b/ggml.c
@@ -15468,7 +15468,7 @@ struct ggml_cgraph ggml_build_backward(struct ggml_context * ctx, struct ggml_cg
 // ---- task profiles  ----
 
 // Check the type and memeory layout for mul_mat on blas(CPU BLAS)
-static bool ggml_mul_mat_check_type_mem(struct ggml_tensor *tensor) {
+static bool ggml_mul_mat_check_type_mem(const struct ggml_tensor *tensor) {
     enum ggml_type src0_t = tensor->src0->type;
     enum ggml_type src1_t = tensor->src1->type;
 
@@ -15669,6 +15669,8 @@ int ggml_get_task_profiles(
             p[0].stages[0].valid = true;
             p[0].stages[1].valid = true;
             p[0].stages[1].parallel = true;
+        } else {
+            GGML_ASSERT(false);
         }
     } break;
     case GGML_OP_SCALE: {
@@ -15717,7 +15719,7 @@ int ggml_get_task_profiles(
     case GGML_OP_FLASH_FF: {
         p[0].stages[1].valid = true;
         p[0].stages[1].parallel = true;
-    }
+    } break;
     case GGML_OP_FLASH_ATTN_BACK: {
         p[0].stages[0].valid = true;
         p[0].stages[1].valid = true;
@@ -15727,11 +15729,12 @@ int ggml_get_task_profiles(
     case GGML_OP_MAP_BINARY: {
         p[0].stages[1].valid = true;
     } break;
-    case GGML_OP_CROSS_ENTROPY_LOSS:
+    case GGML_OP_CROSS_ENTROPY_LOSS: {
         p[0].stages[0].valid = true;
         p[0].stages[1].valid = true;
         p[0].stages[1].parallel = true;
         p[0].stages[2].valid = true;
+    } break;
     case GGML_OP_CROSS_ENTROPY_LOSS_BACK: {
         p[0].stages[1].valid = true;
         p[0].stages[1].parallel = true;
@@ -15764,6 +15767,8 @@ int ggml_get_task_profiles(
             p[i].stages[0].parallel = true;
             p[i].stages[1].valid = true;
             p[i].stages[1].wait = true;
+        } else {
+            GGML_ASSERT(false);
         }
         ++n_profiles;
     }

From 5abb8aefea8c9c9cc175448ff568f43788ae3718 Mon Sep 17 00:00:00 2001
From: mqy <meng.qingyou@gmail.com>
Date: Sun, 18 Jun 2023 18:55:44 +0800
Subject: [PATCH 14/24] fix warning

---
 ggml-cuda.cu | 2 +-
 ggml-cuda.h  | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/ggml-cuda.cu b/ggml-cuda.cu
index 48c7c83dfb4ad..e31d494b2ac52 100644
--- a/ggml-cuda.cu
+++ b/ggml-cuda.cu
@@ -2542,7 +2542,7 @@ bool ggml_cuda_is_gpu_offloading(struct ggml_tensor * tensor) {
         || (tensor->src1 != nullptr && tensor->src1->backend == GGML_BACKEND_GPU);
 }
 
-bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor){
+bool ggml_cuda_compute_forward(const struct ggml_compute_params * params, struct ggml_tensor * tensor){
     ggml_cuda_func_t func;
     const bool any_on_device = ggml_cuda_is_gpu_offloading(tensor);
 
diff --git a/ggml-cuda.h b/ggml-cuda.h
index 70bd65e227bf2..efb5bb38f2806 100644
--- a/ggml-cuda.h
+++ b/ggml-cuda.h
@@ -32,7 +32,7 @@ void   ggml_cuda_assign_buffers_no_scratch(struct ggml_tensor * tensor);
 void   ggml_cuda_set_main_device(int main_device);
 void   ggml_cuda_set_scratch_size(size_t scratch_size);
 void   ggml_cuda_free_scratch(void);
-bool   ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor);
+bool   ggml_cuda_compute_forward(const struct ggml_compute_params * params, struct ggml_tensor * tensor);
 
 #ifdef  __cplusplus
 }

From 5feefb32b3e332a411c2c618d38bfe5f98dbde13 Mon Sep 17 00:00:00 2001
From: mqy <meng.qingyou@gmail.com>
Date: Sun, 18 Jun 2023 18:57:33 +0800
Subject: [PATCH 15/24] threading: add suspend/resume APIs, so it's possible to
 run a thread pool at session level

---
 ggml-threading.c            |  81 ++++++++++++-------
 ggml-threading.h            |  12 ++-
 tests/test-ggml-threading.c | 151 +++++++++++++++++++++++++++++++++---
 3 files changed, 204 insertions(+), 40 deletions(-)

diff --git a/ggml-threading.c b/ggml-threading.c
index 882639666ab05..fb02b40469116 100644
--- a/ggml-threading.c
+++ b/ggml-threading.c
@@ -194,6 +194,8 @@ struct ggml_threading_context {
     struct ggml_perf_stats wait_perf;
     struct ggml_perf_stats wakeup_perf;
 
+    atomic_bool suspending;
+
     int64_t *stages_time;
 };
 
@@ -252,6 +254,30 @@ static void ggml_threading_cond_wait(struct ggml_compute_state *state) {
     }
 }
 
+// Suspend
+void ggml_threading_suspend(struct ggml_threading_context *ctx) {
+    if (ctx->n_threads == 1) {
+        return;
+    }
+
+    struct ggml_compute_state_shared *shared = &ctx->shared;
+
+    ggml_spin_lock(&shared->spin);
+    ctx->shared.wait_now = true;
+    ggml_spin_unlock(&shared->spin);
+
+    const int n_worker_threads = ctx->n_threads - 1;
+
+    while (ctx->shared.n_waiting != n_worker_threads) {
+        ggml_spin_pause();
+    }
+
+    ggml_spin_lock(&shared->spin);
+    ctx->suspending = true;
+    ggml_spin_unlock(&shared->spin);
+    PRINT_DEBUG("[main] saw %d workers waiting\n", n_worker_threads);
+}
+
 // Wakeup all workers.
 //
 // Workers takes some time to wakeup, and has to lock spin after wakeup. Yield
@@ -259,8 +285,14 @@ static void ggml_threading_cond_wait(struct ggml_compute_state *state) {
 // experimental. See tests/test-ggml-threading.c for details.
 //
 // NOTE: must be protected by shared->spin
-static void
-ggml_threading_wakeup_workers(struct ggml_compute_state_shared *shared) {
+void ggml_threading_resume(struct ggml_threading_context *ctx) {
+    if (ctx->n_threads == 1) {
+        return;
+    }
+
+    struct ggml_compute_state_shared *shared = &ctx->shared;
+    ggml_spin_lock(&shared->spin);
+
     int64_t perf_cycles_0 = 0;
     int64_t perf_time_0 = 0;
 
@@ -269,12 +301,11 @@ ggml_threading_wakeup_workers(struct ggml_compute_state_shared *shared) {
         perf_time_0 = ggml_time_us();
     }
 
-    shared->wait_now = false;
-
     int loop_counter = 0;
-    int notify_counter = 0;
     int64_t last_signal_time = 0;
 
+    shared->wait_now = false;
+
     while (shared->n_waiting != 0) {
         ggml_spin_unlock(&shared->spin);
 
@@ -294,22 +325,23 @@ ggml_threading_wakeup_workers(struct ggml_compute_state_shared *shared) {
         GGML_ASSERT(pthread_mutex_lock(&shared->mutex) == 0);
         GGML_ASSERT(pthread_cond_broadcast(&shared->cond) == 0);
         GGML_ASSERT(pthread_mutex_unlock(&shared->mutex) == 0);
-        ++notify_counter;
         last_signal_time = ggml_time_us();
 
         ggml_spin_lock(&shared->spin);
     }
 
+    ctx->suspending = false;
+
     if (shared->ctx->features & GGML_THREADING_FEATURE_PERF) {
         ggml_perf_collect(&shared->ctx->wakeup_perf, perf_cycles_0,
                           perf_time_0);
     }
 
-    // if (notify_counter > 1) {
-    //     printf("%s: loop counter: %d, notify counter: %d\n", __func__,
-    //            loop_counter, notify_counter);
-    // }
-    UNUSED(notify_counter);
+    ggml_spin_unlock(&shared->spin);
+}
+
+bool ggml_threading_is_suspending(struct ggml_threading_context *ctx) {
+    return ctx->suspending;
 }
 
 // Setup workers for a task stage.
@@ -329,7 +361,9 @@ static void ggml_threading_setup_workers(struct ggml_threading_context *ctx,
 
     if (current->parallel) {
         if (shared->n_waiting > 0) {
-            ggml_threading_wakeup_workers(shared);
+            ggml_spin_unlock(&shared->spin);
+            ggml_threading_resume(ctx);
+            ggml_spin_lock(&shared->spin);
         }
 
         if ((ctx->features & GGML_THREADING_FEATURE_WAIT_ON_DONE) > 0) {
@@ -351,17 +385,11 @@ static void ggml_threading_setup_workers(struct ggml_threading_context *ctx,
         }
     } else if (current->wait) {
         if (shared->n_waiting < n_worker_threads) {
-            shared->wait_now = true;
-            PRINT_DEBUG("[main] wait_now was set, expect %d workers wait\n",
+            PRINT_DEBUG("[main] wait_now will be set, expect %d workers wait\n",
                         n_worker_threads);
-            ggml_spin_unlock(&shared->spin);
-
-            while (shared->n_waiting != n_worker_threads) {
-                ggml_spin_pause();
-            }
-
-            ggml_spin_lock(&shared->spin);
-            PRINT_DEBUG("[main] saw %d workers waiting\n", n_worker_threads);
+            ggml_spin_unlock(&ctx->shared.spin);
+            ggml_threading_suspend(ctx);
+            ggml_spin_lock(&ctx->shared.spin);
         }
     }
 
@@ -376,7 +404,7 @@ ggml_thread_ret_t ggml_threading_graph_compute_thread(void *data) {
 
     struct ggml_compute_state_shared *shared = state->shared;
     GGML_ASSERT(shared);
-    //GGML_ASSERT(shared->task_runner);
+    // GGML_ASSERT(shared->task_runner);
 
     shared->n_ready++;
 
@@ -527,7 +555,7 @@ ggml_threading_compute_tensor(struct ggml_threading_context *ctx,
                 GGML_ASSERT(profiles[0].id == 1);
 
                 memcpy(&node->task_profile, &profiles[0],
-                    sizeof(struct ggml_task_profile));
+                       sizeof(struct ggml_task_profile));
                 runner = ctx->shared.task_runner;
                 GGML_ASSERT(runner);
 
@@ -572,6 +600,7 @@ ggml_threading_start(int n_threads, ggml_threading_thread_runner *thread_runner,
     ctx->n_threads = n_threads;
     ctx->features = features;
     ctx->stages_time = stages_time;
+    ctx->suspending = false;
 
     int n_workers = n_threads - 1;
     if (n_workers > 0) {
@@ -633,9 +662,7 @@ void ggml_threading_stop(struct ggml_threading_context *ctx) {
         PRINT_DEBUG("[main] stopping thread pool ...\n");
         ctx->shared.stop = true;
 
-        ggml_spin_lock(&ctx->shared.spin);
-        ggml_threading_wakeup_workers(&ctx->shared);
-        ggml_spin_unlock(&ctx->shared.spin);
+        ggml_threading_resume(ctx);
 
         for (int j = 0; j < ctx->n_threads - 1; j++) {
             GGML_ASSERT(pthread_join(ctx->workers[j].thrd, NULL) == 0);
diff --git a/ggml-threading.h b/ggml-threading.h
index 81192450c6728..012c9cd504308 100644
--- a/ggml-threading.h
+++ b/ggml-threading.h
@@ -25,11 +25,12 @@ enum ggml_threading_features {
 typedef ggml_thread_ret_t(ggml_threading_thread_runner)(void *data);
 
 // Init and start underlying workers if n_threads > 1.
+// n_threads: number of threads (including caller) involving in computing tasks.
 //
 // thread: optional OS thread runner, default value:
 // `ggml_threading_graph_compute_thread`.
 //
-// task_runner: default task runner, nullable wheen tensor.runner is not NULL.
+// task_runner: default task runner, nullable when tensor.runner is not NULL.
 //              Overridden by tensor.runner.
 // features: configure threading behaviour, optional.
 // threading additional features. see `ggml_threading_feature`, default 0.
@@ -41,9 +42,18 @@ ggml_threading_start(int n_threads, ggml_threading_thread_runner *thread,
                      enum ggml_threading_features features,
                      int64_t stages_time[3]);
 
+// Suspend worker threads.
+void ggml_threading_suspend(struct ggml_threading_context *ctx);
+
+// Resume worker threads.
+void ggml_threading_resume(struct ggml_threading_context *ctx);
+
 // Stop workers (if exist), free memories (including the ctx).
 void ggml_threading_stop(struct ggml_threading_context *ctx);
 
+// Is all worker threads suspending?
+bool ggml_threading_is_suspending(struct ggml_threading_context *ctx);
+
 // The default implementation of `ggml_threading_thread_runner`
 ggml_thread_ret_t ggml_threading_graph_compute_thread(void *data);
 
diff --git a/tests/test-ggml-threading.c b/tests/test-ggml-threading.c
index e904fc10dcf67..f941f4dc3c372 100644
--- a/tests/test-ggml-threading.c
+++ b/tests/test-ggml-threading.c
@@ -41,8 +41,9 @@ static const int n_repeat = 10;
 // counter with array.
 static int work_done_arr[MAX_N_THREADS];
 
-static enum ggml_compute_error mock_task_runner(const struct ggml_compute_params *params,
-                             struct ggml_tensor *node) {
+static enum ggml_compute_error
+mock_task_runner(const struct ggml_compute_params *params,
+                 struct ggml_tensor *node) {
     int64_t loops = node->task_profile.dev_flags[1] * 1000 * 1000;
     if (node->task_profile.stages[params->type].parallel) {
         loops /= params->nth;
@@ -59,7 +60,7 @@ static enum ggml_compute_error mock_task_runner(const struct ggml_compute_params
     return GGML_COMPUTE_OK;
 }
 
-int test_driver(int id, struct ggml_tensor *node, int n_threads) {
+static int test_driver(int id, struct ggml_tensor *node, int n_threads) {
     uint8_t loops = node->task_profile.dev_flags[1];
     printf(
         "\n[test-ggml-threading] #%02d, workload: %2d million(s), n_threads: "
@@ -81,8 +82,8 @@ int test_driver(int id, struct ggml_tensor *node, int n_threads) {
 
     node->task_profile.runner = mock_task_runner;
 
-    struct ggml_threading_context *ctx =
-        ggml_threading_start(n_threads, NULL, NULL, features, /*stages_time*/ NULL);
+    struct ggml_threading_context *ctx = ggml_threading_start(
+        n_threads, NULL, NULL, features, /*stages_time*/ NULL);
 
     int t1 = (int)ggml_time_us();
 
@@ -148,7 +149,7 @@ mock_task_runner_fallback(const struct ggml_compute_params *params,
 
 // By design, fallback should happen when attempt computing tensor in GPU,
 // thus it is not parallelled.
-int test_fallback(struct ggml_tensor *node) {
+static int test_fallback(struct ggml_tensor *node) {
     struct ggml_threading_context *ctx = ggml_threading_start(
         1, NULL, mock_task_runner_fallback,
         /*features*/ GGML_THREADING_FEATURE_NONE, /*stages_time*/ NULL);
@@ -182,7 +183,7 @@ customized_node_runner(const struct ggml_compute_params *params,
 }
 
 // Test when node->task_profile.runner is not NULL.
-int test_customized_node_runner(struct ggml_tensor *node) {
+static int test_customized_node_runner(struct ggml_tensor *node) {
     struct ggml_threading_context *ctx = ggml_threading_start(
         1, NULL, mock_task_runner,
         /*features*/ GGML_THREADING_FEATURE_NONE, /*stages_time*/ NULL);
@@ -204,6 +205,121 @@ int test_customized_node_runner(struct ggml_tensor *node) {
     return 0;
 }
 
+static enum ggml_compute_error
+lifecycle_runner(const struct ggml_compute_params *params,
+                 struct ggml_tensor *node) {
+    UNUSED(params);
+    UNUSED(node);
+    return GGML_COMPUTE_OK;
+}
+
+// Test thread lifecycle: start -> suspend -> resume -> stop
+static int test_lifecycle(void) {
+    struct ggml_tensor node;
+    memset(&node, 0, sizeof(struct ggml_tensor));
+
+    struct ggml_task_stage *stages = node.task_profile.stages;
+
+    stages[0].valid = true;
+    stages[1].valid = true;
+    stages[1].parallel = true;
+
+    node.op = GGML_OP_MUL_MAT;
+    struct ggml_tensor src0 = {
+        .type = GGML_TYPE_Q4_0,
+    };
+    struct ggml_tensor src1 = {
+        .type = GGML_TYPE_F32,
+    };
+    node.src0 = &src0;
+    node.src1 = &src1;
+
+    int t0 = (int)ggml_time_ms();
+    // Suppose creating threading when entering session.
+
+    // We have to try affable threads.
+    struct ggml_threading_context *ctx = NULL;
+    int threads_arr[] = {4, 2};
+    int threads_arr_len = sizeof(threads_arr) / sizeof(threads_arr[0]);
+    int n_threads = 1;
+
+    for (int i = 0; i < threads_arr_len; i++) {
+        n_threads = threads_arr[i];
+        int start_time = (int)ggml_time_ms();
+        ctx = ggml_threading_start(
+            n_threads, NULL, lifecycle_runner,
+            /*features*/ GGML_THREADING_FEATURE_WAIT_ON_DONE |
+                GGML_THREADING_FEATURE_PERF,
+            /*stages_time*/ NULL);
+        int elapsed = (int)ggml_time_ms() - start_time;
+        if (elapsed > 5 * n_threads) {
+            printf("[test-ggml-threading] %s: it took %d ms to start %d worker "
+                   "thread(s), skip\n",
+                   __func__, elapsed, n_threads - 1);
+            ggml_threading_stop(ctx);
+        } else {
+            break;
+        }
+    }
+
+    if (n_threads == 1) {
+        printf("[test-ggml-threading] %s: too slow to start at least 1 worker "
+               "thread(s), skip\n",
+               __func__);
+        return 0;
+    }
+
+    // Suppose exiting from previous compute graph ...
+    printf("[test-ggml-threading] %s: %d workers, suspending ...\n", __func__,
+           n_threads - 1);
+    ggml_threading_suspend(ctx);
+
+    // Suppose entering new compute graph ...
+    printf("[test-ggml-threading] test lifecycle: resuming ...\n");
+    ggml_threading_resume(ctx);
+
+    const int m = 2;
+    const int n = 50;
+
+    printf("[test-ggml-threading] %s: computing %d tensors (half wait)...\n",
+           __func__, m * n);
+
+    for (int i = 0; i < m; i++) {
+        stages[0].wait = (i == 0);
+        for (int j = 0; j < n; j++) {
+            ggml_threading_compute_tensor(ctx, &node, /*wdata*/ NULL,
+                                          /*wsize*/ 0);
+        }
+    }
+
+    printf("[test-ggml-threading] %s: compute done, resuming...\n", __func__);
+    ggml_threading_resume(ctx);
+
+    const int loops = 100;
+    printf("[test-ggml-threading] %s: try %d loops of suspend-resume ...\n",
+           __func__, loops);
+
+    for (int i = 0; i < loops; i++) {
+        ggml_threading_suspend(ctx);
+        if (!ggml_threading_is_suspending(ctx)) {
+            abort();
+        }
+
+        ggml_threading_resume(ctx);
+        if (ggml_threading_is_suspending(ctx)) {
+            abort();
+        }
+    }
+
+    printf("[test-ggml-threading] %s: stopping ...\n", __func__);
+    ggml_threading_stop(ctx);
+
+    int elapsed_ms = (int)ggml_time_ms() - t0;
+    printf("[test-ggml-threading] %s: elapsed %d ms\n", __func__, elapsed_ms);
+
+    return 0;
+}
+
 int main(void) {
     ggml_time_init();
 
@@ -268,21 +384,21 @@ int main(void) {
         }
 
         // skip this n_threads when too slow.
-        int t0 = (int)ggml_time_us();
+        int t0 = (int)ggml_time_ms();
 
         struct ggml_threading_context *ctx =
             ggml_threading_start(n_threads, ggml_threading_graph_compute_thread,
                                  NULL, 0, /*stages_time*/ NULL);
 
-        int t1 = (int)ggml_time_us();
+        int t1 = (int)ggml_time_ms();
 
         ggml_threading_stop(ctx);
 
-        int elapsed_us = t1 - t0;
-        if (elapsed_us > 500 * n_threads) {
+        int elapsed_ms = t1 - t0;
+        if (elapsed_ms > 5 * n_threads) {
             printf("[test-ggml-threading] warning: it took took %7.3f "
                    "ms to start %2d worker thread(s). Too slow, skip.\n",
-                   1.0 * elapsed_us / 1000, n_threads - 1);
+                   1.0 * elapsed_ms, n_threads - 1);
             threads_arr[i] = 0;
             ++n_slow;
         } else {
@@ -430,6 +546,17 @@ int main(void) {
         }
     }
 
+    // lifecycle.
+    {
+        printf("[test-ggml-threading] test lifecycle ...\n");
+        ++n_tests;
+
+        if (test_lifecycle() == 0) {
+            ++n_passed;
+            printf("[test-ggml-threading] test lifecycle: ok\n\n");
+        }
+    }
+
     printf("[test-ggml-threading] %d/%d passed.\n", n_passed, n_tests);
 
     return (n_passed == n_tests) ? 0 : 1;

From 286c5b30143c7540d947811d37199afde50819a3 Mon Sep 17 00:00:00 2001
From: mqy <meng.qingyou@gmail.com>
Date: Sun, 18 Jun 2023 20:01:58 +0800
Subject: [PATCH 16/24] threadng: remove unnecessary spin lock/unlock from
 suspend/resume; add more tests

---
 ggml-threading.c            | 22 ++++------------------
 tests/test-ggml-threading.c | 25 +++++++++++++++----------
 2 files changed, 19 insertions(+), 28 deletions(-)

diff --git a/ggml-threading.c b/ggml-threading.c
index fb02b40469116..2a5cfa096dd4b 100644
--- a/ggml-threading.c
+++ b/ggml-threading.c
@@ -260,22 +260,17 @@ void ggml_threading_suspend(struct ggml_threading_context *ctx) {
         return;
     }
 
-    struct ggml_compute_state_shared *shared = &ctx->shared;
-
-    ggml_spin_lock(&shared->spin);
+    PRINT_DEBUG("[main] wait_now will be set, expect %d workers wait\n",
+                n_worker_threads);
     ctx->shared.wait_now = true;
-    ggml_spin_unlock(&shared->spin);
 
     const int n_worker_threads = ctx->n_threads - 1;
-
     while (ctx->shared.n_waiting != n_worker_threads) {
         ggml_spin_pause();
     }
 
-    ggml_spin_lock(&shared->spin);
-    ctx->suspending = true;
-    ggml_spin_unlock(&shared->spin);
     PRINT_DEBUG("[main] saw %d workers waiting\n", n_worker_threads);
+    ctx->suspending = true;
 }
 
 // Wakeup all workers.
@@ -291,7 +286,6 @@ void ggml_threading_resume(struct ggml_threading_context *ctx) {
     }
 
     struct ggml_compute_state_shared *shared = &ctx->shared;
-    ggml_spin_lock(&shared->spin);
 
     int64_t perf_cycles_0 = 0;
     int64_t perf_time_0 = 0;
@@ -307,8 +301,6 @@ void ggml_threading_resume(struct ggml_threading_context *ctx) {
     shared->wait_now = false;
 
     while (shared->n_waiting != 0) {
-        ggml_spin_unlock(&shared->spin);
-
         if (loop_counter > 0) {
             ggml_spin_pause();
             if (loop_counter > 3) {
@@ -326,8 +318,6 @@ void ggml_threading_resume(struct ggml_threading_context *ctx) {
         GGML_ASSERT(pthread_cond_broadcast(&shared->cond) == 0);
         GGML_ASSERT(pthread_mutex_unlock(&shared->mutex) == 0);
         last_signal_time = ggml_time_us();
-
-        ggml_spin_lock(&shared->spin);
     }
 
     ctx->suspending = false;
@@ -335,9 +325,7 @@ void ggml_threading_resume(struct ggml_threading_context *ctx) {
     if (shared->ctx->features & GGML_THREADING_FEATURE_PERF) {
         ggml_perf_collect(&shared->ctx->wakeup_perf, perf_cycles_0,
                           perf_time_0);
-    }
-
-    ggml_spin_unlock(&shared->spin);
+    };
 }
 
 bool ggml_threading_is_suspending(struct ggml_threading_context *ctx) {
@@ -385,8 +373,6 @@ static void ggml_threading_setup_workers(struct ggml_threading_context *ctx,
         }
     } else if (current->wait) {
         if (shared->n_waiting < n_worker_threads) {
-            PRINT_DEBUG("[main] wait_now will be set, expect %d workers wait\n",
-                        n_worker_threads);
             ggml_spin_unlock(&ctx->shared.spin);
             ggml_threading_suspend(ctx);
             ggml_spin_lock(&ctx->shared.spin);
diff --git a/tests/test-ggml-threading.c b/tests/test-ggml-threading.c
index f941f4dc3c372..cb2cca163f202 100644
--- a/tests/test-ggml-threading.c
+++ b/tests/test-ggml-threading.c
@@ -214,7 +214,7 @@ lifecycle_runner(const struct ggml_compute_params *params,
 }
 
 // Test thread lifecycle: start -> suspend -> resume -> stop
-static int test_lifecycle(void) {
+static int test_lifecycle(bool wait_on_done) {
     struct ggml_tensor node;
     memset(&node, 0, sizeof(struct ggml_tensor));
 
@@ -243,14 +243,15 @@ static int test_lifecycle(void) {
     int threads_arr_len = sizeof(threads_arr) / sizeof(threads_arr[0]);
     int n_threads = 1;
 
+    enum ggml_threading_features features =
+        wait_on_done ? GGML_THREADING_FEATURE_NONE
+                     : GGML_THREADING_FEATURE_WAIT_ON_DONE;
     for (int i = 0; i < threads_arr_len; i++) {
         n_threads = threads_arr[i];
         int start_time = (int)ggml_time_ms();
-        ctx = ggml_threading_start(
-            n_threads, NULL, lifecycle_runner,
-            /*features*/ GGML_THREADING_FEATURE_WAIT_ON_DONE |
-                GGML_THREADING_FEATURE_PERF,
-            /*stages_time*/ NULL);
+        ctx = ggml_threading_start(n_threads, NULL, lifecycle_runner,
+                                   features | GGML_THREADING_FEATURE_PERF,
+                                   /*stages_time*/ NULL);
         int elapsed = (int)ggml_time_ms() - start_time;
         if (elapsed > 5 * n_threads) {
             printf("[test-ggml-threading] %s: it took %d ms to start %d worker "
@@ -547,13 +548,17 @@ int main(void) {
     }
 
     // lifecycle.
-    {
-        printf("[test-ggml-threading] test lifecycle ...\n");
+    for (int i = 0; i < 2; i++) {
+        bool wait_on_done = (i == 1);
+        printf("[test-ggml-threading] test lifecycle (want_on_done = %d) ...\n",
+               wait_on_done);
         ++n_tests;
 
-        if (test_lifecycle() == 0) {
+        if (test_lifecycle(wait_on_done) == 0) {
             ++n_passed;
-            printf("[test-ggml-threading] test lifecycle: ok\n\n");
+            printf("[test-ggml-threading] test lifecycle (want_on_done = %d): "
+                   "ok\n\n",
+                   wait_on_done);
         }
     }
 

From 98728632c67a4bd1ec3c791b1b92b125c904f8e4 Mon Sep 17 00:00:00 2001
From: mqy <meng.qingyou@gmail.com>
Date: Mon, 19 Jun 2023 01:04:32 +0800
Subject: [PATCH 17/24] threading test: less loops to avoid timeout

---
 tests/test-ggml-threading.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/test-ggml-threading.c b/tests/test-ggml-threading.c
index cb2cca163f202..d34be3ac6f2b5 100644
--- a/tests/test-ggml-threading.c
+++ b/tests/test-ggml-threading.c
@@ -280,7 +280,7 @@ static int test_lifecycle(bool wait_on_done) {
     ggml_threading_resume(ctx);
 
     const int m = 2;
-    const int n = 50;
+    const int n = 10;
 
     printf("[test-ggml-threading] %s: computing %d tensors (half wait)...\n",
            __func__, m * n);
@@ -296,7 +296,7 @@ static int test_lifecycle(bool wait_on_done) {
     printf("[test-ggml-threading] %s: compute done, resuming...\n", __func__);
     ggml_threading_resume(ctx);
 
-    const int loops = 100;
+    const int loops = 10;
     printf("[test-ggml-threading] %s: try %d loops of suspend-resume ...\n",
            __func__, loops);
 

From 6609c229e84e7cb749a5a3902f0123033c06c523 Mon Sep 17 00:00:00 2001
From: mqy <meng.qingyou@gmail.com>
Date: Mon, 19 Jun 2023 01:05:34 +0800
Subject: [PATCH 18/24] fixed OP_OUT_PROD and OP_NONE

---
 ggml.c | 23 ++++++++++++++++-------
 1 file changed, 16 insertions(+), 7 deletions(-)

diff --git a/ggml.c b/ggml.c
index 5a9e0b33e8cea..75a5624818a19 100644
--- a/ggml.c
+++ b/ggml.c
@@ -15655,8 +15655,7 @@ int ggml_get_task_profiles(
         p[0].stages[1].valid = true;
         p[0].stages[1].parallel = true;
     } break;
-    case GGML_OP_MUL_MAT:
-    case GGML_OP_OUT_PROD: { // FIXME: is this correct?
+    case GGML_OP_MUL_MAT: {
         enum ggml_type src0_t = tensor->src0->type;
         if (src0_t == GGML_TYPE_F32) {
             p[0].stages[1].valid = true;
@@ -15673,6 +15672,15 @@ int ggml_get_task_profiles(
             GGML_ASSERT(false);
         }
     } break;
+    case GGML_OP_OUT_PROD: {
+        enum ggml_type src0_t = tensor->src0->type;
+        if (src0_t == GGML_TYPE_F32) {
+            p[0].stages[1].valid = true;
+            p[0].stages[1].parallel = true;
+        } else {
+            GGML_ASSERT(false);
+        }
+    } break;
     case GGML_OP_SCALE: {
         p[0].stages[1].valid = true;
         p[0].stages[1].parallel = true;
@@ -15810,13 +15818,12 @@ static void ggml_optimize_tensor_task_profile(
     struct ggml_tensor *tensor, struct ggml_task_profile *profiles,
     int n_profiles, struct ggml_mulmat_tune *tune) {
 
-    if (tensor->op != GGML_OP_MUL_MAT && tensor->op != GGML_OP_OUT_PROD) {
+    if (tensor->op != GGML_OP_MUL_MAT) {
         return;
     }
 
     GGML_ASSERT(tensor);
-    GGML_ASSERT(tensor->op == GGML_OP_MUL_MAT ||
-                tensor->op == GGML_OP_OUT_PROD);
+    GGML_ASSERT(tensor->op == GGML_OP_MUL_MAT);
     GGML_ASSERT(tensor->task_profile.id == n_profiles);
 
     GGML_ASSERT(profiles);
@@ -15949,7 +15956,9 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
         for (int i = 0; i < cgraph->n_nodes; i++) {
             struct ggml_tensor * node = cgraph->nodes[i];
 
-            GGML_ASSERT (node->op != GGML_OP_NONE);
+            if (node->op == GGML_OP_NONE) {
+                continue;
+            }
 
             if (node->task_profile.id == 0) {
                 ggml_set_tensor_task_profile(node, cgraph->tune);
@@ -16031,7 +16040,7 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
                     {
                     } break;
                 case GGML_OP_MUL_MAT:
-                case GGML_OP_OUT_PROD: // FIXME: is this correct?
+                case GGML_OP_OUT_PROD:
                     {
                         size_t cur = 0;
                         GGML_ASSERT(node->src1->type == GGML_TYPE_F32);

From 65fd65e0c1d74d595a8ce14fca289444fc25e345 Mon Sep 17 00:00:00 2001
From: mqy <meng.qingyou@gmail.com>
Date: Mon, 19 Jun 2023 13:50:35 +0800
Subject: [PATCH 19/24] tune: update readme

---
 examples/mulmat-tune/README.md | 77 ++++++++++++++++++++++------------
 1 file changed, 50 insertions(+), 27 deletions(-)

diff --git a/examples/mulmat-tune/README.md b/examples/mulmat-tune/README.md
index 4e521211d968e..4ba1a1d38adb7 100644
--- a/examples/mulmat-tune/README.md
+++ b/examples/mulmat-tune/README.md
@@ -19,21 +19,60 @@ run bench ahead of time (saving tens of seconds), but there are two shortcomings
   outdated format. So I integrated mulmat tune into `main` and `perplexity` as
   a complementary solution.
 
-## Build into main and perplexity
+The `load` mode try validates at least the following fields:
+- version
+- model
+- ftype
+- n_threads
+- n_profiles
+- profiles
+
+`n_threads` is very critical to performance, to select best n_threads.
+when run `main` or `perplexity`, the n_threads is automatically set, the default
+n_threads generally works well. Example:
+```
+system_info: n_threads = 4 / 12
+```
+This is read as use 4 of total 12 cores(with 6 physical cores).
+
+## Build
+
+Compile options:
+- `LLAMA_TUNE` for CMake (default ON)
+- `LLAMA_NO_TUNE` for Make (default undefined)
+
+`GGML_USE_TUNE` and `GGML_TUNE_NDEBUG` are defined when llama tune is enabled.
+
+When `GGML_USE_TUNE` is defined, mulmat_tune functionalities are compiled into
+main and perplexity:
+- cli args `--tune`, `--tune-file` are visible.
+- try selecting fastest task profile according to tune result for mul_mat.
+
+The standalone tool `mulmat-tune` is always build: no compile options.
+
+**Makefile**
+
+To use tune, at least one of the vendors have to be built:
+- BLAS(ACCELERATE, OpenNBLAS, BLIS)
+- ClBlast
+- CUDA (may not run)
+
+To enable the debug, comment out `-DGGML_TUNE_NDEBUG` from Makefile.
 
-Makefile:
 ```
 make clean && make
 ```
 
-CMake (with BLAS):
+**CMake**
+
 ```
-cmake --build . --target clean
-cmake .. -DLLAMA_BLAS=ON
+rm -rf build/*
+cd build
+cmake ..
 cmake --build . --config Release
 ```
 
-Run examples:
+## Run main or perplexity
 
 ```
 # bench and run:
@@ -48,21 +87,7 @@ Run examples:
 ./main -m ./models/3B/open-llama-3b-q4-0.bin -c 512 -b 1024 -n 256 --keep 48 --repeat_penalty 1.0 --color -i -r "User:" -f prompts/chat-with-bob.txt -t 4 --tune-file <FILE>
 ```
 
-# Build the standalone `mulmat-tune`
-
-Makefile:
-```
-make clean && make
-```
-
-CMake (with BLAS)
-```
-cmake --build . --target clean
-cmake .. -DLLAMA_BLAS=ON
-cmake --build . --config Release
-```
-
-Run examples:
+## Run mulmat-tune tool
 
 ```
 ./mulmat-tune -h
@@ -82,8 +107,8 @@ Run examples:
 # customized n_pass: run 1 pass only instead of the default 3.
 ./mulmat-tune --n_pass 1
 
-# customized n_threads instead of the default 1.
-./mulmat-tune --n_threads 4
+# customized n_threads instead of the default 4.
+./mulmat-tune --n_threads 6
 
 # save to file
 ./mulmat-tune --file <FILE>
@@ -93,9 +118,7 @@ Run examples:
 
 ```
 
-# End to End Test
-
-## Compare With Master
+## Example: compare With Master
 
 You may want to run the following commands. Make sure the tune result file is
 setup properly.
@@ -103,7 +126,7 @@ setup properly.
 General steps:
 
 1. run `./mulmat-tune -h` to see how to build for misc vendors.
-   To enable the debug, comment out `-DGGML_TUNE_NDEBUG` from makefile then run:
+   then run:
 
    ```
    make clean; make

From 44b831dc59f8d6e7b3bbfc4ccd9cc2a121684339 Mon Sep 17 00:00:00 2001
From: mqy <meng.qingyou@gmail.com>
Date: Mon, 19 Jun 2023 13:54:20 +0800
Subject: [PATCH 20/24] tune: extract ggml_mulmat_tune_bench_wrapper

---
 ggml-tune.c | 45 ++++++++++++++++++++++++++++++++++++++++++++
 ggml-tune.h |  6 ++++++
 llama.cpp   | 54 +++--------------------------------------------------
 3 files changed, 54 insertions(+), 51 deletions(-)

diff --git a/ggml-tune.c b/ggml-tune.c
index 2e292e98e7bb6..36c44e1dc53d2 100644
--- a/ggml-tune.c
+++ b/ggml-tune.c
@@ -935,3 +935,48 @@ bool ggml_mulmat_tune_bench(struct ggml_mulmat_tune *tune,
 
     return true;
 }
+
+bool ggml_mulmat_tune_bench_wrapper(struct ggml_mulmat_tune *mulmat_tune,
+                                    struct ggml_mulmat_tune_params *params,
+                                    bool run_bench) {
+    printf("\n");
+    bool empty_fname = !params->fname || strcmp(params->fname, "") == 0;
+
+    if (!ggml_cpu_has_blas()) {
+        fprintf(stderr, "[tune] this program is not built with BLAS, abort.\n");
+        return 1;
+    }
+
+    if (run_bench) {
+        return ggml_mulmat_tune_bench(mulmat_tune, params);
+    }
+
+    if (!empty_fname) {
+        FILE *fp = fopen(params->fname, "r");
+        if (!fp) {
+            fprintf(stderr, "[tune] failed to open file %s.\n", params->fname);
+            return false;
+        } else {
+            int rc = ggml_mulmat_tune_read_data(mulmat_tune, fp);
+            fclose(fp);
+
+            if (rc != 0) {
+                fprintf(stderr,
+                        "[tune] failed to read data from %s, error code: %d\n",
+                        params->fname, rc);
+                return false;
+            }
+
+            fprintf(stderr, "[tune] loaded data from %s\n", params->fname);
+
+            bool ok = ggml_mulmat_tune_validate(mulmat_tune, mulmat_tune->model,
+                                                params->model.ftype,
+                                                params->n_threads);
+            if (!ok) {
+                return false;
+            }
+        }
+    }
+
+    return true;
+}
diff --git a/ggml-tune.h b/ggml-tune.h
index addcd34dbbd62..633f92697050a 100644
--- a/ggml-tune.h
+++ b/ggml-tune.h
@@ -132,6 +132,12 @@ void ggml_mulmat_tune_estimate_time(const struct ggml_mulmat_tune_shape *shape,
 bool ggml_mulmat_tune_bench(struct ggml_mulmat_tune *tune,
                             struct ggml_mulmat_tune_params *params);
 
+// This API is intended to be called by llama, etc.
+// Three modes: bench and run; bench(save) then exit; load and run
+bool ggml_mulmat_tune_bench_wrapper(struct ggml_mulmat_tune *mulmat_tune,
+                                    struct ggml_mulmat_tune_params *params,
+                                    bool run_bench);
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/llama.cpp b/llama.cpp
index e6bddffd5edaa..a3c3586e399f2 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -2748,8 +2748,6 @@ bool llama_mulmat_tune(struct llama_context *ctx, int n_threads, bool tune,
                        const char *fname) {
     GGML_ASSERT(ctx->model.n_gpu_layers == 0);
 
-    printf("\n");
-
     const char *model_name = llama_model_type_name(ctx->model.type);
 
     llama_hparams *hparams = &ctx->model.hparams;
@@ -2820,71 +2818,25 @@ bool llama_mulmat_tune(struct llama_context *ctx, int n_threads, bool tune,
         /* .m_num          =*/8,
         /* .n_pass         =*/1,
         /* .n_threads      =*/n_threads,
-        /* .prrogress      =*/true,
+        /* .progress       =*/true,
         /* .output_console =*/false,
         /* .fname          =*/fname,
     };
 
-    bool empty_fname = !fname || strcmp(fname, "") == 0;
-
     ctx->tune = new (struct ggml_mulmat_tune);
     if (!ctx->tune) {
         fprintf(stderr, "[tune] failed to allocate memory for tune\n");
         return false;
     }
 
-    if (!ggml_cpu_has_blas()) {
-        fprintf(stderr, "[tune] this program is not built with BLAS, abort.\n");
-        return false;
-    }
-
-    if (tune) {
-        bool ok = ggml_mulmat_tune_bench(ctx->tune, &params);
-        if (!ok) {
-            ggml_mulmat_tune_free(ctx->tune);
-            return false;
-        }
-        if (!empty_fname) {
-            ggml_mulmat_tune_free(ctx->tune);
-            return true;
-        }
-    } else if (empty_fname) {
-        return false;
-    }
-
-    if (!empty_fname) {
-        FILE *fp = fopen(fname, "r");
-        if (!fp) {
-            fprintf(stderr, "[tune] failed to open file %s.\n", fname);
-            return false;
-        } else {
-            int rc = ggml_mulmat_tune_read_data(ctx->tune, fp);
-            fclose(fp);
-
-            if (rc != 0) {
-                fprintf(stderr,
-                        "[tune] failed to read data from %s, error code: %d\n",
-                        fname, rc);
-                return false;
-            }
-
-            fprintf(stderr, "[tune] loaded data from %s\n", fname);
-
-            bool ok = ggml_mulmat_tune_validate(ctx->tune, model_name, ggml_ftype,
-                                                params.n_threads);
-            if (!ok) {
-                return false;
-            }
-        }
-    }
-
-    return true;
+    return ggml_mulmat_tune_bench_wrapper(ctx->tune, &params, tune);
 }
 #endif
 
 void llama_free(struct llama_context * ctx) {
 #ifdef GGML_USE_TUNE
     if (ctx->tune) {
+        ggml_mulmat_tune_free(ctx->tune);
         delete(ctx->tune);
     }
 #endif

From 4d32b4088e10ad37733cda7979706a3b8ffa1cb2 Mon Sep 17 00:00:00 2001
From: mqy <meng.qingyou@gmail.com>
Date: Mon, 19 Jun 2023 14:05:30 +0800
Subject: [PATCH 21/24] threading test: decrease a threshold value to avoid
 timeout

---
 tests/test-ggml-threading.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/test-ggml-threading.c b/tests/test-ggml-threading.c
index d34be3ac6f2b5..30dddbeabf288 100644
--- a/tests/test-ggml-threading.c
+++ b/tests/test-ggml-threading.c
@@ -253,7 +253,7 @@ static int test_lifecycle(bool wait_on_done) {
                                    features | GGML_THREADING_FEATURE_PERF,
                                    /*stages_time*/ NULL);
         int elapsed = (int)ggml_time_ms() - start_time;
-        if (elapsed > 5 * n_threads) {
+        if (elapsed > 1 * n_threads) {
             printf("[test-ggml-threading] %s: it took %d ms to start %d worker "
                    "thread(s), skip\n",
                    __func__, elapsed, n_threads - 1);
@@ -396,7 +396,7 @@ int main(void) {
         ggml_threading_stop(ctx);
 
         int elapsed_ms = t1 - t0;
-        if (elapsed_ms > 5 * n_threads) {
+        if (elapsed_ms > 1 * n_threads) {
             printf("[test-ggml-threading] warning: it took took %7.3f "
                    "ms to start %2d worker thread(s). Too slow, skip.\n",
                    1.0 * elapsed_ms, n_threads - 1);

From cc8a375bc411c153ec0771faf852316aa5e42f83 Mon Sep 17 00:00:00 2001
From: mqy <meng.qingyou@gmail.com>
Date: Mon, 19 Jun 2023 16:17:48 +0800
Subject: [PATCH 22/24] threading: fix deadlock by reverting part of changes
 from commit 286c5b30

---
 ggml-threading.c            | 16 ++++++++++++----
 tests/test-ggml-threading.c |  4 ++--
 2 files changed, 14 insertions(+), 6 deletions(-)

diff --git a/ggml-threading.c b/ggml-threading.c
index 2a5cfa096dd4b..4c1bc0714fede 100644
--- a/ggml-threading.c
+++ b/ggml-threading.c
@@ -199,8 +199,6 @@ struct ggml_threading_context {
     int64_t *stages_time;
 };
 
-// NOTE: ggml_spin_lock and ggml_spin_unlock may can be noop if
-// feature wait_on_done is off.
 static inline void ggml_spin_lock(volatile atomic_flag *obj) {
     while (atomic_flag_test_and_set(obj)) {
         ggml_spin_pause();
@@ -262,7 +260,10 @@ void ggml_threading_suspend(struct ggml_threading_context *ctx) {
 
     PRINT_DEBUG("[main] wait_now will be set, expect %d workers wait\n",
                 n_worker_threads);
+
+    ggml_spin_lock(&ctx->shared.spin);
     ctx->shared.wait_now = true;
+    ggml_spin_unlock(&ctx->shared.spin);
 
     const int n_worker_threads = ctx->n_threads - 1;
     while (ctx->shared.n_waiting != n_worker_threads) {
@@ -270,7 +271,9 @@ void ggml_threading_suspend(struct ggml_threading_context *ctx) {
     }
 
     PRINT_DEBUG("[main] saw %d workers waiting\n", n_worker_threads);
+    ggml_spin_lock(&ctx->shared.spin);
     ctx->suspending = true;
+    ggml_spin_unlock(&ctx->shared.spin);
 }
 
 // Wakeup all workers.
@@ -278,8 +281,6 @@ void ggml_threading_suspend(struct ggml_threading_context *ctx) {
 // Workers takes some time to wakeup, and has to lock spin after wakeup. Yield
 // is used to avoid signal frequently. Current implementation is highly
 // experimental. See tests/test-ggml-threading.c for details.
-//
-// NOTE: must be protected by shared->spin
 void ggml_threading_resume(struct ggml_threading_context *ctx) {
     if (ctx->n_threads == 1) {
         return;
@@ -298,9 +299,12 @@ void ggml_threading_resume(struct ggml_threading_context *ctx) {
     int loop_counter = 0;
     int64_t last_signal_time = 0;
 
+    ggml_spin_lock(&shared->spin);
     shared->wait_now = false;
 
     while (shared->n_waiting != 0) {
+        ggml_spin_unlock(&shared->spin);
+
         if (loop_counter > 0) {
             ggml_spin_pause();
             if (loop_counter > 3) {
@@ -318,6 +322,8 @@ void ggml_threading_resume(struct ggml_threading_context *ctx) {
         GGML_ASSERT(pthread_cond_broadcast(&shared->cond) == 0);
         GGML_ASSERT(pthread_mutex_unlock(&shared->mutex) == 0);
         last_signal_time = ggml_time_us();
+
+        ggml_spin_lock(&shared->spin);
     }
 
     ctx->suspending = false;
@@ -326,6 +332,8 @@ void ggml_threading_resume(struct ggml_threading_context *ctx) {
         ggml_perf_collect(&shared->ctx->wakeup_perf, perf_cycles_0,
                           perf_time_0);
     };
+
+    ggml_spin_unlock(&shared->spin);
 }
 
 bool ggml_threading_is_suspending(struct ggml_threading_context *ctx) {
diff --git a/tests/test-ggml-threading.c b/tests/test-ggml-threading.c
index 30dddbeabf288..886c5ee672794 100644
--- a/tests/test-ggml-threading.c
+++ b/tests/test-ggml-threading.c
@@ -550,13 +550,13 @@ int main(void) {
     // lifecycle.
     for (int i = 0; i < 2; i++) {
         bool wait_on_done = (i == 1);
-        printf("[test-ggml-threading] test lifecycle (want_on_done = %d) ...\n",
+        printf("[test-ggml-threading] test lifecycle (wait_on_done = %d) ...\n",
                wait_on_done);
         ++n_tests;
 
         if (test_lifecycle(wait_on_done) == 0) {
             ++n_passed;
-            printf("[test-ggml-threading] test lifecycle (want_on_done = %d): "
+            printf("[test-ggml-threading] test lifecycle (wait_on_done = %d): "
                    "ok\n\n",
                    wait_on_done);
         }

From aac7f7cc0407a6eb830079377a34e885d0a0e1e2 Mon Sep 17 00:00:00 2001
From: mqy <meng.qingyou@gmail.com>
Date: Mon, 19 Jun 2023 18:15:32 +0800
Subject: [PATCH 23/24] threading: try to fix a deadlock, also added critical
 deadlock detection

---
 ggml-threading.c            | 55 ++++++++++++++++++++++---------------
 tests/test-ggml-threading.c |  4 +--
 2 files changed, 35 insertions(+), 24 deletions(-)

diff --git a/ggml-threading.c b/ggml-threading.c
index 4c1bc0714fede..4a9cf622fafb0 100644
--- a/ggml-threading.c
+++ b/ggml-threading.c
@@ -261,19 +261,22 @@ void ggml_threading_suspend(struct ggml_threading_context *ctx) {
     PRINT_DEBUG("[main] wait_now will be set, expect %d workers wait\n",
                 n_worker_threads);
 
-    ggml_spin_lock(&ctx->shared.spin);
-    ctx->shared.wait_now = true;
+    struct ggml_compute_state_shared *shared = &ctx->shared;
+
+    ggml_spin_lock(&shared->spin);
+    shared->wait_now = true;
     ggml_spin_unlock(&ctx->shared.spin);
 
     const int n_worker_threads = ctx->n_threads - 1;
-    while (ctx->shared.n_waiting != n_worker_threads) {
+    while (shared->n_waiting != n_worker_threads) {
         ggml_spin_pause();
     }
 
     PRINT_DEBUG("[main] saw %d workers waiting\n", n_worker_threads);
-    ggml_spin_lock(&ctx->shared.spin);
+
+    ggml_spin_lock(&shared->spin);
     ctx->suspending = true;
-    ggml_spin_unlock(&ctx->shared.spin);
+    ggml_spin_unlock(&shared->spin);
 }
 
 // Wakeup all workers.
@@ -296,44 +299,52 @@ void ggml_threading_resume(struct ggml_threading_context *ctx) {
         perf_time_0 = ggml_time_us();
     }
 
-    int loop_counter = 0;
-    int64_t last_signal_time = 0;
+    // Dead lock detection.
+    int counter = 0;
+    int64_t last_notify_ms = 0;
+    const int max_notify_count = ctx->n_threads - 1;
+    const int max_duration_ms = 100 * max_notify_count;
 
     ggml_spin_lock(&shared->spin);
     shared->wait_now = false;
 
     while (shared->n_waiting != 0) {
-        ggml_spin_unlock(&shared->spin);
 
-        if (loop_counter > 0) {
-            ggml_spin_pause();
-            if (loop_counter > 3) {
-                sched_yield();
-            }
+        GGML_ASSERT(pthread_mutex_lock(&shared->mutex) == 0);
+        if (shared->n_waiting == 0) {
+            GGML_ASSERT(pthread_mutex_unlock(&shared->mutex) == 0);
+            ggml_spin_unlock(&shared->spin);
+            break;
         }
-        ++loop_counter;
 
-        // TODO: should bench actual average wait/wakeup time.
-        if (last_signal_time > 0 && (ggml_time_us() - last_signal_time) < 10) {
-            continue;
-        }
+        ggml_spin_unlock(&shared->spin);
 
-        GGML_ASSERT(pthread_mutex_lock(&shared->mutex) == 0);
         GGML_ASSERT(pthread_cond_broadcast(&shared->cond) == 0);
         GGML_ASSERT(pthread_mutex_unlock(&shared->mutex) == 0);
-        last_signal_time = ggml_time_us();
+        last_notify_ms = ggml_time_ms();
+
+        sched_yield();
+
+        int elapsed = last_notify_ms > 0 ? ggml_time_ms() - last_notify_ms : 0;
+
+        if ((counter > max_notify_count) || elapsed > max_duration_ms) {
+            fprintf(stderr,
+                    "[ggml-threading] potential dead lock detected, notified "
+                    "for %d times, elapsed time: %d ms, abort\n",
+                    counter, elapsed);
+            abort();
+        }
 
         ggml_spin_lock(&shared->spin);
     }
 
     ctx->suspending = false;
+    ggml_spin_unlock(&shared->spin);
 
     if (shared->ctx->features & GGML_THREADING_FEATURE_PERF) {
         ggml_perf_collect(&shared->ctx->wakeup_perf, perf_cycles_0,
                           perf_time_0);
     };
-
-    ggml_spin_unlock(&shared->spin);
 }
 
 bool ggml_threading_is_suspending(struct ggml_threading_context *ctx) {
diff --git a/tests/test-ggml-threading.c b/tests/test-ggml-threading.c
index 886c5ee672794..8fc705a6bd197 100644
--- a/tests/test-ggml-threading.c
+++ b/tests/test-ggml-threading.c
@@ -33,7 +33,7 @@
 
 #define UNUSED(x) (void)(x)
 
-#define MAX_N_THREADS 16
+#define MAX_N_THREADS 64
 
 static const int n_repeat = 10;
 
@@ -353,7 +353,7 @@ int main(void) {
     //   average time, thus greatly punishes those small workloads.
     // - wait_on_done is general faster than wait_now, can be 10x faster.
 
-    int threads_arr[] = {1, 2, 4, 6, 8, 16};
+    int threads_arr[] = {1, 2, 4, 6, 8, 16, 32, 64};
     int threads_arr_len = sizeof(threads_arr) / sizeof(threads_arr[0]);
 
     // millions of loops.

From 08972d2aee2021461b589b8e71bb29275bff7436 Mon Sep 17 00:00:00 2001
From: mqy <meng.qingyou@gmail.com>
Date: Mon, 19 Jun 2023 19:15:00 +0800
Subject: [PATCH 24/24] threading: removed feature wait_on_done to figure out
 causes of deadlock in windows AVX

---
 ggml-threading.c            | 56 +++++++++++++------------------------
 ggml-threading.h            |  3 +-
 ggml-tune.c                 |  2 +-
 ggml.c                      |  4 +--
 tests/test-ggml-threading.c | 55 ++++++++++--------------------------
 5 files changed, 38 insertions(+), 82 deletions(-)

diff --git a/ggml-threading.c b/ggml-threading.c
index 4a9cf622fafb0..1468318599ac9 100644
--- a/ggml-threading.c
+++ b/ggml-threading.c
@@ -167,7 +167,6 @@ struct ggml_compute_state_shared {
 
     // commands.
     atomic_bool wait_now;
-    atomic_bool wait_on_done;
     atomic_bool stop;
 
     // Default task runner, can be overriden by node.task_profile.runner.
@@ -263,11 +262,12 @@ void ggml_threading_suspend(struct ggml_threading_context *ctx) {
 
     struct ggml_compute_state_shared *shared = &ctx->shared;
 
+    const int n_worker_threads = ctx->n_threads - 1;
+
     ggml_spin_lock(&shared->spin);
     shared->wait_now = true;
     ggml_spin_unlock(&ctx->shared.spin);
 
-    const int n_worker_threads = ctx->n_threads - 1;
     while (shared->n_waiting != n_worker_threads) {
         ggml_spin_pause();
     }
@@ -281,8 +281,8 @@ void ggml_threading_suspend(struct ggml_threading_context *ctx) {
 
 // Wakeup all workers.
 //
-// Workers takes some time to wakeup, and has to lock spin after wakeup. Yield
-// is used to avoid signal frequently. Current implementation is highly
+// Workers takes some time to wakeup.
+// Yield is used to avoid notify frequently. Current implementation is highly
 // experimental. See tests/test-ggml-threading.c for details.
 void ggml_threading_resume(struct ggml_threading_context *ctx) {
     if (ctx->n_threads == 1) {
@@ -302,14 +302,20 @@ void ggml_threading_resume(struct ggml_threading_context *ctx) {
     // Dead lock detection.
     int counter = 0;
     int64_t last_notify_ms = 0;
-    const int max_notify_count = ctx->n_threads - 1;
-    const int max_duration_ms = 100 * max_notify_count;
+    const int max_notify_count = 50;
+    const int max_duration_ms = 1000;
+
+    if (shared->n_waiting != 0 && shared->n_waiting != ctx->n_threads - 1) {
+        fprintf(stderr,
+                "[ggml-threading] expected n_waiting is 0 or %d, actual %d, abort\n",
+                ctx->n_threads - 1, shared->n_waiting);
+        abort();
+    }
 
     ggml_spin_lock(&shared->spin);
     shared->wait_now = false;
 
     while (shared->n_waiting != 0) {
-
         GGML_ASSERT(pthread_mutex_lock(&shared->mutex) == 0);
         if (shared->n_waiting == 0) {
             GGML_ASSERT(pthread_mutex_unlock(&shared->mutex) == 0);
@@ -323,7 +329,11 @@ void ggml_threading_resume(struct ggml_threading_context *ctx) {
         GGML_ASSERT(pthread_mutex_unlock(&shared->mutex) == 0);
         last_notify_ms = ggml_time_ms();
 
-        sched_yield();
+        if (counter % 1 == 0) {
+            ggml_spin_pause();
+        } else {
+            sched_yield();
+        }
 
         int elapsed = last_notify_ms > 0 ? ggml_time_ms() - last_notify_ms : 0;
 
@@ -372,24 +382,6 @@ static void ggml_threading_setup_workers(struct ggml_threading_context *ctx,
             ggml_threading_resume(ctx);
             ggml_spin_lock(&shared->spin);
         }
-
-        if ((ctx->features & GGML_THREADING_FEATURE_WAIT_ON_DONE) > 0) {
-            // Optimize energy: wait_on_done. We MAY also check following nodes,
-            // but that's a bit complicated.
-            shared->wait_on_done = false;
-            for (int i = type + 1; i <= GGML_TASK_FINALIZE; i++) {
-                struct ggml_task_stage *next = &profile->stages[i];
-                if (next->parallel) {
-                    break;
-                }
-                if (next->wait) {
-                    shared->wait_on_done = true;
-                    PRINT_DEBUG("[main] wait_on_done is enabled for "
-                                "current task stage\n");
-                    break;
-                }
-            }
-        }
     } else if (current->wait) {
         if (shared->n_waiting < n_worker_threads) {
             ggml_spin_unlock(&ctx->shared.spin);
@@ -437,17 +429,7 @@ ggml_thread_ret_t ggml_threading_graph_compute_thread(void *data) {
             state->has_work = false;
             shared->n_tasks--;
 
-            bool wait = shared->wait_on_done && !state->has_work;
-            if (wait) {
-                ggml_threading_cond_wait(state);
-            }
-
             ggml_spin_unlock(&shared->spin);
-
-            // no need to pause.
-            if (wait) {
-                continue;
-            }
         }
 
         ggml_spin_pause();
@@ -594,11 +576,11 @@ ggml_threading_start(int n_threads, ggml_threading_thread_runner *thread_runner,
         .n_tasks = 0,
         .n_waiting = 0,
         .wait_now = false,
-        .wait_on_done = false,
         .stop = false,
         .task_runner = task_runner,
         .ctx = ctx,
     };
+    atomic_flag_clear(&ctx->shared.spin);
 
     PRINT_DEBUG("[main] thread start, features: %d\n", features);
 
diff --git a/ggml-threading.h b/ggml-threading.h
index 012c9cd504308..f226b00196ff6 100644
--- a/ggml-threading.h
+++ b/ggml-threading.h
@@ -17,8 +17,7 @@ struct ggml_threading_context;
 // Optional (experimental) features.
 enum ggml_threading_features {
     GGML_THREADING_FEATURE_NONE = 0,
-    GGML_THREADING_FEATURE_WAIT_ON_DONE = 1 << 0,
-    GGML_THREADING_FEATURE_PERF = 1 << 1,
+    GGML_THREADING_FEATURE_PERF = 1,
 };
 
 // The thread runner to feed into OS threads.
diff --git a/ggml-tune.c b/ggml-tune.c
index 36c44e1dc53d2..7aa9c217c7b29 100644
--- a/ggml-tune.c
+++ b/ggml-tune.c
@@ -802,7 +802,7 @@ bool ggml_mulmat_tune_bench(struct ggml_mulmat_tune *tune,
 
     struct ggml_threading_context *thrd_ctx =
         ggml_threading_start(tune->n_threads, NULL, NULL,
-                             GGML_THREADING_FEATURE_WAIT_ON_DONE, stages_time);
+                             GGML_THREADING_FEATURE_NONE, stages_time);
 
     for (int i_shape = 0; i_shape < tune->n_shapes; i_shape++) {
         const struct ggml_mulmat_tune_shape *shape = &tune->shapes[i_shape];
diff --git a/ggml.c b/ggml.c
index 75a5624818a19..614212b677226 100644
--- a/ggml.c
+++ b/ggml.c
@@ -16203,14 +16203,14 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
         }
 
         // ~ 50 us
-        //printf("=== prepare computing took %d us\n", (int)(ggml_time_us() - t0));
+        // printf("=== prepare computing took %d us\n", (int)(ggml_time_us() - t0));
     }
 
     const int64_t perf_start_cycles  = ggml_perf_cycles();
     const int64_t perf_start_time_us = ggml_perf_time_us();
 
     struct ggml_threading_context *thrd_ctx = ggml_threading_start(n_threads,
-        NULL, ggml_compute_forward, GGML_THREADING_FEATURE_WAIT_ON_DONE, NULL);
+        NULL, ggml_compute_forward, GGML_THREADING_FEATURE_NONE, NULL);
 
     for (int i = 0; i < cgraph->n_nodes; i++) {
         GGML_PRINT_DEBUG_5("%s: %d/%d\n", __func__, i, cgraph->n_nodes);
diff --git a/tests/test-ggml-threading.c b/tests/test-ggml-threading.c
index 8fc705a6bd197..cfb5b7a8e7223 100644
--- a/tests/test-ggml-threading.c
+++ b/tests/test-ggml-threading.c
@@ -71,12 +71,7 @@ static int test_driver(int id, struct ggml_tensor *node, int n_threads) {
         work_done_arr[i] = 0;
     }
 
-    bool wait_on_done = (node->task_profile.dev_flags[0] > 0u);
-
     enum ggml_threading_features features = GGML_THREADING_FEATURE_PERF;
-    if (wait_on_done) {
-        features |= GGML_THREADING_FEATURE_WAIT_ON_DONE;
-    }
 
     int t0 = (int)ggml_time_us();
 
@@ -118,9 +113,9 @@ static int test_driver(int id, struct ggml_tensor *node, int n_threads) {
     }
 
     printf("\tstage-0: parallel: %d, wait: %d\n\tstage-1: parallel: %d, wait: "
-           "%d, wait_on_done: %d %s\n",
+           "%d\n",
            stages[0].parallel, stages[0].wait, stages[1].parallel,
-           stages[1].wait, wait_on_done, stages[1].wait ? "<--------" : "");
+           stages[1].wait);
 
     if (actual == expect) {
         printf("\tthreading: init %6.3f ms, compute %6.3f ms, cleanup %6.3f "
@@ -214,7 +209,7 @@ lifecycle_runner(const struct ggml_compute_params *params,
 }
 
 // Test thread lifecycle: start -> suspend -> resume -> stop
-static int test_lifecycle(bool wait_on_done) {
+static int test_lifecycle() {
     struct ggml_tensor node;
     memset(&node, 0, sizeof(struct ggml_tensor));
 
@@ -243,9 +238,7 @@ static int test_lifecycle(bool wait_on_done) {
     int threads_arr_len = sizeof(threads_arr) / sizeof(threads_arr[0]);
     int n_threads = 1;
 
-    enum ggml_threading_features features =
-        wait_on_done ? GGML_THREADING_FEATURE_NONE
-                     : GGML_THREADING_FEATURE_WAIT_ON_DONE;
+    enum ggml_threading_features features = GGML_THREADING_FEATURE_NONE;
     for (int i = 0; i < threads_arr_len; i++) {
         n_threads = threads_arr[i];
         int start_time = (int)ggml_time_ms();
@@ -351,7 +344,6 @@ int main(void) {
     // physical cores:
     // - the wait/wakeup time varies much: can be up to tens or hundreds of the
     //   average time, thus greatly punishes those small workloads.
-    // - wait_on_done is general faster than wait_now, can be 10x faster.
 
     int threads_arr[] = {1, 2, 4, 6, 8, 16, 32, 64};
     int threads_arr_len = sizeof(threads_arr) / sizeof(threads_arr[0]);
@@ -408,7 +400,7 @@ int main(void) {
         }
     }
 
-    // node.task_profile.dev_flags: byte 0 for wait_on_done, byte 1 for loops.
+    // node.task_profile.dev_flags: byte 0 unused, byte 1 for loops.
 
     for (int x = 0; x < workload_arr_len; x++) {
         node.task_profile.dev_flags[1] = workload_arr[x];
@@ -423,7 +415,7 @@ int main(void) {
                    "n_threads: %2d ====\n",
                    workload_arr[x], n_threads);
 
-            // multi-threads: parallel + wait_now/wait_on_done
+            // multi-threads: parallel + wait
 
             if (n_threads == 1) {
                 stages[0].parallel = false;
@@ -489,22 +481,11 @@ int main(void) {
                         abort();
                     }
 
-                    { // disable wait_on_done
-                        node.task_profile.dev_flags[0] = 0u; // wait now.
-
-                        n_tests++;
-                        if (test_driver(n_tests, &node, n_threads) == 0) {
-                            n_passed++;
-                        }
-                    }
-
-                    { // enable wait_on_done
-                        node.task_profile.dev_flags[0] = 1u; // wait on done
+                    node.task_profile.dev_flags[0] = 0u; // wait now.
 
-                        n_tests++;
-                        if (test_driver(n_tests, &node, n_threads) == 0) {
-                            n_passed++;
-                        }
+                    n_tests++;
+                    if (test_driver(n_tests, &node, n_threads) == 0) {
+                        n_passed++;
                     }
                 }
             }
@@ -548,18 +529,12 @@ int main(void) {
     }
 
     // lifecycle.
-    for (int i = 0; i < 2; i++) {
-        bool wait_on_done = (i == 1);
-        printf("[test-ggml-threading] test lifecycle (wait_on_done = %d) ...\n",
-               wait_on_done);
-        ++n_tests;
+    printf("[test-ggml-threading] test lifecycle ...\n");
+    ++n_tests;
 
-        if (test_lifecycle(wait_on_done) == 0) {
-            ++n_passed;
-            printf("[test-ggml-threading] test lifecycle (wait_on_done = %d): "
-                   "ok\n\n",
-                   wait_on_done);
-        }
+    if (test_lifecycle() == 0) {
+        ++n_passed;
+        printf("[test-ggml-threading] test lifecycle: ok\n\n");
     }
 
     printf("[test-ggml-threading] %d/%d passed.\n", n_passed, n_tests);