From 213f133701e947a94bdd070315cc0528429bd27d Mon Sep 17 00:00:00 2001 From: mqy Date: Wed, 14 Jun 2023 18:33:14 +0800 Subject: [PATCH 01/24] initial --- .gitignore | 1 + CMakeLists.txt | 9 + Makefile | 31 +- examples/CMakeLists.txt | 1 + examples/common.cpp | 14 + examples/common.h | 2 + examples/main/main.cpp | 10 + examples/mulmat-tune/CMakeLists.txt | 14 + examples/mulmat-tune/README.md | 272 ++++++++ examples/mulmat-tune/mulmat-tune.cpp | 277 ++++++++ examples/perplexity/perplexity.cpp | 10 + ggml-cuda.cu | 2 +- ggml-opencl.cpp | 2 +- ggml-threading.c | 620 +++++++++++++++++ ggml-threading.h | 68 ++ ggml-tune.c | 897 ++++++++++++++++++++++++ ggml-tune.h | 137 ++++ ggml.c | 996 ++++++++++++++------------- ggml.h | 71 +- llama.cpp | 160 ++++- llama.h | 3 + tests/.gitignore | 2 + tests/CMakeLists.txt | 2 + tests/test-ggml-threading.c | 345 ++++++++++ tests/test-ggml-tune.c | 200 ++++++ 25 files changed, 3646 insertions(+), 500 deletions(-) create mode 100644 examples/mulmat-tune/CMakeLists.txt create mode 100644 examples/mulmat-tune/README.md create mode 100644 examples/mulmat-tune/mulmat-tune.cpp create mode 100644 ggml-threading.c create mode 100644 ggml-threading.h create mode 100644 ggml-tune.c create mode 100644 ggml-tune.h create mode 100644 tests/.gitignore create mode 100644 tests/test-ggml-threading.c create mode 100644 tests/test-ggml-tune.c diff --git a/.gitignore b/.gitignore index e7bfd52e3d63c..c2e2a0ab0ca32 100644 --- a/.gitignore +++ b/.gitignore @@ -40,6 +40,7 @@ models/* /server /Pipfile /libllama.so +/mulmat-tune build-info.h arm_neon.h diff --git a/CMakeLists.txt b/CMakeLists.txt index 73677195404e3..832c1e986a6eb 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -78,6 +78,7 @@ option(LLAMA_K_QUANTS "llama: use k-quants" option(LLAMA_BUILD_TESTS "llama: build tests" ${LLAMA_STANDALONE}) option(LLAMA_BUILD_EXAMPLES "llama: build examples" ${LLAMA_STANDALONE}) option(LLAMA_BUILD_SERVER "llama: build server example" OFF) +option(LLAMA_MULMAT_TUNE "llama: mulmat tune" OFF) # # Build info header @@ -214,6 +215,7 @@ if (LLAMA_BLAS) message(STATUS "BLAS found, Includes: ${BLAS_INCLUDE_DIRS}") add_compile_options(${BLAS_LINKER_FLAGS}) add_compile_definitions(GGML_USE_OPENBLAS) + add_compile_definitions(GGML_BLAS_VENDOR="${LLAMA_BLAS_VENDOR}") set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} ${BLAS_LIBRARIES}) set(LLAMA_EXTRA_INCLUDES ${LLAMA_EXTRA_INCLUDES} ${BLAS_INCLUDE_DIRS}) @@ -276,6 +278,11 @@ if (LLAMA_METAL) ) endif() +if (LLAMA_MULMAT_TUNE) + add_compile_definitions(GGML_USE_MULMAT_TUNE) + add_compile_definitions(GGML_MULMAT_TUNE_NDEBUG) +endif() + if (LLAMA_K_QUANTS) set(GGML_SOURCES_EXTRA ${GGML_SOURCES_EXTRA} k_quants.c k_quants.h) add_compile_definitions(GGML_USE_K_QUANTS) @@ -450,6 +457,8 @@ endif() add_library(ggml OBJECT ggml.c + ggml-threading.c + ggml-tune.c ggml.h ${GGML_SOURCES_CUDA} ${GGML_SOURCES_OPENCL} diff --git a/Makefile b/Makefile index afd06e0a60282..a8d1bdc0991ae 100644 --- a/Makefile +++ b/Makefile @@ -1,5 +1,5 @@ # Define the default target now so that it is always the first target -BUILD_TARGETS = main quantize quantize-stats perplexity embedding vdot train-text-from-scratch simple +BUILD_TARGETS = main quantize quantize-stats perplexity embedding vdot train-text-from-scratch simple mulmat-tune ifdef LLAMA_BUILD_SERVER BUILD_TARGETS += server @@ -47,7 +47,8 @@ endif OPT = -O3 CFLAGS = -I. $(OPT) -std=c11 -fPIC CXXFLAGS = -I. -I./examples $(OPT) -std=c++11 -fPIC -LDFLAGS = +# -lm fixed error: ggml.o: undefined reference to symbol 'tanhf@@GLIBC_2.2.5' from ubuntu 22.04 +LDFLAGS = -lm ifdef LLAMA_DEBUG CFLAGS += -O0 -g @@ -134,8 +135,7 @@ ifndef LLAMA_NO_K_QUANTS endif ifndef LLAMA_NO_ACCELERATE - # Mac M1 - include Accelerate framework. - # `-framework Accelerate` works on Mac Intel as well, with negliable performance boost (as of the predict time). + # Mac Intel & M1 - include Accelerate framework. ifeq ($(UNAME_S),Darwin) CFLAGS += -DGGML_USE_ACCELERATE LDFLAGS += -framework Accelerate @@ -145,10 +145,16 @@ endif # LLAMA_NO_ACCELERATE ifdef LLAMA_OPENBLAS CFLAGS += -DGGML_USE_OPENBLAS -I/usr/local/include/openblas -I/usr/include/openblas LDFLAGS += -lopenblas + ifeq ($(UNAME_S),Darwin) + # openblas installed with Homebew on macOS. + CFLAGS += -I/usr/local/opt/openblas/include + LDFLAGS += -L/usr/local/opt/openblas/lib + endif endif # LLAMA_OPENBLAS ifdef LLAMA_BLIS CFLAGS += -DGGML_USE_OPENBLAS -I/usr/local/include/blis -I/usr/include/blis + CFLAGS += -DGGML_BLAS_VENDOR="\"BLIS\"" LDFLAGS += -lblis -L/usr/local/lib endif # LLAMA_BLIS @@ -230,6 +236,11 @@ k_quants.o: k_quants.c k_quants.h $(CC) $(CFLAGS) -c $< -o $@ endif # LLAMA_NO_K_QUANTS +ifdef LLAMA_MULMAT_TUNE + CFLAGS += -DGGML_USE_MULMAT_TUNE -DGGML_MULMAT_TUNE_NDEBUG + CXXFLAGS += -DGGML_USE_MULMAT_TUNE +endif + # # Print build information # @@ -245,6 +256,8 @@ $(info I CC: $(CCV)) $(info I CXX: $(CXXV)) $(info ) +OBJS += ggml-tune.o ggml-threading.o + # # Build library # @@ -253,7 +266,12 @@ ggml.o: ggml.c ggml.h ggml-cuda.h $(CC) $(CFLAGS) -c $< -o $@ llama.o: llama.cpp ggml.h ggml-cuda.h ggml-metal.h llama.h llama-util.h - $(CXX) $(CXXFLAGS) -c $< -o $@ + +ggml-threading.o: ggml-threading.c ggml.h + $(CC) $(CFLAGS) -c $< -o $@ + +ggml-tune.o: ggml-tune.c ggml.h + $(CC) $(CFLAGS) -c $< -o $@ common.o: examples/common.cpp examples/common.h $(CXX) $(CXXFLAGS) -c $< -o $@ @@ -298,6 +316,9 @@ server: examples/server/server.cpp examples/server/httplib.h examples/server/jso train-text-from-scratch: examples/train-text-from-scratch/train-text-from-scratch.cpp build-info.h ggml.o llama.o $(OBJS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) +mulmat-tune: examples/mulmat-tune/mulmat-tune.cpp build-info.h ggml.o $(OBJS) + $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o mulmat-tune $(LDFLAGS) + build-info.h: $(wildcard .git/index) scripts/build-info.sh @sh scripts/build-info.sh > $@.tmp @if ! cmp -s $@.tmp $@; then \ diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt index cf9c4a2231337..cf01b8a2adb90 100644 --- a/examples/CMakeLists.txt +++ b/examples/CMakeLists.txt @@ -39,6 +39,7 @@ else() add_subdirectory(baby-llama) add_subdirectory(train-text-from-scratch) add_subdirectory(simple) + add_subdirectory(mulmat-tune) if (LLAMA_METAL) add_subdirectory(metal) endif() diff --git a/examples/common.cpp b/examples/common.cpp index fed24e027d8a8..882e90c9c3649 100644 --- a/examples/common.cpp +++ b/examples/common.cpp @@ -345,6 +345,16 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) { params.mem_test = true; } else if (arg == "--export") { params.export_cgraph = true; +#ifdef GGML_USE_MULMAT_TUNE + } else if (arg == "--tune") { + params.tune = true; + } else if (arg == "--tune-file") { + if (++i >= argc) { + invalid_param = true; + break; + } + params.tune_file = argv[i]; +#endif // GGML_USE_MULMAT_TUNE } else if (arg == "--verbose-prompt") { params.verbose_prompt = true; } else if (arg == "-r" || arg == "--reverse-prompt") { @@ -498,6 +508,10 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) { #endif fprintf(stderr, " --mtest compute maximum memory usage\n"); fprintf(stderr, " --export export the computation graph to 'llama.ggml'\n"); +#ifdef GGML_USE_MULMAT_TUNE + fprintf(stderr, " --tune mulmat tune enable. If tune-file is set then exit after bench\n"); + fprintf(stderr, " --tune-file FILE mulmat tune data file. If tune is true, then write bench result to this file, else load the file and run\n"); +#endif fprintf(stderr, " --verbose-prompt print prompt before generation\n"); fprintf(stderr, " --lora FNAME apply LoRA adapter (implies --no-mmap)\n"); fprintf(stderr, " --lora-base FNAME optional model to use as a base for the layers modified by the LoRA adapter\n"); diff --git a/examples/common.h b/examples/common.h index 6c2953cb2a7c6..5e394b2186c8d 100644 --- a/examples/common.h +++ b/examples/common.h @@ -77,6 +77,8 @@ struct gpt_params { bool mem_test = false; // compute maximum memory usage bool export_cgraph = false; // export the computation graph bool verbose_prompt = false; // print prompt tokens before generation + bool tune = false; // mulmat tune: enable + std::string tune_file = ""; // mulmat tune: data file }; bool gpt_params_parse(int argc, char ** argv, gpt_params & params); diff --git a/examples/main/main.cpp b/examples/main/main.cpp index 941312f9cc756..542e463bfe84e 100644 --- a/examples/main/main.cpp +++ b/examples/main/main.cpp @@ -117,6 +117,16 @@ int main(int argc, char ** argv) { return 1; } +#ifdef GGML_USE_MULMAT_TUNE + if (params.tune || !params.tune_file.empty()) { + bool ok = llama_mulmat_tune(ctx, params.n_threads, params.tune, params.tune_file.c_str()); + if (!ok || (params.tune && !params.tune_file.empty())) { + llama_free(ctx); + return ok? 0: 1; + } + } +#endif + // print system information { fprintf(stderr, "\n"); diff --git a/examples/mulmat-tune/CMakeLists.txt b/examples/mulmat-tune/CMakeLists.txt new file mode 100644 index 0000000000000..51e1053e87e07 --- /dev/null +++ b/examples/mulmat-tune/CMakeLists.txt @@ -0,0 +1,14 @@ +set(TARGET mulmat-tune) +add_executable(${TARGET} mulmat-tune.cpp) + +if (XCODE OR MSVC) + set(MULMAT_TUNE_LIBS ggml) +else() + set(MULMAT_TUNE_LIBS ggml m) +endif() + +target_link_libraries(${TARGET} PRIVATE ${MULMAT_TUNE_LIBS} ${CMAKE_THREAD_LIBS_INIT}) +target_compile_features(${TARGET} PRIVATE cxx_std_11) +if(TARGET BUILD_INFO) + add_dependencies(${TARGET} BUILD_INFO) +endif() diff --git a/examples/mulmat-tune/README.md b/examples/mulmat-tune/README.md new file mode 100644 index 0000000000000..cff8a3d6467ea --- /dev/null +++ b/examples/mulmat-tune/README.md @@ -0,0 +1,272 @@ +# Mulmat Benchmark and Tunning + +Apart from the standalone tool `mulmat-tune`, mulmat tune is also integrated into +`main` and `perplexity`. To avoid too many new cli options, I just added two options. +To make it run faster, the `m_num` is set as 8 thus max M is 128, and the `n_pass` +is set as 1. + +With the newly added cli options, we can use `main` and `perplexity` with the +following three ways: + +* bench and run: --tune +* bench and exit: --tune --tune-file +* load and run: --tune-file + +The `load` mode reads existing data file. Although this is fine because we can +run bench ahead of time (saving tens of seconds), but there are two shortcomings: +- have to re-run when format changed, this is OK because we are acknowledged. +- the most subtle problem is algorithm was changed silently but we are using the + outdated format. So I integrated mulmat tune into `main` and `perplexity` as + a complementary solution. + +## Build into main and perplexity + +Makefile: +``` +make clean && LLAMA_MULMAT_TUNE=1 make +``` + +CMake (with BLAS): +``` +cmake --build . --target clean +cmake .. -DLLAMA_BLAS=ON -DLLAMA_MULMAT_TUNE=ON +cmake --build . --config Release +``` + +Run examples: + +``` +# bench and run: + +./main -m ./models/3B/open-llama-3b-q4-0.bin -c 512 -b 1024 -n 256 --keep 48 --repeat_penalty 1.0 --color -i -r "User:" -f prompts/chat-with-bob.txt -t 4 --tune + +# bench then exit: +./main -m ./models/3B/open-llama-3b-q4-0.bin --tune --tune-file + +# load and run + +./main -m ./models/3B/open-llama-3b-q4-0.bin -c 512 -b 1024 -n 256 --keep 48 --repeat_penalty 1.0 --color -i -r "User:" -f prompts/chat-with-bob.txt -t 4 --tune-file +``` + +# Build the standalone `mulmat-tune` + +Makefile: +``` +make clean && LLAMA_MULMAT_TUNE=1 make +``` + +CMake (with BLAS) +``` +cmake --build . --target clean +cmake .. -DLLAMA_BLAS=ON -DLLAMA_MULMAT_TUNE=ON +cmake --build . --config Release +``` + +Run examples: + +``` +./mulmat-tune -h + +# run with default params (7B, Q4_0, ...) +./mulmat-tune + +# set model +./mulmat-tune --model 13B + +# set ggml ftype, 2 for Q4_0, 3 for Q4_1, run `mulmat-tune -h` for help. +./mulmat-tune --ftype 3 + +# customized m_num +./mulmat-tune --m_num 8 + +# customized n_pass: run 1 pass only instead of the default 3. +./mulmat-tune --n_pass 1 + +# customized n_threads instead of the default 1. +./mulmat-tune --n_threads 4 + +# save to file +./mulmat-tune --file + +# save to file, always override if exists (CAUTION!) +./mulmat-tune --file -y + +``` + +# End to End Test + +## Compare With Master + +You may want to run the following commands. Make sure the tune result file is +setup properly. + +General steps: + +1. run `./mulmat-tune -h` to see how to build for misc vendors. + you can build with `GGML_MULMAT_TUNE_NDEBUG=` to enable the the debug, e.g: + ``` + make clean; LLAMA_MULMAT_TUNE=1 LLAMA_MULMAT_TUNE_NDEBUG=1 LLAMA_NO_ACCELERATE=1 LLAMA_CLBLAST=1 make + ``` + On `macOS`, `ACCELERATE` is enabled by default. When `ACCELERATE` is built along + with `CUDA` or `CL`, you may not see `CUDA` or `CL` from debug because `CPU` + or `CPU_BLAS` is more faster (as of the estimation from mulmat tune). +2. create a small prompt file: + ``` + head -n 5 ./models/wikitext-2-raw/wiki.valid.raw > ./models/wiki.valid-5.raw + ``` +3. run any of the following example commands. + ``` + ./perplexity -m models/7B/ggml-model-q4_0.bin -f ./models/wiki.valid-5.raw -c 128 --mlock -t 1 -b 32 + ./perplexity -m models/7B/ggml-model-q4_0.bin -f ./models/wiki.valid-5.raw -c 128 --mlock -t 4 -b 64 + ``` + * `--mlock` is recommended for `macOS`, you may not want to use it. + * don't change `-c 128`: too large `context size` causes 0 perplexity trunk. + * `-t` is the number of threads, recommend `1`, `2`, `4` or `6`. + * you can change the batch size (`-b`) between `1` and `128`. + * you may want to add other cli options. + +The following results are generated with Accelerate compiled. + +### 1 thread + +**Master (2d43387d)** + +``` +| M | perplexity (seconds per pass) | prompt eval time (ms per token) | +| --- | --------------- | +| 8 | 43.53 | 339.95 | +| 16 | 44.31 | 346.12 | +| 24 | 43.14 | 336.90 | +| 32 | 33.59 | 262.25 | +| 40 | 27.64 | 215.77 | +| 48 | 24.52 | 191.42 | +``` + +**This branch (tune)** + +``` +| M | perplexity (seconds per pass) | prompt eval time (ms per token) | +| --- | --------------- | +| 8 | 43.78 | 341.96 | +| 16 | 42.88 | 334.93 | +| 24 | 42.06 | 328.42 | +| 32 | 33.07 | 258.25 | +| 40 | 28.69 | 223.98 | +| 48 | 25.65 | 200.19 | +``` + +### 4 threads + +**Master (2d43387d)** + +``` +| M | perplexity (seconds per pass) | prompt eval time (ms per token) | +| --- | --------------- | +| 8 | 12.43 | 96.99 | +| 16 | 12.10 | 94.44 | +| 24 | 12.81 | 99.95 | +| 32 | 31.64 | 247.04 | +| 48 | 24.55 | 191.63 | +| 64 | 17.56 | 137.09 | +| 96 | 17.59 | 137.25 | +| 128 | 10.73 | 83.74 | +``` + +**This branch (no tune)** + +``` +| M | perplexity (seconds per pass) | prompt eval time (ms per token) | +| --- | --------------- | +| 8 | 12.31 | 96.07 | +| 16 | 12.00 | 93.63 | +| 24 | 12.07 | 94.15 | +| 32 | 20.34 | 158.76 | +| 48 | 15.86 | 123.73 | +| 64 | 10.98 | 85.69 | +| 96 | 11.24 | 87.66 | +| 128 | 7.53 | 58.77 | +``` + +**This branch (tune)** + +``` +| M | perplexity (seconds per pass) | prompt eval time (ms per token) | +| --- | --------------- | +| 8 | 12.48 | 97.37 | +| 16 | 12.26 | 95.70 | +| 24 | 12.25 | 95.53 | +| 32 | 11.98 | 93.58 | +| 48 | 12.57 | 98.12 | +| 64 | 11.28 | 88.05 | +| 96 | 9.55 | 74.53 | +| 128 | 7.51 | 58.61 | +``` + +# Bench Data Format + +**Example** + +``` +5 3B 2 6 1 + +3200 3200 2 0 3 10 +16 0 0 0 16 1 0 1 0 0 0 0 +16 1 0 2 17 0 1 0 0 0 0 0 + 0 0 0 0 34 0 1 0 0 0 0 0 + 1 1 793 0 9103 2102 0 0 6014 0 + 2 2 1591 0 8034 2305 0 0 30982 0 + 4 4 2236 0 6476 2484 0 0 31388 0 + 8 7 4161 0 6623 2389 0 0 29204 0 + 16 15 8339 0 6434 2752 0 0 34303 0 + 32 32 16919 0 6915 3651 0 0 42511 0 + 64 200 34270 0 6574 4528 0 0 68212 0 + 128 188 69400 0 6325 6839 0 0 74437 0 + 256 303 134597 0 6168 11544 0 0 110180 0 + 512 687 279685 0 6337 29712 0 0 159728 0 + +3200 8640 2 0 2 10 + + ... + + ``` + +**Informal Explanation** + +``` +head +groups+ + +head := version model ggml_ftype n_shapes n_threads +shape+ + +# head +version: 1 +model: "3B" | "7B" | "13B" | "30B" | "65B" +ggml_ftype: 0 - 4, 7 - 14 +n_shapes: number of shapes +n_threads: number of threads + +shape := N K m_num n_profiles +task_conf_profile+ +bench_item+ + +task_conf_profile: stage_conf(init) stage_conf(compute) stage_conf(finalize) +stage_conf: backend parallel wait +backend: 0 (NONE) | 16 (CPU) | 17 (CPU_BLAS) | 32 (GPU) | 33 (GPU_CUDA) | 34 (GPU_CL) +parallel: 0 (false) | 1 (true) +wait: 0 (false) | 1 (true) + +bench_item: M profile_time+ +profile_time := stage_time[3] +stage_time[3]: init_time, compute_time, finalize_time +``` + +A task stage is invalid if it's backend equals to `GGML_TASK_BACKEND_NONE`. +Time unit is `us`. A column is all zeros when that stage does not exist. + +# NOTE + +1. "3B" is [open-llama 3B](https://github.com/ggerganov/llama.cpp/pull/1588). +2. Model names are subject to change: we may support something like X-3B, Y-4B, ... +3. As of Jun 1, this tool is still in early stage, will be changed frequently in + recent couple of days (or weeks). diff --git a/examples/mulmat-tune/mulmat-tune.cpp b/examples/mulmat-tune/mulmat-tune.cpp new file mode 100644 index 0000000000000..62f1da27764b9 --- /dev/null +++ b/examples/mulmat-tune/mulmat-tune.cpp @@ -0,0 +1,277 @@ +#include +#include +#include +#include +#include + +#include "build-info.h" +#include "ggml-tune.h" +#include "ggml.h" +#include "llama.h" + +#define UNUSED(x) (void)(x) + +static void print_build_tips(void) { + const char *a = "LLAMA_NO_ACCELERATE"; + fprintf(stderr, "Tips on how to build with various backend vendors:\n\n"); + fprintf(stderr, "CUDA: make clean; LLAMA_CUBLAS=1 make\n"); + fprintf(stderr, "CL: make clean; LLAMA_CLBLAST=1 make\n"); + fprintf(stderr, "Accelerate: make clean; %s= make\n", a); + fprintf(stderr, "OpenBLAS: make clean; %s=1 LLAMA_OPENBLAS=1 make\n", a); + fprintf(stderr, "BLIS: make clean; %s=1 LLAMA_BLIS=1 make\n", a); + fprintf(stderr, "\n"); + fprintf(stderr, "NOTE: for CUDA/CL, use %s=1 to disable ACCELERATE\n", a); +} + +static bool prompt_yes_no(const char *prompt) { + char buf[2]; + while (true) { + fprintf(stderr, "%s (Y|n)\n", prompt); + buf[0] = 0; + buf[1] = 0; + int i = 0; + int c = 0; + + while (c != '\n') { + c = fgetc(stdin); + buf[i % 2] = c; + i++; + } + if (i == 1) { + if (buf[0] == '\n') { + return true; + } + } else if (i == 2) { + if (buf[0] == 'Y' || buf[0] == 'y') { + return true; + } + if (buf[0] == 'N' || buf[0] == 'n') { + return false; + } + } + } +} + +static void cmd_analyze(struct ggml_mulmat_tune *tune); + +static void usage(char *prog) { + const char *usage_lines[] = { + "usage: %s args", + "", + "bench [-m MODEL] [-t TYPE] [-f FILE] [-y]", + "--model MODEL 3B | 7B | 13B | 30B | 65B", + " default 7B", + "--ftype FTYPE ggml ftype:", + " 0: all F32", + " 1: mostly F16", + " 2: mostly Q4_0", + " 3: mostly Q4_1", + " 4: mostly Q4_1, some F16", + " 7: mostly Q8_0", + " 8: mostly Q5_0", + " 9: mostly Q5_1", + " 10: mostly Q2_K", + " 11: mostly Q3_K", + " 12: mostly Q4_K", + " 13: mostly Q5_K", + " 14: mostly Q6_K", + " default 2 (mostly Q4_0)", + "--m_num M_NUM number of M, the max M = 2^(M_NUM-1)", + " requires between [6, 12]", + " default 10", + "--n_pass PASS number of passes to run", + " default 1", + " requires: between [1, 3]", + "--n_threads NTH bench with this number of threads", + " requires: between [1, 16]", + " default 1", + "--file FILE data file to write", + " default stdout", + "-y always answer \"yes\" to all prompts", + }; + + int len = (int)(sizeof(usage_lines) / sizeof(char *)); + for (int i = 0; i < len; i++) { + const char *line = usage_lines[i]; + if (i == 0) { + fprintf(stderr, line, prog); + } else { + fprintf(stderr, "%s\n", line); + } + } + + printf("\n"); + print_build_tips(); + printf("\n"); +} + +int main(int argc, char **argv) { + if (argc == 2) { + if (strcmp(argv[1], "-h") == 0 || strcmp(argv[1], "--help") == 0) { + usage(argv[0]); + return 0; + } + } + + int arg_start = 1; + + const char *arg_model = NULL; + const char *arg_ftype = NULL; + const char *arg_m_num = NULL; + const char *arg_n_threads = NULL; + const char *arg_n_pass = NULL; + const char *arg_file = NULL; + bool always_yes = false; + + for (int i = arg_start; i < argc; i++) { + if (strcmp(argv[i], "--model") == 0) { + if (i + 1 < argc) { + arg_model = argv[i + 1]; + ++i; + } + } else if (strcmp(argv[i], "--ftype") == 0) { + if (i + 1 < argc) { + arg_ftype = argv[i + 1]; + ++i; + } + } else if (strcmp(argv[i], "--m_num") == 0) { + if (i + 1 < argc) { + arg_m_num = argv[i + 1]; + ++i; + } + } else if (strcmp(argv[i], "--n_pass") == 0) { + if (i + 1 < argc) { + arg_n_pass = argv[i + 1]; + ++i; + } + } else if (strcmp(argv[i], "--n_threads") == 0) { + if (i + 1 < argc) { + arg_n_threads = argv[i + 1]; + ++i; + } + } else if (strcmp(argv[i], "--file") == 0) { + if (i + 1 < argc) { + arg_file = argv[i + 1]; + ++i; + } + } else if (strcmp(argv[i], "-y") == 0) { + always_yes = true; + } else { + fprintf(stderr, "invalid arg: %s\n", argv[i]); + usage(argv[0]); + return 1; + } + } + + enum ggml_ftype ftype = GGML_FTYPE_MOSTLY_Q4_0; + { + if (arg_ftype != NULL) { + int v = atoi(arg_ftype); + ftype = (enum ggml_ftype)v; + } + + if (ftype > GGML_FTYPE_MOSTLY_Q5_1) { + fprintf(stderr, "k_quants type %d is not implemented\n", ftype); + return 1; + } + } + + if (arg_file != NULL && !always_yes) { + struct stat st; + int rc = stat(arg_file, &st); + UNUSED(st); + if (rc == 0) { // prompt + size_t len = strlen(arg_file) + 50; + char *prompt = (char *)malloc(len); + GGML_ASSERT(prompt); + snprintf(prompt, len, "data file '%s' exists, override?", arg_file); + + if (!prompt_yes_no(prompt)) { + printf("Aborted.\n"); + return 1; + } + free(prompt); + } + } + + int m_num = 10; + { + if (arg_m_num != NULL) { + int v = atoi(arg_m_num); + m_num = v; + } + + if (m_num < 6 || m_num > 12) { + fprintf(stderr, "invalid m_num: %d, expect between [6, 12]\n", + m_num); + usage(argv[0]); + return 1; + } + } + + int n_pass = 1; + { + if (arg_n_pass != NULL) { + int v = atoi(arg_n_pass); + n_pass = v; + } + if (n_pass < 1 || n_pass > GGML_MULMAT_MAX_PASS) { + fprintf(stderr, "invalid n_pass: %d, expect between [1, %d]\n", + n_pass, GGML_MULMAT_MAX_PASS); + usage(argv[0]); + return 1; + } + } + + int n_threads = 1; + { + if (arg_n_threads != NULL) { + int v = atoi(arg_n_threads); + n_threads = v; + if (n_threads < 1 || n_threads > 16) { + fprintf(stderr, + "invalid n_threads: %d, expect between [1, 16]\n", + n_threads); + usage(argv[0]); + return 1; + } + } + } + + const char *model_name = "7B"; + { + if (arg_model != NULL) { + model_name = arg_model; + } + } + + // Let init message print earlier. + { + struct ggml_init_params init_params = { + /*.mem_size =*/1, + /*.mem_buffer =*/NULL, + /*.no_alloc =*/0, + }; + struct ggml_context *ctx = ggml_init(init_params); + GGML_ASSERT(ctx); + ggml_free(ctx); + } + + struct ggml_mulmat_tune tune; + + struct ggml_mulmat_tune_params params; + memset(¶ms, 0, sizeof(struct ggml_mulmat_tune_params)); + + ggml_mulmat_init_task_profiles(); + + ggml_mulmat_tune_model_init(¶ms.model, model_name, ftype); + params.m_num = m_num; + params.n_pass = n_pass; + params.n_threads = n_threads; + params.progress = true; + params.output_console = true; + params.fname = arg_file; + + bool ok = ggml_mulmat_tune_bench(&tune, ¶ms); + return ok ? 0 : 1; +} diff --git a/examples/perplexity/perplexity.cpp b/examples/perplexity/perplexity.cpp index ae8cfe0afc0b7..1f14c18def3a3 100644 --- a/examples/perplexity/perplexity.cpp +++ b/examples/perplexity/perplexity.cpp @@ -158,6 +158,16 @@ int main(int argc, char ** argv) { return 1; } +#ifdef GGML_USE_MULMAT_TUNE + if (params.tune || !params.tune_file.empty()){ + bool ok = llama_mulmat_tune(ctx, params.n_threads, params.tune, params.tune_file.c_str()); + if (!ok || (params.tune && !params.tune_file.empty())) { + llama_free(ctx); + return ok? 0: 1; + } + } +#endif + // print system information { fprintf(stderr, "\n"); diff --git a/ggml-cuda.cu b/ggml-cuda.cu index 16488b9f9067f..cf52109bce96e 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -2571,7 +2571,7 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_ func = ggml_cuda_rms_norm; break; case GGML_OP_MUL_MAT: - if (!any_on_device && !ggml_cuda_can_mul_mat(tensor->src0, tensor->src1, tensor)) { + if (!any_on_device/* && !ggml_cuda_can_mul_mat(tensor->src0, tensor->src1, tensor)*/) { return false; } func = ggml_cuda_mul_mat; diff --git a/ggml-opencl.cpp b/ggml-opencl.cpp index 95f4cec6dd59c..b2300a104ddb2 100644 --- a/ggml-opencl.cpp +++ b/ggml-opencl.cpp @@ -1628,7 +1628,7 @@ bool ggml_cl_mul_mat_use_f16(const struct ggml_tensor * src0, const struct ggml_ } void ggml_cl_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst, void * wdata, size_t wsize) { - GGML_ASSERT(ggml_cl_can_mul_mat(src0, src1, dst)); + // GGML_ASSERT(ggml_cl_can_mul_mat(src0, src1, dst)); if (src0->type == GGML_TYPE_F32) { ggml_cl_mul_mat_f32(src0, src1, dst); diff --git a/ggml-threading.c b/ggml-threading.c new file mode 100644 index 0000000000000..cf17793f6be61 --- /dev/null +++ b/ggml-threading.c @@ -0,0 +1,620 @@ + +#include +#include +#include + +#include "ggml-threading.h" +#include "ggml.h" + +#define UNUSED(x) (void)(x) + +// see https://github.com/ggerganov/llama.cpp/pull/1314 +#if defined(__x86_64__) || (defined(_MSC_VER) && defined(_M_AMD64)) +#include +static inline void ggml_spin_pause(void) { _mm_pause(); } +#else +static inline void ggml_spin_pause(void) {} +#endif + +#if defined(_WIN32) + +#include + +typedef volatile LONG atomic_int; +typedef atomic_int atomic_bool; +typedef LONG atomic_flag; + +typedef CRITICAL_SECTION pthread_mutex_t; +typedef CONDITION_VARIABLE pthread_cond_t; +typedef void pthread_mutexattr_t; +typedef void pthread_condattr_t; + +typedef HANDLE pthread_t; + +static void atomic_store(atomic_int *ptr, LONG val) { + InterlockedExchange(ptr, val); +} + +static LONG atomic_load(atomic_int *ptr) { + return InterlockedCompareExchange(ptr, 0, 0); +} + +static LONG atomic_fetch_add(atomic_int *ptr, LONG inc) { + return InterlockedExchangeAdd(ptr, inc); +} + +static LONG atomic_fetch_sub(atomic_int *ptr, LONG dec) { + return atomic_fetch_add(ptr, -(dec)); +} + +static inline LONG atomic_flag_test_and_set(volatile atomic_flag *ptr) { + return InterlockedCompareExchange(ptr, 1, 0); +} +static inline LONG atomic_flag_clear(volatile atomic_flag *ptr) { + return InterlockedExchange(ptr, 0); +} +static int pthread_create(pthread_t *out, void *unused, + ggml_thread_ret_t (*func)(void *), void *arg) { + (void)unused; + HANDLE handle = + CreateThread(NULL, 0, (LPTHREAD_START_ROUTINE)func, arg, 0, NULL); + if (handle == NULL) { + return EAGAIN; + } + + *out = handle; + return 0; +} + +static int pthread_join(pthread_t thread, void *unused) { + (void)unused; + return (int)WaitForSingleObject(thread, INFINITE); +} + +static int pthread_mutex_init(pthread_mutex_t *mutex, + pthread_mutexattr_t *attr) { + (void)attr; + InitializeCriticalSection(mutex); + return 0; +} + +static int pthread_mutex_destroy(pthread_mutex_t *mutex) { + DeleteCriticalSection(mutex); + return 0; +} + +static int pthread_mutex_lock(pthread_mutex_t *mutex) { + EnterCriticalSection(mutex); + return 0; +} + +static int pthread_mutex_unlock(pthread_mutex_t *mutex) { + LeaveCriticalSection(mutex); + return 0; +} + +static int pthread_cond_init(pthread_cond_t *cond, pthread_condattr_t *attr) { + (void)attr; + InitializeConditionVariable(cond); + return 0; +} + +static int pthread_cond_destroy(pthread_cond_t *cond) { + (void)cond; + return 0; +} + +static int pthread_cond_wait(pthread_cond_t *cond, pthread_mutex_t *mutex) { + SleepConditionVariableCS(cond, mutex, INFINITE); + return 0; +} + +static int pthread_cond_signal(pthread_cond_t *cond) { + WakeConditionVariable(cond); + return 0; +} + +static int pthread_cond_broadcast(pthread_cond_t *cond) { + WakeAllConditionVariable(cond); + return 0; +} + +static int sched_yield(void) { + // https://learn.microsoft.com/en-us/windows/win32/api/winnt/nf-winnt-yieldprocessor + YieldProcessor(); + return 0; +} + +#else // ! _WIN32 + +#include +#include +#include + +#endif + +// #define GGML_THREADING_DEBUG 1 + +#ifdef GGML_THREADING_DEBUG +#define PRINT_DEBUG(...) fprintf(stdout, __VA_ARGS__) +#else +#define PRINT_DEBUG(...) +#endif + +struct ggml_perf_stats { + int runs; + + // total cycles + atomic_int cycles; + + // total time in us. + atomic_int time_us; +}; + +struct ggml_compute_state_shared { + atomic_flag spin; + pthread_mutex_t mutex; + pthread_cond_t cond; + + // number of threads that has entered thread runner. + atomic_int n_ready; + + // number of assigned but unfinished tasks, workers decrease it. + atomic_int n_tasks; + + // number of waiting workers, workers increase it. + atomic_int n_waiting; + + // commands. + atomic_bool wait_now; + atomic_bool wait_on_done; + atomic_bool stop; + + ggml_threading_task_runner *task_runner; + + struct ggml_threading_context *ctx; +}; +struct ggml_compute_state { + pthread_t thrd; + + atomic_bool has_work; + struct ggml_compute_params params; + struct ggml_tensor *node; + + struct ggml_compute_state_shared *shared; +}; +struct ggml_threading_context { + int n_threads; + struct ggml_compute_state_shared shared; + struct ggml_compute_state *workers; + + enum ggml_threading_features features; + + struct ggml_perf_stats wait_perf; + struct ggml_perf_stats wakeup_perf; + + int64_t *stages_time; +}; + +// NOTE: ggml_spin_lock and ggml_spin_unlock may can be noop if +// feature wait_on_done is off. +static inline void ggml_spin_lock(volatile atomic_flag *obj) { + while (atomic_flag_test_and_set(obj)) { + ggml_spin_pause(); + } +} + +static inline void ggml_spin_unlock(volatile atomic_flag *obj) { + atomic_flag_clear(obj); +} + +static inline void ggml_perf_collect(struct ggml_perf_stats *st, int64_t c0, + int64_t t0) { + st->runs++; + st->cycles += (ggml_cycles() - c0); + st->time_us += (ggml_time_us() - t0); +} + +// A worker thread goes cond waiting. +// NOTE: must be protected by shared->spin +static void ggml_threading_cond_wait(struct ggml_compute_state *state) { + struct ggml_compute_state_shared *shared = state->shared; + + int64_t perf_cycles_0 = 0; + int64_t perf_time_0 = 0; + + if (shared->ctx->features & GGML_THREADING_FEATURE_PERF) { + perf_cycles_0 = ggml_cycles(); + perf_time_0 = ggml_time_us(); + } + + GGML_ASSERT(pthread_mutex_lock(&shared->mutex) == 0); + + if (!shared->wait_now) { + GGML_ASSERT(pthread_mutex_unlock(&shared->mutex) == 0); + ggml_spin_unlock(&shared->spin); + return; + } + + shared->n_waiting++; + ggml_spin_unlock(&shared->spin); + + GGML_ASSERT(pthread_cond_wait(&shared->cond, &shared->mutex) == 0); + GGML_ASSERT(pthread_mutex_unlock(&shared->mutex) == 0); + + ggml_spin_lock(&shared->spin); + + shared->n_waiting--; + + if (shared->ctx->features & GGML_THREADING_FEATURE_PERF) { + ggml_perf_collect(&shared->ctx->wait_perf, perf_cycles_0, perf_time_0); + } +} + +// Wakeup all workers. +// +// Workers takes some time to wakeup, and has to lock spin after wakeup. Yield +// is used to avoid signal frequently. Current implementation is highly +// experimental. See tests/test-ggml-threading.c for details. +// +// NOTE: must be protected by shared->spin +static void +ggml_threading_wakeup_workers(struct ggml_compute_state_shared *shared) { + int64_t perf_cycles_0 = 0; + int64_t perf_time_0 = 0; + + if (shared->ctx->features & GGML_THREADING_FEATURE_PERF) { + perf_cycles_0 = ggml_cycles(); + perf_time_0 = ggml_time_us(); + } + + shared->wait_now = false; + + int loop_counter = 0; + int notify_counter = 0; + int64_t last_signal_time = 0; + + while (shared->n_waiting != 0) { + ggml_spin_unlock(&shared->spin); + + if (loop_counter > 0) { + ggml_spin_pause(); + if (loop_counter > 3) { + sched_yield(); + } + } + ++loop_counter; + + // TODO: should bench actual average wait/wakeup time. + if (last_signal_time > 0 && (ggml_time_us() - last_signal_time) < 10) { + continue; + } + + GGML_ASSERT(pthread_mutex_lock(&shared->mutex) == 0); + GGML_ASSERT(pthread_cond_broadcast(&shared->cond) == 0); + GGML_ASSERT(pthread_mutex_unlock(&shared->mutex) == 0); + ++notify_counter; + last_signal_time = ggml_time_us(); + + ggml_spin_lock(&shared->spin); + } + + if (shared->ctx->features & GGML_THREADING_FEATURE_PERF) { + ggml_perf_collect(&shared->ctx->wakeup_perf, perf_cycles_0, + perf_time_0); + } + + // if (notify_counter > 1) { + // printf("%s: loop counter: %d, notify counter: %d\n", __func__, + // loop_counter, notify_counter); + // } + UNUSED(notify_counter); +} + +// Setup workers for a task stage. +// NOTE: must be protected by shared->spin +static void ggml_threading_setup_workers(struct ggml_threading_context *ctx, + struct ggml_task_profile *profile, + enum ggml_task_type type) { + PRINT_DEBUG("[main] setup workers for task ...\n"); + +#ifdef GGML_THREADING_DEBUG + int64_t t0 = ggml_time_us(); +#endif + + const int n_worker_threads = ctx->n_threads - 1; + struct ggml_task_stage *current = &profile->stages[type]; + struct ggml_compute_state_shared *shared = &ctx->shared; + + if (current->parallel) { + if (shared->n_waiting > 0) { + ggml_threading_wakeup_workers(shared); + } + + if ((ctx->features & GGML_THREADING_FEATURE_WAIT_ON_DONE) > 0) { + // Optimize energy: wait_on_done. We MAY also check following nodes, + // but that's a bit complicated. + shared->wait_on_done = false; + for (int i = type + 1; i <= GGML_TASK_FINALIZE; i++) { + struct ggml_task_stage *next = &profile->stages[i]; + if (next->parallel) { + break; + } + if (next->wait) { + shared->wait_on_done = true; + PRINT_DEBUG("[main] wait_on_done is enabled for " + "current task stage\n"); + break; + } + } + } + } else if (current->wait) { + if (shared->n_waiting < n_worker_threads) { + shared->wait_now = true; + PRINT_DEBUG("[main] wait_now was set, expect %d workers wait\n", + n_worker_threads); + ggml_spin_unlock(&shared->spin); + + while (shared->n_waiting != n_worker_threads) { + ggml_spin_pause(); + } + + ggml_spin_lock(&shared->spin); + PRINT_DEBUG("[main] saw %d workers waiting\n", n_worker_threads); + } + } + + PRINT_DEBUG("[main] setup workers for task took %d us\n", + (int)(ggml_time_us() - t0)); +} + +ggml_thread_ret_t ggml_threading_graph_compute_thread(void *data) { + GGML_ASSERT(data); + struct ggml_compute_state *state = (struct ggml_compute_state *)data; + GGML_ASSERT(state); + + struct ggml_compute_state_shared *shared = state->shared; + GGML_ASSERT(shared); + GGML_ASSERT(shared->task_runner); + + shared->n_ready++; + + PRINT_DEBUG("[%d-th] running\n", state->params.ith); + + while (!shared->stop) { + if (shared->wait_now) { + ggml_spin_lock(&shared->spin); + if (!state->has_work) { + ggml_threading_cond_wait(state); + } + ggml_spin_unlock(&shared->spin); + } + + if (shared->n_tasks > 0 && state->has_work) { + enum ggml_compute_error err = + shared->task_runner(&state->params, state->node); + + GGML_ASSERT(err == GGML_COMPUTE_OK || err == GGML_COMPUTE_FALLBACK); + + ggml_spin_lock(&shared->spin); + + state->has_work = false; + shared->n_tasks--; + + bool wait = shared->wait_on_done && !state->has_work; + if (wait) { + ggml_threading_cond_wait(state); + } + + ggml_spin_unlock(&shared->spin); + + // no need to pause. + if (wait) { + continue; + } + } + + ggml_spin_pause(); + } + + PRINT_DEBUG("[%d-th] exited\n", state->params.ith); + return 0; +} + +enum ggml_compute_error +ggml_threading_compute_tensor(struct ggml_threading_context *ctx, + struct ggml_tensor *node, void *wdata, + size_t wsize) { + GGML_ASSERT(ctx); + GGML_ASSERT(node); + + GGML_ASSERT(ctx->shared.task_runner); + struct ggml_compute_state_shared *state_shared = &ctx->shared; + + // This is the params for main thread. + struct ggml_compute_params params; + enum ggml_compute_error err; + + for (int type = GGML_TASK_INIT; type <= GGML_TASK_FINALIZE; type++) { + if (node->task_profile.stages[type].backend == GGML_TASK_BACKEND_NONE) { + continue; + } + + PRINT_DEBUG("[main] stage: %d\n", type); + + int64_t t_stage = 0; + if (ctx->stages_time) { + t_stage = ggml_time_us(); + } + + // n_tasks is the total number of parallel computing tasks + // (including main thread). + int n_tasks = + node->task_profile.stages[type].parallel ? ctx->n_threads : 1; + + ggml_spin_lock(&state_shared->spin); + + if (ctx->n_threads > 1) { + ggml_threading_setup_workers(ctx, &node->task_profile, type); + } + + if (n_tasks > 1) { + // setup compute task parameters. + for (int j = 0; j < n_tasks - 1; j++) { + ctx->workers[j].params = (struct ggml_compute_params){ + .type = type, + .ith = j + 1, + .nth = n_tasks, + .wsize = wsize, + .wdata = wdata, + }; + ctx->workers[j].node = node; + ctx->workers[j].has_work = true; + } + state_shared->n_tasks = n_tasks - 1; + PRINT_DEBUG("[main] assigned %d tasks\n", state_shared->n_tasks); + } + + ggml_spin_unlock(&state_shared->spin); + + // main thread always run the 0-th task. + // TODO: assert(params->nth == 1) instead of + // assert(params->ith == 0) + { + params.type = type; + params.ith = 0; + params.nth = n_tasks; + params.wsize = wsize; + params.wdata = wdata; + + err = state_shared->task_runner(¶ms, node); + } + + // wait for tasks done. + if (n_tasks > 1) { + while (state_shared->n_tasks != 0) { + ggml_spin_pause(); + } + } + + PRINT_DEBUG("[main] all tasks finished\n\n"); + + if (ctx->stages_time) { + ctx->stages_time[type] = ggml_time_us() - t_stage; + } + + if (err != GGML_COMPUTE_OK) { + return err; + } + } + + return GGML_COMPUTE_OK; +} + +struct ggml_threading_context * +ggml_threading_start(int n_threads, ggml_threading_thread_runner *thread_runner, + ggml_threading_task_runner *task_stage_runner, + enum ggml_threading_features features, + int64_t stages_time[3]) { + GGML_ASSERT(n_threads > 0); + GGML_ASSERT(thread_runner); + GGML_ASSERT(task_stage_runner); + + size_t ctx_sz = sizeof(struct ggml_threading_context); + struct ggml_threading_context *ctx = malloc(ctx_sz); + GGML_ASSERT(ctx); + memset(ctx, 0, ctx_sz); + + ctx->shared = (struct ggml_compute_state_shared){ + .spin = {0}, + .n_ready = 0, + .n_tasks = 0, + .n_waiting = 0, + .wait_now = false, + .wait_on_done = false, + .stop = false, + .task_runner = task_stage_runner, + .ctx = ctx, + }; + + PRINT_DEBUG("[main] thread start, features: %d\n", features); + + ctx->n_threads = n_threads; + ctx->features = features; + ctx->stages_time = stages_time; + + int n_workers = n_threads - 1; + if (n_workers > 0) { + GGML_ASSERT(pthread_mutex_init(&ctx->shared.mutex, NULL) == 0); + GGML_ASSERT(pthread_cond_init(&ctx->shared.cond, NULL) == 0); + + size_t workers_sz = sizeof(struct ggml_compute_state) * n_workers; + struct ggml_compute_state *workers = malloc(workers_sz); + GGML_ASSERT(workers); + memset(workers, 0, workers_sz); + + for (int j = 0; j < n_workers; j++) { + workers[j].shared = &ctx->shared; + GGML_ASSERT(pthread_create(&workers[j].thrd, NULL, thread_runner, + &workers[j]) == 0); + } + + ctx->workers = workers; + + while (ctx->shared.n_ready != n_workers) { + ggml_spin_pause(); + } + } + + return ctx; +} + +static void +ggml_threading_print_perf_stats(struct ggml_threading_context *ctx) { + bool print_stats = ctx->features & GGML_THREADING_FEATURE_PERF; +#ifdef GGML_THREADING_DEBUG + print_stats = true; +#endif + + if (!print_stats) { + return; + } + + const char *prefix_arr[2] = {"[threading wait ]", "[threading wakeup]"}; + struct ggml_perf_stats *st_arr[2] = {&ctx->wait_perf, &ctx->wakeup_perf}; + for (int i = 0; i < 2; i++) { + struct ggml_perf_stats *st = st_arr[i]; + if (st->runs == 0) { + continue; + } + fprintf(stdout, + "%s runs: %4d, avg cycles: %8.3f ms, avg time: " + "%8.3f ms\n", + prefix_arr[i], st->runs, + 1.0 * st->cycles / (st->runs * ggml_cycles_per_ms()), + 1.0 * st->time_us / (st->runs * 1000)); + } +} + +void ggml_threading_stop(struct ggml_threading_context *ctx) { + GGML_ASSERT(ctx); + + if (ctx->workers) { + PRINT_DEBUG("[main] stopping thread pool ...\n"); + ctx->shared.stop = true; + + ggml_spin_lock(&ctx->shared.spin); + ggml_threading_wakeup_workers(&ctx->shared); + ggml_spin_unlock(&ctx->shared.spin); + + for (int j = 0; j < ctx->n_threads - 1; j++) { + GGML_ASSERT(pthread_join(ctx->workers[j].thrd, NULL) == 0); + } + free(ctx->workers); + PRINT_DEBUG("[main] thread pool stopped\n"); + } + + ggml_threading_print_perf_stats(ctx); + + free(ctx); +} diff --git a/ggml-threading.h b/ggml-threading.h new file mode 100644 index 0000000000000..f3214efc7cb7d --- /dev/null +++ b/ggml-threading.h @@ -0,0 +1,68 @@ +#pragma once + +#include "ggml.h" + +#ifdef __cplusplus +extern "C" { +#endif + +#if defined(_WIN32) +typedef int ggml_thread_ret_t; +#else +typedef void *ggml_thread_ret_t; +#endif + +struct ggml_threading_context; + +// Optional (experimental) features. +enum ggml_threading_features { + GGML_THREADING_FEATURE_NONE = 0, + GGML_THREADING_FEATURE_WAIT_ON_DONE = 1 << 0, + GGML_THREADING_FEATURE_PERF = 1 << 1, +}; + +// Compute errors. +enum ggml_compute_error { + GGML_COMPUTE_OK = 0, + GGML_COMPUTE_FALLBACK = 1, +}; + +// The task runner to be called by main thread and workers. +typedef enum ggml_compute_error(ggml_threading_task_runner)( + struct ggml_compute_params *params, struct ggml_tensor *node); + +// The thread runner to feed into OS threads. +typedef ggml_thread_ret_t(ggml_threading_thread_runner)(void *data); + +// Init and start underlying workers if n_threads > 1. +// +// features: optional for configure threading additional features. +// see `ggml_threading_feature`, default 0. +// stages_time: optional for collecting per-stage wall clock time. +struct ggml_threading_context * +ggml_threading_start(int n_threads, ggml_threading_thread_runner *thread, + ggml_threading_task_runner *task_stage_runner, + enum ggml_threading_features features, + int64_t stages_time[3]); + +// Stop workers (if exist), free memories (including the ctx). +void ggml_threading_stop(struct ggml_threading_context *ctx); + +// The default implementation of `ggml_threading_thread_runner` +ggml_thread_ret_t ggml_threading_graph_compute_thread(void *data); + +// Compute a tensor. It computes the enabled task stages one by one. +// Caller should take care of the return error: retry for fallback error. +enum ggml_compute_error +ggml_threading_compute_tensor(struct ggml_threading_context *ctx, + struct ggml_tensor *node, void *wdata, + size_t wsize); + +// This is an experimental functionality for mulmat tune, as a thin wrapper. +enum ggml_compute_error +ggml_compute_forward_wrapper(struct ggml_compute_params *params, + struct ggml_tensor *tensor); + +#ifdef __cplusplus +} +#endif diff --git a/ggml-tune.c b/ggml-tune.c new file mode 100644 index 0000000000000..fbca953ed469e --- /dev/null +++ b/ggml-tune.c @@ -0,0 +1,897 @@ +#include + +#include "ggml-threading.h" +#include "ggml-tune.h" +#include "ggml.h" + +// MUL_MAT fine tunning for non-GPU-offloading cases. + +#define GGML_MULMAT_CACHE_LEN 16 +static struct mm_cache_element default_mm_cache[GGML_MULMAT_CACHE_LEN] = {0}; + +#define FNV_OFFSET 14695981039346656037UL +#define FNV_PRIME 1099511628211UL +static uint64_t ggml_mulmat_tune_cache_hash(int M, int N, int K) { + char buf[30]; + snprintf(buf, 30, "%d%d%d", M, N, K); + + uint64_t hash = FNV_OFFSET; + for (const char *p = buf; *p; p++) { + hash ^= (uint64_t)(unsigned char)(*p); + hash *= FNV_PRIME; + } + return hash; +} + +static const char * +ggml_mulmat_tune_task_backend_name(enum ggml_task_backend backend) { + switch (backend) { + case GGML_TASK_BACKEND_NONE: + return ""; + case GGML_TASK_BACKEND_CPU: + return "CPU"; + case GGML_TASK_BACKEND_CPU_BLAS: + return "BLAS"; + case GGML_TASK_BACKEND_GPU: + return "GPU"; + case GGML_TASK_BACKEND_GPU_CUDA: + return "CUDA"; + case GGML_TASK_BACKEND_GPU_CL: + return "CL"; + default: + GGML_ASSERT(false); + } +} + +const struct ggml_task_profile *ggml_mulmat_tune_select_task_profile( + struct ggml_mulmat_tune *tune, int M, int N, int K, enum ggml_type src0_t, + enum ggml_type src1_t, int stages_time[3]) { + GGML_ASSERT(tune); + + // TODO: default_mm_cache is thread-unsafe. + struct mm_cache_element *mm_cache = default_mm_cache; + int slot = ggml_mulmat_tune_cache_hash(M, N, K) % GGML_MULMAT_CACHE_LEN; + struct mm_cache_element *e = &mm_cache[slot]; + + struct ggml_mulmat_tune_time profiles_time[GGML_MAX_TASK_PROFILES] = {0}; + + struct ggml_task_profile *prof = NULL; + + if (e->M == M && e->N == N && e->K == K) { + prof = e->profile; + if (stages_time) { + for (int i = 0; i < 3; i++) { + stages_time[i] = e->stages_time[i]; + } + } + } else { + const struct ggml_mulmat_tune_shape *shape = NULL; + shape = ggml_mulmat_tune_get_shape(tune, N, K, src0_t, src1_t); + if (shape) { + ggml_mulmat_tune_estimate_time(shape, M, profiles_time); + + int min = INT32_MAX; + int index = -1; + for (int i = 0; i < shape->n_profiles; i++) { + int total = profiles_time[i].total_time; + if (total < min) { + min = total; + index = i; + } + } + + if (index >= 0) { + prof = profiles_time[index].profile; + for (int i = 0; i < 3; i++) { + int t = profiles_time[index].stage_time[i]; + if (stages_time) { + stages_time[i] = t; + } + e->stages_time[i] = t; + } + + GGML_ASSERT(prof); + + e->profile = prof; + e->M = M; + e->N = N; + e->K = K; + + // to disable this, build with + // `make clean; LLAMA_MULMAT_TUNE=1 LLAMA_MULMAT_TUNE_NDEBUG=1 + // make` +#if !defined(GGML_MULMAT_TUNE_NDEBUG) + const char *names[3]; + for (int i = 0; i < 3; i++) { + names[i] = ggml_mulmat_tune_task_backend_name( + prof->stages[i].backend); + } + printf( + "\n[mulmat tune] M: %3d, N: %5d, K: %5d, backends of the " + "fastest profile: %s %s %s\n", + M, N, K, names[0], names[1], names[2]); +#endif + } + } + } + + return prof; +} + +void ggml_mulmat_tune_model_init(struct ggml_mulmat_tune_model *model, + const char *name, enum ggml_ftype ftype) { + const int n_vocab = 32000; + int n_embd; + // n_ff = ((2*(4*n_embd)/3 + n_mult - 1)/n_mult)*n_mult + int n_ff; + // n_rot = n_embd/n_head; + int n_rot; + + if (strcmp(name, "3B") == 0) { + // n_head=32, n_mult=216, n_layer=26 + // https://github.com/ggerganov/llama.cpp/pull/1588 + n_embd = 3200; + n_ff = 8640; + n_rot = 100; + } else if (strcmp(name, "7B") == 0) { + n_embd = 4096; + n_ff = 11008; + n_rot = 128; + } else if (strcmp(name, "13B") == 0) { + n_embd = 5120; + n_ff = 13824; + n_rot = 128; + } else if (strcmp(name, "30B") == 0) { + n_embd = 6656; + n_ff = 17920; + n_rot = 128; + } else if (strcmp(name, "65B") == 0) { + n_embd = 8192; + n_ff = 22016; + n_rot = 128; + } else { + GGML_ASSERT(false); + } + + model->name = name; + model->ftype = ftype; + model->n_vocab = n_vocab; + model->n_embd = n_embd; + model->n_ff = n_ff; + model->n_rot = n_rot; +} + +bool ggml_mulmat_tune_init(struct ggml_mulmat_tune *tune, + struct ggml_mulmat_tune_params *params, + struct ggml_task_profile_factory *pf) { + + struct ggml_mulmat_tune_model *model = ¶ms->model; + + memset(tune, 0, sizeof(struct ggml_mulmat_tune)); + + tune->version = GGML_MULMAT_TUNE_VERSION; + tune->n_threads = params->n_threads; + tune->ftype = model->ftype; + + size_t name_len = strlen(model->name); + GGML_ASSERT(name_len > 0); + strncpy(tune->model, model->name, sizeof(tune->model) - 1); + + const enum ggml_type rot_src0_type = GGML_TYPE_F16; + const enum ggml_type src1_type = GGML_TYPE_F32; + + int n_vocab = model->n_vocab; + int n_embd = model->n_embd; + int n_ff = model->n_ff; + int n_rot = model->n_rot; + + enum ggml_type type = ggml_ftype_to_ggml_type(model->ftype); + + GGML_ASSERT(GGML_MULMAT_N_SHAPES >= 6); + tune->n_shapes = GGML_MULMAT_N_SHAPES; + + // Attention layers + tune->shapes[0] = (struct ggml_mulmat_tune_shape){ + .N = n_embd, .K = n_embd, .src0_type = type, .src1_type = src1_type}; + // Feed forward layers + tune->shapes[1] = (struct ggml_mulmat_tune_shape){ + .N = n_embd, .K = n_ff, .src0_type = type, .src1_type = src1_type}; + tune->shapes[2] = (struct ggml_mulmat_tune_shape){ + .N = n_ff, .K = n_embd, .src0_type = type, .src1_type = src1_type}; + tune->shapes[3] = (struct ggml_mulmat_tune_shape){ + .N = n_vocab, .K = n_embd, .src0_type = type, .src1_type = src1_type}; + // RoPE + tune->shapes[4] = (struct ggml_mulmat_tune_shape){ + .N = n_rot, .K = 0, .src0_type = rot_src0_type, .src1_type = src1_type}; + tune->shapes[5] = (struct ggml_mulmat_tune_shape){ + .N = 0, .K = n_rot, .src0_type = rot_src0_type, .src1_type = src1_type}; + + for (int i = 0; i < tune->n_shapes; i++) { + struct ggml_mulmat_tune_shape *shape = &tune->shapes[i]; + shape->n_profiles = ggml_mulmat_get_task_profiles( + pf, shape->src0_type, shape->src1_type, &shape->profiles); + if (shape->n_profiles == 0) { + // allowed for testing. + continue; + } + + shape->m_num = params->m_num; + shape->arr_m = malloc(shape->m_num * sizeof(int)); + for (int j = 0; j < shape->m_num; j++) { + shape->arr_m[j] = 1 << j; + } + + size_t sz = sizeof(struct ggml_mulmat_tune_m) * + (shape->n_profiles * shape->m_num); + shape->items = malloc(sz); + GGML_ASSERT(shape->items); + memset(shape->items, 0, sz); + } + + return true; +} + +void ggml_mulmat_tune_free(struct ggml_mulmat_tune *tune) { + for (int i = 0; i < tune->n_shapes; i++) { + struct ggml_mulmat_tune_shape *shape = &tune->shapes[i]; + GGML_ASSERT(shape); + + // arr_m and items can be NULL only when testing. + if (shape->arr_m) { + free(shape->arr_m); + } + if (shape->items) { + free(shape->items); + } + } +} + +static bool ggml_mulmat_tune_write_profiles( + FILE *fp, const struct ggml_task_profile *profiles, int n_profiles) { + int rc; + for (int i = 0; i < n_profiles; i++) { + const struct ggml_task_profile *profile = &profiles[i]; + for (int j = 0; j < 3; j++) { + const struct ggml_task_stage *ts = &profile->stages[j]; + rc = fprintf(fp, "%2d %d %d", ts->backend, ts->parallel ? 1 : 0, + ts->wait ? 1 : 0); + if (rc <= 0) { + return false; + } + if (j < 2) { + rc = fprintf(fp, " "); + if (rc <= 0) { + return false; + } + } + } + rc = fprintf(fp, "\n"); + if (rc <= 0) { + return false; + } + } + + return true; +} + +static bool +ggml_mulmat_tune_validate_internal(const struct ggml_mulmat_tune *tune, + const char *model, int ftype, int n_threads, + char *errbuf, int errbuf_len) { + + if (tune->version != GGML_MULMAT_TUNE_VERSION) { + snprintf(errbuf, errbuf_len - 1, + "version mismatch, built-in: %d, " + "yours: %d", + GGML_MULMAT_TUNE_VERSION, tune->version); + return false; + } else if (strcmp(model, tune->model) != 0) { + snprintf(errbuf, errbuf_len - 1, + "model mismatch. built-in: %s, yours: %s", model, tune->model); + return false; + } else if (ftype != tune->ftype) { + snprintf(errbuf, errbuf_len - 1, + "ftype mismatch. built-in: %d, yours: %d\n", ftype, + tune->ftype); + return false; + } else if (n_threads != tune->n_threads) { + snprintf(errbuf, errbuf_len - 1, + "n_threads mismatch. run-time: %d, yours: %d\n", n_threads, + tune->n_threads); + return false; + } + + for (int i = 0; i < tune->n_shapes; i++) { + const struct ggml_mulmat_tune_shape *shape = &tune->shapes[i]; + + struct ggml_task_profile *builtin_profiles = NULL; + int n_profiles = ggml_mulmat_get_task_profiles( + NULL, shape->src0_type, shape->src1_type, &builtin_profiles); + + if (n_profiles != shape->n_profiles) { + snprintf(errbuf, errbuf_len - 1, "task profiles mismatch"); + return false; + } + + // TODO: profiles order is relevant, too strict. + size_t sz = sizeof(struct ggml_task_profile) * n_profiles; + if (memcmp(builtin_profiles, shape->profiles, sz) != 0) { + snprintf(errbuf, errbuf_len - 1, "task profiles mismatch"); + + printf("=== built-in profiles:\n"); + ggml_mulmat_tune_write_profiles(stderr, builtin_profiles, + n_profiles); + + printf("=== incoming profiles:\n"); + ggml_mulmat_tune_write_profiles(stderr, shape->profiles, + shape->n_profiles); + return false; + } + } + + return true; +} + +bool ggml_mulmat_tune_validate(const struct ggml_mulmat_tune *tune, + const char *model, int ftype, int n_threads) { + char errbuf[128]; + bool ok = ggml_mulmat_tune_validate_internal(tune, model, ftype, n_threads, + errbuf, sizeof(errbuf)); + if (!ok) { + fprintf(stderr, "[mulmat tune] error: %s. run bench again.\n", errbuf); + } + + return ok; +} + +bool ggml_mulmat_tune_read_data(struct ggml_mulmat_tune *tune, FILE *fp) { + int rc = fscanf(fp, "%d", &tune->version); + if (rc <= 0) { + return false; + } + + if (tune->version != GGML_MULMAT_TUNE_VERSION) { + fprintf(stderr, "[mulmat tune] version mismatch, run bench again\n"); + return false; + } + + rc = fscanf(fp, "%s %d %d %d", tune->model, (int *)&tune->ftype, + &tune->n_shapes, &tune->n_threads); + if (rc <= 0) { + return false; + } + + for (int i_shape = 0; i_shape < tune->n_shapes; i_shape++) { + struct ggml_mulmat_tune_shape *shape = &tune->shapes[i_shape]; + + rc = fscanf(fp, "%d %d %d %d %d %d", &shape->N, &shape->K, + (int *)&shape->src0_type, (int *)&shape->src1_type, + &shape->n_profiles, &shape->m_num); + if (rc <= 0) { + return false; + } + + { + size_t item_size = sizeof(struct ggml_mulmat_tune_m) * + (shape->n_profiles * shape->m_num); + shape->items = malloc(item_size); + if (shape->items == NULL) { + fprintf(stderr, "[mulmat tune] failed to allocate memory\n"); + return false; + } + memset(shape->items, 0, item_size); + } + + { + size_t sz = sizeof(struct ggml_task_profile) * shape->n_profiles; + shape->profiles = malloc(sz); + GGML_ASSERT(shape->profiles); + memset(shape->profiles, 0, sz); + } + + for (int ip = 0; ip < shape->n_profiles; ip++) { + struct ggml_task_profile *profile = &shape->profiles[ip]; + for (int j = 0; j < 3; j++) { + struct ggml_task_stage *ts = &profile->stages[j]; + int backend; + int parallel; + int wait; + rc = fscanf(fp, "%d %d %d", &backend, ¶llel, &wait); + if (rc <= 0) { + return false; + } + ts->backend = (enum ggml_task_backend)backend; + ts->parallel = parallel ? true : false; + ts->wait = wait ? true : false; + } + } + + for (int i_m = 0; i_m < shape->m_num; i_m++) { + int M; + for (int ip = 0; ip < shape->n_profiles; ip++) { + if (ip == 0) { + rc = fscanf(fp, "%d", &M); + if (rc <= 0) { + return false; + } + } + struct ggml_mulmat_tune_m *item = + &shape->items[ip * shape->m_num + i_m]; + item->M = M; + rc = fscanf(fp, "%d %d %d", &item->stages_time[0], + &item->stages_time[1], &item->stages_time[2]); + if (rc <= 0) { + return false; + } + } + } + } + + return true; +} + +bool ggml_mulmat_tune_write_data(const struct ggml_mulmat_tune *tune, + FILE *fp) { + int rc; + rc = fprintf(fp, "%d %s %d %d %d\n\n", tune->version, tune->model, + tune->ftype, tune->n_shapes, tune->n_threads); + if (rc <= 0) { + return false; + } + + for (int i_shape = 0; i_shape < tune->n_shapes; i_shape++) { + if (i_shape > 0) { + fprintf(fp, "\n"); + } + const struct ggml_mulmat_tune_shape *shape = &tune->shapes[i_shape]; + rc = fprintf(fp, "%d %d %d %d %d %d\n", shape->N, shape->K, + shape->src0_type, shape->src1_type, shape->n_profiles, + shape->m_num); + if (rc <= 0) { + return false; + } + + rc = ggml_mulmat_tune_write_profiles(fp, shape->profiles, + shape->n_profiles); + if (rc <= 0) { + return false; + } + + for (int i_m = 0; i_m < shape->m_num; i_m++) { + for (int ip = 0; ip < shape->n_profiles; ip++) { + struct ggml_mulmat_tune_m *item = + &shape->items[ip * shape->m_num + i_m]; + if (ip == 0) { + rc = fprintf(fp, "%4d", item->M); + if (rc <= 0) { + return false; + } + } + + struct ggml_task_profile *profile = &shape->profiles[ip]; + for (int k = 0; k < 3; k++) { + if (profile->stages[k].backend != GGML_TASK_BACKEND_NONE) { + rc = fprintf(fp, "%9d", item->stages_time[k]); + if (rc <= 0) { + return false; + } + } else { + rc = fprintf(fp, " 0"); + if (rc <= 0) { + return false; + } + } + } + } + rc = fprintf(fp, "\n"); + if (rc <= 0) { + return false; + } + } + } + + return true; +} + +const struct ggml_mulmat_tune_shape * +ggml_mulmat_tune_get_shape(const struct ggml_mulmat_tune *tune, const int N, + const int K, enum ggml_type src0_type, + enum ggml_type src1_type) { + GGML_ASSERT(N > 0 && K > 0); + + for (int i = 0; i < tune->n_shapes; i++) { + const struct ggml_mulmat_tune_shape *s = &tune->shapes[i]; + if (s->src0_type != src0_type || s->src1_type != src1_type) { + continue; + } + + if (s->N > 0 && s->K > 0) { + if (s->N == N && s->K == K) { + return s; + } + } else if (s->N > 0 && s->K == 0) { + if (s->N == N) { + return s; + } + } else if (s->N == 0 && s->K > 0) { + if (s->K == K) { + return s; + } + } + } + + return NULL; +} + +// This is the experimental reference implementation. +// Requires both n_threads are same at bench time and runtime. +void ggml_mulmat_tune_estimate_time( + const struct ggml_mulmat_tune_shape *shape, int M, + struct ggml_mulmat_tune_time *profile_time) { + + GGML_ASSERT(shape); + GGML_ASSERT(profile_time); + + const int m_num = shape->m_num; + const int min_m = shape->items[0].M; + const int max_m = shape->items[m_num - 1].M; + + for (int ip = 0; ip < shape->n_profiles; ip++) { + struct ggml_task_profile *profile = &shape->profiles[ip]; + profile_time[ip].total_time = 0; + profile_time[ip].profile = profile; + + const int items_offset = ip * m_num; + + struct ggml_mulmat_tune_m *p0 = NULL; + struct ggml_mulmat_tune_m *p1 = NULL; + if (M < min_m) { + // first two. + p0 = &shape->items[items_offset]; + p1 = &shape->items[items_offset + 1]; + } else if (M > max_m) { + // last two + p0 = &shape->items[items_offset + m_num - 2]; + p1 = &shape->items[items_offset + m_num - 1]; + } else { + for (int i = 0; i < m_num; i++) { + p1 = &shape->items[items_offset + i]; + if (p1->M == M) { + p0 = p1; + break; + } + + if (i > 0) { + p0 = (struct ggml_mulmat_tune_m *)(p1 - 1); + if (M > p0->M && M < p1->M) { + break; + } + } + } + } + + GGML_ASSERT(p0 && p1); + + for (int i_stage = 0; i_stage < 3; i_stage++) { + struct ggml_task_stage *stage = &profile->stages[i_stage]; + if (stage->backend == GGML_TASK_BACKEND_NONE) { + continue; + } + + int p0_v = p0->stages_time[i_stage]; + int p1_v = p1->stages_time[i_stage]; + + GGML_ASSERT(p0_v >= 0); + GGML_ASSERT(p1_v >= 0); + + // t = aM + b + double a; + double b; + + if (p0 == p1) { + a = 0.0; + b = p1_v; + } else { + a = 1.0 * (p1_v - p0_v) / (p1->M - p0->M); + b = p1_v - a * p1->M; + } + int t = (int)(a * M + b); + + profile_time[ip].stage_time[i_stage] = t; + profile_time[ip].total_time += t; + } + } +} + +// Experimental: create mul_mat tensor. +static struct ggml_tensor *ggml_mulmat_new_tensor(int M, int N, int K, + enum ggml_type src0_type, + struct ggml_context **ctx) { + // At most 256, because in `ggml_quantize_qx_x`, the index type of hist is + // either int8_t or uint8_t. + // Use 1024 to avoid suddenly broken. + int64_t hist[1024]; + + bool src0_is_quantized = ggml_is_quantized(src0_type); + + size_t ctx_size = 0; + ctx_size += (size_t)(M * N * ggml_type_sizef(GGML_TYPE_F32)); // src1 + ctx_size += (size_t)(N * K * ggml_type_sizef(src0_type)); // src0 + ctx_size += (size_t)(1024 * 1024 * 64); // experimental + + if (src0_is_quantized) { + // quantize F32 to Qx_x + ctx_size += (size_t)(N * K * ggml_type_sizef(GGML_TYPE_F32)); + } + + struct ggml_init_params init_params = { + .mem_size = ctx_size, + .mem_buffer = NULL, + .no_alloc = 0, + }; + + *ctx = ggml_init(init_params); + GGML_ASSERT(*ctx); + + // src0: N x K + struct ggml_tensor *src0 = + ggml_new_tensor_2d(*ctx, src0_type, (int64_t)K, (int64_t)N); + + // src1: M x K + struct ggml_tensor *src1 = + ggml_new_tensor_2d(*ctx, GGML_TYPE_F32, (int64_t)K, (int64_t)M); + ggml_set_f32(src1, 0.5f); + + if (src0_type == GGML_TYPE_F32 || src0_type == GGML_TYPE_F16) { + ggml_set_f32(src0, 0.1f); + } else if (src0_is_quantized) { + struct ggml_tensor *src0_f32 = + ggml_new_tensor_2d(*ctx, GGML_TYPE_F32, (int64_t)K, (int64_t)N); + ggml_set_f32(src0_f32, 0.1f); + + switch (src0_type) { + case GGML_TYPE_Q4_0: + ggml_quantize_q4_0((const float *)src0_f32->data, src0->data, N * K, + K, hist); + break; + case GGML_TYPE_Q4_1: + ggml_quantize_q4_1((const float *)src0_f32->data, src0->data, N * K, + K, hist); + break; + case GGML_TYPE_Q5_0: + ggml_quantize_q5_0((const float *)src0_f32->data, src0->data, N * K, + K, hist); + break; + case GGML_TYPE_Q5_1: + ggml_quantize_q5_1((const float *)src0_f32->data, src0->data, N * K, + K, hist); + break; + case GGML_TYPE_Q8_0: + ggml_quantize_q8_0((const float *)src0_f32->data, src0->data, N * K, + K, hist); + break; + default: + GGML_ASSERT(false); + } + } else { + GGML_ASSERT(false); + } + + // node: M x N + // Will compute z = y * xT, z: node, y: src1, x: src0 + return ggml_mul_mat(*ctx, src0, src1); +} + +// Experimental: allocate memory for wdata with max possible size. +// This part of code is actually belongs to ggml compute graph. +static size_t ggml_mulmat_allocate_wdata(int N, int K, char **wdata) { + // The size is actually determined by cgraph before computing. + // Apart from the src0_type, wsize is affected by backend, cache line size, + // n_threads etc. + + const size_t extra = 1024 * 1024; + size_t sz = (size_t)(N * K * ggml_type_sizef(GGML_TYPE_F32)) + extra; + void *buf = malloc(sz); + + if (!buf) { + fprintf(stderr, + "[mulmat tune] error: failed to allocate %zu MiB memory", + sz / 1024 / 1024); + return 0; + } + + memset(buf, 0, sz); + *wdata = buf; + return sz; +} + +int ggml_mulmat_tune_get_builtin_task_backends( + enum ggml_task_backend *backends) { + int i = 0; + backends[i++] = GGML_TASK_BACKEND_CPU; + + if (ggml_cpu_has_cpublas()) { + backends[i++] = GGML_TASK_BACKEND_CPU_BLAS; + } + + if (ggml_cpu_has_cublas()) { + backends[i++] = GGML_TASK_BACKEND_GPU_CUDA; + } else if (ggml_cpu_has_clblast()) { + backends[i++] = GGML_TASK_BACKEND_GPU_CL; + } + return i; +} + +bool ggml_mulmat_tune_bench(struct ggml_mulmat_tune *tune, + struct ggml_mulmat_tune_params *params) { + GGML_ASSERT(tune); + GGML_ASSERT(params); + GGML_ASSERT(params->model.name); + + enum ggml_task_backend backends[16]; + int n_backends = ggml_mulmat_tune_get_builtin_task_backends(backends); + if (n_backends < 2) { + fprintf(stderr, + "[mulmat tune] error: this program was not built with BLAS.\n"); + return false; + } + + bool ok = ggml_mulmat_tune_init(tune, params, NULL); + if (!ok) { + return false; + } + + { + char buf[128] = {0}; + int offset = 0; + + for (int i = 0; i < n_backends; i++) { + if (i > 0) { + buf[offset++] = ','; + buf[offset++] = ' '; + } + const char *name = ggml_mulmat_tune_task_backend_name(backends[i]); + size_t len = strlen(name); + memcpy(&buf[offset], name, len); + offset += (int)len; + } + + fprintf(stdout, + "[mulmat tune] model: %s, ggml ftype: %d, " + "n_pass: %d, n_threads: %d, n_shapes: %d, backends: %s\n", + params->model.name, params->model.ftype, params->n_pass, + params->n_threads, tune->n_shapes, buf); + } + + int64_t stages_time[3]; + int64_t t0 = ggml_time_ms(); + + struct ggml_threading_context *thrd_ctx = ggml_threading_start( + tune->n_threads, ggml_threading_graph_compute_thread, + ggml_compute_forward_wrapper, GGML_THREADING_FEATURE_WAIT_ON_DONE, + stages_time); + + for (int i_shape = 0; i_shape < tune->n_shapes; i_shape++) { + const struct ggml_mulmat_tune_shape *shape = &tune->shapes[i_shape]; + int M; + int N = shape->N; + int K = shape->K; + + char buf[20] = {0}; + int buf_len = sizeof(buf) - 1; + int line_len = 0; + + for (int i_m = 0; i_m < shape->m_num; i_m++) { + M = shape->arr_m[i_m]; + if (shape->N == 0) { + N = M; + } else if (shape->K == 0) { + K = M; + } + + if (params->progress) { + line_len = snprintf(buf, buf_len, "%d %d %d ", N, K, M); + fprintf(stdout, "%s", buf); + fflush(stdout); + } + + char *wdata = NULL; + size_t wsize = ggml_mulmat_allocate_wdata(N, K, &wdata); + if (wsize == 0) { + return false; + } + + struct ggml_context *ctx = NULL; + struct ggml_tensor *node = + ggml_mulmat_new_tensor(M, N, K, shape->src0_type, &ctx); + + for (int ip = 0; ip < shape->n_profiles; ip++) { + const struct ggml_task_profile *profile = &shape->profiles[ip]; + + memcpy(&node->task_profile, profile, + sizeof(struct ggml_task_profile)); + + struct ggml_mulmat_tune_m *item = + &shape->items[ip * shape->m_num + i_m]; + item->M = M; + + int min[3] = {INT32_MAX, INT32_MAX, INT32_MAX}; + + for (int k = 0; k < params->n_pass; k++) { + for (int j = 0; j < 3; j++) { + stages_time[j] = 0; + } + + /*enum ggml_compute_error err = */ + ggml_threading_compute_tensor(thrd_ctx, node, wdata, wsize); + + for (int i = 0; i < 3; i++) { + int v = (int)stages_time[i]; + if (v < min[i]) { + min[i] = v; + } + } + + if (params->progress) { + fprintf(stdout, "."); + fflush(stdout); + line_len++; + } + } + for (int i = 0; i < 3; i++) { + item->stages_time[i] = min[i]; + } + } + + ggml_free(ctx); + free(wdata); + + if (params->progress) { + line_len += 10; + for (int j = 0; j < line_len; j++) { + fprintf(stdout, "\b \b"); + } + fflush(stdout); + } + } + } + + ggml_threading_stop(thrd_ctx); + + fprintf(stdout, "[mulmat tune] done, elapsed time: %d seconds.\n", + (int)(ggml_time_ms() - t0) / 1000); + + // output + + if (params->fname && strcmp(params->fname, "") != 0) { + FILE *fp = fopen(params->fname, "w"); + if (!fp) { + fprintf(stderr, + "[mulmat tune] warn: failed to open file `%s`, print to " + "console instead\n\n", + params->fname); + params->output_console = 1; + } else { + ok = ggml_mulmat_tune_write_data(tune, fp); + fclose(fp); + + if (ok) { + fprintf(stdout, "[mulmat tune] data was written to `%s`\n", + params->fname); + } else { + fprintf( + stderr, + "[mulmat tune] warn: failed to write file `%s`, print to " + "console instead\n\n", + params->fname); + params->output_console = 1; + } + } + } + + if (params->output_console) { + return ggml_mulmat_tune_write_data(tune, stdout); + } + + return true; +} diff --git a/ggml-tune.h b/ggml-tune.h new file mode 100644 index 0000000000000..404f1f1c4a53f --- /dev/null +++ b/ggml-tune.h @@ -0,0 +1,137 @@ +#pragma once + +#include +#include +#include + +#include "ggml.h" + +#ifdef __cplusplus +extern "C" { +#endif + +#define GGML_MULMAT_TUNE_VERSION 8 +#define GGML_MULMAT_N_SHAPES 6 + +#define GGML_MULMAT_MAX_PASS 3 + +struct ggml_mulmat_tune_m { + int M; + + int stages_time[3]; +}; + +struct ggml_mulmat_tune_model { + const char *name; + + enum ggml_ftype ftype; + + int n_vocab; + + int n_embd; + + // n_ff = ((2*(4*n_embd)/3 + n_mult - 1)/n_mult)*n_mult + int n_ff; + + // n_rot = n_embd/n_head; + int n_rot; +}; + +struct ggml_mulmat_tune_shape { + // For RoPE, one of N / K is 0. + int N; + int K; + + enum ggml_type src0_type; + enum ggml_type src1_type; + + int n_profiles; + struct ggml_task_profile *profiles; + + int m_num; + int *arr_m; + + struct ggml_mulmat_tune_m *items; +}; + +struct ggml_mulmat_tune { + int version; + + char model[16]; + + enum ggml_ftype ftype; + + int n_shapes; + // Given N/K, we bench for mul_mat [M,K] x [K,N]. + struct ggml_mulmat_tune_shape shapes[GGML_MULMAT_N_SHAPES]; + + int n_threads; +}; + +struct ggml_mulmat_tune_time { + struct ggml_task_profile *profile; + int stage_time[3]; + int total_time; +}; + +struct mm_cache_element { + int M; + int N; + int K; + struct ggml_task_profile *profile; + int stages_time[3]; +}; + +// params for tune/bench. +struct ggml_mulmat_tune_params { + struct ggml_mulmat_tune_model model; + int m_num; + int n_pass; + int n_threads; + bool progress; // print and clear '.' + bool output_console; // also print result to console + const char *fname; +}; + +// NOTE: stages_time is filled if not null. +const struct ggml_task_profile * +ggml_mulmat_tune_select_task_profile(struct ggml_mulmat_tune *tune, int M, + int N, int K, enum ggml_type src0_t, + enum ggml_type src1_t, int stages_time[3]); + +bool ggml_mulmat_tune_validate(const struct ggml_mulmat_tune *tune, + const char *model_name, int ftype, + int n_threads); + +void ggml_mulmat_tune_model_init(struct ggml_mulmat_tune_model *model, + const char *name, enum ggml_ftype ftype); + +bool ggml_mulmat_tune_init(struct ggml_mulmat_tune *tune, + struct ggml_mulmat_tune_params *params, + struct ggml_task_profile_factory *profile_factory); + +void ggml_mulmat_tune_free(struct ggml_mulmat_tune *tune); + +bool ggml_mulmat_tune_write_data(const struct ggml_mulmat_tune *tune, FILE *fp); + +bool ggml_mulmat_tune_read_data(struct ggml_mulmat_tune *tune, FILE *fp); + +const struct ggml_mulmat_tune_shape * +ggml_mulmat_tune_get_shape(const struct ggml_mulmat_tune *tune, int N, int K, + enum ggml_type src0_type, enum ggml_type src1_type); + +void ggml_mulmat_tune_estimate_time(const struct ggml_mulmat_tune_shape *shape, + int M, + struct ggml_mulmat_tune_time *profile_time); + +const char *ggml_task_backend_name(enum ggml_task_backend backend); + +int ggml_mulmat_tune_get_builtin_task_backends( + enum ggml_task_backend *backends); + +bool ggml_mulmat_tune_bench(struct ggml_mulmat_tune *tune, + struct ggml_mulmat_tune_params *params); + +#ifdef __cplusplus +} +#endif diff --git a/ggml.c b/ggml.c index 78c3653543c88..5d0b83b1de198 100644 --- a/ggml.c +++ b/ggml.c @@ -61,26 +61,6 @@ static LONG atomic_fetch_sub(atomic_int* ptr, LONG dec) { return atomic_fetch_add(ptr, -(dec)); } -typedef HANDLE pthread_t; - -typedef DWORD thread_ret_t; -static int pthread_create(pthread_t* out, void* unused, thread_ret_t(*func)(void*), void* arg) { - (void) unused; - HANDLE handle = CreateThread(NULL, 0, (LPTHREAD_START_ROUTINE) func, arg, 0, NULL); - if (handle == NULL) - { - return EAGAIN; - } - - *out = handle; - return 0; -} - -static int pthread_join(pthread_t thread, void* unused) { - (void) unused; - return (int) WaitForSingleObject(thread, INFINITE); -} - static int sched_yield (void) { Sleep (0); return 0; @@ -88,8 +68,6 @@ static int sched_yield (void) { #else #include #include - -typedef void* thread_ret_t; #endif // __FMA__ and __F16C__ are not defined in MSVC, however they are implied with AVX2/AVX512 @@ -166,6 +144,12 @@ inline static void* ggml_aligned_malloc(size_t size) { #include "ggml-opencl.h" #endif +#if defined(GGML_USE_MULMAT_TUNE) + #include "ggml-tune.h" +#endif + +#include "ggml-threading.h" + #undef MIN #undef MAX #define MIN(a, b) ((a) < (b) ? (a) : (b)) @@ -4059,6 +4043,8 @@ struct ggml_context * ggml_init(struct ggml_init_params params) { ggml_cl_init(); #endif + ggml_mulmat_init_task_profiles(); + is_first_call = false; } @@ -4302,7 +4288,7 @@ struct ggml_tensor * ggml_new_tensor_impl( /*.src0 =*/ NULL, /*.src1 =*/ NULL, /*.opt =*/ { NULL }, - /*.n_tasks =*/ 0, + /*.task_profile =*/ { 0 }, /*.perf_runs =*/ 0, /*.perf_cycles =*/ 0, /*.perf_time_us =*/ 0, @@ -8516,14 +8502,19 @@ static void ggml_compute_forward_mul_f32( const int ith = params->ith; const int nth = params->nth; + enum ggml_task_backend comp_backend = dst->task_profile.stages[GGML_TASK_COMPUTE].backend; + if (comp_backend == GGML_TASK_BACKEND_GPU_CL) { #ifdef GGML_USE_CLBLAST - if (src1->backend == GGML_BACKEND_GPU) { - if (ith == 0) { - ggml_cl_mul(src0, src1, dst); + if (src1->backend == GGML_BACKEND_GPU) { + if (ith == 0) { + ggml_cl_mul(src0, src1, dst); + } + return; } - return; - } +#else + GGML_ASSERT(false); #endif + }; const int64_t nr = ggml_nrows(src0); @@ -9950,36 +9941,6 @@ static void ggml_compute_forward_rms_norm_back( } -// ggml_compute_forward_mul_mat - -#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) -// helper function to determine if it is better to use BLAS or not -// for large matrices, BLAS is faster -static bool ggml_compute_forward_mul_mat_use_blas( - const struct ggml_tensor * src0, - const struct ggml_tensor * src1, - struct ggml_tensor * dst) { - //const int64_t ne00 = src0->ne[0]; - //const int64_t ne01 = src0->ne[1]; - - const int64_t ne10 = src1->ne[0]; - - const int64_t ne0 = dst->ne[0]; - const int64_t ne1 = dst->ne[1]; - - // TODO: find the optimal values for these - if (ggml_is_contiguous(src0) && - ggml_is_contiguous(src1) && - (ne0 >= 32 && ne1 >= 32 && ne10 >= 32)) { - - /*printf("BLAS: %d %d %d %d %d\n", ne0, ne1, ne10, ne00, ne01);*/ - return true; - } - - return false; -} -#endif - static void ggml_compute_forward_mul_mat_f32( const struct ggml_compute_params * params, const struct ggml_tensor * src0, @@ -10050,28 +10011,25 @@ static void ggml_compute_forward_mul_mat_f32( // nb01 >= nb00 - src0 is not transposed // compute by src0 rows + enum ggml_task_backend comp_backend = dst->task_profile.stages[GGML_TASK_COMPUTE].backend; + + if (comp_backend == GGML_TASK_BACKEND_GPU_CL) { #if defined(GGML_USE_CLBLAST) - if (ggml_cl_can_mul_mat(src0, src1, dst)) { - if (params->ith == 0 && params->type == GGML_TASK_COMPUTE) { - ggml_cl_mul_mat(src0, src1, dst, params->wdata, params->wsize); - } + GGML_ASSERT(params->nth == 1); + GGML_ASSERT(params->type == GGML_TASK_COMPUTE); + ggml_cl_mul_mat(src0, src1, dst, params->wdata, params->wsize); return; - } +#else + GGML_ASSERT(false); #endif + } -#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) - if (ggml_compute_forward_mul_mat_use_blas(src0, src1, dst)) { - if (params->ith != 0) { - return; - } - - if (params->type == GGML_TASK_INIT) { - return; - } + GGML_ASSERT(comp_backend & GGML_TASK_BACKEND_CPU); - if (params->type == GGML_TASK_FINALIZE) { - return; - } + if (comp_backend == GGML_TASK_BACKEND_CPU_BLAS) { +#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) + GGML_ASSERT(params->nth == 1); + GGML_ASSERT(params->type == GGML_TASK_COMPUTE); for (int64_t i03 = 0; i03 < ne03; i03++) { for (int64_t i02 = 0; i02 < ne02; i02++) { @@ -10089,16 +10047,13 @@ static void ggml_compute_forward_mul_mat_f32( //printf("CBLAS F32 = %f ms, %d x %d x %d x %d\n", (ggml_perf_time_us() - t0)/1000.0, ne0, ne1, ne2, ne3); return; - } +#else + GGML_ASSERT(false); #endif - - if (params->type == GGML_TASK_INIT) { - return; } - if (params->type == GGML_TASK_FINALIZE) { - return; - } + GGML_ASSERT(params->type == GGML_TASK_COMPUTE); + GGML_ASSERT(comp_backend == GGML_TASK_BACKEND_CPU); // parallelize by src0 rows using ggml_vec_dot_f32 @@ -10215,30 +10170,26 @@ static void ggml_compute_forward_mul_mat_f16_f32( // nb01 >= nb00 - src0 is not transposed // compute by src0 rows + enum ggml_task_backend comp_backend = dst->task_profile.stages[GGML_TASK_COMPUTE].backend; + + if (comp_backend == GGML_TASK_BACKEND_GPU_CL) { #if defined(GGML_USE_CLBLAST) - if (ggml_cl_can_mul_mat(src0, src1, dst)) { - if (params->ith == 0 && params->type == GGML_TASK_COMPUTE) { - ggml_cl_mul_mat(src0, src1, dst, params->wdata, params->wsize); - } + GGML_ASSERT(params->type == GGML_TASK_COMPUTE); + ggml_cl_mul_mat(src0, src1, dst, params->wdata, params->wsize); return; - } +#else + GGML_ASSERT(false); #endif + } + + enum ggml_task_backend init_backend = dst->task_profile.stages[GGML_TASK_INIT].backend; + GGML_ASSERT(comp_backend & GGML_TASK_BACKEND_CPU); + if (comp_backend == GGML_TASK_BACKEND_CPU_BLAS) { #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) - if (ggml_compute_forward_mul_mat_use_blas(src0, src1, dst)) { GGML_ASSERT(nb10 == sizeof(float)); - - if (params->ith != 0) { - return; - } - - if (params->type == GGML_TASK_INIT) { - return; - } - - if (params->type == GGML_TASK_FINALIZE) { - return; - } + GGML_ASSERT(params->nth == 1); + GGML_ASSERT(params->type == GGML_TASK_COMPUTE); for (int64_t i03 = 0; i03 < ne03; i03++) { for (int64_t i02 = 0; i02 < ne02; i02++) { @@ -10271,8 +10222,14 @@ static void ggml_compute_forward_mul_mat_f16_f32( /*printf("CBLAS F16 = %f ms, %d x %d x %d x %d\n", (ggml_perf_time_us() - t0)/1000.0, ne0, ne1, ne2, ne3);*/ return; - } +#else + GGML_ASSERT(false); #endif + } + + GGML_ASSERT(params->type == GGML_TASK_INIT || params->type == GGML_TASK_COMPUTE); + GGML_ASSERT(init_backend == GGML_TASK_BACKEND_CPU); + GGML_ASSERT(comp_backend == GGML_TASK_BACKEND_CPU); if (params->type == GGML_TASK_INIT) { ggml_fp16_t * const wdata = params->wdata; @@ -10293,9 +10250,7 @@ static void ggml_compute_forward_mul_mat_f16_f32( return; } - if (params->type == GGML_TASK_FINALIZE) { - return; - } + GGML_ASSERT (params->type == GGML_TASK_COMPUTE); // fp16 -> half the size, so divide by 2 // TODO: do not support transposed src1 @@ -10420,50 +10375,62 @@ static void ggml_compute_forward_mul_mat_q_f32( // nb01 >= nb00 - src0 is not transposed // compute by src0 rows + enum ggml_task_backend comp_backend = dst->task_profile.stages[GGML_TASK_COMPUTE].backend; + + if (comp_backend == GGML_TASK_BACKEND_GPU_CL) { #if defined(GGML_USE_CLBLAST) - if (ggml_cl_can_mul_mat(src0, src1, dst)) { - if (params->ith == 0 && params->type == GGML_TASK_COMPUTE) { - ggml_cl_mul_mat(src0, src1, dst, params->wdata, params->wsize); - } + GGML_ASSERT(params->nth == 1); + GGML_ASSERT(params->type == GGML_TASK_COMPUTE); + ggml_cl_mul_mat(src0, src1, dst, params->wdata, params->wsize); return; - } +#else + GGML_ASSERT(false); #endif + } + + enum ggml_task_backend init_backend = dst->task_profile.stages[GGML_TASK_INIT].backend; + GGML_ASSERT(comp_backend & GGML_TASK_BACKEND_CPU); + if (comp_backend == GGML_TASK_BACKEND_CPU_BLAS) { #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) - if (ggml_compute_forward_mul_mat_use_blas(src0, src1, dst)) { - if (params->ith != 0) { - return; - } + GGML_ASSERT (init_backend == GGML_TASK_BACKEND_CPU); + GGML_ASSERT(params->type == GGML_TASK_INIT || params->type == GGML_TASK_COMPUTE); + GGML_ASSERT(src0->data); + GGML_ASSERT(params->wdata); - if (params->type == GGML_TASK_INIT) { - return; - } + float * const wdata = params->wdata; + dequantize_row_q_t const dequantize_row_q = quantize_fns[type].dequantize_row_q; - if (params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_INIT) { + // rows per thread + const int dr = (ne01 + nth - 1)/nth; + + // row range for this thread + const int ir0 = dr*ith; + int ir1 = MIN(ir0 + dr, ne01); + + for (int64_t i03 = 0; i03 < ne03; i03++) { + for (int64_t i02 = 0; i02 < ne02; i02++) { + char * data0_offset = (char *) src0->data + i03*nb03 + i02*nb02; + float * wdata_offset = wdata + i03*ne03 + i02*ne02; + for (int64_t i = ir0; i < ir1; ++i) { + dequantize_row_q(data0_offset + i*nb01, wdata_offset + i*ne00, ne00); + } + } + } return; } - float * const wdata = params->wdata; - dequantize_row_q_t const dequantize_row_q = quantize_fns[type].dequantize_row_q; + GGML_ASSERT(nth == 1); + GGML_ASSERT(params->type == GGML_TASK_COMPUTE); for (int64_t i03 = 0; i03 < ne03; i03++) { for (int64_t i02 = 0; i02 < ne02; i02++) { const float * y = (float *) ((char *) src1->data + i02*nb12 + i03*nb13); - float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3); - { - size_t id = 0; - for (int64_t i01 = 0; i01 < ne01; ++i01) { - dequantize_row_q((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01, wdata + id, ne00); - id += ne00; - } - - assert(id*sizeof(float) <= params->wsize); - } - + // zT = y * xT const float * x = wdata; - cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans, ne11, ne01, ne10, 1.0f, y, ne10, @@ -10472,13 +10439,19 @@ static void ggml_compute_forward_mul_mat_q_f32( } } - //printf("CBLAS = %f ms, %d x %d x %d x %d\n", (ggml_perf_time_us() - t0)/1000.0, ne0, ne1, ne2, ne3); - return; - } +#else + GGML_ASSERT(false); #endif + } + + GGML_ASSERT(params->type == GGML_TASK_INIT || params->type == GGML_TASK_COMPUTE); + GGML_ASSERT(init_backend == GGML_TASK_BACKEND_CPU); + GGML_ASSERT(comp_backend == GGML_TASK_BACKEND_CPU); if (params->type == GGML_TASK_INIT) { + GGML_ASSERT(params->nth == 1); + char * wdata = params->wdata; const size_t row_size = ne10*GGML_TYPE_SIZE[vec_dot_type]/GGML_BLCK_SIZE[vec_dot_type]; @@ -10490,13 +10463,10 @@ static void ggml_compute_forward_mul_mat_q_f32( } } } - return; } - if (params->type == GGML_TASK_FINALIZE) { - return; - } + GGML_ASSERT(params->type == GGML_TASK_COMPUTE); // parallelize by src0 rows using ggml_vec_dot_q @@ -14324,20 +14294,31 @@ static void ggml_compute_forward_cross_entropy_loss_back( } } - ///////////////////////////////// -static void ggml_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor) { +static enum ggml_compute_error ggml_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor) { GGML_ASSERT(params); -#ifdef GGML_USE_CUBLAS - bool skip_cpu = ggml_cuda_compute_forward(params, tensor); - if (skip_cpu) { - return; + enum ggml_task_backend comp_backend = tensor->task_profile.stages[GGML_TASK_COMPUTE].backend; + + if (comp_backend == GGML_TASK_BACKEND_GPU_CUDA) { +#if defined(GGML_USE_CUBLAS) + bool skip_cpu = ggml_cuda_compute_forward(params, tensor); + if (skip_cpu) { + return GGML_COMPUTE_OK; + } + GGML_ASSERT(tensor->src0->backend == GGML_BACKEND_CPU); + GGML_ASSERT(tensor->src1 == NULL || tensor->src1->backend == GGML_BACKEND_CPU); + return GGML_COMPUTE_FALLBACK; +#else + GGML_ASSERT(false); +#endif } - GGML_ASSERT(tensor->src0->backend == GGML_BACKEND_CPU); - GGML_ASSERT(tensor->src1 == NULL || tensor->src1->backend == GGML_BACKEND_CPU); -#endif // GGML_USE_CUBLAS + + // if (tensor->task_profile.stages[params->type].backend > GGML_TASK_BACKEND_CPU) { + // printf("mulmat: test fallback\n"); + // return GGML_COMPUTE_FALLBACK; + // } switch (tensor->op) { case GGML_OP_DUP: @@ -14585,6 +14566,15 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm GGML_ASSERT(false); } break; } + + return GGML_COMPUTE_OK; +} + +enum ggml_compute_error ggml_compute_forward_wrapper(struct ggml_compute_params *params, + struct ggml_tensor *tensor) { + // We call ggml_compute_forward because the CUDA mul_mat entry point + // was moved out of `ggml_compute_forward_mul_mat`. + return ggml_compute_forward(params, tensor); } //////////////////////////////////////////////////////////////////////////////// @@ -15480,6 +15470,7 @@ struct ggml_cgraph ggml_build_forward(struct ggml_tensor * tensor) { /*.n_nodes =*/ 0, /*.n_leafs =*/ 0, /*.n_threads =*/ GGML_DEFAULT_N_THREADS, + /*.tune =*/ NULL, /*.work_size =*/ 0, /*.work =*/ NULL, /*.nodes =*/ { NULL }, @@ -15533,175 +15524,288 @@ struct ggml_cgraph ggml_build_backward(struct ggml_context * ctx, struct ggml_cg return result; } -// -// thread data -// -// synchronization is done via busy loops -// I tried using spin locks, but not sure how to use them correctly - the things I tried were slower than busy loops -// +// ---- mulmat task profiles ---- -#ifdef __APPLE__ +static struct ggml_task_profile_factory default_task_profile_factory = {0}; -//#include -// -//typedef os_unfair_lock ggml_lock_t; -// -//#define ggml_lock_init(x) UNUSED(x) -//#define ggml_lock_destroy(x) UNUSED(x) -//#define ggml_lock_lock os_unfair_lock_lock -//#define ggml_lock_unlock os_unfair_lock_unlock -// -//#define GGML_LOCK_INITIALIZER OS_UNFAIR_LOCK_INIT +// TODO: thread unsafe. Should be initialized once. +void ggml_mulmat_init_task_profiles(void) { + const size_t sz = sizeof(struct ggml_task_profile_factory); + memset(&default_task_profile_factory, 0, sz); -typedef int ggml_lock_t; + // f32 + { + struct ggml_task_profile *p = default_task_profile_factory.f32_f32; + int i = 0; -#define ggml_lock_init(x) UNUSED(x) -#define ggml_lock_destroy(x) UNUSED(x) -#define ggml_lock_lock(x) UNUSED(x) -#define ggml_lock_unlock(x) UNUSED(x) + p[i].stages[1].backend = GGML_TASK_BACKEND_CPU; + p[i].stages[1].parallel = true; + i++; -#define GGML_LOCK_INITIALIZER 0 +#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) + p[i].stages[1].backend = GGML_TASK_BACKEND_CPU_BLAS; + p[i].stages[1].wait = true; + i++; +#endif -typedef pthread_t ggml_thread_t; +#if defined(GGML_USE_CUBLAS) + p[i].stages[1].backend = GGML_TASK_BACKEND_GPU_CUDA; + p[i].stages[1].wait = true; + i++; +#elif defined(GGML_USE_CLBLAST) + p[i].stages[1].backend = GGML_TASK_BACKEND_GPU_CL; + p[i].stages[1].wait = true; + i++; +#endif + default_task_profile_factory.n_f32_f32 = i; + } -#define ggml_thread_create pthread_create -#define ggml_thread_join pthread_join + // f16 + { + struct ggml_task_profile *p = default_task_profile_factory.f16_f32; + int i = 0; -#else + p[i].stages[0].backend = GGML_TASK_BACKEND_CPU; + p[i].stages[1].backend = GGML_TASK_BACKEND_CPU; + p[i].stages[1].parallel = true; + i++; -//typedef pthread_spinlock_t ggml_lock_t; +#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) + p[i].stages[1].backend = GGML_TASK_BACKEND_CPU_BLAS; + p[i].stages[1].wait = true; + i++; +#endif + +#if defined(GGML_USE_CUBLAS) + p[i].stages[1].backend = GGML_TASK_BACKEND_GPU_CUDA; + p[i].stages[1].wait = true; + i++; +#elif defined(GGML_USE_CLBLAST) + p[i].stages[1].backend = GGML_TASK_BACKEND_GPU_CL; + p[i].stages[1].wait = true; + i++; +#endif + default_task_profile_factory.n_f16_f32 = i; + } -//#define ggml_lock_init(x) pthread_spin_init(x, PTHREAD_PROCESS_PRIVATE) -//#define ggml_lock_destroy pthread_spin_destroy -//#define ggml_lock_lock pthread_spin_lock -//#define ggml_lock_unlock pthread_spin_unlock + // qxx + { + struct ggml_task_profile *p = default_task_profile_factory.qxx_f32; + int i = 0; -typedef int ggml_lock_t; + p[i].stages[0].backend = GGML_TASK_BACKEND_CPU; + p[i].stages[1].backend = GGML_TASK_BACKEND_CPU; + p[i].stages[1].parallel = true; + i++; -#define ggml_lock_init(x) UNUSED(x) -#define ggml_lock_destroy(x) UNUSED(x) -#if defined(__x86_64__) || (defined(_MSC_VER) && defined(_M_AMD64)) -#define ggml_lock_lock(x) _mm_pause() -#else -#define ggml_lock_lock(x) UNUSED(x) +#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) + p[i].stages[0].backend = GGML_TASK_BACKEND_CPU; + p[i].stages[0].parallel = true; + p[i].stages[1].backend = GGML_TASK_BACKEND_CPU_BLAS; + p[i].stages[1].wait = true; + i++; +#endif + +#if defined(GGML_USE_CUBLAS) + p[i].stages[1].backend = GGML_TASK_BACKEND_GPU_CUDA; + p[i].stages[1].wait = true; + i++; +#elif defined(GGML_USE_CLBLAST) + p[i].stages[1].backend = GGML_TASK_BACKEND_GPU_CL; + p[i].stages[1].wait = true; + i++; #endif -#define ggml_lock_unlock(x) UNUSED(x) + default_task_profile_factory.n_qxx_f32 = i; + } +} -#define GGML_LOCK_INITIALIZER 0 +int ggml_mulmat_get_task_profiles(struct ggml_task_profile_factory *pf, + enum ggml_type src0_t, enum ggml_type src1_t, + struct ggml_task_profile **profiles) { + GGML_ASSERT(profiles); -typedef pthread_t ggml_thread_t; + if (pf == NULL) { + pf = &default_task_profile_factory; + } -#define ggml_thread_create pthread_create -#define ggml_thread_join pthread_join + GGML_ASSERT(src1_t == GGML_TYPE_F32); -#endif + if (src0_t == GGML_TYPE_F32) { + *profiles = pf->f32_f32; + return pf->n_f32_f32; + } -struct ggml_compute_state_shared { - ggml_lock_t spin; + if (src0_t == GGML_TYPE_F16) { + *profiles = pf->f16_f32; + return pf->n_f16_f32; + } - int n_threads; + if (ggml_is_quantized(src0_t)) { + *profiles = pf->qxx_f32; + return pf->n_qxx_f32; + } - // synchronization primitives - atomic_int n_ready; - atomic_bool has_work; - atomic_bool stop; // stop all threads -}; + GGML_ASSERT(false); +} + +static const struct ggml_task_profile * +ggml_mulmat_get_default_task_profile(struct ggml_task_profile_factory *pf, + enum ggml_type src0_type, + enum ggml_type src1_type) { + GGML_ASSERT(src1_type == GGML_TYPE_F32); + if (pf == NULL) { + pf = &default_task_profile_factory; + } -struct ggml_compute_state { - ggml_thread_t thrd; + struct ggml_task_profile *p = NULL; - struct ggml_compute_params params; - struct ggml_tensor * node; + if (src0_type == GGML_TYPE_F32) { + p = &pf->f32_f32[0]; + } else if (src0_type == GGML_TYPE_F16) { + p = &pf->f16_f32[0]; + } else if (ggml_is_quantized(src0_type)) { + p = &pf->qxx_f32[0]; + } else { + GGML_ASSERT(false); + } - struct ggml_compute_state_shared * shared; -}; + for (int i = 0; i < 3; i++) { + GGML_ASSERT(p->stages[i].backend == GGML_TASK_BACKEND_CPU || + p->stages[i].backend == GGML_TASK_BACKEND_NONE); + } -static thread_ret_t ggml_graph_compute_thread(void * data) { - struct ggml_compute_state * state = (struct ggml_compute_state *) data; + return p; +} - const int n_threads = state->shared->n_threads; +// Set task profile for GGML_OP_MUL_MAT or GGML_OP_OUT_PROD. +static void ggml_mulmat_set_tensor_task_profile(struct ggml_tensor *node, + struct ggml_mulmat_tune *tune) { + GGML_ASSERT(node); + GGML_ASSERT(node->op == GGML_OP_MUL_MAT || node->op == GGML_OP_OUT_PROD); - while (true) { - if (atomic_fetch_add(&state->shared->n_ready, 1) == n_threads - 1) { - atomic_store(&state->shared->has_work, false); - } else { - while (atomic_load(&state->shared->has_work)) { - if (atomic_load(&state->shared->stop)) { - return 0; - } - ggml_lock_lock (&state->shared->spin); - ggml_lock_unlock(&state->shared->spin); - } - } + enum ggml_type src0_t = node->src0->type; + enum ggml_type src1_t = node->src1->type; + + // Type and memory layout requirements for computing mul_mat with BLAS. + bool cond_match = (src0_t == GGML_TYPE_F32 || src0_t == GGML_TYPE_F16 || + ggml_is_quantized(src0_t)) && + src1_t == GGML_TYPE_F32 && node->type == GGML_TYPE_F32 && + ggml_is_contiguous(node->src0) && + ggml_is_contiguous(node->src1); + + int M = (int)node->ne[1]; + int N = (int)node->ne[0]; + int K = (int)node->src1->ne[0]; + + struct ggml_task_profile *profiles = NULL; + int n_profiles = ggml_mulmat_get_task_profiles(NULL, src0_t, src1_t, &profiles); + GGML_ASSERT(n_profiles >= 2); + GGML_ASSERT(profiles); - atomic_fetch_sub(&state->shared->n_ready, 1); + const struct ggml_task_profile *prof = NULL; - // wait for work - while (!atomic_load(&state->shared->has_work)) { - if (atomic_load(&state->shared->stop)) { - return 0; + if (cond_match) { +#if defined(GGML_USE_MULMAT_TUNE) + if (tune != NULL) { + int stages_time_us[3]; + prof = ggml_mulmat_tune_select_task_profile(tune, M, N, K, src0_t, src1_t, stages_time_us); + if (prof != NULL) { + GGML_ASSERT(prof); + memcpy(&node->task_profile, prof, sizeof(struct ggml_task_profile)); + // Do not wait if the estimated execution time is too small (e.g. less than 0.1 ms) + // TODO: need bench actual wait/notify time, see ggml-threading.c + for (int i = 0; i < 3; i++) { + if (node->task_profile.stages[i].wait) { + if (stages_time_us[i] < 100) { + node->task_profile.stages[i].wait = false; + } + } + } + return; } - ggml_lock_lock (&state->shared->spin); - ggml_lock_unlock(&state->shared->spin); } +#else + UNUSED(tune); +#endif - // check if we should stop - if (atomic_load(&state->shared->stop)) { - break; - } + if (prof == NULL && M >= 32 && N >= 32 && K >= 32) { + for (int j = 0; j < n_profiles; j++) { + enum ggml_task_backend comp_be = + profiles[j].stages[GGML_TASK_COMPUTE].backend; - if (state->node) { - if (state->params.ith < state->params.nth) { - ggml_compute_forward(&state->params, state->node); + switch (comp_be) { + case GGML_TASK_BACKEND_GPU_CUDA: { + GGML_ASSERT(ggml_cpu_has_cublas()); + prof = &profiles[j]; + break; + } + case GGML_TASK_BACKEND_GPU_CL: { + GGML_ASSERT(ggml_cpu_has_clblast()); + prof = &profiles[j]; + break; + } + case GGML_TASK_BACKEND_CPU_BLAS: { + GGML_ASSERT(ggml_cpu_has_cpublas()); + prof = &profiles[j]; + break; + } + default: { + break; + } + } } - - state->node = NULL; - } else { - break; } } - return 0; + if (prof == NULL) { + prof = ggml_mulmat_get_default_task_profile(NULL, src0_t, src1_t); + } + + GGML_ASSERT(prof); + memcpy(&node->task_profile, prof, sizeof(struct ggml_task_profile)); } void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph) { - const int n_threads = cgraph->n_threads; - - struct ggml_compute_state_shared state_shared = { - /*.spin =*/ GGML_LOCK_INITIALIZER, - /*.n_threads =*/ n_threads, - /*.n_ready =*/ 0, - /*.has_work =*/ false, - /*.stop =*/ false, - }; - struct ggml_compute_state * workers = n_threads > 1 ? alloca(sizeof(struct ggml_compute_state)*(n_threads - 1)) : NULL; - - // create thread pool - if (n_threads > 1) { - ggml_lock_init(&state_shared.spin); - - atomic_store(&state_shared.has_work, true); - - for (int j = 0; j < n_threads - 1; j++) { - workers[j] = (struct ggml_compute_state) { - .thrd = 0, - .params = { - .type = GGML_TASK_COMPUTE, - .ith = j + 1, - .nth = n_threads, - .wsize = cgraph->work ? ggml_nbytes(cgraph->work) : 0, - .wdata = cgraph->work ? cgraph->work->data : NULL, - }, - .node = NULL, - .shared = &state_shared, - }; + int n_threads = cgraph->n_threads; - int rc = ggml_thread_create(&workers[j].thrd, NULL, ggml_graph_compute_thread, &workers[j]); - GGML_ASSERT(rc == 0); - UNUSED(rc); + if (ggml_cpu_has_blas()) { + for (int i = 0; i < cgraph->n_nodes; i++) { + struct ggml_tensor *node = cgraph->nodes[i]; + + memset(&node->task_profile, 0, sizeof(struct ggml_task_profile)); + struct ggml_task_stage *stages = node->task_profile.stages; + + // Adapt node->backend: assume GPU at COMPUTE stage. + if (node->backend > GGML_BACKEND_CPU) { + stages[GGML_TASK_INIT].backend = GGML_TASK_BACKEND_NONE; + stages[GGML_TASK_FINALIZE].backend = GGML_TASK_BACKEND_NONE; + + stages[GGML_TASK_COMPUTE].parallel = false; + bool wait = (node->op == GGML_OP_MUL_MAT || node->op == GGML_OP_MUL); + stages[GGML_TASK_COMPUTE].wait = wait; + if (ggml_cpu_has_cublas()) { + stages[GGML_TASK_COMPUTE].backend = GGML_TASK_BACKEND_GPU_CUDA; + } else if (ggml_cpu_has_clblast()) { + stages[GGML_TASK_COMPUTE].backend = GGML_TASK_BACKEND_GPU_CL; + } else { + GGML_ASSERT(false); + } + } else if (node->op == GGML_OP_MUL_MAT) { + struct ggml_mulmat_tune * tune = NULL; +#if defined(GGML_USE_MULMAT_TUNE) + tune = cgraph->tune; +#endif + ggml_mulmat_set_tensor_task_profile(node, tune); + } else if (node->op == GGML_OP_OUT_PROD) { + ggml_mulmat_set_tensor_task_profile(node, NULL); + } } } + struct ggml_threading_context *thrd_ctx = ggml_threading_start( + n_threads, ggml_threading_graph_compute_thread, ggml_compute_forward, + GGML_THREADING_FEATURE_WAIT_ON_DONE, NULL); + // initialize tasks + work buffer { size_t work_size = 0; @@ -15709,13 +15813,13 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph) // thread scheduling for the different operations for (int i = 0; i < cgraph->n_nodes; i++) { struct ggml_tensor * node = cgraph->nodes[i]; + struct ggml_task_stage *stages = node->task_profile.stages; switch (node->op) { case GGML_OP_CPY: case GGML_OP_DUP: { - node->n_tasks = n_threads; - + stages[GGML_TASK_COMPUTE].backend = GGML_TASK_BACKEND_CPU; size_t cur = 0; if (ggml_is_quantized(node->type)) { cur = GGML_TYPE_SIZE[GGML_TYPE_F32] * node->ne[0] * n_threads; @@ -15726,7 +15830,8 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph) case GGML_OP_ADD: case GGML_OP_ADD1: { - node->n_tasks = n_threads; + stages[GGML_TASK_COMPUTE].backend = GGML_TASK_BACKEND_CPU; + stages[GGML_TASK_COMPUTE].parallel = true; size_t cur = 0; @@ -15738,7 +15843,9 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph) } break; case GGML_OP_ACC: { - node->n_tasks = n_threads; + stages[GGML_TASK_INIT].backend = GGML_TASK_BACKEND_CPU; + stages[GGML_TASK_COMPUTE].backend = GGML_TASK_BACKEND_CPU; + stages[GGML_TASK_COMPUTE].parallel = true; size_t cur = 0; @@ -15764,9 +15871,15 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph) case GGML_OP_STEP: case GGML_OP_RELU: { - node->n_tasks = 1; + stages[GGML_TASK_COMPUTE].backend = GGML_TASK_BACKEND_CPU; } break; case GGML_OP_MUL: + { + if (stages[GGML_TASK_COMPUTE].backend == GGML_TASK_BACKEND_NONE) { + stages[GGML_TASK_COMPUTE].backend = GGML_TASK_BACKEND_CPU; + stages[GGML_TASK_COMPUTE].parallel = true; + } + } break; case GGML_OP_GELU: case GGML_OP_SILU: case GGML_OP_SILU_BACK: @@ -15774,66 +15887,65 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph) case GGML_OP_RMS_NORM: case GGML_OP_RMS_NORM_BACK: { - node->n_tasks = n_threads; + stages[GGML_TASK_COMPUTE].backend = GGML_TASK_BACKEND_CPU; + stages[GGML_TASK_COMPUTE].parallel = true; } break; case GGML_OP_MUL_MAT: case GGML_OP_OUT_PROD: { - node->n_tasks = n_threads; - - // TODO: use different scheduling for different matrix sizes - //const int nr0 = ggml_nrows(node->src0); - //const int nr1 = ggml_nrows(node->src1); - - //node->n_tasks = MIN(n_threads, MAX(1, nr0/128)); - //printf("nr0 = %8d, nr1 = %8d, nr0*nr1 = %8d, n_tasks = %d\n", nr0, nr1, nr0*nr1, node->n_tasks); - size_t cur = 0; - -#if defined(GGML_USE_CUBLAS) - if (ggml_cuda_can_mul_mat(node->src0, node->src1, node)) { - node->n_tasks = 1; // TODO: this actually is doing nothing - // the threads are still spinning + enum ggml_task_backend comp_backend = stages[GGML_TASK_COMPUTE].backend; + GGML_ASSERT(comp_backend != GGML_TASK_BACKEND_NONE); + + // TODO: remove this check once we are sure `ggml_mulmat_set_tensor_task_profile()` is correct. + if ((comp_backend & GGML_TASK_BACKEND_GPU) || comp_backend == GGML_TASK_BACKEND_CPU_BLAS) { + enum ggml_type src0_t = node->src0->type; + enum ggml_type src1_t = node->src1->type; + bool cond_match = (src0_t == GGML_TYPE_F32 || src0_t == GGML_TYPE_F16 || + ggml_is_quantized(src0_t)) && + src1_t == GGML_TYPE_F32 && node->type == GGML_TYPE_F32 && + ggml_is_contiguous(node->src0) && + ggml_is_contiguous(node->src1); + GGML_ASSERT(cond_match); } - else -#elif defined(GGML_USE_CLBLAST) - if (ggml_cl_can_mul_mat(node->src0, node->src1, node)) { - node->n_tasks = 1; // TODO: this actually is doing nothing - // the threads are still spinning + + if (comp_backend == GGML_TASK_BACKEND_GPU_CL) { +#if defined(GGML_USE_CLBLAST) cur = ggml_cl_mul_mat_get_wsize(node->src0, node->src1, node); - } - else +#else + GGML_ASSERT(false); #endif - if (node->src0->type == GGML_TYPE_F16 && node->src1->type == GGML_TYPE_F32) { -#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) - if (ggml_compute_forward_mul_mat_use_blas(node->src0, node->src1, node)) { - node->n_tasks = 1; // TODO: this actually is doing nothing - // the threads are still spinning + } else if (comp_backend == GGML_TASK_BACKEND_CPU_BLAS) { + GGML_ASSERT(ggml_cpu_has_cpublas()); + GGML_ASSERT(node->src1->type == GGML_TYPE_F32); + + if (node->src0->type == GGML_TYPE_F32) { + cur = 0; + } else if (node->src0->type == GGML_TYPE_F16) { // here we need memory just for single 2D matrix from src0 cur = GGML_TYPE_SIZE[GGML_TYPE_F32]*(node->src0->ne[0]*node->src0->ne[1]); + } else if (ggml_is_quantized(node->src0->type)) { + cur = GGML_TYPE_SIZE[GGML_TYPE_F32]*(node->src0->ne[0]*node->src0->ne[1]); } else { - cur = GGML_TYPE_SIZE[GGML_TYPE_F16]*ggml_nelements(node->src1); + GGML_ASSERT(false); } -#else - cur = GGML_TYPE_SIZE[GGML_TYPE_F16]*ggml_nelements(node->src1); -#endif - } else if (node->src0->type == GGML_TYPE_F32 && node->src1->type == GGML_TYPE_F32) { - cur = 0; -#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) - if (ggml_compute_forward_mul_mat_use_blas(node->src0, node->src1, node)) { - node->n_tasks = 1; + } else if (comp_backend == GGML_TASK_BACKEND_CPU || comp_backend == GGML_TASK_BACKEND_GPU_CUDA) { + // We have to reseve buffer for CUDA because it may fallback to CPU. + if (comp_backend == GGML_TASK_BACKEND_GPU_CUDA) { + GGML_ASSERT(ggml_cpu_has_cublas()); } -#endif - } else if (ggml_is_quantized(node->src0->type) && node->src1->type == GGML_TYPE_F32) { -#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) - if (ggml_compute_forward_mul_mat_use_blas(node->src0, node->src1, node)) { - node->n_tasks = 1; - cur = GGML_TYPE_SIZE[GGML_TYPE_F32]*(node->src0->ne[0]*node->src0->ne[1]); - } else -#endif - { + + GGML_ASSERT(node->src1->type == GGML_TYPE_F32); + + if (node->src0->type == GGML_TYPE_F32) { + cur = 0; + } else if (node->src0->type == GGML_TYPE_F16) { + cur = GGML_TYPE_SIZE[GGML_TYPE_F16]*ggml_nelements(node->src1); + } else if (ggml_is_quantized(node->src0->type)) { const enum ggml_type type_q = quantize_fns[node->src0->type].vec_dot_type; cur = GGML_TYPE_SIZE[type_q]*ggml_nelements(node->src1)/GGML_BLCK_SIZE[type_q]; + } else { + GGML_ASSERT(false); } } else { GGML_ASSERT(false); @@ -15843,9 +15955,14 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph) } break; case GGML_OP_SCALE: { - node->n_tasks = n_threads; + stages[GGML_TASK_COMPUTE].backend = GGML_TASK_BACKEND_CPU; + stages[GGML_TASK_COMPUTE].parallel = true; } break; case GGML_OP_SET: + { + stages[GGML_TASK_INIT].backend = GGML_TASK_BACKEND_CPU; + stages[GGML_TASK_COMPUTE].backend = GGML_TASK_BACKEND_CPU; + } break; case GGML_OP_CONT: case GGML_OP_RESHAPE: case GGML_OP_VIEW: @@ -15856,7 +15973,7 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph) case GGML_OP_DIAG: case GGML_OP_DIAG_MASK_ZERO: { - node->n_tasks = 1; + stages[GGML_TASK_COMPUTE].backend = GGML_TASK_BACKEND_CPU; } break; case GGML_OP_DIAG_MASK_INF: case GGML_OP_SOFT_MAX: @@ -15864,20 +15981,23 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph) case GGML_OP_ROPE: case GGML_OP_ROPE_BACK: { - node->n_tasks = n_threads; + stages[GGML_TASK_COMPUTE].backend = GGML_TASK_BACKEND_CPU; + stages[GGML_TASK_COMPUTE].parallel = true; } break; case GGML_OP_ALIBI: { - node->n_tasks = 1; //TODO + stages[GGML_TASK_COMPUTE].backend = GGML_TASK_BACKEND_CPU; } break; case GGML_OP_CLAMP: { - node->n_tasks = 1; //TODO + stages[GGML_TASK_COMPUTE].backend = GGML_TASK_BACKEND_CPU; } break; case GGML_OP_CONV_1D_1S: case GGML_OP_CONV_1D_2S: { - node->n_tasks = n_threads; + stages[GGML_TASK_INIT].backend = GGML_TASK_BACKEND_CPU; + stages[GGML_TASK_COMPUTE].backend = GGML_TASK_BACKEND_CPU; + stages[GGML_TASK_COMPUTE].parallel = true; GGML_ASSERT(node->src0->ne[3] == 1); GGML_ASSERT(node->src1->ne[2] == 1); @@ -15906,45 +16026,48 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph) } break; case GGML_OP_FLASH_ATTN: { - node->n_tasks = n_threads; + stages[GGML_TASK_COMPUTE].backend = GGML_TASK_BACKEND_CPU; + stages[GGML_TASK_COMPUTE].parallel = true; size_t cur = 0; const int64_t ne11 = ggml_up(node->src1->ne[1], GGML_SOFT_MAX_UNROLL); if (node->src1->type == GGML_TYPE_F32) { - cur = sizeof(float)*ne11*node->n_tasks; // TODO: this can become (n_tasks-1) - cur += sizeof(float)*ne11*node->n_tasks; // this is overestimated by x2 + cur = sizeof(float)*ne11*n_threads; // TODO: this can become (n_tasks-1) + cur += sizeof(float)*ne11*n_threads; // this is overestimated by x2 } if (node->src1->type == GGML_TYPE_F16) { - cur = sizeof(float)*ne11*node->n_tasks; // TODO: this can become (n_tasks-1) - cur += sizeof(float)*ne11*node->n_tasks; // this is overestimated by x2 + cur = sizeof(float)*ne11*n_threads; // TODO: this can become (n_tasks-1) + cur += sizeof(float)*ne11*n_threads; // this is overestimated by x2 } work_size = MAX(work_size, cur); } break; case GGML_OP_FLASH_FF: { - node->n_tasks = n_threads; - + stages[GGML_TASK_COMPUTE].backend = GGML_TASK_BACKEND_CPU; + stages[GGML_TASK_COMPUTE].parallel = true; size_t cur = 0; if (node->src1->type == GGML_TYPE_F32) { - cur = sizeof(float)*node->src1->ne[1]*node->n_tasks; // TODO: this can become (n_tasks-1) - cur += sizeof(float)*node->src1->ne[1]*node->n_tasks; // this is overestimated by x2 + cur = sizeof(float)*node->src1->ne[1]*n_threads; // TODO: this can become (n_tasks-1) + cur += sizeof(float)*node->src1->ne[1]*n_threads; // this is overestimated by x2 } if (node->src1->type == GGML_TYPE_F16) { - cur = sizeof(float)*node->src1->ne[1]*node->n_tasks; // TODO: this can become (n_tasks-1) - cur += sizeof(float)*node->src1->ne[1]*node->n_tasks; // this is overestimated by x2 + cur = sizeof(float)*node->src1->ne[1]*n_threads; // TODO: this can become (n_tasks-1) + cur += sizeof(float)*node->src1->ne[1]*n_threads; // this is overestimated by x2 } work_size = MAX(work_size, cur); } break; case GGML_OP_FLASH_ATTN_BACK: { - node->n_tasks = n_threads; + stages[GGML_TASK_INIT].backend = GGML_TASK_BACKEND_CPU; + stages[GGML_TASK_COMPUTE].backend = GGML_TASK_BACKEND_CPU; + stages[GGML_TASK_COMPUTE].parallel = true; size_t cur = 0; @@ -15952,13 +16075,13 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph) const int64_t ne11 = ggml_up(node->src1->ne[1], GGML_SOFT_MAX_UNROLL); const int64_t mxDn = MAX(D, ne11) * 2; // *2 because of S and SM in ggml_compute_forward_flash_attn_back if (node->src1->type == GGML_TYPE_F32) { - cur = sizeof(float)*mxDn*node->n_tasks; // TODO: this can become (n_tasks-1) - cur += sizeof(float)*mxDn*node->n_tasks; // this is overestimated by x2 + cur = sizeof(float)*mxDn*n_threads; // TODO: this can become (n_tasks-1) + cur += sizeof(float)*mxDn*n_threads; // this is overestimated by x2 } if (node->src1->type == GGML_TYPE_F16) { - cur = sizeof(float)*mxDn*node->n_tasks; // TODO: this can become (n_tasks-1) - cur += sizeof(float)*mxDn*node->n_tasks; // this is overestimated by x2 + cur = sizeof(float)*mxDn*n_threads; // TODO: this can become (n_tasks-1) + cur += sizeof(float)*mxDn*n_threads; // this is overestimated by x2 } work_size = MAX(work_size, cur); @@ -15966,32 +16089,38 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph) case GGML_OP_MAP_UNARY: case GGML_OP_MAP_BINARY: { - node->n_tasks = 1; + stages[GGML_TASK_COMPUTE].backend = GGML_TASK_BACKEND_CPU; } break; case GGML_OP_CROSS_ENTROPY_LOSS: { - node->n_tasks = n_threads; + stages[GGML_TASK_INIT].backend = GGML_TASK_BACKEND_CPU; + stages[GGML_TASK_COMPUTE].backend = GGML_TASK_BACKEND_CPU; + stages[GGML_TASK_COMPUTE].parallel = true; + stages[GGML_TASK_FINALIZE].backend = GGML_TASK_BACKEND_CPU; - size_t cur = ggml_type_size(node->type)*(node->n_tasks + node->src0->ne[0]*node->n_tasks); + size_t cur = ggml_type_size(node->type)*(n_threads + node->src0->ne[0]*n_threads); work_size = MAX(work_size, cur); } break; case GGML_OP_CROSS_ENTROPY_LOSS_BACK: { - node->n_tasks = n_threads; + stages[GGML_TASK_COMPUTE].backend = GGML_TASK_BACKEND_CPU; + stages[GGML_TASK_COMPUTE].parallel = true; - size_t cur = ggml_type_size(node->type)*node->src0->ne[0]*node->n_tasks; + size_t cur = ggml_type_size(node->type)*node->src0->ne[0]*n_threads; work_size = MAX(work_size, cur); } break; case GGML_OP_NONE: { - node->n_tasks = 1; + stages[GGML_TASK_COMPUTE].backend = GGML_TASK_BACKEND_CPU; } break; case GGML_OP_COUNT: { GGML_ASSERT(false); } break; + default: + GGML_ASSERT(false); } } @@ -16023,126 +16152,27 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph) const int64_t perf_node_start_cycles = ggml_perf_cycles(); const int64_t perf_node_start_time_us = ggml_perf_time_us(); - // INIT - struct ggml_compute_params params = { - /*.type =*/ GGML_TASK_INIT, - /*.ith =*/ 0, - /*.nth =*/ node->n_tasks, - /*.wsize =*/ cgraph->work ? ggml_nbytes(cgraph->work) : 0, - /*.wdata =*/ cgraph->work ? cgraph->work->data : NULL, - }; - - ggml_compute_forward(¶ms, node); - - // COMPUTE - if (node->n_tasks > 1) { - if (atomic_fetch_add(&state_shared.n_ready, 1) == n_threads - 1) { - atomic_store(&state_shared.has_work, false); - } - - while (atomic_load(&state_shared.has_work)) { - ggml_lock_lock (&state_shared.spin); - ggml_lock_unlock(&state_shared.spin); - } - - // launch thread pool - for (int j = 0; j < n_threads - 1; j++) { - workers[j].params = (struct ggml_compute_params) { - .type = GGML_TASK_COMPUTE, - .ith = j + 1, - .nth = node->n_tasks, - .wsize = cgraph->work ? ggml_nbytes(cgraph->work) : 0, - .wdata = cgraph->work ? cgraph->work->data : NULL, - }; - workers[j].node = node; - } - - atomic_fetch_sub(&state_shared.n_ready, 1); - - while (atomic_load(&state_shared.n_ready) > 0) { - ggml_lock_lock (&state_shared.spin); - ggml_lock_unlock(&state_shared.spin); - } - - atomic_store(&state_shared.has_work, true); + // TODO: can be moved out of loop? + void *wdata = NULL; + size_t wsize = 0; + if (cgraph->work) { + wdata = cgraph->work->data; + wsize = ggml_nbytes(cgraph->work); } - params.type = GGML_TASK_COMPUTE; - ggml_compute_forward(¶ms, node); - - // wait for thread pool - if (node->n_tasks > 1) { - if (atomic_fetch_add(&state_shared.n_ready, 1) == n_threads - 1) { - atomic_store(&state_shared.has_work, false); - } - - while (atomic_load(&state_shared.has_work)) { - ggml_lock_lock (&state_shared.spin); - ggml_lock_unlock(&state_shared.spin); - } - - atomic_fetch_sub(&state_shared.n_ready, 1); - - while (atomic_load(&state_shared.n_ready) != 0) { - ggml_lock_lock (&state_shared.spin); - ggml_lock_unlock(&state_shared.spin); - } - } - - // FINALIZE - if (node->n_tasks > 1) { - if (atomic_fetch_add(&state_shared.n_ready, 1) == n_threads - 1) { - atomic_store(&state_shared.has_work, false); - } - - while (atomic_load(&state_shared.has_work)) { - ggml_lock_lock (&state_shared.spin); - ggml_lock_unlock(&state_shared.spin); - } - - // launch thread pool - for (int j = 0; j < n_threads - 1; j++) { - workers[j].params = (struct ggml_compute_params) { - .type = GGML_TASK_FINALIZE, - .ith = j + 1, - .nth = node->n_tasks, - .wsize = cgraph->work ? ggml_nbytes(cgraph->work) : 0, - .wdata = cgraph->work ? cgraph->work->data : NULL, - }; - workers[j].node = node; - } - - atomic_fetch_sub(&state_shared.n_ready, 1); - - while (atomic_load(&state_shared.n_ready) > 0) { - ggml_lock_lock (&state_shared.spin); - ggml_lock_unlock(&state_shared.spin); - } - - atomic_store(&state_shared.has_work, true); - } - - params.type = GGML_TASK_FINALIZE; - ggml_compute_forward(¶ms, node); - - // wait for thread pool - if (node->n_tasks > 1) { - if (atomic_fetch_add(&state_shared.n_ready, 1) == n_threads - 1) { - atomic_store(&state_shared.has_work, false); - } - - while (atomic_load(&state_shared.has_work)) { - ggml_lock_lock (&state_shared.spin); - ggml_lock_unlock(&state_shared.spin); - } - - atomic_fetch_sub(&state_shared.n_ready, 1); - - while (atomic_load(&state_shared.n_ready) != 0) { - ggml_lock_lock (&state_shared.spin); - ggml_lock_unlock(&state_shared.spin); + enum ggml_compute_error err = + ggml_threading_compute_tensor(thrd_ctx, node, wdata, wsize); + if (err == GGML_COMPUTE_FALLBACK) { + if (node->op == GGML_OP_MUL_MAT) { + const struct ggml_task_profile *p = + ggml_mulmat_get_default_task_profile( + NULL, node->src0->type, node->src1->type); + memcpy(&node->task_profile, p, + sizeof(struct ggml_task_profile)); } + err = ggml_threading_compute_tensor(thrd_ctx, node, wdata, wsize); } + GGML_ASSERT(err == GGML_COMPUTE_OK); // performance stats (node) { @@ -16155,19 +16185,7 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph) } } - // join thread pool - if (n_threads > 1) { - atomic_store(&state_shared.stop, true); - atomic_store(&state_shared.has_work, true); - - for (int j = 0; j < n_threads - 1; j++) { - int rc = ggml_thread_join(workers[j].thrd, NULL); - GGML_ASSERT(rc == 0); - UNUSED(rc); - } - - ggml_lock_destroy(&state_shared.spin); - } + ggml_threading_stop(thrd_ctx); // performance stats (graph) { @@ -16242,7 +16260,7 @@ static void ggml_graph_export_node(const struct ggml_tensor * tensor, const char tensor->n_dims, ne[0], ne[1], ne[2], ne[3], nb[0], nb[1], nb[2], nb[3], - tensor->n_tasks, + tensor->task_profile.stages[0].parallel, // replaceed n_tasks. tensor->data, tensor->name); } @@ -18024,24 +18042,24 @@ int ggml_cpu_has_wasm_simd(void) { #endif } -int ggml_cpu_has_blas(void) { -#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST) +int ggml_cpu_has_cublas(void) { +#if defined(GGML_USE_CUBLAS) return 1; #else return 0; #endif } -int ggml_cpu_has_cublas(void) { -#if defined(GGML_USE_CUBLAS) +int ggml_cpu_has_clblast(void) { +#if defined(GGML_USE_CLBLAST) return 1; #else return 0; #endif } -int ggml_cpu_has_clblast(void) { -#if defined(GGML_USE_CLBLAST) +int ggml_cpu_has_cpublas(void) { +#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) return 1; #else return 0; @@ -18052,6 +18070,10 @@ int ggml_cpu_has_gpublas(void) { return ggml_cpu_has_cublas() || ggml_cpu_has_clblast(); } +int ggml_cpu_has_blas(void) { + return ggml_cpu_has_cpublas() || ggml_cpu_has_gpublas(); +} + int ggml_cpu_has_sse3(void) { #if defined(__SSE3__) return 1; diff --git a/ggml.h b/ggml.h index 1380c530fdae8..f51b658fd3abe 100644 --- a/ggml.h +++ b/ggml.h @@ -1,5 +1,7 @@ #pragma once + + // // GGML Tensor Library // @@ -200,6 +202,7 @@ #define GGML_MAX_OPT 4 #define GGML_MAX_NAME 32 #define GGML_DEFAULT_N_THREADS 4 +#define GGML_MAX_TASK_PROFILES 8 #define GGML_ASSERT(x) \ do { \ @@ -347,7 +350,6 @@ extern "C" { GGML_OP_COUNT, }; - // ggml object struct ggml_object { size_t offs; @@ -360,6 +362,54 @@ extern "C" { static const size_t GGML_OBJECT_SIZE = sizeof(struct ggml_object); + // As part of task config profile solution, `ggml_task_backend` defines + // backends for each task stage. Similar to `ggml_tensor.backend`, + // `ggml_tensor.task_profile` generalizes how to configure tensor computing + // at per task-stage level. + // + // The following enum values are designed as combination of hardware and + // optional software interface. + enum ggml_task_backend { + GGML_TASK_BACKEND_NONE = 0, + + // [0x10, 0x1F]: CPU + GGML_TASK_BACKEND_CPU = 0x10, + GGML_TASK_BACKEND_CPU_BLAS = 0x11, + + // [0x20 - 0x2F]: GPU + GGML_TASK_BACKEND_GPU = 0x20, + GGML_TASK_BACKEND_GPU_CUDA = 0x21, + GGML_TASK_BACKEND_GPU_CL = 0x22, + }; + + // config for computing one of the 3 task stages of a tensor. + struct ggml_task_stage { + enum ggml_task_backend backend; + bool parallel; + // hint idle workers go waiting, valid only when parallel is false. + bool wait; + }; + + // config for computing a tensor. + struct ggml_task_profile { + // index 0: INIT, 1: COMPUTE, 2: FINALIZE + struct ggml_task_stage stages[3]; + + // MUST be used only in testing codes. + uint8_t dev_flags[4]; + }; + + struct ggml_task_profile_factory { + struct ggml_task_profile f32_f32[GGML_MAX_TASK_PROFILES]; + int n_f32_f32; + + struct ggml_task_profile f16_f32[GGML_MAX_TASK_PROFILES]; + int n_f16_f32; + + struct ggml_task_profile qxx_f32[GGML_MAX_TASK_PROFILES]; + int n_qxx_f32; + }; + // n-dimensional tensor struct ggml_tensor { enum ggml_type type; @@ -383,7 +433,8 @@ extern "C" { struct ggml_tensor * opt[GGML_MAX_OPT]; // thread scheduling - int n_tasks; + + struct ggml_task_profile task_profile; // performance int perf_runs; @@ -396,7 +447,7 @@ extern "C" { void * extra; // extra things e.g. for ggml-cuda.cu - char padding[4]; + char padding[12]; }; static const size_t GGML_TENSOR_SIZE = sizeof(struct ggml_tensor); @@ -407,6 +458,8 @@ extern "C" { int n_leafs; int n_threads; + struct ggml_mulmat_tune *tune; + size_t work_size; struct ggml_tensor * work; @@ -1287,9 +1340,21 @@ extern "C" { GGML_API int ggml_cpu_has_cublas (void); GGML_API int ggml_cpu_has_clblast (void); GGML_API int ggml_cpu_has_gpublas (void); + GGML_API int ggml_cpu_has_cpublas (void); GGML_API int ggml_cpu_has_sse3 (void); GGML_API int ggml_cpu_has_vsx (void); + // + // mulmat task profiles + // + GGML_API void ggml_mulmat_init_task_profiles(void); + + GGML_API int ggml_mulmat_get_task_profiles( + struct ggml_task_profile_factory *pf, + enum ggml_type src0_t, + enum ggml_type src1_t, + struct ggml_task_profile **profiles); + // // Internal types and functions exposed for tests and benchmarks // diff --git a/llama.cpp b/llama.cpp index c165d3239e63f..fa5a94e21f0ab 100644 --- a/llama.cpp +++ b/llama.cpp @@ -4,6 +4,7 @@ #include #include #include +#include #endif #include "llama-util.h" @@ -20,6 +21,10 @@ #include "ggml-metal.h" #endif +#ifdef GGML_USE_MULMAT_TUNE +#include "ggml-tune.h" +#endif + #include #include #include @@ -280,6 +285,10 @@ struct llama_context { int buf_last = 0; size_t buf_max_size[LLAMA_MAX_SCRATCH_BUFFERS] = { 0 }; +#ifdef GGML_USE_MULMAT_TUNE + struct ggml_mulmat_tune *tune = nullptr; +#endif + void use_buf(struct ggml_context * ctx, int i) { #if defined(LLAMA_USE_SCRATCH) size_t last_size = 0; @@ -1396,10 +1405,12 @@ static bool llama_eval_internal( struct ggml_context * ctx0 = ggml_init(params); - // for big prompts, if BLAS is enabled, it is better to use only one thread - // otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance ggml_cgraph gf = {}; - gf.n_threads = N >= 32 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas() ? 1 : n_threads; + gf.n_threads = n_threads; + +#ifdef GGML_USE_MULMAT_TUNE + gf.tune =lctx.tune; +#endif struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N); ggml_set_name(embd, "embd"); @@ -2732,7 +2743,150 @@ struct llama_context * llama_init_from_file( return ctx; } +#ifdef GGML_USE_MULMAT_TUNE +bool llama_mulmat_tune(struct llama_context *ctx, int n_threads, bool tune, const char *fname) { + printf("\n"); + if (ctx->model.n_gpu_layers != 0) { + fprintf(stderr, "[mulmat tune] error: is disabled by GPU offloading\n"); + return false; + } + + const char *model_name = llama_model_type_name(ctx->model.type); + + llama_hparams *hparams = &ctx->model.hparams; + + enum ggml_ftype ggml_ftype; + switch (hparams->ftype) { + case LLAMA_FTYPE_ALL_F32: + ggml_ftype = GGML_FTYPE_ALL_F32; + break; + case LLAMA_FTYPE_MOSTLY_F16: + ggml_ftype = GGML_FTYPE_MOSTLY_F16; + break; + case LLAMA_FTYPE_MOSTLY_Q4_0: + ggml_ftype = GGML_FTYPE_MOSTLY_Q4_0; + break; + case LLAMA_FTYPE_MOSTLY_Q4_1: + ggml_ftype = GGML_FTYPE_MOSTLY_Q4_1; + break; + case LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16: + ggml_ftype = GGML_FTYPE_MOSTLY_Q4_1_SOME_F16; + break; + case LLAMA_FTYPE_MOSTLY_Q5_0: + ggml_ftype = GGML_FTYPE_MOSTLY_Q5_0; + break; + case LLAMA_FTYPE_MOSTLY_Q5_1: + ggml_ftype = GGML_FTYPE_MOSTLY_Q5_1; + break; + case LLAMA_FTYPE_MOSTLY_Q8_0: + ggml_ftype = GGML_FTYPE_MOSTLY_Q8_0; + break; + case LLAMA_FTYPE_MOSTLY_Q2_K: + ggml_ftype = GGML_FTYPE_MOSTLY_Q2_K; + break; + case LLAMA_FTYPE_MOSTLY_Q3_K_S: + case LLAMA_FTYPE_MOSTLY_Q3_K_M: + case LLAMA_FTYPE_MOSTLY_Q3_K_L: + ggml_ftype = GGML_FTYPE_MOSTLY_Q3_K; + break; + case LLAMA_FTYPE_MOSTLY_Q4_K_S: + case LLAMA_FTYPE_MOSTLY_Q4_K_M: + ggml_ftype = GGML_FTYPE_MOSTLY_Q4_K; + break; + case LLAMA_FTYPE_MOSTLY_Q5_K_S: + case LLAMA_FTYPE_MOSTLY_Q5_K_M: + ggml_ftype = GGML_FTYPE_MOSTLY_Q5_K; + break; + case LLAMA_FTYPE_MOSTLY_Q6_K: + ggml_ftype = GGML_FTYPE_MOSTLY_Q6_K; + break; + default: + throw std::runtime_error( + format("invalid output file type %d\n", hparams->ftype)); + } + + int n_vocab = hparams->n_vocab; + int n_embd = hparams->n_embd; + int n_rot = hparams->n_rot; + + int n_mult = hparams->n_mult; + int n_ff = ((2*(4*n_embd)/3 + n_mult - 1)/n_mult)*n_mult; + + struct ggml_mulmat_tune_params params = { + /*.model =*/ { + /* .name =*/ model_name, + /* .ftype =*/ ggml_ftype, + /* .n_vocab =*/ n_vocab, + /* .n_embd =*/ n_embd, + /* .n_ff =*/ n_ff, + /* .n_rot =*/ n_rot, + }, + /* .m_num =*/ 8, + /* .n_pass =*/ 1, + /* .n_threads =*/ n_threads, + /* .prrogress =*/ true, + /* .output_console =*/ false, + /* .fname =*/ fname, + }; + + bool empty_fname = !fname || strcmp(fname, "") == 0; + + ctx->tune = new(struct ggml_mulmat_tune); + if (!ctx->tune) { + throw std::runtime_error(format("failed to allocate memory for tune\n")); + } + + if (tune) { + bool ok = ggml_mulmat_tune_bench(ctx->tune, ¶ms); + if (!ok) { + ggml_mulmat_tune_free(ctx->tune); + return false; + } + if (!empty_fname) { + ggml_mulmat_tune_free(ctx->tune); + return true; + } + } else { + if (empty_fname) { + return false; + } + } + + if (!empty_fname) { + FILE *fp = fopen(fname, "r"); + if (!fp) { + fprintf(stderr, "[mulmat tune] failed to open file %s.\n", + fname); + } else { + bool ok = ggml_mulmat_tune_read_data(ctx->tune, fp); + fclose(fp); + + if (!ok) { + fprintf(stderr, + "[mulmat tune] failed to read data from %s\n", + fname); + return false; + } + + fprintf(stderr, "[mulmat tune] loaded data from %s\n", fname); + + ok = ggml_mulmat_tune_validate(ctx->tune, model_name, ggml_ftype, params.n_threads); + if (!ok) { + return false; + } + } + } + + return true; +} +#endif + void llama_free(struct llama_context * ctx) { +#ifdef GGML_USE_MULMAT_TUNE + if (ctx->tune) { + delete(ctx->tune); + } +#endif delete ctx; } diff --git a/llama.h b/llama.h index 1241ba6c0ec44..c3f6a21548a52 100644 --- a/llama.h +++ b/llama.h @@ -300,6 +300,9 @@ extern "C" { // Print system information LLAMA_API const char * llama_print_system_info(void); + // Experimental utility functionality for mulmat tunning. + LLAMA_API bool llama_mulmat_tune(struct llama_context *ctx, int n_threads, bool tune, const char *fname); + #ifdef __cplusplus } #endif diff --git a/tests/.gitignore b/tests/.gitignore new file mode 100644 index 0000000000000..f4b8ee1b32633 --- /dev/null +++ b/tests/.gitignore @@ -0,0 +1,2 @@ +/test-ggml-threading +/test-ggml-tune diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 4171c126c7b7d..977b8ef6db032 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -12,3 +12,5 @@ llama_add_test(test-sampling.cpp) llama_add_test(test-tokenizer-0.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab.bin) # llama_add_test(test-grad0.c) # SLOW # llama_add_test(test-opt.c) # SLOW +llama_add_test(test-ggml-threading.c) +llama_add_test(test-ggml-tune.c) diff --git a/tests/test-ggml-threading.c b/tests/test-ggml-threading.c new file mode 100644 index 0000000000000..0b47623e2d0c1 --- /dev/null +++ b/tests/test-ggml-threading.c @@ -0,0 +1,345 @@ +#include "ggml-threading.h" +#include "ggml.h" + +#include +#include +#include +#include + +// Purposes: +// 1. general overview of the threading behaviors. +// 2. race (dead lock) detection. + +// # build +// cd build +// +// # build release: +// cmake .. && cmake --build . --config Release +// +// # build with sanitize: +// cmake .. -DLLAMA_SANITIZE_THREAD=ON && cmake --build . --config Release +// +// # run: +// ./bin/test-ggml-threading + +// How to turn off the warning on Apple: malloc: nano zone abandoned due to +// inability to reserve vm space? +// ==> export MallocNanoZone=0, no need to rebuild. +// See `nano_init()` from +// https://opensource.apple.com/source/libmalloc/libmalloc-140.40.1/src/nano_malloc.c.auto.html + +// How to view the threading debug: +// ==> uncomment `#define GGML_THREADING_DEBUG 1` from file ggml-threading.c + +#define UNUSED(x) (void)(x) + +#define MAX_N_THREADS 16 + +static const int n_repeat = 10; + +// It's frustrating to use atomic with c11 on Windows, let's replace atomic +// counter with array. +static int work_done_arr[MAX_N_THREADS]; + +static enum ggml_compute_error +mock_task_runner(struct ggml_compute_params *params, struct ggml_tensor *node) { + int64_t loops = node->task_profile.dev_flags[1] * 1000 * 1000; + if (node->task_profile.stages[params->type].parallel) { + loops /= params->nth; + } + + volatile int64_t j = 0; + for (int i = 0; i < loops; i++) { + j++; + } + + UNUSED(j); + + work_done_arr[params->ith]++; + return GGML_COMPUTE_OK; +} + +int test_driver(int id, struct ggml_tensor *node, int n_threads) { + printf("\n[test-ggml-threading] #%d, n_threads: %d\n", id, n_threads); + + for (int i = 0; i < n_threads; i++) { + work_done_arr[i] = 0; + } + + bool wait_on_done = (node->task_profile.dev_flags[0] > 0u); + + enum ggml_threading_features features = GGML_THREADING_FEATURE_PERF; + if (wait_on_done) { + features |= GGML_THREADING_FEATURE_WAIT_ON_DONE; + } + + int t0 = (int)ggml_time_us(); + + struct ggml_threading_context *ctx = + ggml_threading_start(n_threads, ggml_threading_graph_compute_thread, + mock_task_runner, features, /*stages_time*/ NULL); + + int t1 = (int)ggml_time_us(); + + for (int i = 0; i < n_repeat; i++) { + enum ggml_compute_error err = ggml_threading_compute_tensor( + ctx, node, /*wdata*/ NULL, /*wsize*/ 0); + if (err != GGML_COMPUTE_OK) { + ggml_threading_stop(ctx); + fprintf(stderr, + "ggml_threading_compute_tensor failed with error: %d.\n", + err); + return 1; + } + } + + int t2 = (int)ggml_time_us(); + + ggml_threading_stop(ctx); + + int t3 = (int)ggml_time_us(); + + int expect = 0; + for (int i = 0; i < 3; i++) { + struct ggml_task_stage *ts = &node->task_profile.stages[i]; + if (ts->backend != GGML_TASK_BACKEND_NONE) { + if (ts->parallel) { + expect += n_threads; + } else { + expect++; + } + } + } + expect *= n_repeat; + + int actual = 0; + for (int i = 0; i < n_threads; i++) { + actual += work_done_arr[i]; + } + + uint8_t loops = node->task_profile.dev_flags[1]; + + printf("\tloops: %2d million(s), ---wait_on_done---: %d\n\tstage-0: " + "(parallel: %d, " + "wait: %d)\n" + "\tstage-1: (parallel: %d, wait: %d)\n", + loops, wait_on_done, node->task_profile.stages[0].parallel, + node->task_profile.stages[0].wait, + node->task_profile.stages[1].parallel, + node->task_profile.stages[1].wait); + + if (actual == expect) { + printf("\tthreading: init %6.3f ms, compute %6.3f ms, cleanup %6.3f " + "ms, total %6.3f ms\n", + 1.0 * (t1 - t0) / 1000, 1.0 * (t2 - t1) / 1000, + 1.0 * (t3 - t2) / 1000, 1.0 * (t3 - t0) / 1000); + return 0; + } + + fprintf(stderr, "\t== failed. expect %d done, actual %d done\n\n", expect, + actual); + + return 2; +} + +static enum ggml_compute_error +mock_task_runner_fallback(struct ggml_compute_params *params, + struct ggml_tensor *node) { + UNUSED(params); + if (node->backend == GGML_BACKEND_GPU) { + // ... finally failed to compute in GPU. + + node->backend = GGML_BACKEND_CPU; + return GGML_COMPUTE_FALLBACK; + } else { + return GGML_COMPUTE_OK; + } +} + +// By design, fallback should happen when attempt computing tensor in GPU, +// thus it is not parallelled. +int test_fallback(struct ggml_tensor *node) { + struct ggml_threading_context *ctx = ggml_threading_start( + 1, ggml_threading_graph_compute_thread, mock_task_runner_fallback, + /*features*/ GGML_THREADING_FEATURE_NONE, /*stages_time*/ NULL); + + enum ggml_compute_error err = + ggml_threading_compute_tensor(ctx, node, /*wdata*/ NULL, /*wsize*/ 0); + if (err == GGML_COMPUTE_FALLBACK) { + err = ggml_threading_compute_tensor(ctx, node, /*wdata*/ NULL, + /*wsize*/ 0); + } + + ggml_threading_stop(ctx); + if (err != GGML_COMPUTE_OK) { + fprintf(stderr, + "ggml_threading_compute_tensor failed with error: %d.\n", err); + return 1; + } + + return 0; +} + +int main(void) { + ggml_time_init(); + + struct ggml_tensor node; + memset(&node, 0, sizeof(struct ggml_tensor)); + + struct ggml_task_stage *stages = node.task_profile.stages; + + stages[0].backend = GGML_TASK_BACKEND_CPU; + stages[1].backend = GGML_TASK_BACKEND_CPU; + stages[2].backend = GGML_TASK_BACKEND_NONE; + + int n_passed = 0; + int n_tests = 0; + + int parallel[3] = {0, 1, 2}; + + // In github build actions (windows-latest-cmake and ubuntu-latest-cmake): + // When n_threads >= 4, the thread init time and compute time suddenly goes + // down to 100x ~ 1000x slow -- comparing to n_threads == 2. + // + // But the tests (n_threads 1, 2, 4, 6) looks sound on my devices: + // - MacBook air 2013, ubuntu 22.04 + // - MacBook pro 2018, macOS 13.4 + // + // So I assume the github build host has limited multi-cpu quota. + // Will skip computing when threading init time is too slow. + // + // NOTE: it's observed that when workload is 0 and n_threads >= number of + // physical cores: + // - the wait/wakeup time varies much: can be up to tens or hundreds of the + // average time, thus greatly punishes those small workloads. + // - wait_on_done is general faster than wait_now, can be 10x faster. + + int threads_arr[] = {1, 2, 4, 8}; + int threads_arr_len = sizeof(threads_arr) / sizeof(threads_arr[0]); + + // millions of loops. + uint8_t workload_arr[] = {0u, 1u, 10u}; + int workload_arr_len = sizeof(workload_arr) / sizeof(workload_arr[0]); + + // node.task_profile.dev_flags: byte 0 for wait_on_done, byte 1 for loops. + + for (int x = 0; x < workload_arr_len; x++) { + node.task_profile.dev_flags[1] = workload_arr[x]; + + for (int i = 0; i < threads_arr_len; i++) { + int n_threads = threads_arr[i]; + if (n_threads > MAX_N_THREADS) { + abort(); + } + + printf("\n[test-ggml-threading] ==== n_nodes: %d, n_threads: %d, " + "loops: %2d million(s) ====\n", + n_repeat, n_threads, workload_arr[x]); + + if (n_threads > 1) { // skip this n_threads when too slow. + int t0 = (int)ggml_time_us(); + + struct ggml_threading_context *ctx = ggml_threading_start( + n_threads, ggml_threading_graph_compute_thread, + mock_task_runner, 0, /*stages_time*/ NULL); + + int t1 = (int)ggml_time_us(); + + ggml_threading_stop(ctx); + + int elapsed_us = t1 - t0; + if (elapsed_us > 500 * n_threads) { + fprintf(stderr, + "[test-ggml-threading] warning: it took took %.3f " + "ms to start %d worker thread(s).\n", + 1.0 * elapsed_us / 1000, n_threads - 1); + fprintf(stderr, "[test-ggml-threading] warning: looks like " + "the environment is too slow to run this " + "number of threads, skip.\n"); + continue; + } + } + + // multi-threads: parallel + wait_now/wait_on_done + + if (n_threads == 1) { + stages[0].parallel = false; + stages[1].parallel = false; + stages[0].wait = false; + stages[1].wait = false; + + n_tests++; + if (test_driver(n_tests, &node, n_threads) == 0) { + n_passed++; + } + continue; + } + + for (int j = 0; j < 3; j++) { + stages[0].wait = false; + stages[1].wait = false; + node.task_profile.dev_flags[0] = 0u; + + if (parallel[j] == 0) { + stages[0].parallel = false; + stages[1].parallel = false; + + n_tests++; + if (test_driver(n_tests, &node, n_threads) == 0) { + n_passed++; + } + } else if (parallel[j] == 1) { + stages[0].parallel = true; + stages[1].parallel = false; + + for (int k = 0; k < 2; k++) { + stages[1].wait = (k == 1); + + if (!stages[1].wait) { + n_tests++; + if (test_driver(n_tests, &node, n_threads) == 0) { + n_passed++; + } + continue; + } + + // wait + + for (int m = 0; m < 2; m++) { + if (m == 1) { + node.task_profile.dev_flags[0] = 1u; + } + n_tests++; + if (test_driver(n_tests, &node, n_threads) == 0) { + n_passed++; + } + node.task_profile.dev_flags[0] = 0u; + } + } + } else { + stages[0].parallel = true; + stages[1].parallel = true; + + n_tests++; + if (test_driver(n_tests, &node, n_threads) == 0) { + n_passed++; + } + } + } + } + } + + { + ++n_tests; + + node.backend = GGML_BACKEND_GPU; + if (test_fallback(&node) == 0) { + ++n_passed; + printf("\n[test-ggml-threading] test fallback: ok\n\n"); + } + } + + printf("[test-ggml-threading] %d/%d passed.\n", n_passed, n_tests); + + return (n_passed == n_tests) ? 0 : 1; +} diff --git a/tests/test-ggml-tune.c b/tests/test-ggml-tune.c new file mode 100644 index 0000000000000..ed612fff45562 --- /dev/null +++ b/tests/test-ggml-tune.c @@ -0,0 +1,200 @@ +#include "ggml-tune.h" +#include "ggml.h" + +#include + +static int bench(void); +static int estimate_time_non_zero_NK(void); + +static void init_params(struct ggml_mulmat_tune_params *params, int m_num) { + *params = (struct ggml_mulmat_tune_params){ + .model = + (struct ggml_mulmat_tune_model){ + .name = "3B", // fake + .ftype = GGML_FTYPE_MOSTLY_Q4_0, + .n_vocab = 4096, + .n_embd = 1024, + .n_ff = 2048, + .n_rot = 128, + }, + .m_num = m_num, + .n_pass = 1, + .n_threads = 1, + .progress = false, + .output_console = true, + .fname = NULL}; +} + +int main(void) { + int rv = bench(); + if (rv != 0) { + return rv; + } + + printf("\n"); + + rv = estimate_time_non_zero_NK(); + if (rv != 0) { + return rv; + } + printf("\n"); + + return 0; +} + +static int bench(void) { + printf("test: %s\n", __func__); + + { + enum ggml_task_backend backends[16]; + int n_backends = ggml_mulmat_tune_get_builtin_task_backends(backends); + if (n_backends < 2) { + printf("test: %s, skipped because no BLAS\n", __func__); + return 0; + } + } + + { + struct ggml_init_params init_params = { + /*.mem_size =*/1, + /*.mem_buffer =*/NULL, + /*.no_alloc =*/0, + }; + struct ggml_context *ctx = ggml_init(init_params); + GGML_ASSERT(ctx); + ggml_free(ctx); + } + + struct ggml_mulmat_tune tune; + + struct ggml_mulmat_tune_params params; + + init_params(¶ms, /*m_num*/ 4); + + bool ok = ggml_mulmat_tune_bench(&tune, ¶ms); + ggml_mulmat_tune_free(&tune); + + return ok ? 0 : 1; +} + +int estimate_time_non_zero_NK(void) { + printf("test: %s\n", __func__); + + struct test_data_t { + int M; + int time[3]; // 3 profiles. + }; + + struct ggml_mulmat_tune tune = { + .version = 1, + .ftype = GGML_FTYPE_MOSTLY_Q4_0, + }; + + const int m_num = 2; + + struct ggml_task_profile_factory pf; + memset(&pf, 0, sizeof(struct ggml_task_profile_factory)); + + { + pf.n_qxx_f32 = 2; + pf.qxx_f32[0].stages[0].backend = GGML_TASK_BACKEND_CPU; + pf.qxx_f32[0].stages[1].backend = GGML_TASK_BACKEND_CPU; + + pf.qxx_f32[1].stages[0].backend = GGML_TASK_BACKEND_CPU; + pf.qxx_f32[1].stages[1].backend = GGML_TASK_BACKEND_CPU_BLAS; + } + + struct ggml_mulmat_tune_params params; + init_params(¶ms, m_num); + + ggml_mulmat_tune_init(&tune, ¶ms, &pf); + + struct ggml_mulmat_tune_shape *shape = NULL; + for (int i = 0; i < tune.n_shapes; i++) { + if (tune.shapes[i].N > 0 && tune.shapes[i].K > 0) { + shape = &tune.shapes[i]; + break; + } + } + GGML_ASSERT(shape); + GGML_ASSERT(shape->n_profiles == 2); + GGML_ASSERT(ggml_is_quantized(shape->src0_type)); + + printf("shape: N: %d, K: %d, n_profiles: %d\n", shape->N, shape->K, + shape->n_profiles); + + { + shape->items[0] = + (struct ggml_mulmat_tune_m){.M = 2, .stages_time = {2, 4, 0}}; + shape->items[1] = + (struct ggml_mulmat_tune_m){.M = 4, .stages_time = {4, 8, 0}}; + + shape->items[2] = + (struct ggml_mulmat_tune_m){.M = 2, .stages_time = {4, 4, 0}}; + shape->items[3] = + (struct ggml_mulmat_tune_m){.M = 4, .stages_time = {4, 4, 0}}; + } + + const struct test_data_t test_data[] = { + { + .M = 1, // out of range + .time = {3, 8}, + }, + { + .M = 2, + .time = {6, 8}, + }, + { + .M = 3, + .time = {9, 8}, + }, + { + .M = 4, + .time = {12, 8}, + }, + { + .M = 5, // out of range + .time = {15, 8}, + }, + }; + + int n_tests = (int)(sizeof(test_data) / sizeof(struct test_data_t)); + + struct ggml_mulmat_tune_time profile_time[GGML_MAX_TASK_PROFILES]; + size_t profile_time_sz = + sizeof(struct ggml_mulmat_tune_time) * GGML_MAX_TASK_PROFILES; + + int n_passed = 0; + for (int i = 0; i < n_tests; i++) { + memset(profile_time, 0, profile_time_sz); + const struct test_data_t *e = &test_data[i]; + + const struct ggml_mulmat_tune_shape *matched_shape = + ggml_mulmat_tune_get_shape(&tune, shape->N, shape->K, + shape->src0_type, shape->src1_type); + GGML_ASSERT(matched_shape); + GGML_ASSERT(matched_shape == shape); + + ggml_mulmat_tune_estimate_time(matched_shape, e->M, profile_time); + + for (int j = 0; j < shape->n_profiles; j++) { + int actual = profile_time[j].total_time; + int expect = e->time[j]; + if (expect != actual) { + fprintf(stderr, + "test fail. i: %d, j: %d, M: %d, expect: " + "%d, actual: %d\n", + i, j, e->M, expect, actual); + } else { + ++n_passed; + } + } + } + + n_tests *= shape->n_profiles; + printf("%2d of %2d pass\n", n_passed, n_tests); + + ggml_mulmat_tune_free(&tune); + + return n_passed == n_tests ? 0 : 1; +} From 1b041d773730efbfa98743ddc4446bbf14cafc8d Mon Sep 17 00:00:00 2001 From: mqy Date: Wed, 14 Jun 2023 21:17:14 +0800 Subject: [PATCH 02/24] threading test: improve readability at both codes and output --- tests/test-ggml-threading.c | 192 ++++++++++++++++++++---------------- 1 file changed, 108 insertions(+), 84 deletions(-) diff --git a/tests/test-ggml-threading.c b/tests/test-ggml-threading.c index 0b47623e2d0c1..ed9d8aa2bcdd8 100644 --- a/tests/test-ggml-threading.c +++ b/tests/test-ggml-threading.c @@ -60,7 +60,11 @@ mock_task_runner(struct ggml_compute_params *params, struct ggml_tensor *node) { } int test_driver(int id, struct ggml_tensor *node, int n_threads) { - printf("\n[test-ggml-threading] #%d, n_threads: %d\n", id, n_threads); + uint8_t loops = node->task_profile.dev_flags[1]; + printf( + "\n[test-ggml-threading] #%02d, workload: %2d million(s), n_threads: " + "%2d\n", + id, loops, n_threads); for (int i = 0; i < n_threads; i++) { work_done_arr[i] = 0; @@ -86,9 +90,8 @@ int test_driver(int id, struct ggml_tensor *node, int n_threads) { ctx, node, /*wdata*/ NULL, /*wsize*/ 0); if (err != GGML_COMPUTE_OK) { ggml_threading_stop(ctx); - fprintf(stderr, - "ggml_threading_compute_tensor failed with error: %d.\n", - err); + printf("ggml_threading_compute_tensor failed with error: %d.\n", + err); return 1; } } @@ -99,9 +102,11 @@ int test_driver(int id, struct ggml_tensor *node, int n_threads) { int t3 = (int)ggml_time_us(); + const struct ggml_task_stage *stages = node->task_profile.stages; + int expect = 0; for (int i = 0; i < 3; i++) { - struct ggml_task_stage *ts = &node->task_profile.stages[i]; + const struct ggml_task_stage *ts = &stages[i]; if (ts->backend != GGML_TASK_BACKEND_NONE) { if (ts->parallel) { expect += n_threads; @@ -117,16 +122,10 @@ int test_driver(int id, struct ggml_tensor *node, int n_threads) { actual += work_done_arr[i]; } - uint8_t loops = node->task_profile.dev_flags[1]; - - printf("\tloops: %2d million(s), ---wait_on_done---: %d\n\tstage-0: " - "(parallel: %d, " - "wait: %d)\n" - "\tstage-1: (parallel: %d, wait: %d)\n", - loops, wait_on_done, node->task_profile.stages[0].parallel, - node->task_profile.stages[0].wait, - node->task_profile.stages[1].parallel, - node->task_profile.stages[1].wait); + printf("\tstage-0: parallel: %d, wait: %d\n\tstage-1: parallel: %d, wait: " + "%d, wait_on_done: %d %s\n", + stages[0].parallel, stages[0].wait, stages[1].parallel, + stages[1].wait, wait_on_done, stages[1].wait ? "<--------" : ""); if (actual == expect) { printf("\tthreading: init %6.3f ms, compute %6.3f ms, cleanup %6.3f " @@ -136,8 +135,7 @@ int test_driver(int id, struct ggml_tensor *node, int n_threads) { return 0; } - fprintf(stderr, "\t== failed. expect %d done, actual %d done\n\n", expect, - actual); + printf("\t== failed. expect %d done, actual %d done\n\n", expect, actual); return 2; } @@ -172,8 +170,7 @@ int test_fallback(struct ggml_tensor *node) { ggml_threading_stop(ctx); if (err != GGML_COMPUTE_OK) { - fprintf(stderr, - "ggml_threading_compute_tensor failed with error: %d.\n", err); + printf("ggml_threading_compute_tensor failed with error: %d.\n", err); return 1; } @@ -195,8 +192,6 @@ int main(void) { int n_passed = 0; int n_tests = 0; - int parallel[3] = {0, 1, 2}; - // In github build actions (windows-latest-cmake and ubuntu-latest-cmake): // When n_threads >= 4, the thread init time and compute time suddenly goes // down to 100x ~ 1000x slow -- comparing to n_threads == 2. @@ -214,13 +209,47 @@ int main(void) { // average time, thus greatly punishes those small workloads. // - wait_on_done is general faster than wait_now, can be 10x faster. - int threads_arr[] = {1, 2, 4, 8}; + int threads_arr[] = {1, 2, 4, 6, 8, 16}; int threads_arr_len = sizeof(threads_arr) / sizeof(threads_arr[0]); // millions of loops. uint8_t workload_arr[] = {0u, 1u, 10u}; int workload_arr_len = sizeof(workload_arr) / sizeof(workload_arr[0]); + // skip slow/big n_threads. + for (int i = 0; i < threads_arr_len; i++) { + int n_threads = threads_arr[i]; + + if (n_threads == 1) { + continue; + } else if (n_threads > MAX_N_THREADS) { + printf("[test-ggml-threading] warning: the n_threads (%d) is too " + "big, allow at most %d, skip.\n", + n_threads, MAX_N_THREADS); + threads_arr[i] = 0; + continue; + } + + // skip this n_threads when too slow. + int t0 = (int)ggml_time_us(); + + struct ggml_threading_context *ctx = + ggml_threading_start(n_threads, ggml_threading_graph_compute_thread, + mock_task_runner, 0, /*stages_time*/ NULL); + + int t1 = (int)ggml_time_us(); + + ggml_threading_stop(ctx); + + int elapsed_us = t1 - t0; + if (elapsed_us > 500 * n_threads) { + printf("[test-ggml-threading] warning: it took took %.3f " + "ms to start %d worker thread(s). Loo slow, skip.\n", + 1.0 * elapsed_us / 1000, n_threads - 1); + threads_arr[i] = 0; + } + } + // node.task_profile.dev_flags: byte 0 for wait_on_done, byte 1 for loops. for (int x = 0; x < workload_arr_len; x++) { @@ -228,101 +257,96 @@ int main(void) { for (int i = 0; i < threads_arr_len; i++) { int n_threads = threads_arr[i]; - if (n_threads > MAX_N_THREADS) { - abort(); + if (n_threads <= 0) { + continue; } - printf("\n[test-ggml-threading] ==== n_nodes: %d, n_threads: %d, " - "loops: %2d million(s) ====\n", - n_repeat, n_threads, workload_arr[x]); - - if (n_threads > 1) { // skip this n_threads when too slow. - int t0 = (int)ggml_time_us(); + printf("\n[test-ggml-threading] ==== workload: %2d million(s), " + "n_threads: %2d ====\n", + workload_arr[x], n_threads); - struct ggml_threading_context *ctx = ggml_threading_start( - n_threads, ggml_threading_graph_compute_thread, - mock_task_runner, 0, /*stages_time*/ NULL); + // multi-threads: parallel + wait_now/wait_on_done - int t1 = (int)ggml_time_us(); + if (n_threads == 1) { + stages[0].parallel = false; + stages[1].parallel = false; + stages[0].wait = false; + stages[1].wait = false; - ggml_threading_stop(ctx); + node.task_profile.dev_flags[0] = 0u; - int elapsed_us = t1 - t0; - if (elapsed_us > 500 * n_threads) { - fprintf(stderr, - "[test-ggml-threading] warning: it took took %.3f " - "ms to start %d worker thread(s).\n", - 1.0 * elapsed_us / 1000, n_threads - 1); - fprintf(stderr, "[test-ggml-threading] warning: looks like " - "the environment is too slow to run this " - "number of threads, skip.\n"); - continue; + n_tests++; + if (test_driver(n_tests, &node, n_threads) == 0) { + n_passed++; } + continue; } - // multi-threads: parallel + wait_now/wait_on_done - - if (n_threads == 1) { + { // no parallel, no wait stages[0].parallel = false; stages[1].parallel = false; stages[0].wait = false; stages[1].wait = false; + node.task_profile.dev_flags[0] = 0u; + n_tests++; if (test_driver(n_tests, &node, n_threads) == 0) { n_passed++; } - continue; } - for (int j = 0; j < 3; j++) { + { // both parallel, no wait + stages[0].parallel = true; + stages[1].parallel = true; stages[0].wait = false; stages[1].wait = false; + node.task_profile.dev_flags[0] = 0u; - if (parallel[j] == 0) { - stages[0].parallel = false; - stages[1].parallel = false; + n_tests++; + if (test_driver(n_tests, &node, n_threads) == 0) { + n_passed++; + } + } + + { // stage 0 parallel, stage 1 may wait + stages[0].parallel = true; + stages[1].parallel = false; + stages[0].wait = false; + + { // stage 1 no wait + stages[1].wait = false; + node.task_profile.dev_flags[0] = 0u; n_tests++; if (test_driver(n_tests, &node, n_threads) == 0) { n_passed++; } - } else if (parallel[j] == 1) { - stages[0].parallel = true; - stages[1].parallel = false; - - for (int k = 0; k < 2; k++) { - stages[1].wait = (k == 1); - - if (!stages[1].wait) { - n_tests++; - if (test_driver(n_tests, &node, n_threads) == 0) { - n_passed++; - } - continue; - } + } + + { // stage 1 wait + stages[1].wait = true; + if (stages[1].parallel) { + abort(); + } - // wait - - for (int m = 0; m < 2; m++) { - if (m == 1) { - node.task_profile.dev_flags[0] = 1u; - } - n_tests++; - if (test_driver(n_tests, &node, n_threads) == 0) { - n_passed++; - } - node.task_profile.dev_flags[0] = 0u; + { // disable wait_on_done + node.task_profile.dev_flags[0] = 0u; // wait now. + + n_tests++; + if (test_driver(n_tests, &node, n_threads) == 0) { + n_passed++; } } - } else { - stages[0].parallel = true; - stages[1].parallel = true; - n_tests++; - if (test_driver(n_tests, &node, n_threads) == 0) { - n_passed++; + { // enable wait_on_done + node.task_profile.dev_flags[0] = 1u; // wait on done + + n_tests++; + if (test_driver(n_tests, &node, n_threads) == 0) { + n_passed++; + } } } } From 48016f685c9377a5bfe9810c3f736bb2fa527125 Mon Sep 17 00:00:00 2001 From: mqy Date: Thu, 15 Jun 2023 06:43:08 +0800 Subject: [PATCH 03/24] bulk refactored task profile to support complete fallback; enable tune by default for ease of dev --- CMakeLists.txt | 8 +- Makefile | 8 +- examples/common.cpp | 6 +- examples/main/main.cpp | 2 +- examples/mulmat-tune/README.md | 21 +- examples/mulmat-tune/mulmat-tune.cpp | 2 - examples/perplexity/perplexity.cpp | 2 +- ggml-threading.c | 18 +- ggml-tune.c | 59 ++- ggml-tune.h | 8 +- ggml.c | 615 +++++++++++++++------------ ggml.h | 28 +- llama.cpp | 10 +- tests/test-ggml-threading.c | 11 + tests/test-ggml-tune.c | 30 +- 15 files changed, 457 insertions(+), 371 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 832c1e986a6eb..716673da28c5f 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -78,7 +78,7 @@ option(LLAMA_K_QUANTS "llama: use k-quants" option(LLAMA_BUILD_TESTS "llama: build tests" ${LLAMA_STANDALONE}) option(LLAMA_BUILD_EXAMPLES "llama: build examples" ${LLAMA_STANDALONE}) option(LLAMA_BUILD_SERVER "llama: build server example" OFF) -option(LLAMA_MULMAT_TUNE "llama: mulmat tune" OFF) +option(LLAMA_TUNE "llama: mulmat tune" ON) # # Build info header @@ -278,9 +278,9 @@ if (LLAMA_METAL) ) endif() -if (LLAMA_MULMAT_TUNE) - add_compile_definitions(GGML_USE_MULMAT_TUNE) - add_compile_definitions(GGML_MULMAT_TUNE_NDEBUG) +if (LLAMA_TUNE) + add_compile_definitions(GGML_USE_TUNE) + add_compile_definitions(GGML_TUNE_NDEBUG) endif() if (LLAMA_K_QUANTS) diff --git a/Makefile b/Makefile index a8d1bdc0991ae..531f62fb01347 100644 --- a/Makefile +++ b/Makefile @@ -231,14 +231,14 @@ ifneq ($(filter armv8%,$(UNAME_M)),) CFLAGS += -mfp16-format=ieee -mno-unaligned-access endif -ifdef LLAMA_NO_K_QUANTS +ifndef LLAMA_NO_K_QUANTS k_quants.o: k_quants.c k_quants.h $(CC) $(CFLAGS) -c $< -o $@ endif # LLAMA_NO_K_QUANTS -ifdef LLAMA_MULMAT_TUNE - CFLAGS += -DGGML_USE_MULMAT_TUNE -DGGML_MULMAT_TUNE_NDEBUG - CXXFLAGS += -DGGML_USE_MULMAT_TUNE +ifndef LLAMA_NO_TUNE +CFLAGS += -DGGML_USE_TUNE -DGGML_TUNE_NDEBUG +CXXFLAGS += -DGGML_USE_TUNE endif # diff --git a/examples/common.cpp b/examples/common.cpp index 882e90c9c3649..fd6df49477949 100644 --- a/examples/common.cpp +++ b/examples/common.cpp @@ -345,7 +345,7 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) { params.mem_test = true; } else if (arg == "--export") { params.export_cgraph = true; -#ifdef GGML_USE_MULMAT_TUNE +#ifdef GGML_USE_TUNE } else if (arg == "--tune") { params.tune = true; } else if (arg == "--tune-file") { @@ -354,7 +354,7 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) { break; } params.tune_file = argv[i]; -#endif // GGML_USE_MULMAT_TUNE +#endif // GGML_USE_TUNE } else if (arg == "--verbose-prompt") { params.verbose_prompt = true; } else if (arg == "-r" || arg == "--reverse-prompt") { @@ -508,7 +508,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) { #endif fprintf(stderr, " --mtest compute maximum memory usage\n"); fprintf(stderr, " --export export the computation graph to 'llama.ggml'\n"); -#ifdef GGML_USE_MULMAT_TUNE +#ifdef GGML_USE_TUNE fprintf(stderr, " --tune mulmat tune enable. If tune-file is set then exit after bench\n"); fprintf(stderr, " --tune-file FILE mulmat tune data file. If tune is true, then write bench result to this file, else load the file and run\n"); #endif diff --git a/examples/main/main.cpp b/examples/main/main.cpp index 542e463bfe84e..fa243ce95cbb0 100644 --- a/examples/main/main.cpp +++ b/examples/main/main.cpp @@ -117,7 +117,7 @@ int main(int argc, char ** argv) { return 1; } -#ifdef GGML_USE_MULMAT_TUNE +#ifdef GGML_USE_TUNE if (params.tune || !params.tune_file.empty()) { bool ok = llama_mulmat_tune(ctx, params.n_threads, params.tune, params.tune_file.c_str()); if (!ok || (params.tune && !params.tune_file.empty())) { diff --git a/examples/mulmat-tune/README.md b/examples/mulmat-tune/README.md index cff8a3d6467ea..df023757a85b1 100644 --- a/examples/mulmat-tune/README.md +++ b/examples/mulmat-tune/README.md @@ -23,13 +23,13 @@ run bench ahead of time (saving tens of seconds), but there are two shortcomings Makefile: ``` -make clean && LLAMA_MULMAT_TUNE=1 make +make clean && make ``` CMake (with BLAS): ``` cmake --build . --target clean -cmake .. -DLLAMA_BLAS=ON -DLLAMA_MULMAT_TUNE=ON +cmake .. -DLLAMA_BLAS=ON cmake --build . --config Release ``` @@ -52,13 +52,13 @@ Run examples: Makefile: ``` -make clean && LLAMA_MULMAT_TUNE=1 make +make clean && make ``` CMake (with BLAS) ``` cmake --build . --target clean -cmake .. -DLLAMA_BLAS=ON -DLLAMA_MULMAT_TUNE=ON +cmake .. -DLLAMA_BLAS=ON cmake --build . --config Release ``` @@ -103,22 +103,29 @@ setup properly. General steps: 1. run `./mulmat-tune -h` to see how to build for misc vendors. - you can build with `GGML_MULMAT_TUNE_NDEBUG=` to enable the the debug, e.g: + To enable the debug, comment out `-DGGML_TUNE_NDEBUG` from makefile then run: + ``` - make clean; LLAMA_MULMAT_TUNE=1 LLAMA_MULMAT_TUNE_NDEBUG=1 LLAMA_NO_ACCELERATE=1 LLAMA_CLBLAST=1 make + make clean; make ``` + On `macOS`, `ACCELERATE` is enabled by default. When `ACCELERATE` is built along with `CUDA` or `CL`, you may not see `CUDA` or `CL` from debug because `CPU` - or `CPU_BLAS` is more faster (as of the estimation from mulmat tune). + or `CPU_BLAS` is more faster (as of the estimation from mulmat tune), try run + with `-t 1`? 2. create a small prompt file: + ``` head -n 5 ./models/wikitext-2-raw/wiki.valid.raw > ./models/wiki.valid-5.raw ``` + 3. run any of the following example commands. + ``` ./perplexity -m models/7B/ggml-model-q4_0.bin -f ./models/wiki.valid-5.raw -c 128 --mlock -t 1 -b 32 ./perplexity -m models/7B/ggml-model-q4_0.bin -f ./models/wiki.valid-5.raw -c 128 --mlock -t 4 -b 64 ``` + * `--mlock` is recommended for `macOS`, you may not want to use it. * don't change `-c 128`: too large `context size` causes 0 perplexity trunk. * `-t` is the number of threads, recommend `1`, `2`, `4` or `6`. diff --git a/examples/mulmat-tune/mulmat-tune.cpp b/examples/mulmat-tune/mulmat-tune.cpp index 62f1da27764b9..ab3334d763870 100644 --- a/examples/mulmat-tune/mulmat-tune.cpp +++ b/examples/mulmat-tune/mulmat-tune.cpp @@ -262,8 +262,6 @@ int main(int argc, char **argv) { struct ggml_mulmat_tune_params params; memset(¶ms, 0, sizeof(struct ggml_mulmat_tune_params)); - ggml_mulmat_init_task_profiles(); - ggml_mulmat_tune_model_init(¶ms.model, model_name, ftype); params.m_num = m_num; params.n_pass = n_pass; diff --git a/examples/perplexity/perplexity.cpp b/examples/perplexity/perplexity.cpp index 1f14c18def3a3..2cdd9db060d25 100644 --- a/examples/perplexity/perplexity.cpp +++ b/examples/perplexity/perplexity.cpp @@ -158,7 +158,7 @@ int main(int argc, char ** argv) { return 1; } -#ifdef GGML_USE_MULMAT_TUNE +#ifdef GGML_USE_TUNE if (params.tune || !params.tune_file.empty()){ bool ok = llama_mulmat_tune(ctx, params.n_threads, params.tune, params.tune_file.c_str()); if (!ok || (params.tune && !params.tune_file.empty())) { diff --git a/ggml-threading.c b/ggml-threading.c index cf17793f6be61..6dd6d2817eff0 100644 --- a/ggml-threading.c +++ b/ggml-threading.c @@ -394,7 +394,7 @@ ggml_thread_ret_t ggml_threading_graph_compute_thread(void *data) { enum ggml_compute_error err = shared->task_runner(&state->params, state->node); - GGML_ASSERT(err == GGML_COMPUTE_OK || err == GGML_COMPUTE_FALLBACK); + GGML_ASSERT(err == GGML_COMPUTE_OK); ggml_spin_lock(&shared->spin); @@ -433,7 +433,11 @@ ggml_threading_compute_tensor(struct ggml_threading_context *ctx, // This is the params for main thread. struct ggml_compute_params params; - enum ggml_compute_error err; + enum ggml_compute_error err = GGML_COMPUTE_OK; + +START: + + memset(¶ms, 0, sizeof(struct ggml_compute_params)); for (int type = GGML_TASK_INIT; type <= GGML_TASK_FINALIZE; type++) { if (node->task_profile.stages[type].backend == GGML_TASK_BACKEND_NONE) { @@ -504,11 +508,19 @@ ggml_threading_compute_tensor(struct ggml_threading_context *ctx, } if (err != GGML_COMPUTE_OK) { + if (err == GGML_COMPUTE_FALLBACK) { + struct ggml_task_profile profiles[GGML_MAX_TASK_PROFILES]; + int n = ggml_get_task_profiles(node, profiles); + GGML_ASSERT(n > 0); + memcpy(&node->task_profile, &profiles[0], + sizeof(struct ggml_task_profile)); + goto START; + } return err; } } - return GGML_COMPUTE_OK; + return err; } struct ggml_threading_context * diff --git a/ggml-tune.c b/ggml-tune.c index fbca953ed469e..52ca96bf302fa 100644 --- a/ggml-tune.c +++ b/ggml-tune.c @@ -55,7 +55,7 @@ const struct ggml_task_profile *ggml_mulmat_tune_select_task_profile( struct ggml_mulmat_tune_time profiles_time[GGML_MAX_TASK_PROFILES] = {0}; - struct ggml_task_profile *prof = NULL; + const struct ggml_task_profile *prof = NULL; if (e->M == M && e->N == N && e->K == K) { prof = e->profile; @@ -97,10 +97,7 @@ const struct ggml_task_profile *ggml_mulmat_tune_select_task_profile( e->N = N; e->K = K; - // to disable this, build with - // `make clean; LLAMA_MULMAT_TUNE=1 LLAMA_MULMAT_TUNE_NDEBUG=1 - // make` -#if !defined(GGML_MULMAT_TUNE_NDEBUG) +#ifndef GGML_TUNE_NDEBUG const char *names[3]; for (int i = 0; i < 3; i++) { names[i] = ggml_mulmat_tune_task_backend_name( @@ -163,8 +160,8 @@ void ggml_mulmat_tune_model_init(struct ggml_mulmat_tune_model *model, bool ggml_mulmat_tune_init(struct ggml_mulmat_tune *tune, struct ggml_mulmat_tune_params *params, - struct ggml_task_profile_factory *pf) { - + ggml_task_profiles_provider *profiles_provider) { + GGML_ASSERT(profiles_provider); struct ggml_mulmat_tune_model *model = ¶ms->model; memset(tune, 0, sizeof(struct ggml_mulmat_tune)); @@ -208,8 +205,20 @@ bool ggml_mulmat_tune_init(struct ggml_mulmat_tune *tune, for (int i = 0; i < tune->n_shapes; i++) { struct ggml_mulmat_tune_shape *shape = &tune->shapes[i]; - shape->n_profiles = ggml_mulmat_get_task_profiles( - pf, shape->src0_type, shape->src1_type, &shape->profiles); + + struct ggml_tensor src0 = { + .type = shape->src0_type, + }; + struct ggml_tensor src1 = { + .type = shape->src1_type, + }; + struct ggml_tensor node = { + .op = GGML_OP_MUL_MAT, + .src0 = &src0, + .src1 = &src1, + }; + + shape->n_profiles = profiles_provider(&node, shape->profiles); if (shape->n_profiles == 0) { // allowed for testing. continue; @@ -304,9 +313,20 @@ ggml_mulmat_tune_validate_internal(const struct ggml_mulmat_tune *tune, for (int i = 0; i < tune->n_shapes; i++) { const struct ggml_mulmat_tune_shape *shape = &tune->shapes[i]; - struct ggml_task_profile *builtin_profiles = NULL; - int n_profiles = ggml_mulmat_get_task_profiles( - NULL, shape->src0_type, shape->src1_type, &builtin_profiles); + struct ggml_tensor src0 = { + .type = shape->src0_type, + }; + struct ggml_tensor src1 = { + .type = shape->src1_type, + }; + struct ggml_tensor node = { + .op = GGML_OP_MUL_MAT, + .src0 = &src0, + .src1 = &src1, + }; + + struct ggml_task_profile builtin_profiles[GGML_MAX_TASK_PROFILES]; + int n_profiles = ggml_get_task_profiles(&node, builtin_profiles); if (n_profiles != shape->n_profiles) { snprintf(errbuf, errbuf_len - 1, "task profiles mismatch"); @@ -382,13 +402,6 @@ bool ggml_mulmat_tune_read_data(struct ggml_mulmat_tune *tune, FILE *fp) { memset(shape->items, 0, item_size); } - { - size_t sz = sizeof(struct ggml_task_profile) * shape->n_profiles; - shape->profiles = malloc(sz); - GGML_ASSERT(shape->profiles); - memset(shape->profiles, 0, sz); - } - for (int ip = 0; ip < shape->n_profiles; ip++) { struct ggml_task_profile *profile = &shape->profiles[ip]; for (int j = 0; j < 3; j++) { @@ -468,7 +481,7 @@ bool ggml_mulmat_tune_write_data(const struct ggml_mulmat_tune *tune, } } - struct ggml_task_profile *profile = &shape->profiles[ip]; + const struct ggml_task_profile *profile = &shape->profiles[ip]; for (int k = 0; k < 3; k++) { if (profile->stages[k].backend != GGML_TASK_BACKEND_NONE) { rc = fprintf(fp, "%9d", item->stages_time[k]); @@ -537,7 +550,7 @@ void ggml_mulmat_tune_estimate_time( const int max_m = shape->items[m_num - 1].M; for (int ip = 0; ip < shape->n_profiles; ip++) { - struct ggml_task_profile *profile = &shape->profiles[ip]; + const struct ggml_task_profile *profile = &shape->profiles[ip]; profile_time[ip].total_time = 0; profile_time[ip].profile = profile; @@ -573,7 +586,7 @@ void ggml_mulmat_tune_estimate_time( GGML_ASSERT(p0 && p1); for (int i_stage = 0; i_stage < 3; i_stage++) { - struct ggml_task_stage *stage = &profile->stages[i_stage]; + const struct ggml_task_stage *stage = &profile->stages[i_stage]; if (stage->backend == GGML_TASK_BACKEND_NONE) { continue; } @@ -736,7 +749,7 @@ bool ggml_mulmat_tune_bench(struct ggml_mulmat_tune *tune, return false; } - bool ok = ggml_mulmat_tune_init(tune, params, NULL); + bool ok = ggml_mulmat_tune_init(tune, params, ggml_get_task_profiles); if (!ok) { return false; } diff --git a/ggml-tune.h b/ggml-tune.h index 404f1f1c4a53f..04b25873c932f 100644 --- a/ggml-tune.h +++ b/ggml-tune.h @@ -46,7 +46,7 @@ struct ggml_mulmat_tune_shape { enum ggml_type src1_type; int n_profiles; - struct ggml_task_profile *profiles; + struct ggml_task_profile profiles[GGML_MAX_TASK_PROFILES]; int m_num; int *arr_m; @@ -69,7 +69,7 @@ struct ggml_mulmat_tune { }; struct ggml_mulmat_tune_time { - struct ggml_task_profile *profile; + const struct ggml_task_profile *profile; int stage_time[3]; int total_time; }; @@ -78,7 +78,7 @@ struct mm_cache_element { int M; int N; int K; - struct ggml_task_profile *profile; + const struct ggml_task_profile *profile; int stages_time[3]; }; @@ -108,7 +108,7 @@ void ggml_mulmat_tune_model_init(struct ggml_mulmat_tune_model *model, bool ggml_mulmat_tune_init(struct ggml_mulmat_tune *tune, struct ggml_mulmat_tune_params *params, - struct ggml_task_profile_factory *profile_factory); + ggml_task_profiles_provider *profiles_provider); void ggml_mulmat_tune_free(struct ggml_mulmat_tune *tune); diff --git a/ggml.c b/ggml.c index 5d0b83b1de198..b75f33b88d877 100644 --- a/ggml.c +++ b/ggml.c @@ -144,7 +144,7 @@ inline static void* ggml_aligned_malloc(size_t size) { #include "ggml-opencl.h" #endif -#if defined(GGML_USE_MULMAT_TUNE) +#if defined(GGML_USE_TUNE) #include "ggml-tune.h" #endif @@ -4043,8 +4043,6 @@ struct ggml_context * ggml_init(struct ggml_init_params params) { ggml_cl_init(); #endif - ggml_mulmat_init_task_profiles(); - is_first_call = false; } @@ -15524,164 +15522,254 @@ struct ggml_cgraph ggml_build_backward(struct ggml_context * ctx, struct ggml_cg return result; } -// ---- mulmat task profiles ---- +// ---- task profiles ---- -static struct ggml_task_profile_factory default_task_profile_factory = {0}; +// Implement `ggml_task_profiles_provider`. +// Fill `profiles` for the `node` and return number of profiles. +// +// NOTE: the node may be incompleted from testing or tunning, so please assert +// everything used here. +inline int ggml_get_task_profiles( + struct ggml_tensor *node, + struct ggml_task_profile profiles[GGML_MAX_TASK_PROFILES]) { + GGML_ASSERT(node); + GGML_ASSERT(node->op >= 0); + GGML_ASSERT(profiles); -// TODO: thread unsafe. Should be initialized once. -void ggml_mulmat_init_task_profiles(void) { - const size_t sz = sizeof(struct ggml_task_profile_factory); - memset(&default_task_profile_factory, 0, sz); + memset(profiles, 0, + sizeof(struct ggml_task_profile) * GGML_MAX_TASK_PROFILES); + + struct ggml_task_profile *p = profiles; + int n_profiles = 0; + + switch (node->op) { + case GGML_OP_CPY: + case GGML_OP_DUP: { + p[0].stages[1].backend = GGML_TASK_BACKEND_CPU; + n_profiles = 1; + } break; + case GGML_OP_ADD: + case GGML_OP_ADD1: { + p[0].stages[1].backend = GGML_TASK_BACKEND_CPU; + p[0].stages[1].parallel = true; + n_profiles = 1; + } break; + case GGML_OP_ACC: { + p[0].stages[0].backend = GGML_TASK_BACKEND_CPU; + p[0].stages[1].backend = GGML_TASK_BACKEND_CPU; + p[0].stages[1].parallel = true; + n_profiles = 1; + } break; + case GGML_OP_SUB: + case GGML_OP_DIV: + case GGML_OP_SQR: + case GGML_OP_SQRT: + case GGML_OP_LOG: + case GGML_OP_SUM: + case GGML_OP_SUM_ROWS: + case GGML_OP_MEAN: + case GGML_OP_REPEAT: + case GGML_OP_REPEAT_BACK: + case GGML_OP_ABS: + case GGML_OP_SGN: + case GGML_OP_NEG: + case GGML_OP_STEP: + case GGML_OP_RELU: { + p[0].stages[1].backend = GGML_TASK_BACKEND_CPU; + n_profiles = 1; + } break; + case GGML_OP_MUL: { + p[0].stages[1].backend = GGML_TASK_BACKEND_CPU; + p[0].stages[1].parallel = true; + n_profiles = 1; + } break; + case GGML_OP_GELU: + case GGML_OP_SILU: + case GGML_OP_SILU_BACK: + case GGML_OP_NORM: + case GGML_OP_RMS_NORM: + case GGML_OP_RMS_NORM_BACK: { + p[0].stages[1].backend = GGML_TASK_BACKEND_CPU; + p[0].stages[1].parallel = true; + n_profiles = 1; + } break; + case GGML_OP_MUL_MAT: + case GGML_OP_OUT_PROD: { + GGML_ASSERT(node->src0); + GGML_ASSERT(node->src1); + + enum ggml_type src0_t = node->src0->type; + enum ggml_type src1_t = node->src1->type; + + GGML_ASSERT(src1_t == GGML_TYPE_F32); - // f32 - { - struct ggml_task_profile *p = default_task_profile_factory.f32_f32; int i = 0; - - p[i].stages[1].backend = GGML_TASK_BACKEND_CPU; - p[i].stages[1].parallel = true; - i++; + if (src0_t == GGML_TYPE_F32) { + p[i].stages[1].backend = GGML_TASK_BACKEND_CPU; + p[i].stages[1].parallel = true; + i++; #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) - p[i].stages[1].backend = GGML_TASK_BACKEND_CPU_BLAS; - p[i].stages[1].wait = true; - i++; + p[i].stages[1].backend = GGML_TASK_BACKEND_CPU_BLAS; + p[i].stages[1].wait = true; + i++; #endif #if defined(GGML_USE_CUBLAS) - p[i].stages[1].backend = GGML_TASK_BACKEND_GPU_CUDA; - p[i].stages[1].wait = true; - i++; + p[i].stages[1].backend = GGML_TASK_BACKEND_GPU_CUDA; + p[i].stages[1].wait = true; + i++; #elif defined(GGML_USE_CLBLAST) - p[i].stages[1].backend = GGML_TASK_BACKEND_GPU_CL; - p[i].stages[1].wait = true; - i++; + p[i].stages[1].backend = GGML_TASK_BACKEND_GPU_CL; + p[i].stages[1].wait = true; + i++; #endif - default_task_profile_factory.n_f32_f32 = i; - } - - // f16 - { - struct ggml_task_profile *p = default_task_profile_factory.f16_f32; - int i = 0; - - p[i].stages[0].backend = GGML_TASK_BACKEND_CPU; - p[i].stages[1].backend = GGML_TASK_BACKEND_CPU; - p[i].stages[1].parallel = true; - i++; + } else if (src0_t == GGML_TYPE_F16) { + p[i].stages[0].backend = GGML_TASK_BACKEND_CPU; + p[i].stages[1].backend = GGML_TASK_BACKEND_CPU; + p[i].stages[1].parallel = true; + i++; #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) - p[i].stages[1].backend = GGML_TASK_BACKEND_CPU_BLAS; - p[i].stages[1].wait = true; - i++; + p[i].stages[1].backend = GGML_TASK_BACKEND_CPU_BLAS; + p[i].stages[1].wait = true; + i++; #endif #if defined(GGML_USE_CUBLAS) - p[i].stages[1].backend = GGML_TASK_BACKEND_GPU_CUDA; - p[i].stages[1].wait = true; - i++; + p[i].stages[1].backend = GGML_TASK_BACKEND_GPU_CUDA; + p[i].stages[1].wait = true; + i++; #elif defined(GGML_USE_CLBLAST) - p[i].stages[1].backend = GGML_TASK_BACKEND_GPU_CL; - p[i].stages[1].wait = true; - i++; + p[i].stages[1].backend = GGML_TASK_BACKEND_GPU_CL; + p[i].stages[1].wait = true; + i++; #endif - default_task_profile_factory.n_f16_f32 = i; - } - - // qxx - { - struct ggml_task_profile *p = default_task_profile_factory.qxx_f32; - int i = 0; - - p[i].stages[0].backend = GGML_TASK_BACKEND_CPU; - p[i].stages[1].backend = GGML_TASK_BACKEND_CPU; - p[i].stages[1].parallel = true; - i++; + } else if (ggml_is_quantized(src0_t)) { + p[i].stages[0].backend = GGML_TASK_BACKEND_CPU; + p[i].stages[1].backend = GGML_TASK_BACKEND_CPU; + p[i].stages[1].parallel = true; + i++; #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) - p[i].stages[0].backend = GGML_TASK_BACKEND_CPU; - p[i].stages[0].parallel = true; - p[i].stages[1].backend = GGML_TASK_BACKEND_CPU_BLAS; - p[i].stages[1].wait = true; - i++; + p[i].stages[0].backend = GGML_TASK_BACKEND_CPU; + p[i].stages[0].parallel = true; + p[i].stages[1].backend = GGML_TASK_BACKEND_CPU_BLAS; + p[i].stages[1].wait = true; + i++; #endif #if defined(GGML_USE_CUBLAS) - p[i].stages[1].backend = GGML_TASK_BACKEND_GPU_CUDA; - p[i].stages[1].wait = true; - i++; + p[i].stages[1].backend = GGML_TASK_BACKEND_GPU_CUDA; + p[i].stages[1].wait = true; + i++; #elif defined(GGML_USE_CLBLAST) - p[i].stages[1].backend = GGML_TASK_BACKEND_GPU_CL; - p[i].stages[1].wait = true; - i++; + p[i].stages[1].backend = GGML_TASK_BACKEND_GPU_CL; + p[i].stages[1].wait = true; + i++; #endif - default_task_profile_factory.n_qxx_f32 = i; - } -} - -int ggml_mulmat_get_task_profiles(struct ggml_task_profile_factory *pf, - enum ggml_type src0_t, enum ggml_type src1_t, - struct ggml_task_profile **profiles) { - GGML_ASSERT(profiles); - - if (pf == NULL) { - pf = &default_task_profile_factory; - } - - GGML_ASSERT(src1_t == GGML_TYPE_F32); - - if (src0_t == GGML_TYPE_F32) { - *profiles = pf->f32_f32; - return pf->n_f32_f32; - } - - if (src0_t == GGML_TYPE_F16) { - *profiles = pf->f16_f32; - return pf->n_f16_f32; - } - - if (ggml_is_quantized(src0_t)) { - *profiles = pf->qxx_f32; - return pf->n_qxx_f32; - } - - GGML_ASSERT(false); -} - -static const struct ggml_task_profile * -ggml_mulmat_get_default_task_profile(struct ggml_task_profile_factory *pf, - enum ggml_type src0_type, - enum ggml_type src1_type) { - GGML_ASSERT(src1_type == GGML_TYPE_F32); - if (pf == NULL) { - pf = &default_task_profile_factory; - } - - struct ggml_task_profile *p = NULL; - - if (src0_type == GGML_TYPE_F32) { - p = &pf->f32_f32[0]; - } else if (src0_type == GGML_TYPE_F16) { - p = &pf->f16_f32[0]; - } else if (ggml_is_quantized(src0_type)) { - p = &pf->qxx_f32[0]; - } else { + } + n_profiles = i; + } break; + case GGML_OP_SCALE: { + p[0].stages[1].backend = GGML_TASK_BACKEND_CPU; + p[0].stages[1].parallel = true; + n_profiles = 1; + } break; + case GGML_OP_SET: { + p[0].stages[0].backend = GGML_TASK_BACKEND_CPU; + p[0].stages[1].backend = GGML_TASK_BACKEND_CPU; + n_profiles = 1; + } break; + case GGML_OP_CONT: + case GGML_OP_RESHAPE: + case GGML_OP_VIEW: + case GGML_OP_PERMUTE: + case GGML_OP_TRANSPOSE: + case GGML_OP_GET_ROWS: + case GGML_OP_GET_ROWS_BACK: + case GGML_OP_DIAG: + case GGML_OP_DIAG_MASK_ZERO: { + p[0].stages[1].backend = GGML_TASK_BACKEND_CPU; + n_profiles = 1; + } break; + case GGML_OP_DIAG_MASK_INF: + case GGML_OP_SOFT_MAX: + case GGML_OP_SOFT_MAX_BACK: + case GGML_OP_ROPE: + case GGML_OP_ROPE_BACK: { + p[0].stages[1].backend = GGML_TASK_BACKEND_CPU; + p[0].stages[1].parallel = true; + n_profiles = 1; + } break; + case GGML_OP_ALIBI: { + p[0].stages[1].backend = GGML_TASK_BACKEND_CPU; + n_profiles = 1; + } break; + case GGML_OP_CLAMP: { + p[0].stages[1].backend = GGML_TASK_BACKEND_CPU; + n_profiles = 1; + } break; + case GGML_OP_CONV_1D_1S: + case GGML_OP_CONV_1D_2S: { + p[0].stages[0].backend = GGML_TASK_BACKEND_CPU; + p[0].stages[1].backend = GGML_TASK_BACKEND_CPU; + p[0].stages[1].parallel = true; + n_profiles = 1; + } break; + case GGML_OP_FLASH_ATTN: { + p[0].stages[1].backend = GGML_TASK_BACKEND_CPU; + p[0].stages[1].parallel = true; + n_profiles = 1; + } break; + case GGML_OP_FLASH_FF: { + p[0].stages[1].backend = GGML_TASK_BACKEND_CPU; + p[0].stages[1].parallel = true; + n_profiles = 1; + } + case GGML_OP_FLASH_ATTN_BACK: { + p[0].stages[0].backend = GGML_TASK_BACKEND_CPU; + p[0].stages[1].backend = GGML_TASK_BACKEND_CPU; + p[0].stages[1].parallel = true; + n_profiles = 1; + } break; + case GGML_OP_MAP_UNARY: + case GGML_OP_MAP_BINARY: { + p[0].stages[1].backend = GGML_TASK_BACKEND_CPU; + n_profiles = 1; + } break; + case GGML_OP_CROSS_ENTROPY_LOSS: + p[0].stages[0].backend = GGML_TASK_BACKEND_CPU; + p[0].stages[1].backend = GGML_TASK_BACKEND_CPU; + p[0].stages[1].parallel = true; + p[0].stages[2].backend = GGML_TASK_BACKEND_CPU; + n_profiles = 1; + case GGML_OP_CROSS_ENTROPY_LOSS_BACK: { + p[0].stages[1].backend = GGML_TASK_BACKEND_CPU; + p[0].stages[1].parallel = true; + n_profiles = 1; + } break; + case GGML_OP_NONE: + case GGML_OP_COUNT: { + GGML_ASSERT(false); + } break; + default: GGML_ASSERT(false); } - for (int i = 0; i < 3; i++) { - GGML_ASSERT(p->stages[i].backend == GGML_TASK_BACKEND_CPU || - p->stages[i].backend == GGML_TASK_BACKEND_NONE); - } - - return p; + GGML_ASSERT(n_profiles > 0 && n_profiles <= GGML_MAX_TASK_PROFILES); + return n_profiles; } // Set task profile for GGML_OP_MUL_MAT or GGML_OP_OUT_PROD. -static void ggml_mulmat_set_tensor_task_profile(struct ggml_tensor *node, - struct ggml_mulmat_tune *tune) { +static const struct ggml_task_profile *ggml_mulmat_get_task_profile( + struct ggml_tensor *node, struct ggml_task_profile *profiles, + int n_profiles, struct ggml_mulmat_tune *tune, int stages_time_us[3]) { + GGML_ASSERT(node); GGML_ASSERT(node->op == GGML_OP_MUL_MAT || node->op == GGML_OP_OUT_PROD); + GGML_ASSERT(profiles); + GGML_ASSERT(n_profiles >= 2); enum ggml_type src0_t = node->src0->type; enum ggml_type src1_t = node->src1->type; @@ -15697,42 +15785,26 @@ static void ggml_mulmat_set_tensor_task_profile(struct ggml_tensor *node, int N = (int)node->ne[0]; int K = (int)node->src1->ne[0]; - struct ggml_task_profile *profiles = NULL; - int n_profiles = ggml_mulmat_get_task_profiles(NULL, src0_t, src1_t, &profiles); - GGML_ASSERT(n_profiles >= 2); - GGML_ASSERT(profiles); - const struct ggml_task_profile *prof = NULL; if (cond_match) { -#if defined(GGML_USE_MULMAT_TUNE) +#if defined(GGML_USE_TUNE) if (tune != NULL) { - int stages_time_us[3]; - prof = ggml_mulmat_tune_select_task_profile(tune, M, N, K, src0_t, src1_t, stages_time_us); + prof = ggml_mulmat_tune_select_task_profile(tune, M, N, K, src0_t, + src1_t, stages_time_us); if (prof != NULL) { - GGML_ASSERT(prof); - memcpy(&node->task_profile, prof, sizeof(struct ggml_task_profile)); - // Do not wait if the estimated execution time is too small (e.g. less than 0.1 ms) - // TODO: need bench actual wait/notify time, see ggml-threading.c - for (int i = 0; i < 3; i++) { - if (node->task_profile.stages[i].wait) { - if (stages_time_us[i] < 100) { - node->task_profile.stages[i].wait = false; - } - } - } - return; + return prof; } } #else UNUSED(tune); + UNUSED(stages_time_us); #endif if (prof == NULL && M >= 32 && N >= 32 && K >= 32) { for (int j = 0; j < n_profiles; j++) { enum ggml_task_backend comp_be = profiles[j].stages[GGML_TASK_COMPUTE].backend; - switch (comp_be) { case GGML_TASK_BACKEND_GPU_CUDA: { GGML_ASSERT(ggml_cpu_has_cublas()); @@ -15753,76 +15825,131 @@ static void ggml_mulmat_set_tensor_task_profile(struct ggml_tensor *node, break; } } + + if (prof) { + break; + } } } } if (prof == NULL) { - prof = ggml_mulmat_get_default_task_profile(NULL, src0_t, src1_t); + prof = &profiles[0]; + GGML_ASSERT(prof->stages[1].backend == GGML_TASK_BACKEND_CPU); } - GGML_ASSERT(prof); - memcpy(&node->task_profile, prof, sizeof(struct ggml_task_profile)); + return prof; } void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph) { int n_threads = cgraph->n_threads; - if (ggml_cpu_has_blas()) { + struct ggml_threading_context *thrd_ctx = ggml_threading_start( + n_threads, ggml_threading_graph_compute_thread, ggml_compute_forward, + GGML_THREADING_FEATURE_WAIT_ON_DONE, NULL); + + // initialize tasks + work buffer + { + // int64_t t0 = ggml_time_us(); + + size_t work_size = 0; + + struct ggml_task_profile profiles[GGML_MAX_TASK_PROFILES]; + + // thread scheduling for the different operations for (int i = 0; i < cgraph->n_nodes; i++) { - struct ggml_tensor *node = cgraph->nodes[i]; + struct ggml_tensor * node = cgraph->nodes[i]; + if (node->op == GGML_OP_NONE || node->op == GGML_OP_CONT) { + continue; + } - memset(&node->task_profile, 0, sizeof(struct ggml_task_profile)); - struct ggml_task_stage *stages = node->task_profile.stages; + int n_profiles = ggml_get_task_profiles(node, profiles); - // Adapt node->backend: assume GPU at COMPUTE stage. - if (node->backend > GGML_BACKEND_CPU) { - stages[GGML_TASK_INIT].backend = GGML_TASK_BACKEND_NONE; - stages[GGML_TASK_FINALIZE].backend = GGML_TASK_BACKEND_NONE; + const struct ggml_task_profile *profile = NULL; - stages[GGML_TASK_COMPUTE].parallel = false; - bool wait = (node->op == GGML_OP_MUL_MAT || node->op == GGML_OP_MUL); - stages[GGML_TASK_COMPUTE].wait = wait; + // Adapt node->backend: assume GPU at COMPUTE stage. + if (node->backend == GGML_BACKEND_GPU || + node->backend == GGML_BACKEND_GPU_SPLIT) { + enum ggml_task_backend be; if (ggml_cpu_has_cublas()) { - stages[GGML_TASK_COMPUTE].backend = GGML_TASK_BACKEND_GPU_CUDA; + be = GGML_TASK_BACKEND_GPU_CUDA; } else if (ggml_cpu_has_clblast()) { - stages[GGML_TASK_COMPUTE].backend = GGML_TASK_BACKEND_GPU_CL; + be = GGML_TASK_BACKEND_GPU_CL; } else { GGML_ASSERT(false); } - } else if (node->op == GGML_OP_MUL_MAT) { - struct ggml_mulmat_tune * tune = NULL; -#if defined(GGML_USE_MULMAT_TUNE) - tune = cgraph->tune; -#endif - ggml_mulmat_set_tensor_task_profile(node, tune); - } else if (node->op == GGML_OP_OUT_PROD) { - ggml_mulmat_set_tensor_task_profile(node, NULL); + + for (int j = 0; j < n_profiles; j++) { + if (profiles[j].stages[1].backend == be) { + profile = &profiles[j]; + break; + } + } + GGML_ASSERT(profile); + } else { + GGML_ASSERT(node->backend == GGML_BACKEND_CPU); } - } - } - struct ggml_threading_context *thrd_ctx = ggml_threading_start( - n_threads, ggml_threading_graph_compute_thread, ggml_compute_forward, - GGML_THREADING_FEATURE_WAIT_ON_DONE, NULL); + bool profile_copied = false; - // initialize tasks + work buffer - { - size_t work_size = 0; + if (node->op == GGML_OP_MUL_MAT) { +#if defined(GGML_USE_TUNE) + int stages_time_us[3]; + profile = ggml_mulmat_get_task_profile( + node, profiles, n_profiles, cgraph->tune, stages_time_us); + GGML_ASSERT(profile); + + if (cgraph->tune) { + memcpy(&node->task_profile, profile, + sizeof(struct ggml_task_profile)); + profile_copied = true; + + // Do not wait if the estimated execution time is too small + // (e.g. less than 0.1 ms) + // TODO: need bench actual wait/notify time, see + // ggml-threading.c + for (int j = 0; j< 3; j++) { + if (node->task_profile.stages[j].wait) { + if (stages_time_us[j] < 100) { + node->task_profile.stages[j].wait = false; + } + } + } + } +#else + profile = ggml_mulmat_get_task_profile(node, profiles, + n_profiles, NULL, NULL); + GGML_ASSERT(profile); +#endif + } else if (node->op == GGML_OP_OUT_PROD) { // FIXME: is is right? + profile = ggml_mulmat_get_task_profile(node, profiles, + n_profiles, NULL, NULL); + GGML_ASSERT(profile); + } else { + profile = &profiles[0]; + GGML_ASSERT(profile->stages[1].backend == + GGML_TASK_BACKEND_CPU); + } + + if (!profile_copied) { + memcpy(&node->task_profile, profile, + sizeof(struct ggml_task_profile)); + } - // thread scheduling for the different operations - for (int i = 0; i < cgraph->n_nodes; i++) { - struct ggml_tensor * node = cgraph->nodes[i]; struct ggml_task_stage *stages = node->task_profile.stages; + // compute stage n_tasks. + int n_tasks = stages[1].parallel ? n_threads : 1; + + // Allocate temp buffer `wdata` for CPU. + // NOTE: GPU MAY fallback to CPU, so we have to cover all possible cases. switch (node->op) { case GGML_OP_CPY: case GGML_OP_DUP: { - stages[GGML_TASK_COMPUTE].backend = GGML_TASK_BACKEND_CPU; size_t cur = 0; if (ggml_is_quantized(node->type)) { - cur = GGML_TYPE_SIZE[GGML_TYPE_F32] * node->ne[0] * n_threads; + cur = GGML_TYPE_SIZE[GGML_TYPE_F32] * node->ne[0] * n_tasks; } work_size = MAX(work_size, cur); @@ -15830,27 +15957,20 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph) case GGML_OP_ADD: case GGML_OP_ADD1: { - stages[GGML_TASK_COMPUTE].backend = GGML_TASK_BACKEND_CPU; - stages[GGML_TASK_COMPUTE].parallel = true; - size_t cur = 0; if (ggml_is_quantized(node->src0->type)) { - cur = GGML_TYPE_SIZE[GGML_TYPE_F32] * node->src0->ne[0] * n_threads; + cur = GGML_TYPE_SIZE[GGML_TYPE_F32] * node->src0->ne[0] * n_tasks; } work_size = MAX(work_size, cur); } break; case GGML_OP_ACC: { - stages[GGML_TASK_INIT].backend = GGML_TASK_BACKEND_CPU; - stages[GGML_TASK_COMPUTE].backend = GGML_TASK_BACKEND_CPU; - stages[GGML_TASK_COMPUTE].parallel = true; - size_t cur = 0; if (ggml_is_quantized(node->src0->type)) { - cur = GGML_TYPE_SIZE[GGML_TYPE_F32] * node->src1->ne[0] * n_threads; + cur = GGML_TYPE_SIZE[GGML_TYPE_F32] * node->src1->ne[0] * n_tasks; } work_size = MAX(work_size, cur); @@ -15870,16 +15990,7 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph) case GGML_OP_NEG: case GGML_OP_STEP: case GGML_OP_RELU: - { - stages[GGML_TASK_COMPUTE].backend = GGML_TASK_BACKEND_CPU; - } break; case GGML_OP_MUL: - { - if (stages[GGML_TASK_COMPUTE].backend == GGML_TASK_BACKEND_NONE) { - stages[GGML_TASK_COMPUTE].backend = GGML_TASK_BACKEND_CPU; - stages[GGML_TASK_COMPUTE].parallel = true; - } - } break; case GGML_OP_GELU: case GGML_OP_SILU: case GGML_OP_SILU_BACK: @@ -15887,28 +15998,14 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph) case GGML_OP_RMS_NORM: case GGML_OP_RMS_NORM_BACK: { - stages[GGML_TASK_COMPUTE].backend = GGML_TASK_BACKEND_CPU; - stages[GGML_TASK_COMPUTE].parallel = true; } break; case GGML_OP_MUL_MAT: - case GGML_OP_OUT_PROD: + case GGML_OP_OUT_PROD: // FIXME: is is right? { size_t cur = 0; enum ggml_task_backend comp_backend = stages[GGML_TASK_COMPUTE].backend; GGML_ASSERT(comp_backend != GGML_TASK_BACKEND_NONE); - // TODO: remove this check once we are sure `ggml_mulmat_set_tensor_task_profile()` is correct. - if ((comp_backend & GGML_TASK_BACKEND_GPU) || comp_backend == GGML_TASK_BACKEND_CPU_BLAS) { - enum ggml_type src0_t = node->src0->type; - enum ggml_type src1_t = node->src1->type; - bool cond_match = (src0_t == GGML_TYPE_F32 || src0_t == GGML_TYPE_F16 || - ggml_is_quantized(src0_t)) && - src1_t == GGML_TYPE_F32 && node->type == GGML_TYPE_F32 && - ggml_is_contiguous(node->src0) && - ggml_is_contiguous(node->src1); - GGML_ASSERT(cond_match); - } - if (comp_backend == GGML_TASK_BACKEND_GPU_CL) { #if defined(GGML_USE_CLBLAST) cur = ggml_cl_mul_mat_get_wsize(node->src0, node->src1, node); @@ -15930,7 +16027,6 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph) GGML_ASSERT(false); } } else if (comp_backend == GGML_TASK_BACKEND_CPU || comp_backend == GGML_TASK_BACKEND_GPU_CUDA) { - // We have to reseve buffer for CUDA because it may fallback to CPU. if (comp_backend == GGML_TASK_BACKEND_GPU_CUDA) { GGML_ASSERT(ggml_cpu_has_cublas()); } @@ -15955,13 +16051,9 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph) } break; case GGML_OP_SCALE: { - stages[GGML_TASK_COMPUTE].backend = GGML_TASK_BACKEND_CPU; - stages[GGML_TASK_COMPUTE].parallel = true; } break; case GGML_OP_SET: { - stages[GGML_TASK_INIT].backend = GGML_TASK_BACKEND_CPU; - stages[GGML_TASK_COMPUTE].backend = GGML_TASK_BACKEND_CPU; } break; case GGML_OP_CONT: case GGML_OP_RESHAPE: @@ -15972,33 +16064,18 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph) case GGML_OP_GET_ROWS_BACK: case GGML_OP_DIAG: case GGML_OP_DIAG_MASK_ZERO: - { - stages[GGML_TASK_COMPUTE].backend = GGML_TASK_BACKEND_CPU; - } break; case GGML_OP_DIAG_MASK_INF: case GGML_OP_SOFT_MAX: case GGML_OP_SOFT_MAX_BACK: case GGML_OP_ROPE: case GGML_OP_ROPE_BACK: - { - stages[GGML_TASK_COMPUTE].backend = GGML_TASK_BACKEND_CPU; - stages[GGML_TASK_COMPUTE].parallel = true; - } break; case GGML_OP_ALIBI: - { - stages[GGML_TASK_COMPUTE].backend = GGML_TASK_BACKEND_CPU; - } break; case GGML_OP_CLAMP: { - stages[GGML_TASK_COMPUTE].backend = GGML_TASK_BACKEND_CPU; } break; case GGML_OP_CONV_1D_1S: case GGML_OP_CONV_1D_2S: { - stages[GGML_TASK_INIT].backend = GGML_TASK_BACKEND_CPU; - stages[GGML_TASK_COMPUTE].backend = GGML_TASK_BACKEND_CPU; - stages[GGML_TASK_COMPUTE].parallel = true; - GGML_ASSERT(node->src0->ne[3] == 1); GGML_ASSERT(node->src1->ne[2] == 1); GGML_ASSERT(node->src1->ne[3] == 1); @@ -16026,62 +16103,53 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph) } break; case GGML_OP_FLASH_ATTN: { - stages[GGML_TASK_COMPUTE].backend = GGML_TASK_BACKEND_CPU; - stages[GGML_TASK_COMPUTE].parallel = true; - size_t cur = 0; const int64_t ne11 = ggml_up(node->src1->ne[1], GGML_SOFT_MAX_UNROLL); if (node->src1->type == GGML_TYPE_F32) { - cur = sizeof(float)*ne11*n_threads; // TODO: this can become (n_tasks-1) - cur += sizeof(float)*ne11*n_threads; // this is overestimated by x2 + cur = sizeof(float)*ne11*n_tasks; // TODO: this can become (n_tasks-1) + cur += sizeof(float)*ne11*n_tasks; // this is overestimated by x2 } if (node->src1->type == GGML_TYPE_F16) { - cur = sizeof(float)*ne11*n_threads; // TODO: this can become (n_tasks-1) - cur += sizeof(float)*ne11*n_threads; // this is overestimated by x2 + cur = sizeof(float)*ne11*n_tasks; // TODO: this can become (n_tasks-1) + cur += sizeof(float)*ne11*n_tasks; // this is overestimated by x2 } work_size = MAX(work_size, cur); } break; case GGML_OP_FLASH_FF: { - stages[GGML_TASK_COMPUTE].backend = GGML_TASK_BACKEND_CPU; - stages[GGML_TASK_COMPUTE].parallel = true; size_t cur = 0; if (node->src1->type == GGML_TYPE_F32) { - cur = sizeof(float)*node->src1->ne[1]*n_threads; // TODO: this can become (n_tasks-1) - cur += sizeof(float)*node->src1->ne[1]*n_threads; // this is overestimated by x2 + cur = sizeof(float)*node->src1->ne[1]*n_tasks; // TODO: this can become (n_tasks-1) + cur += sizeof(float)*node->src1->ne[1]*n_tasks; // this is overestimated by x2 } if (node->src1->type == GGML_TYPE_F16) { - cur = sizeof(float)*node->src1->ne[1]*n_threads; // TODO: this can become (n_tasks-1) - cur += sizeof(float)*node->src1->ne[1]*n_threads; // this is overestimated by x2 + cur = sizeof(float)*node->src1->ne[1]*n_tasks; // TODO: this can become (n_tasks-1) + cur += sizeof(float)*node->src1->ne[1]*n_tasks; // this is overestimated by x2 } work_size = MAX(work_size, cur); } break; case GGML_OP_FLASH_ATTN_BACK: { - stages[GGML_TASK_INIT].backend = GGML_TASK_BACKEND_CPU; - stages[GGML_TASK_COMPUTE].backend = GGML_TASK_BACKEND_CPU; - stages[GGML_TASK_COMPUTE].parallel = true; - size_t cur = 0; const int64_t D = node->src0->ne[0]; const int64_t ne11 = ggml_up(node->src1->ne[1], GGML_SOFT_MAX_UNROLL); const int64_t mxDn = MAX(D, ne11) * 2; // *2 because of S and SM in ggml_compute_forward_flash_attn_back if (node->src1->type == GGML_TYPE_F32) { - cur = sizeof(float)*mxDn*n_threads; // TODO: this can become (n_tasks-1) - cur += sizeof(float)*mxDn*n_threads; // this is overestimated by x2 + cur = sizeof(float)*mxDn*n_tasks; // TODO: this can become (n_tasks-1) + cur += sizeof(float)*mxDn*n_tasks; // this is overestimated by x2 } if (node->src1->type == GGML_TYPE_F16) { - cur = sizeof(float)*mxDn*n_threads; // TODO: this can become (n_tasks-1) - cur += sizeof(float)*mxDn*n_threads; // this is overestimated by x2 + cur = sizeof(float)*mxDn*n_tasks; // TODO: this can become (n_tasks-1) + cur += sizeof(float)*mxDn*n_tasks; // this is overestimated by x2 } work_size = MAX(work_size, cur); @@ -16089,31 +16157,21 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph) case GGML_OP_MAP_UNARY: case GGML_OP_MAP_BINARY: { - stages[GGML_TASK_COMPUTE].backend = GGML_TASK_BACKEND_CPU; } break; case GGML_OP_CROSS_ENTROPY_LOSS: { - stages[GGML_TASK_INIT].backend = GGML_TASK_BACKEND_CPU; - stages[GGML_TASK_COMPUTE].backend = GGML_TASK_BACKEND_CPU; - stages[GGML_TASK_COMPUTE].parallel = true; - stages[GGML_TASK_FINALIZE].backend = GGML_TASK_BACKEND_CPU; - - size_t cur = ggml_type_size(node->type)*(n_threads + node->src0->ne[0]*n_threads); + size_t cur = ggml_type_size(node->type)*(n_threads + node->src0->ne[0]*n_tasks); work_size = MAX(work_size, cur); } break; case GGML_OP_CROSS_ENTROPY_LOSS_BACK: { - stages[GGML_TASK_COMPUTE].backend = GGML_TASK_BACKEND_CPU; - stages[GGML_TASK_COMPUTE].parallel = true; - - size_t cur = ggml_type_size(node->type)*node->src0->ne[0]*n_threads; + size_t cur = ggml_type_size(node->type)*node->src0->ne[0]*n_tasks; work_size = MAX(work_size, cur); } break; case GGML_OP_NONE: { - stages[GGML_TASK_COMPUTE].backend = GGML_TASK_BACKEND_CPU; } break; case GGML_OP_COUNT: { @@ -16134,6 +16192,9 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph) GGML_PRINT_DEBUG("%s: allocating work buffer for graph (%zu bytes)\n", __func__, cgraph->work_size); cgraph->work = ggml_new_tensor_1d(ctx, GGML_TYPE_I8, cgraph->work_size); } + + // ~ 50 us + //printf("=== prepare computing took %d us\n", (int)(ggml_time_us() - t0)); } const int64_t perf_start_cycles = ggml_perf_cycles(); @@ -16162,16 +16223,6 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph) enum ggml_compute_error err = ggml_threading_compute_tensor(thrd_ctx, node, wdata, wsize); - if (err == GGML_COMPUTE_FALLBACK) { - if (node->op == GGML_OP_MUL_MAT) { - const struct ggml_task_profile *p = - ggml_mulmat_get_default_task_profile( - NULL, node->src0->type, node->src1->type); - memcpy(&node->task_profile, p, - sizeof(struct ggml_task_profile)); - } - err = ggml_threading_compute_tensor(thrd_ctx, node, wdata, wsize); - } GGML_ASSERT(err == GGML_COMPUTE_OK); // performance stats (node) diff --git a/ggml.h b/ggml.h index f51b658fd3abe..5ab78c4a011b5 100644 --- a/ggml.h +++ b/ggml.h @@ -202,7 +202,7 @@ #define GGML_MAX_OPT 4 #define GGML_MAX_NAME 32 #define GGML_DEFAULT_N_THREADS 4 -#define GGML_MAX_TASK_PROFILES 8 +#define GGML_MAX_TASK_PROFILES 4 #define GGML_ASSERT(x) \ do { \ @@ -399,17 +399,6 @@ extern "C" { uint8_t dev_flags[4]; }; - struct ggml_task_profile_factory { - struct ggml_task_profile f32_f32[GGML_MAX_TASK_PROFILES]; - int n_f32_f32; - - struct ggml_task_profile f16_f32[GGML_MAX_TASK_PROFILES]; - int n_f16_f32; - - struct ggml_task_profile qxx_f32[GGML_MAX_TASK_PROFILES]; - int n_qxx_f32; - }; - // n-dimensional tensor struct ggml_tensor { enum ggml_type type; @@ -450,6 +439,11 @@ extern "C" { char padding[12]; }; + // Fill `profiles` for the `node` and return number of profiles. + typedef int (ggml_task_profiles_provider) ( + struct ggml_tensor *node, + struct ggml_task_profile profiles[GGML_MAX_TASK_PROFILES]); + static const size_t GGML_TENSOR_SIZE = sizeof(struct ggml_tensor); // computation graph @@ -1345,15 +1339,11 @@ extern "C" { GGML_API int ggml_cpu_has_vsx (void); // - // mulmat task profiles + // task profiles // - GGML_API void ggml_mulmat_init_task_profiles(void); - GGML_API int ggml_mulmat_get_task_profiles( - struct ggml_task_profile_factory *pf, - enum ggml_type src0_t, - enum ggml_type src1_t, - struct ggml_task_profile **profiles); + // Implements `ggml_task_profiles_provider`. + GGML_API int ggml_get_task_profiles (struct ggml_tensor *node, struct ggml_task_profile profiles[GGML_MAX_TASK_PROFILES]); // // Internal types and functions exposed for tests and benchmarks diff --git a/llama.cpp b/llama.cpp index fa5a94e21f0ab..acc0e59f71dbf 100644 --- a/llama.cpp +++ b/llama.cpp @@ -21,7 +21,7 @@ #include "ggml-metal.h" #endif -#ifdef GGML_USE_MULMAT_TUNE +#ifdef GGML_USE_TUNE #include "ggml-tune.h" #endif @@ -285,7 +285,7 @@ struct llama_context { int buf_last = 0; size_t buf_max_size[LLAMA_MAX_SCRATCH_BUFFERS] = { 0 }; -#ifdef GGML_USE_MULMAT_TUNE +#ifdef GGML_USE_TUNE struct ggml_mulmat_tune *tune = nullptr; #endif @@ -1408,7 +1408,7 @@ static bool llama_eval_internal( ggml_cgraph gf = {}; gf.n_threads = n_threads; -#ifdef GGML_USE_MULMAT_TUNE +#ifdef GGML_USE_TUNE gf.tune =lctx.tune; #endif @@ -2743,7 +2743,7 @@ struct llama_context * llama_init_from_file( return ctx; } -#ifdef GGML_USE_MULMAT_TUNE +#ifdef GGML_USE_TUNE bool llama_mulmat_tune(struct llama_context *ctx, int n_threads, bool tune, const char *fname) { printf("\n"); if (ctx->model.n_gpu_layers != 0) { @@ -2882,7 +2882,7 @@ bool llama_mulmat_tune(struct llama_context *ctx, int n_threads, bool tune, cons #endif void llama_free(struct llama_context * ctx) { -#ifdef GGML_USE_MULMAT_TUNE +#ifdef GGML_USE_TUNE if (ctx->tune) { delete(ctx->tune); } diff --git a/tests/test-ggml-threading.c b/tests/test-ggml-threading.c index ed9d8aa2bcdd8..deb15fd84d8e7 100644 --- a/tests/test-ggml-threading.c +++ b/tests/test-ggml-threading.c @@ -356,6 +356,17 @@ int main(void) { { ++n_tests; + // required by getting task profiles. + node.op = GGML_OP_MUL_MAT; + struct ggml_tensor src0 = { + .type = GGML_TYPE_Q4_0, + }; + struct ggml_tensor src1 = { + .type = GGML_TYPE_F32, + }; + node.src0 = &src0; + node.src1 = &src1; + node.backend = GGML_BACKEND_GPU; if (test_fallback(&node) == 0) { ++n_passed; diff --git a/tests/test-ggml-tune.c b/tests/test-ggml-tune.c index ed612fff45562..e0a6950d9502b 100644 --- a/tests/test-ggml-tune.c +++ b/tests/test-ggml-tune.c @@ -3,6 +3,8 @@ #include +#define UNUSED(x) (void)(x) + static int bench(void); static int estimate_time_non_zero_NK(void); @@ -77,6 +79,20 @@ static int bench(void) { return ok ? 0 : 1; } +// implement `ggml_task_profiles_provider` +static int +ggml_task_profiles_mock_qxx_provider(struct ggml_tensor *node, + struct ggml_task_profile *profiles) { + UNUSED(node); + profiles[0].stages[0].backend = GGML_TASK_BACKEND_CPU; + profiles[0].stages[0].backend = GGML_TASK_BACKEND_CPU; + profiles[0].stages[1].backend = GGML_TASK_BACKEND_CPU; + profiles[1].stages[0].backend = GGML_TASK_BACKEND_CPU; + profiles[1].stages[1].backend = GGML_TASK_BACKEND_CPU_BLAS; + + return 2; +} + int estimate_time_non_zero_NK(void) { printf("test: %s\n", __func__); @@ -92,22 +108,10 @@ int estimate_time_non_zero_NK(void) { const int m_num = 2; - struct ggml_task_profile_factory pf; - memset(&pf, 0, sizeof(struct ggml_task_profile_factory)); - - { - pf.n_qxx_f32 = 2; - pf.qxx_f32[0].stages[0].backend = GGML_TASK_BACKEND_CPU; - pf.qxx_f32[0].stages[1].backend = GGML_TASK_BACKEND_CPU; - - pf.qxx_f32[1].stages[0].backend = GGML_TASK_BACKEND_CPU; - pf.qxx_f32[1].stages[1].backend = GGML_TASK_BACKEND_CPU_BLAS; - } - struct ggml_mulmat_tune_params params; init_params(¶ms, m_num); - ggml_mulmat_tune_init(&tune, ¶ms, &pf); + ggml_mulmat_tune_init(&tune, ¶ms, ggml_task_profiles_mock_qxx_provider); struct ggml_mulmat_tune_shape *shape = NULL; for (int i = 0; i < tune.n_shapes; i++) { From 91062322604974d9afeab58618d88282cafedafd Mon Sep 17 00:00:00 2001 From: mqy Date: Thu, 15 Jun 2023 07:19:00 +0800 Subject: [PATCH 04/24] threading test: At github, Windows can take more than 20 seconds to start 15 threads.Let's silently ignore when we saw two adjacent slowness. --- tests/test-ggml-threading.c | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/tests/test-ggml-threading.c b/tests/test-ggml-threading.c index deb15fd84d8e7..90d53e4cdea2a 100644 --- a/tests/test-ggml-threading.c +++ b/tests/test-ggml-threading.c @@ -217,9 +217,19 @@ int main(void) { int workload_arr_len = sizeof(workload_arr) / sizeof(workload_arr[0]); // skip slow/big n_threads. + + int n_slow = 0; + for (int i = 0; i < threads_arr_len; i++) { int n_threads = threads_arr[i]; + // At github, Windows can take more than 20 seconds to start 15 threads. + // Let's silently ignore when we saw two adjacent slowness. + if (n_slow >= 2) { + threads_arr[i] = 0; + continue; + } + if (n_threads == 1) { continue; } else if (n_threads > MAX_N_THREADS) { @@ -243,10 +253,14 @@ int main(void) { int elapsed_us = t1 - t0; if (elapsed_us > 500 * n_threads) { - printf("[test-ggml-threading] warning: it took took %.3f " - "ms to start %d worker thread(s). Loo slow, skip.\n", + printf("[test-ggml-threading] warning: it took took %7.3f " + "ms to start %2d worker thread(s). Too slow, skip.\n", 1.0 * elapsed_us / 1000, n_threads - 1); threads_arr[i] = 0; + ++n_slow; + } else { + // clear. + n_slow = 0; } } From bb590f14826ccfd0f182d005a3fe6333da14017f Mon Sep 17 00:00:00 2001 From: mqy Date: Thu, 15 Jun 2023 08:28:39 +0800 Subject: [PATCH 05/24] Workrounnd to set node->backend --- ggml-opencl.cpp | 4 ++-- ggml.c | 13 +++++++++++++ tests/test-ggml-tune.c | 1 - 3 files changed, 15 insertions(+), 3 deletions(-) diff --git a/ggml-opencl.cpp b/ggml-opencl.cpp index b2300a104ddb2..c9151a8e49561 100644 --- a/ggml-opencl.cpp +++ b/ggml-opencl.cpp @@ -1599,8 +1599,8 @@ bool ggml_cl_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tens // TODO: find the optimal values for these if ((src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) && src1->type == GGML_TYPE_F32 && - dst->type == GGML_TYPE_F32 && - ((ne0 >= 32 && ne1 >= 32 && ne10 >= 32) || src0->backend == GGML_BACKEND_GPU)) { + dst->type == GGML_TYPE_F32 /*&& + ((ne0 >= 32 && ne1 >= 32 && ne10 >= 32) || src0->backend == GGML_BACKEND_GPU)*/) { return true; } diff --git a/ggml.c b/ggml.c index b75f33b88d877..b734f1a0c4d46 100644 --- a/ggml.c +++ b/ggml.c @@ -15938,6 +15938,18 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph) struct ggml_task_stage *stages = node->task_profile.stages; + // Workrounnd to set node->backend. + for (int j = 0; j < 3; j++) { + if (node->backend == GGML_BACKEND_CPU && + (stages[j].backend & GGML_TASK_BACKEND_GPU)) { + if (ggml_cpu_has_cublas() || ggml_cpu_has_clblast()) { + node->backend = GGML_BACKEND_GPU; + } else { + GGML_ASSERT(false); + } + } + } + // compute stage n_tasks. int n_tasks = stages[1].parallel ? n_threads : 1; @@ -16008,6 +16020,7 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph) if (comp_backend == GGML_TASK_BACKEND_GPU_CL) { #if defined(GGML_USE_CLBLAST) + GGML_ASSERT(ggml_cl_can_mul_mat(node->src0, node->src1, node)); cur = ggml_cl_mul_mat_get_wsize(node->src0, node->src1, node); #else GGML_ASSERT(false); diff --git a/tests/test-ggml-tune.c b/tests/test-ggml-tune.c index e0a6950d9502b..913d25ff56ff4 100644 --- a/tests/test-ggml-tune.c +++ b/tests/test-ggml-tune.c @@ -85,7 +85,6 @@ ggml_task_profiles_mock_qxx_provider(struct ggml_tensor *node, struct ggml_task_profile *profiles) { UNUSED(node); profiles[0].stages[0].backend = GGML_TASK_BACKEND_CPU; - profiles[0].stages[0].backend = GGML_TASK_BACKEND_CPU; profiles[0].stages[1].backend = GGML_TASK_BACKEND_CPU; profiles[1].stages[0].backend = GGML_TASK_BACKEND_CPU; profiles[1].stages[1].backend = GGML_TASK_BACKEND_CPU_BLAS; From 7c05049f8b0ba6e090e5a9d5bb11a6d4c74e4a3f Mon Sep 17 00:00:00 2001 From: mqy Date: Thu, 15 Jun 2023 14:06:11 +0800 Subject: [PATCH 06/24] tunning: check GPU offloading before loading model --- examples/common.cpp | 8 ++++++++ examples/perplexity/perplexity.cpp | 2 +- ggml-tune.c | 22 +++++++++++----------- llama.cpp | 12 +++++------- 4 files changed, 25 insertions(+), 19 deletions(-) diff --git a/examples/common.cpp b/examples/common.cpp index fd6df49477949..09ce484a12a11 100644 --- a/examples/common.cpp +++ b/examples/common.cpp @@ -435,6 +435,14 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) { process_escapes(params.prompt); } +#ifdef GGML_USE_TUNE + if (params.n_gpu_layers > 0) { + if (params.tune || !params.tune_file.empty()) { + fprintf(stderr, "[tune] error: tunning and GPU offloading cannot be used at the same time, abort.\n"); + exit(1); + } + } +#endif return true; } diff --git a/examples/perplexity/perplexity.cpp b/examples/perplexity/perplexity.cpp index 2cdd9db060d25..4732205160836 100644 --- a/examples/perplexity/perplexity.cpp +++ b/examples/perplexity/perplexity.cpp @@ -159,7 +159,7 @@ int main(int argc, char ** argv) { } #ifdef GGML_USE_TUNE - if (params.tune || !params.tune_file.empty()){ + if (params.tune || !params.tune_file.empty()) { bool ok = llama_mulmat_tune(ctx, params.n_threads, params.tune, params.tune_file.c_str()); if (!ok || (params.tune && !params.tune_file.empty())) { llama_free(ctx); diff --git a/ggml-tune.c b/ggml-tune.c index 52ca96bf302fa..0a52443e4685c 100644 --- a/ggml-tune.c +++ b/ggml-tune.c @@ -104,7 +104,7 @@ const struct ggml_task_profile *ggml_mulmat_tune_select_task_profile( prof->stages[i].backend); } printf( - "\n[mulmat tune] M: %3d, N: %5d, K: %5d, backends of the " + "\n[tune] M: %3d, N: %5d, K: %5d, backends of the " "fastest profile: %s %s %s\n", M, N, K, names[0], names[1], names[2]); #endif @@ -358,7 +358,7 @@ bool ggml_mulmat_tune_validate(const struct ggml_mulmat_tune *tune, bool ok = ggml_mulmat_tune_validate_internal(tune, model, ftype, n_threads, errbuf, sizeof(errbuf)); if (!ok) { - fprintf(stderr, "[mulmat tune] error: %s. run bench again.\n", errbuf); + fprintf(stderr, "[tune] error: %s. run bench again.\n", errbuf); } return ok; @@ -371,7 +371,7 @@ bool ggml_mulmat_tune_read_data(struct ggml_mulmat_tune *tune, FILE *fp) { } if (tune->version != GGML_MULMAT_TUNE_VERSION) { - fprintf(stderr, "[mulmat tune] version mismatch, run bench again\n"); + fprintf(stderr, "[tune] version mismatch, run bench again\n"); return false; } @@ -396,7 +396,7 @@ bool ggml_mulmat_tune_read_data(struct ggml_mulmat_tune *tune, FILE *fp) { (shape->n_profiles * shape->m_num); shape->items = malloc(item_size); if (shape->items == NULL) { - fprintf(stderr, "[mulmat tune] failed to allocate memory\n"); + fprintf(stderr, "[tune] failed to allocate memory\n"); return false; } memset(shape->items, 0, item_size); @@ -708,7 +708,7 @@ static size_t ggml_mulmat_allocate_wdata(int N, int K, char **wdata) { if (!buf) { fprintf(stderr, - "[mulmat tune] error: failed to allocate %zu MiB memory", + "[tune] error: failed to allocate %zu MiB memory", sz / 1024 / 1024); return 0; } @@ -745,7 +745,7 @@ bool ggml_mulmat_tune_bench(struct ggml_mulmat_tune *tune, int n_backends = ggml_mulmat_tune_get_builtin_task_backends(backends); if (n_backends < 2) { fprintf(stderr, - "[mulmat tune] error: this program was not built with BLAS.\n"); + "[tune] error: this program was not built with BLAS.\n"); return false; } @@ -770,7 +770,7 @@ bool ggml_mulmat_tune_bench(struct ggml_mulmat_tune *tune, } fprintf(stdout, - "[mulmat tune] model: %s, ggml ftype: %d, " + "[tune] model: %s, ggml ftype: %d, " "n_pass: %d, n_threads: %d, n_shapes: %d, backends: %s\n", params->model.name, params->model.ftype, params->n_pass, params->n_threads, tune->n_shapes, buf); @@ -871,7 +871,7 @@ bool ggml_mulmat_tune_bench(struct ggml_mulmat_tune *tune, ggml_threading_stop(thrd_ctx); - fprintf(stdout, "[mulmat tune] done, elapsed time: %d seconds.\n", + fprintf(stdout, "[tune] done, elapsed time: %d seconds.\n", (int)(ggml_time_ms() - t0) / 1000); // output @@ -880,7 +880,7 @@ bool ggml_mulmat_tune_bench(struct ggml_mulmat_tune *tune, FILE *fp = fopen(params->fname, "w"); if (!fp) { fprintf(stderr, - "[mulmat tune] warn: failed to open file `%s`, print to " + "[tune] warn: failed to open file `%s`, print to " "console instead\n\n", params->fname); params->output_console = 1; @@ -889,12 +889,12 @@ bool ggml_mulmat_tune_bench(struct ggml_mulmat_tune *tune, fclose(fp); if (ok) { - fprintf(stdout, "[mulmat tune] data was written to `%s`\n", + fprintf(stdout, "[tune] data was written to `%s`\n", params->fname); } else { fprintf( stderr, - "[mulmat tune] warn: failed to write file `%s`, print to " + "[tune] warn: failed to write file `%s`, print to " "console instead\n\n", params->fname); params->output_console = 1; diff --git a/llama.cpp b/llama.cpp index acc0e59f71dbf..06555e1dddeb2 100644 --- a/llama.cpp +++ b/llama.cpp @@ -2745,11 +2745,9 @@ struct llama_context * llama_init_from_file( #ifdef GGML_USE_TUNE bool llama_mulmat_tune(struct llama_context *ctx, int n_threads, bool tune, const char *fname) { + GGML_ASSERT (ctx->model.n_gpu_layers == 0); + printf("\n"); - if (ctx->model.n_gpu_layers != 0) { - fprintf(stderr, "[mulmat tune] error: is disabled by GPU offloading\n"); - return false; - } const char *model_name = llama_model_type_name(ctx->model.type); @@ -2855,7 +2853,7 @@ bool llama_mulmat_tune(struct llama_context *ctx, int n_threads, bool tune, cons if (!empty_fname) { FILE *fp = fopen(fname, "r"); if (!fp) { - fprintf(stderr, "[mulmat tune] failed to open file %s.\n", + fprintf(stderr, "[tune] failed to open file %s.\n", fname); } else { bool ok = ggml_mulmat_tune_read_data(ctx->tune, fp); @@ -2863,12 +2861,12 @@ bool llama_mulmat_tune(struct llama_context *ctx, int n_threads, bool tune, cons if (!ok) { fprintf(stderr, - "[mulmat tune] failed to read data from %s\n", + "[tune] failed to read data from %s\n", fname); return false; } - fprintf(stderr, "[mulmat tune] loaded data from %s\n", fname); + fprintf(stderr, "[tune] loaded data from %s\n", fname); ok = ggml_mulmat_tune_validate(ctx->tune, model_name, ggml_ftype, params.n_threads); if (!ok) { From 21e9379707ca62e462d4c1a07876d6b9435d5412 Mon Sep 17 00:00:00 2001 From: mqy Date: Thu, 15 Jun 2023 15:57:31 +0800 Subject: [PATCH 07/24] tunning: add f16, todo: f32 failed with CL --- ggml-tune.c | 24 +++++++-------- tests/test-ggml-tune.c | 66 +++++++++++++++++++++++++++++++----------- 2 files changed, 60 insertions(+), 30 deletions(-) diff --git a/ggml-tune.c b/ggml-tune.c index 0a52443e4685c..81b012766124c 100644 --- a/ggml-tune.c +++ b/ggml-tune.c @@ -103,10 +103,9 @@ const struct ggml_task_profile *ggml_mulmat_tune_select_task_profile( names[i] = ggml_mulmat_tune_task_backend_name( prof->stages[i].backend); } - printf( - "\n[tune] M: %3d, N: %5d, K: %5d, backends of the " - "fastest profile: %s %s %s\n", - M, N, K, names[0], names[1], names[2]); + printf("\n[tune] M: %3d, N: %5d, K: %5d, backends of the " + "fastest profile: %s %s %s\n", + M, N, K, names[0], names[1], names[2]); #endif } } @@ -707,8 +706,7 @@ static size_t ggml_mulmat_allocate_wdata(int N, int K, char **wdata) { void *buf = malloc(sz); if (!buf) { - fprintf(stderr, - "[tune] error: failed to allocate %zu MiB memory", + fprintf(stderr, "[tune] error: failed to allocate %zu MiB memory", sz / 1024 / 1024); return 0; } @@ -835,8 +833,9 @@ bool ggml_mulmat_tune_bench(struct ggml_mulmat_tune *tune, stages_time[j] = 0; } - /*enum ggml_compute_error err = */ - ggml_threading_compute_tensor(thrd_ctx, node, wdata, wsize); + enum ggml_compute_error err = ggml_threading_compute_tensor( + thrd_ctx, node, wdata, wsize); + GGML_ASSERT(err == GGML_COMPUTE_OK); for (int i = 0; i < 3; i++) { int v = (int)stages_time[i]; @@ -892,11 +891,10 @@ bool ggml_mulmat_tune_bench(struct ggml_mulmat_tune *tune, fprintf(stdout, "[tune] data was written to `%s`\n", params->fname); } else { - fprintf( - stderr, - "[tune] warn: failed to write file `%s`, print to " - "console instead\n\n", - params->fname); + fprintf(stderr, + "[tune] warn: failed to write file `%s`, print to " + "console instead\n\n", + params->fname); params->output_console = 1; } } diff --git a/tests/test-ggml-tune.c b/tests/test-ggml-tune.c index 913d25ff56ff4..a8a2048621eca 100644 --- a/tests/test-ggml-tune.c +++ b/tests/test-ggml-tune.c @@ -8,12 +8,13 @@ static int bench(void); static int estimate_time_non_zero_NK(void); -static void init_params(struct ggml_mulmat_tune_params *params, int m_num) { +static void init_params(struct ggml_mulmat_tune_params *params, + enum ggml_ftype ftype, int m_num, int n_threads) { *params = (struct ggml_mulmat_tune_params){ .model = (struct ggml_mulmat_tune_model){ - .name = "3B", // fake - .ftype = GGML_FTYPE_MOSTLY_Q4_0, + .name = "xB", // fake model name + .ftype = ftype, .n_vocab = 4096, .n_embd = 1024, .n_ff = 2048, @@ -21,7 +22,7 @@ static void init_params(struct ggml_mulmat_tune_params *params, int m_num) { }, .m_num = m_num, .n_pass = 1, - .n_threads = 1, + .n_threads = n_threads, .progress = false, .output_console = true, .fname = NULL}; @@ -45,13 +46,11 @@ int main(void) { } static int bench(void) { - printf("test: %s\n", __func__); - { enum ggml_task_backend backends[16]; int n_backends = ggml_mulmat_tune_get_builtin_task_backends(backends); if (n_backends < 2) { - printf("test: %s, skipped because no BLAS\n", __func__); + printf("[test-ggml-tune] skipped because no BLAS\n"); return 0; } } @@ -67,16 +66,48 @@ static int bench(void) { ggml_free(ctx); } - struct ggml_mulmat_tune tune; + // F32: ggml_opencl: ggml_cl_h2d_tensor_2d(queue, d_X, 0, src0, i03, i02, + // NULL) error -30 at /Users/mqy/tools/AI/llama.cpp/ggml-opencl.cpp:838 + enum ggml_ftype ftypes[] = { + // GGML_FTYPE_ALL_F32, + GGML_FTYPE_MOSTLY_F16, + GGML_FTYPE_MOSTLY_Q4_0, + }; - struct ggml_mulmat_tune_params params; + int n_ftypes = sizeof(ftypes) / sizeof(ftypes[0]); - init_params(¶ms, /*m_num*/ 4); + const int m_num = 4; - bool ok = ggml_mulmat_tune_bench(&tune, ¶ms); - ggml_mulmat_tune_free(&tune); + // Don't use n_threads larger than 2 because Github build hots has limited + // resource quota. + int threads_arr[] = {1, 2}; + int thread_arr_len = sizeof(threads_arr) / sizeof(threads_arr[0]); + + int n_passed = 0; + int n_tests = 0; + + for (int i = 0; i < n_ftypes; i++) { + for (int j = 0; j < thread_arr_len; j++) { + printf("\n"); + + int n_threads = threads_arr[j]; + struct ggml_mulmat_tune tune; + + struct ggml_mulmat_tune_params params; + memset(¶ms, 0, sizeof(struct ggml_mulmat_tune_params)); + init_params(¶ms, ftypes[i], m_num, n_threads); + + ++n_tests; + bool ok = ggml_mulmat_tune_bench(&tune, ¶ms); + if (ok) { + ++n_passed; + } + ggml_mulmat_tune_free(&tune); + } + } - return ok ? 0 : 1; + printf("[test-ggml-tune] %d / %d passed\n", n_passed, n_tests); + return (n_passed == n_tests) ? 0 : 1; } // implement `ggml_task_profiles_provider` @@ -93,7 +124,7 @@ ggml_task_profiles_mock_qxx_provider(struct ggml_tensor *node, } int estimate_time_non_zero_NK(void) { - printf("test: %s\n", __func__); + printf("test-ggml-tune: %s\n", __func__); struct test_data_t { int M; @@ -106,9 +137,10 @@ int estimate_time_non_zero_NK(void) { }; const int m_num = 2; + const int n_threads = 1; // useless. struct ggml_mulmat_tune_params params; - init_params(¶ms, m_num); + init_params(¶ms, tune.ftype, m_num, n_threads); ggml_mulmat_tune_init(&tune, ¶ms, ggml_task_profiles_mock_qxx_provider); @@ -123,8 +155,8 @@ int estimate_time_non_zero_NK(void) { GGML_ASSERT(shape->n_profiles == 2); GGML_ASSERT(ggml_is_quantized(shape->src0_type)); - printf("shape: N: %d, K: %d, n_profiles: %d\n", shape->N, shape->K, - shape->n_profiles); + printf("[test-ggml-tune] %s, shape: N: %d, K: %d, n_profiles: %d\n", + __func__, shape->N, shape->K, shape->n_profiles); { shape->items[0] = From 5342dc075ff19862c5da1e5073759fd6c92e4943 Mon Sep 17 00:00:00 2001 From: mqy Date: Thu, 15 Jun 2023 21:34:34 +0800 Subject: [PATCH 08/24] tunning: support k_quants; disabled rope shapes (workaround); make cache thread safe; fixed shape comprison --- examples/mulmat-tune/mulmat-tune.cpp | 4 +- ggml-tune.c | 94 +++++++++++++++++++--------- ggml-tune.h | 22 ++++--- tests/test-ggml-tune.c | 7 ++- 4 files changed, 85 insertions(+), 42 deletions(-) diff --git a/examples/mulmat-tune/mulmat-tune.cpp b/examples/mulmat-tune/mulmat-tune.cpp index ab3334d763870..55dd1927588de 100644 --- a/examples/mulmat-tune/mulmat-tune.cpp +++ b/examples/mulmat-tune/mulmat-tune.cpp @@ -170,8 +170,8 @@ int main(int argc, char **argv) { ftype = (enum ggml_ftype)v; } - if (ftype > GGML_FTYPE_MOSTLY_Q5_1) { - fprintf(stderr, "k_quants type %d is not implemented\n", ftype); + if (ftype == GGML_FTYPE_ALL_F32 || ftype == GGML_FTYPE_MOSTLY_F16) { + fprintf(stderr, "none quantized type %d is not supported\n", ftype); return 1; } } diff --git a/ggml-tune.c b/ggml-tune.c index 81b012766124c..20f3950693fcb 100644 --- a/ggml-tune.c +++ b/ggml-tune.c @@ -4,10 +4,11 @@ #include "ggml-tune.h" #include "ggml.h" -// MUL_MAT fine tunning for non-GPU-offloading cases. +#ifdef GGML_USE_K_QUANTS +#include "k_quants.h" +#endif -#define GGML_MULMAT_CACHE_LEN 16 -static struct mm_cache_element default_mm_cache[GGML_MULMAT_CACHE_LEN] = {0}; +// MUL_MAT fine tunning for non-GPU-offloading cases. #define FNV_OFFSET 14695981039346656037UL #define FNV_PRIME 1099511628211UL @@ -49,9 +50,8 @@ const struct ggml_task_profile *ggml_mulmat_tune_select_task_profile( GGML_ASSERT(tune); // TODO: default_mm_cache is thread-unsafe. - struct mm_cache_element *mm_cache = default_mm_cache; int slot = ggml_mulmat_tune_cache_hash(M, N, K) % GGML_MULMAT_CACHE_LEN; - struct mm_cache_element *e = &mm_cache[slot]; + struct ggml_mulmat_tune_cache_ele *e = &tune->cache[slot]; struct ggml_mulmat_tune_time profiles_time[GGML_MAX_TASK_PROFILES] = {0}; @@ -183,7 +183,7 @@ bool ggml_mulmat_tune_init(struct ggml_mulmat_tune *tune, enum ggml_type type = ggml_ftype_to_ggml_type(model->ftype); - GGML_ASSERT(GGML_MULMAT_N_SHAPES >= 6); + GGML_ASSERT(GGML_MULMAT_N_SHAPES == 4 || GGML_MULMAT_N_SHAPES == 6); tune->n_shapes = GGML_MULMAT_N_SHAPES; // Attention layers @@ -196,11 +196,26 @@ bool ggml_mulmat_tune_init(struct ggml_mulmat_tune *tune, .N = n_ff, .K = n_embd, .src0_type = type, .src1_type = src1_type}; tune->shapes[3] = (struct ggml_mulmat_tune_shape){ .N = n_vocab, .K = n_embd, .src0_type = type, .src1_type = src1_type}; - // RoPE - tune->shapes[4] = (struct ggml_mulmat_tune_shape){ - .N = n_rot, .K = 0, .src0_type = rot_src0_type, .src1_type = src1_type}; - tune->shapes[5] = (struct ggml_mulmat_tune_shape){ - .N = 0, .K = n_rot, .src0_type = rot_src0_type, .src1_type = src1_type}; + + tune->n_shapes = GGML_MULMAT_N_SHAPES; + + if (GGML_MULMAT_N_SHAPES == 6) { + // RoPE. + // - very small comparing to previous, almost no need to bench. + // - an Illegal instruction exception on Github (mac-latest-cmake). + // - CL sometimes throws error on localhost. + // So temporarily disabled as a workaround. + tune->shapes[4] = + (struct ggml_mulmat_tune_shape){.N = n_rot, + .K = 0, + .src0_type = rot_src0_type, + .src1_type = src1_type}; + tune->shapes[5] = + (struct ggml_mulmat_tune_shape){.N = 0, + .K = n_rot, + .src0_type = rot_src0_type, + .src1_type = src1_type}; + } for (int i = 0; i < tune->n_shapes; i++) { struct ggml_mulmat_tune_shape *shape = &tune->shapes[i]; @@ -225,6 +240,7 @@ bool ggml_mulmat_tune_init(struct ggml_mulmat_tune *tune, shape->m_num = params->m_num; shape->arr_m = malloc(shape->m_num * sizeof(int)); + GGML_ASSERT(shape->arr_m); for (int j = 0; j < shape->m_num; j++) { shape->arr_m[j] = 1 << j; } @@ -245,11 +261,13 @@ void ggml_mulmat_tune_free(struct ggml_mulmat_tune *tune) { GGML_ASSERT(shape); // arr_m and items can be NULL only when testing. - if (shape->arr_m) { - free(shape->arr_m); - } - if (shape->items) { - free(shape->items); + if (shape->m_num > 0) { + if (shape->arr_m) { + free(shape->arr_m); + } + if (shape->items) { + free(shape->items); + } } } } @@ -325,17 +343,19 @@ ggml_mulmat_tune_validate_internal(const struct ggml_mulmat_tune *tune, }; struct ggml_task_profile builtin_profiles[GGML_MAX_TASK_PROFILES]; + memset(builtin_profiles, 0, sizeof(builtin_profiles)); + int n_profiles = ggml_get_task_profiles(&node, builtin_profiles); if (n_profiles != shape->n_profiles) { - snprintf(errbuf, errbuf_len - 1, "task profiles mismatch"); + snprintf(errbuf, errbuf_len - 1, "task profiles mismatch(n_profiles)"); return false; } // TODO: profiles order is relevant, too strict. size_t sz = sizeof(struct ggml_task_profile) * n_profiles; if (memcmp(builtin_profiles, shape->profiles, sz) != 0) { - snprintf(errbuf, errbuf_len - 1, "task profiles mismatch"); + snprintf(errbuf, errbuf_len - 1, "task profiles mismatch(profiles)"); printf("=== built-in profiles:\n"); ggml_mulmat_tune_write_profiles(stderr, builtin_profiles, @@ -364,6 +384,9 @@ bool ggml_mulmat_tune_validate(const struct ggml_mulmat_tune *tune, } bool ggml_mulmat_tune_read_data(struct ggml_mulmat_tune *tune, FILE *fp) { + GGML_ASSERT(tune); + memset(tune, 0, sizeof(struct ggml_mulmat_tune)); + int rc = fscanf(fp, "%d", &tune->version); if (rc <= 0) { return false; @@ -661,27 +684,42 @@ static struct ggml_tensor *ggml_mulmat_new_tensor(int M, int N, int K, ggml_new_tensor_2d(*ctx, GGML_TYPE_F32, (int64_t)K, (int64_t)N); ggml_set_f32(src0_f32, 0.1f); + const float *src_data = (const float *)src0_f32->data; + int nxk = N * K; + switch (src0_type) { case GGML_TYPE_Q4_0: - ggml_quantize_q4_0((const float *)src0_f32->data, src0->data, N * K, - K, hist); + ggml_quantize_q4_0(src_data, src0->data, nxk, K, hist); break; case GGML_TYPE_Q4_1: - ggml_quantize_q4_1((const float *)src0_f32->data, src0->data, N * K, - K, hist); + ggml_quantize_q4_1(src_data, src0->data, nxk, K, hist); break; case GGML_TYPE_Q5_0: - ggml_quantize_q5_0((const float *)src0_f32->data, src0->data, N * K, - K, hist); + ggml_quantize_q5_0(src_data, src0->data, nxk, K, hist); break; case GGML_TYPE_Q5_1: - ggml_quantize_q5_1((const float *)src0_f32->data, src0->data, N * K, - K, hist); + ggml_quantize_q5_1(src_data, src0->data, nxk, K, hist); break; case GGML_TYPE_Q8_0: - ggml_quantize_q8_0((const float *)src0_f32->data, src0->data, N * K, - K, hist); + ggml_quantize_q8_0(src_data, src0->data, nxk, K, hist); + break; +#ifdef GGML_USE_K_QUANTS + case GGML_TYPE_Q2_K: + ggml_quantize_q2_K(src_data, src0->data, nxk, K, hist); break; + case GGML_TYPE_Q3_K: + ggml_quantize_q3_K(src_data, src0->data, nxk, K, hist); + break; + case GGML_TYPE_Q4_K: + ggml_quantize_q4_K(src_data, src0->data, nxk, K, hist); + break; + case GGML_TYPE_Q5_K: + ggml_quantize_q5_K(src_data, src0->data, nxk, K, hist); + break; + case GGML_TYPE_Q6_K: + ggml_quantize_q6_K(src_data, src0->data, nxk, K, hist); + break; +#endif default: GGML_ASSERT(false); } diff --git a/ggml-tune.h b/ggml-tune.h index 04b25873c932f..b1246615503d3 100644 --- a/ggml-tune.h +++ b/ggml-tune.h @@ -11,7 +11,8 @@ extern "C" { #endif #define GGML_MULMAT_TUNE_VERSION 8 -#define GGML_MULMAT_N_SHAPES 6 +#define GGML_MULMAT_N_SHAPES 4 +#define GGML_MULMAT_CACHE_LEN 16 #define GGML_MULMAT_MAX_PASS 3 @@ -54,6 +55,14 @@ struct ggml_mulmat_tune_shape { struct ggml_mulmat_tune_m *items; }; + struct ggml_mulmat_tune_cache_ele { + int M; + int N; + int K; + const struct ggml_task_profile *profile; + int stages_time[3]; +}; + struct ggml_mulmat_tune { int version; @@ -66,6 +75,9 @@ struct ggml_mulmat_tune { struct ggml_mulmat_tune_shape shapes[GGML_MULMAT_N_SHAPES]; int n_threads; + + // Cache for time estimating. + struct ggml_mulmat_tune_cache_ele cache[GGML_MULMAT_CACHE_LEN]; }; struct ggml_mulmat_tune_time { @@ -74,14 +86,6 @@ struct ggml_mulmat_tune_time { int total_time; }; -struct mm_cache_element { - int M; - int N; - int K; - const struct ggml_task_profile *profile; - int stages_time[3]; -}; - // params for tune/bench. struct ggml_mulmat_tune_params { struct ggml_mulmat_tune_model model; diff --git a/tests/test-ggml-tune.c b/tests/test-ggml-tune.c index a8a2048621eca..5499fa6bf1d82 100644 --- a/tests/test-ggml-tune.c +++ b/tests/test-ggml-tune.c @@ -70,15 +70,16 @@ static int bench(void) { // NULL) error -30 at /Users/mqy/tools/AI/llama.cpp/ggml-opencl.cpp:838 enum ggml_ftype ftypes[] = { // GGML_FTYPE_ALL_F32, - GGML_FTYPE_MOSTLY_F16, + // GGML_FTYPE_MOSTLY_F16, GGML_FTYPE_MOSTLY_Q4_0, + GGML_FTYPE_MOSTLY_Q4_K, }; int n_ftypes = sizeof(ftypes) / sizeof(ftypes[0]); const int m_num = 4; - // Don't use n_threads larger than 2 because Github build hots has limited + // Don't use n_threads larger than 2 because Github build hosts has limited // resource quota. int threads_arr[] = {1, 2}; int thread_arr_len = sizeof(threads_arr) / sizeof(threads_arr[0]); @@ -124,7 +125,7 @@ ggml_task_profiles_mock_qxx_provider(struct ggml_tensor *node, } int estimate_time_non_zero_NK(void) { - printf("test-ggml-tune: %s\n", __func__); + printf("[test-ggml-tune] %s\n", __func__); struct test_data_t { int M; From 6b83a3e16fa3126d2c5e6667d2f396bde84a68b4 Mon Sep 17 00:00:00 2001 From: mqy Date: Fri, 16 Jun 2023 20:32:12 +0800 Subject: [PATCH 09/24] try make CL run w/o tunning, but -ngl stucks no output. had to add task runer and profile id, many changes, see the f codes --- examples/mulmat-tune/mulmat-tune.cpp | 27 +- ggml-opencl.cpp | 2 +- ggml-threading.c | 37 ++- ggml-threading.h | 22 +- ggml-tune.c | 70 ++++- ggml-tune.h | 13 +- ggml.c | 441 ++++++++++++++------------- ggml.h | 29 ++ tests/test-ggml-threading.c | 61 +++- tests/test-ggml-tune.c | 10 +- 10 files changed, 433 insertions(+), 279 deletions(-) diff --git a/examples/mulmat-tune/mulmat-tune.cpp b/examples/mulmat-tune/mulmat-tune.cpp index 55dd1927588de..da1d0a1c1fe7e 100644 --- a/examples/mulmat-tune/mulmat-tune.cpp +++ b/examples/mulmat-tune/mulmat-tune.cpp @@ -11,6 +11,10 @@ #define UNUSED(x) (void)(x) +// F16 has an pending Illegal Instruction error on macos-latest-cmake. +// So the workaround is to disable non-quantized ftypes. +// #define SUPPORT_NONE_Q_TYPE 1 + static void print_build_tips(void) { const char *a = "LLAMA_NO_ACCELERATE"; fprintf(stderr, "Tips on how to build with various backend vendors:\n\n"); @@ -62,11 +66,12 @@ static void usage(char *prog) { "--model MODEL 3B | 7B | 13B | 30B | 65B", " default 7B", "--ftype FTYPE ggml ftype:", +#ifdef SUPPORT_NONE_Q_TYPE " 0: all F32", " 1: mostly F16", +#endif " 2: mostly Q4_0", " 3: mostly Q4_1", - " 4: mostly Q4_1, some F16", " 7: mostly Q8_0", " 8: mostly Q5_0", " 9: mostly Q5_1", @@ -84,7 +89,7 @@ static void usage(char *prog) { " requires: between [1, 3]", "--n_threads NTH bench with this number of threads", " requires: between [1, 16]", - " default 1", + " default 4", "--file FILE data file to write", " default stdout", "-y always answer \"yes\" to all prompts", @@ -170,8 +175,22 @@ int main(int argc, char **argv) { ftype = (enum ggml_ftype)v; } +#ifndef SUPPORT_NONE_Q_TYPE if (ftype == GGML_FTYPE_ALL_F32 || ftype == GGML_FTYPE_MOSTLY_F16) { - fprintf(stderr, "none quantized type %d is not supported\n", ftype); + fprintf(stderr, "error: none quantized type %d is not supported\n", + ftype); + return 1; + } +#endif + + bool cond_1 = ftype >= GGML_FTYPE_MOSTLY_Q4_0 && + ftype <= GGML_FTYPE_MOSTLY_Q4_1; + bool cond_2 = + ftype >= GGML_FTYPE_MOSTLY_Q8_0 && ftype <= GGML_FTYPE_MOSTLY_Q6_K; + + if (!(cond_1 || cond_2)) { + fprintf(stderr, "error: type %d is not a known ggml ftype.\n", + ftype); return 1; } } @@ -223,7 +242,7 @@ int main(int argc, char **argv) { } } - int n_threads = 1; + int n_threads = 4; { if (arg_n_threads != NULL) { int v = atoi(arg_n_threads); diff --git a/ggml-opencl.cpp b/ggml-opencl.cpp index c9151a8e49561..2a1a04fcaccf1 100644 --- a/ggml-opencl.cpp +++ b/ggml-opencl.cpp @@ -1628,7 +1628,7 @@ bool ggml_cl_mul_mat_use_f16(const struct ggml_tensor * src0, const struct ggml_ } void ggml_cl_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst, void * wdata, size_t wsize) { - // GGML_ASSERT(ggml_cl_can_mul_mat(src0, src1, dst)); + GGML_ASSERT(ggml_cl_can_mul_mat(src0, src1, dst)); if (src0->type == GGML_TYPE_F32) { ggml_cl_mul_mat_f32(src0, src1, dst); diff --git a/ggml-threading.c b/ggml-threading.c index 6dd6d2817eff0..7ef763c0f81e1 100644 --- a/ggml-threading.c +++ b/ggml-threading.c @@ -170,7 +170,8 @@ struct ggml_compute_state_shared { atomic_bool wait_on_done; atomic_bool stop; - ggml_threading_task_runner *task_runner; + // Default task runner, can be overriden by node.task_profile.runner. + ggml_task_runner *task_runner; struct ggml_threading_context *ctx; }; @@ -391,8 +392,10 @@ ggml_thread_ret_t ggml_threading_graph_compute_thread(void *data) { } if (shared->n_tasks > 0 && state->has_work) { - enum ggml_compute_error err = - shared->task_runner(&state->params, state->node); + ggml_task_runner *runner = state->node->task_profile.runner + ? state->node->task_profile.runner + : shared->task_runner; + enum ggml_compute_error err = runner(&state->params, state->node); GGML_ASSERT(err == GGML_COMPUTE_OK); @@ -427,8 +430,13 @@ ggml_threading_compute_tensor(struct ggml_threading_context *ctx, size_t wsize) { GGML_ASSERT(ctx); GGML_ASSERT(node); - GGML_ASSERT(ctx->shared.task_runner); + + ggml_task_runner *runner = ctx->shared.task_runner; + if (node->task_profile.runner) { + runner = node->task_profile.runner; + } + struct ggml_compute_state_shared *state_shared = &ctx->shared; // This is the params for main thread. @@ -491,7 +499,7 @@ ggml_threading_compute_tensor(struct ggml_threading_context *ctx, params.wsize = wsize; params.wdata = wdata; - err = state_shared->task_runner(¶ms, node); + err = runner(¶ms, node); } // wait for tasks done. @@ -509,11 +517,21 @@ ggml_threading_compute_tensor(struct ggml_threading_context *ctx, if (err != GGML_COMPUTE_OK) { if (err == GGML_COMPUTE_FALLBACK) { + PRINT_DEBUG("[main] fallback from profile, id=%d\n", + node->task_profile.id); + GGML_ASSERT(node->task_profile.stages[1].backend > + GGML_TASK_BACKEND_CPU); + struct ggml_task_profile profiles[GGML_MAX_TASK_PROFILES]; int n = ggml_get_task_profiles(node, profiles); GGML_ASSERT(n > 0); + GGML_ASSERT(profiles[0].stages[1].backend == + GGML_TASK_BACKEND_CPU); + memcpy(&node->task_profile, &profiles[0], sizeof(struct ggml_task_profile)); + runner = ctx->shared.task_runner; + goto START; } return err; @@ -525,12 +543,13 @@ ggml_threading_compute_tensor(struct ggml_threading_context *ctx, struct ggml_threading_context * ggml_threading_start(int n_threads, ggml_threading_thread_runner *thread_runner, - ggml_threading_task_runner *task_stage_runner, + ggml_task_runner *task_runner, enum ggml_threading_features features, int64_t stages_time[3]) { GGML_ASSERT(n_threads > 0); - GGML_ASSERT(thread_runner); - GGML_ASSERT(task_stage_runner); + if (thread_runner == NULL) { + thread_runner = ggml_threading_graph_compute_thread; + } size_t ctx_sz = sizeof(struct ggml_threading_context); struct ggml_threading_context *ctx = malloc(ctx_sz); @@ -545,7 +564,7 @@ ggml_threading_start(int n_threads, ggml_threading_thread_runner *thread_runner, .wait_now = false, .wait_on_done = false, .stop = false, - .task_runner = task_stage_runner, + .task_runner = task_runner, .ctx = ctx, }; diff --git a/ggml-threading.h b/ggml-threading.h index f3214efc7cb7d..189fc2ed56a69 100644 --- a/ggml-threading.h +++ b/ggml-threading.h @@ -21,27 +21,21 @@ enum ggml_threading_features { GGML_THREADING_FEATURE_PERF = 1 << 1, }; -// Compute errors. -enum ggml_compute_error { - GGML_COMPUTE_OK = 0, - GGML_COMPUTE_FALLBACK = 1, -}; - -// The task runner to be called by main thread and workers. -typedef enum ggml_compute_error(ggml_threading_task_runner)( - struct ggml_compute_params *params, struct ggml_tensor *node); - // The thread runner to feed into OS threads. typedef ggml_thread_ret_t(ggml_threading_thread_runner)(void *data); // Init and start underlying workers if n_threads > 1. // -// features: optional for configure threading additional features. -// see `ggml_threading_feature`, default 0. +// thread: optional OS thread runner, default value: +// `ggml_threading_graph_compute_thread`. +// +// features: optional for configure +// threading additional features. see `ggml_threading_feature`, default 0. +// // stages_time: optional for collecting per-stage wall clock time. struct ggml_threading_context * ggml_threading_start(int n_threads, ggml_threading_thread_runner *thread, - ggml_threading_task_runner *task_stage_runner, + ggml_task_runner *task_runner, enum ggml_threading_features features, int64_t stages_time[3]); @@ -60,7 +54,7 @@ ggml_threading_compute_tensor(struct ggml_threading_context *ctx, // This is an experimental functionality for mulmat tune, as a thin wrapper. enum ggml_compute_error -ggml_compute_forward_wrapper(struct ggml_compute_params *params, +ggml_compute_forward_wrapper(const struct ggml_compute_params *params, struct ggml_tensor *tensor); #ifdef __cplusplus diff --git a/ggml-tune.c b/ggml-tune.c index 20f3950693fcb..aeb63e957da2d 100644 --- a/ggml-tune.c +++ b/ggml-tune.c @@ -44,9 +44,12 @@ ggml_mulmat_tune_task_backend_name(enum ggml_task_backend backend) { } } -const struct ggml_task_profile *ggml_mulmat_tune_select_task_profile( - struct ggml_mulmat_tune *tune, int M, int N, int K, enum ggml_type src0_t, - enum ggml_type src1_t, int stages_time[3]) { +// NOTE: we can not use the profile from tune because the profiles do not +// contain fields such as runner, get_size. +int ggml_mulmat_tune_select_task_profile(struct ggml_mulmat_tune *tune, int M, + int N, int K, enum ggml_type src0_t, + enum ggml_type src1_t, + int stages_time[3]) { GGML_ASSERT(tune); // TODO: default_mm_cache is thread-unsafe. @@ -103,15 +106,15 @@ const struct ggml_task_profile *ggml_mulmat_tune_select_task_profile( names[i] = ggml_mulmat_tune_task_backend_name( prof->stages[i].backend); } - printf("\n[tune] M: %3d, N: %5d, K: %5d, backends of the " - "fastest profile: %s %s %s\n", - M, N, K, names[0], names[1], names[2]); + printf("\n[tune] M: %3d, N: %5d, K: %5d, profile id: %d, " + "backends: %s %s %s\n", + M, N, K, prof->id, names[0], names[1], names[2]); #endif } } } - return prof; + return prof->id; } void ggml_mulmat_tune_model_init(struct ggml_mulmat_tune_model *model, @@ -264,10 +267,13 @@ void ggml_mulmat_tune_free(struct ggml_mulmat_tune *tune) { if (shape->m_num > 0) { if (shape->arr_m) { free(shape->arr_m); + shape->arr_m = NULL; } if (shape->items) { free(shape->items); + shape->items = NULL; } + shape->m_num = 0; } } } @@ -277,6 +283,11 @@ static bool ggml_mulmat_tune_write_profiles( int rc; for (int i = 0; i < n_profiles; i++) { const struct ggml_task_profile *profile = &profiles[i]; + rc = fprintf(fp, "%d ", profile->id); + if (rc <= 0) { + return false; + } + for (int j = 0; j < 3; j++) { const struct ggml_task_stage *ts = &profile->stages[j]; rc = fprintf(fp, "%2d %d %d", ts->backend, ts->parallel ? 1 : 0, @@ -304,7 +315,6 @@ static bool ggml_mulmat_tune_validate_internal(const struct ggml_mulmat_tune *tune, const char *model, int ftype, int n_threads, char *errbuf, int errbuf_len) { - if (tune->version != GGML_MULMAT_TUNE_VERSION) { snprintf(errbuf, errbuf_len - 1, "version mismatch, built-in: %d, " @@ -348,14 +358,28 @@ ggml_mulmat_tune_validate_internal(const struct ggml_mulmat_tune *tune, int n_profiles = ggml_get_task_profiles(&node, builtin_profiles); if (n_profiles != shape->n_profiles) { - snprintf(errbuf, errbuf_len - 1, "task profiles mismatch(n_profiles)"); + snprintf(errbuf, errbuf_len - 1, + "task profiles mismatch (n_profiles)"); return false; } // TODO: profiles order is relevant, too strict. - size_t sz = sizeof(struct ggml_task_profile) * n_profiles; - if (memcmp(builtin_profiles, shape->profiles, sz) != 0) { - snprintf(errbuf, errbuf_len - 1, "task profiles mismatch(profiles)"); + // Only validate stages! + size_t sz = sizeof(struct ggml_task_stage) * 3; + bool matched = true; + for (int j = 0; j < n_profiles; j++) { + if (builtin_profiles[j].id != shape->profiles[j].id) { + return false; + } + if (memcmp(builtin_profiles[j].stages, shape->profiles[j].stages, + sz) != 0) { + matched = false; + break; + } + } + if (!matched) { + snprintf(errbuf, errbuf_len - 1, + "task profiles mismatch (profiles)"); printf("=== built-in profiles:\n"); ggml_mulmat_tune_write_profiles(stderr, builtin_profiles, @@ -426,6 +450,12 @@ bool ggml_mulmat_tune_read_data(struct ggml_mulmat_tune *tune, FILE *fp) { for (int ip = 0; ip < shape->n_profiles; ip++) { struct ggml_task_profile *profile = &shape->profiles[ip]; + + rc = fscanf(fp, "%d ", &profile->id); + if (rc <= 0) { + return false; + } + for (int j = 0; j < 3; j++) { struct ggml_task_stage *ts = &profile->stages[j]; int backend; @@ -777,6 +807,8 @@ bool ggml_mulmat_tune_bench(struct ggml_mulmat_tune *tune, GGML_ASSERT(params); GGML_ASSERT(params->model.name); + memset(tune, 0, sizeof(struct ggml_mulmat_tune)); + enum ggml_task_backend backends[16]; int n_backends = ggml_mulmat_tune_get_builtin_task_backends(backends); if (n_backends < 2) { @@ -785,6 +817,15 @@ bool ggml_mulmat_tune_bench(struct ggml_mulmat_tune *tune, return false; } + if (params->model.ftype >= GGML_FTYPE_MOSTLY_Q2_K && + params->model.ftype <= GGML_FTYPE_MOSTLY_Q6_K) { +#if defined(GGML_USE_CLBLAST) + printf("[tune] error: cl implementation does not support k_quants at " + "the time of writing this code, skip.\n"); + return false; +#endif + } + bool ok = ggml_mulmat_tune_init(tune, params, ggml_get_task_profiles); if (!ok) { return false; @@ -816,9 +857,8 @@ bool ggml_mulmat_tune_bench(struct ggml_mulmat_tune *tune, int64_t t0 = ggml_time_ms(); struct ggml_threading_context *thrd_ctx = ggml_threading_start( - tune->n_threads, ggml_threading_graph_compute_thread, - ggml_compute_forward_wrapper, GGML_THREADING_FEATURE_WAIT_ON_DONE, - stages_time); + tune->n_threads, NULL, ggml_compute_forward_wrapper, + GGML_THREADING_FEATURE_WAIT_ON_DONE, stages_time); for (int i_shape = 0; i_shape < tune->n_shapes; i_shape++) { const struct ggml_mulmat_tune_shape *shape = &tune->shapes[i_shape]; diff --git a/ggml-tune.h b/ggml-tune.h index b1246615503d3..7955a50a977da 100644 --- a/ggml-tune.h +++ b/ggml-tune.h @@ -10,7 +10,7 @@ extern "C" { #endif -#define GGML_MULMAT_TUNE_VERSION 8 +#define GGML_MULMAT_TUNE_VERSION 9 #define GGML_MULMAT_N_SHAPES 4 #define GGML_MULMAT_CACHE_LEN 16 @@ -55,7 +55,7 @@ struct ggml_mulmat_tune_shape { struct ggml_mulmat_tune_m *items; }; - struct ggml_mulmat_tune_cache_ele { +struct ggml_mulmat_tune_cache_ele { int M; int N; int K; @@ -98,10 +98,11 @@ struct ggml_mulmat_tune_params { }; // NOTE: stages_time is filled if not null. -const struct ggml_task_profile * -ggml_mulmat_tune_select_task_profile(struct ggml_mulmat_tune *tune, int M, - int N, int K, enum ggml_type src0_t, - enum ggml_type src1_t, int stages_time[3]); +// Return profile id. +int ggml_mulmat_tune_select_task_profile(struct ggml_mulmat_tune *tune, int M, + int N, int K, enum ggml_type src0_t, + enum ggml_type src1_t, + int stages_time[3]); bool ggml_mulmat_tune_validate(const struct ggml_mulmat_tune *tune, const char *model_name, int ftype, diff --git a/ggml.c b/ggml.c index b734f1a0c4d46..43ec93a64e2b2 100644 --- a/ggml.c +++ b/ggml.c @@ -8500,19 +8500,14 @@ static void ggml_compute_forward_mul_f32( const int ith = params->ith; const int nth = params->nth; - enum ggml_task_backend comp_backend = dst->task_profile.stages[GGML_TASK_COMPUTE].backend; - if (comp_backend == GGML_TASK_BACKEND_GPU_CL) { #ifdef GGML_USE_CLBLAST - if (src1->backend == GGML_BACKEND_GPU) { - if (ith == 0) { - ggml_cl_mul(src0, src1, dst); - } - return; + if (src1->backend == GGML_BACKEND_GPU) { + if (ith == 0) { + ggml_cl_mul(src0, src1, dst); } -#else - GGML_ASSERT(false); + return; + } #endif - }; const int64_t nr = ggml_nrows(src0); @@ -9938,7 +9933,7 @@ static void ggml_compute_forward_rms_norm_back( } } - +// CPU only static void ggml_compute_forward_mul_mat_f32( const struct ggml_compute_params * params, const struct ggml_tensor * src0, @@ -10010,18 +10005,6 @@ static void ggml_compute_forward_mul_mat_f32( // compute by src0 rows enum ggml_task_backend comp_backend = dst->task_profile.stages[GGML_TASK_COMPUTE].backend; - - if (comp_backend == GGML_TASK_BACKEND_GPU_CL) { -#if defined(GGML_USE_CLBLAST) - GGML_ASSERT(params->nth == 1); - GGML_ASSERT(params->type == GGML_TASK_COMPUTE); - ggml_cl_mul_mat(src0, src1, dst, params->wdata, params->wsize); - return; -#else - GGML_ASSERT(false); -#endif - } - GGML_ASSERT(comp_backend & GGML_TASK_BACKEND_CPU); if (comp_backend == GGML_TASK_BACKEND_CPU_BLAS) { @@ -10104,6 +10087,7 @@ static void ggml_compute_forward_mul_mat_f32( //} } +// CPU only. static void ggml_compute_forward_mul_mat_f16_f32( const struct ggml_compute_params * params, const struct ggml_tensor * src0, @@ -10168,19 +10152,9 @@ static void ggml_compute_forward_mul_mat_f16_f32( // nb01 >= nb00 - src0 is not transposed // compute by src0 rows + enum ggml_task_backend init_backend = dst->task_profile.stages[GGML_TASK_INIT].backend; enum ggml_task_backend comp_backend = dst->task_profile.stages[GGML_TASK_COMPUTE].backend; - if (comp_backend == GGML_TASK_BACKEND_GPU_CL) { -#if defined(GGML_USE_CLBLAST) - GGML_ASSERT(params->type == GGML_TASK_COMPUTE); - ggml_cl_mul_mat(src0, src1, dst, params->wdata, params->wsize); - return; -#else - GGML_ASSERT(false); -#endif - } - - enum ggml_task_backend init_backend = dst->task_profile.stages[GGML_TASK_INIT].backend; GGML_ASSERT(comp_backend & GGML_TASK_BACKEND_CPU); if (comp_backend == GGML_TASK_BACKEND_CPU_BLAS) { @@ -10304,6 +10278,7 @@ static void ggml_compute_forward_mul_mat_f16_f32( //} } +// CPU only static void ggml_compute_forward_mul_mat_q_f32( const struct ggml_compute_params * params, const struct ggml_tensor * src0, @@ -10373,20 +10348,8 @@ static void ggml_compute_forward_mul_mat_q_f32( // nb01 >= nb00 - src0 is not transposed // compute by src0 rows - enum ggml_task_backend comp_backend = dst->task_profile.stages[GGML_TASK_COMPUTE].backend; - - if (comp_backend == GGML_TASK_BACKEND_GPU_CL) { -#if defined(GGML_USE_CLBLAST) - GGML_ASSERT(params->nth == 1); - GGML_ASSERT(params->type == GGML_TASK_COMPUTE); - ggml_cl_mul_mat(src0, src1, dst, params->wdata, params->wsize); - return; -#else - GGML_ASSERT(false); -#endif - } - enum ggml_task_backend init_backend = dst->task_profile.stages[GGML_TASK_INIT].backend; + enum ggml_task_backend comp_backend = dst->task_profile.stages[GGML_TASK_COMPUTE].backend; GGML_ASSERT(comp_backend & GGML_TASK_BACKEND_CPU); if (comp_backend == GGML_TASK_BACKEND_CPU_BLAS) { @@ -14294,30 +14257,9 @@ static void ggml_compute_forward_cross_entropy_loss_back( ///////////////////////////////// -static enum ggml_compute_error ggml_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor) { +static enum ggml_compute_error ggml_compute_forward(const struct ggml_compute_params * params, struct ggml_tensor * tensor) { GGML_ASSERT(params); - enum ggml_task_backend comp_backend = tensor->task_profile.stages[GGML_TASK_COMPUTE].backend; - - if (comp_backend == GGML_TASK_BACKEND_GPU_CUDA) { -#if defined(GGML_USE_CUBLAS) - bool skip_cpu = ggml_cuda_compute_forward(params, tensor); - if (skip_cpu) { - return GGML_COMPUTE_OK; - } - GGML_ASSERT(tensor->src0->backend == GGML_BACKEND_CPU); - GGML_ASSERT(tensor->src1 == NULL || tensor->src1->backend == GGML_BACKEND_CPU); - return GGML_COMPUTE_FALLBACK; -#else - GGML_ASSERT(false); -#endif - } - - // if (tensor->task_profile.stages[params->type].backend > GGML_TASK_BACKEND_CPU) { - // printf("mulmat: test fallback\n"); - // return GGML_COMPUTE_FALLBACK; - // } - switch (tensor->op) { case GGML_OP_DUP: { @@ -14568,13 +14510,6 @@ static enum ggml_compute_error ggml_compute_forward(struct ggml_compute_params * return GGML_COMPUTE_OK; } -enum ggml_compute_error ggml_compute_forward_wrapper(struct ggml_compute_params *params, - struct ggml_tensor *tensor) { - // We call ggml_compute_forward because the CUDA mul_mat entry point - // was moved out of `ggml_compute_forward_mul_mat`. - return ggml_compute_forward(params, tensor); -} - //////////////////////////////////////////////////////////////////////////////// static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor * tensor, bool inplace) { @@ -15524,12 +15459,67 @@ struct ggml_cgraph ggml_build_backward(struct ggml_context * ctx, struct ggml_cg // ---- task profiles ---- +// TODO: replace with ggml_compute_forward_cuda +// DO NOT check matrix size further. +#if defined(GGML_USE_CUBLAS) +static enum ggml_compute_error ggml_compute_forward_cuda( + const struct ggml_compute_params * params, + struct ggml_tensor * tensor) { + GGML_ASSERT (ggml_cuda_can_mul_mat(tensor->src0, tensor->src1, tensor)); + if (ggml_cuda_compute_forward(params, tensor)) { + return GGML_COMPUTE_OK; + } + GGML_ASSERT(tensor->src0->backend == GGML_BACKEND_CPU); + GGML_ASSERT(tensor->src1 == NULL || tensor->src1->backend == GGML_BACKEND_CPU); + return GGML_COMPUTE_FALLBACK; +} +#endif + +// TODO: replace with ggml_cl_mul_mat. +// DO NOT check matrix size further. +#if defined(GGML_USE_CLBLAST) +static enum ggml_compute_error ggml_compute_forward_cl( + const struct ggml_compute_params * params, + struct ggml_tensor * tensor) { + switch (tensor->op) { + case GGML_OP_MUL_MAT: + GGML_ASSERT(ggml_cl_can_mul_mat(tensor->src0, tensor->src1, tensor)); + ggml_cl_mul_mat(tensor->src0, tensor->src1, tensor, params->wdata, params->wsize); + return GGML_COMPUTE_OK; + default: + break; + } + + GGML_ASSERT(false); +} + +static int ggml_compute_forward_get_wsize_cl (struct ggml_tensor *tensor) { + switch (tensor->op) { + case GGML_OP_MUL_MAT: + return ggml_cl_mul_mat_get_wsize(tensor->src0, tensor->src1, tensor); + default: + break; + } + return -1; +} +#endif + +// The wrapper for external mulmat tune tool. +enum ggml_compute_error ggml_compute_forward_wrapper(const struct ggml_compute_params *params, + struct ggml_tensor *tensor) { + // We call ggml_compute_forward because the CUDA mul_mat entry point + // was moved out of `ggml_compute_forward_mul_mat`. + return ggml_compute_forward(params, tensor); +} + // Implement `ggml_task_profiles_provider`. // Fill `profiles` for the `node` and return number of profiles. // // NOTE: the node may be incompleted from testing or tunning, so please assert // everything used here. -inline int ggml_get_task_profiles( +// +// TODO: configure cuda for none mul_mat nodes. +int ggml_get_task_profiles( struct ggml_tensor *node, struct ggml_task_profile profiles[GGML_MAX_TASK_PROFILES]) { GGML_ASSERT(node); @@ -15595,6 +15585,8 @@ inline int ggml_get_task_profiles( } break; case GGML_OP_MUL_MAT: case GGML_OP_OUT_PROD: { + // CPU only profiles. + // CUDA/CL: see end of function. GGML_ASSERT(node->src0); GGML_ASSERT(node->src1); @@ -15614,16 +15606,6 @@ inline int ggml_get_task_profiles( p[i].stages[1].wait = true; i++; #endif - -#if defined(GGML_USE_CUBLAS) - p[i].stages[1].backend = GGML_TASK_BACKEND_GPU_CUDA; - p[i].stages[1].wait = true; - i++; -#elif defined(GGML_USE_CLBLAST) - p[i].stages[1].backend = GGML_TASK_BACKEND_GPU_CL; - p[i].stages[1].wait = true; - i++; -#endif } else if (src0_t == GGML_TYPE_F16) { p[i].stages[0].backend = GGML_TASK_BACKEND_CPU; p[i].stages[1].backend = GGML_TASK_BACKEND_CPU; @@ -15635,16 +15617,6 @@ inline int ggml_get_task_profiles( p[i].stages[1].wait = true; i++; #endif - -#if defined(GGML_USE_CUBLAS) - p[i].stages[1].backend = GGML_TASK_BACKEND_GPU_CUDA; - p[i].stages[1].wait = true; - i++; -#elif defined(GGML_USE_CLBLAST) - p[i].stages[1].backend = GGML_TASK_BACKEND_GPU_CL; - p[i].stages[1].wait = true; - i++; -#endif } else if (ggml_is_quantized(src0_t)) { p[i].stages[0].backend = GGML_TASK_BACKEND_CPU; p[i].stages[1].backend = GGML_TASK_BACKEND_CPU; @@ -15658,16 +15630,6 @@ inline int ggml_get_task_profiles( p[i].stages[1].wait = true; i++; #endif - -#if defined(GGML_USE_CUBLAS) - p[i].stages[1].backend = GGML_TASK_BACKEND_GPU_CUDA; - p[i].stages[1].wait = true; - i++; -#elif defined(GGML_USE_CLBLAST) - p[i].stages[1].backend = GGML_TASK_BACKEND_GPU_CL; - p[i].stages[1].wait = true; - i++; -#endif } n_profiles = i; } break; @@ -15757,7 +15719,43 @@ inline int ggml_get_task_profiles( GGML_ASSERT(false); } +#if defined(GGML_USE_CUBLAS) + switch (node->op) { + case GGML_OP_ADD: + case GGML_OP_MUL: + case GGML_OP_SILU: + case GGML_OP_RMS_NORM: + case GGML_OP_MUL_MAT: + case GGML_OP_RESHAPE: + case GGML_OP_ROPE: { + int i = n_profiles; + p[i].runner = ggml_compute_forward_cuda; + p[i].stages[1].backend = GGML_TASK_BACKEND_GPU_CUDA; + p[i].stages[1].wait = true; + ++n_profiles; + } break; + default: { + } break; + } +#elif defined(GGML_USE_CLBLAST) + switch (node->op) { + case GGML_OP_MUL_MAT: { + int i = n_profiles; + p[i].runner = ggml_compute_forward_cl; + p[i].get_wsize = ggml_compute_forward_get_wsize_cl; + p[i].stages[1].backend = GGML_TASK_BACKEND_GPU_CL; + p[i].stages[1].wait = true; + ++n_profiles; + } break; + default: { + } break; + } +#endif + GGML_ASSERT(n_profiles > 0 && n_profiles <= GGML_MAX_TASK_PROFILES); + for (int i = 0; i < n_profiles; i++) { + profiles[i].id = i + 1; + } return n_profiles; } @@ -15769,7 +15767,7 @@ static const struct ggml_task_profile *ggml_mulmat_get_task_profile( GGML_ASSERT(node); GGML_ASSERT(node->op == GGML_OP_MUL_MAT || node->op == GGML_OP_OUT_PROD); GGML_ASSERT(profiles); - GGML_ASSERT(n_profiles >= 2); + GGML_ASSERT(n_profiles > 0); enum ggml_type src0_t = node->src0->type; enum ggml_type src1_t = node->src1->type; @@ -15777,9 +15775,9 @@ static const struct ggml_task_profile *ggml_mulmat_get_task_profile( // Type and memory layout requirements for computing mul_mat with BLAS. bool cond_match = (src0_t == GGML_TYPE_F32 || src0_t == GGML_TYPE_F16 || ggml_is_quantized(src0_t)) && - src1_t == GGML_TYPE_F32 && node->type == GGML_TYPE_F32 && - ggml_is_contiguous(node->src0) && - ggml_is_contiguous(node->src1); + src1_t == GGML_TYPE_F32 && node->type == GGML_TYPE_F32 && + ggml_is_contiguous(node->src0) && + ggml_is_contiguous(node->src1); int M = (int)node->ne[1]; int N = (int)node->ne[0]; @@ -15790,10 +15788,14 @@ static const struct ggml_task_profile *ggml_mulmat_get_task_profile( if (cond_match) { #if defined(GGML_USE_TUNE) if (tune != NULL) { - prof = ggml_mulmat_tune_select_task_profile(tune, M, N, K, src0_t, + GGML_ASSERT(n_profiles >= 2); + int id = ggml_mulmat_tune_select_task_profile(tune, M, N, K, src0_t, src1_t, stages_time_us); - if (prof != NULL) { - return prof; + for (int i = 0; i < n_profiles; i++) { + if (profiles[i].id == id) { + prof = &profiles[i]; + return prof; + } } } #else @@ -15841,11 +15843,101 @@ static const struct ggml_task_profile *ggml_mulmat_get_task_profile( return prof; } +void ggml_graph_compute_set_tensor_task_proile(struct ggml_tensor *node, + struct ggml_cgraph *cgraph) { + // Pre-specified. + for (int i = 0; i < 3; i++) { + if (node->task_profile.stages[i].backend > 0) { + return; + } + } + + struct ggml_task_profile profiles[GGML_MAX_TASK_PROFILES]; + int n_profiles = ggml_get_task_profiles(node, profiles); + + const struct ggml_task_profile *profile = NULL; + + // GPU offloading. A special case of pre-specified task_profile. + if (node->backend == GGML_BACKEND_GPU || node->backend == GGML_BACKEND_GPU_SPLIT) { + if (node->op != GGML_OP_MUL_MAT && node->op != GGML_OP_OUT_PROD) { + enum ggml_task_backend be; + if (ggml_cpu_has_cublas()) { + be = GGML_TASK_BACKEND_GPU_CUDA; + } else if (ggml_cpu_has_clblast()) { + be = GGML_TASK_BACKEND_GPU_CL; + } else { + GGML_ASSERT(false); + } + + for (int j = 0; j < n_profiles; j++) { + if (profiles[j].stages[1].backend == be) { + profile = &profiles[j]; + break; + } + } + GGML_ASSERT(profile); + GGML_ASSERT(!cgraph->tune); + + memcpy(&node->task_profile, profile, sizeof(struct ggml_task_profile)); + return; + } + } + + // mul_mat: GGML_OP_MUL_MAT and GGML_OP_OUT_PROD. + if (node->op == GGML_OP_MUL_MAT) { +#if defined(GGML_USE_TUNE) + GGML_ASSERT(node->backend == GGML_BACKEND_CPU); + + int stages_time_us[3]; + profile = ggml_mulmat_get_task_profile(node, profiles, n_profiles, + cgraph->tune, stages_time_us); + GGML_ASSERT(profile); + + memcpy(&node->task_profile, profile, sizeof(struct ggml_task_profile)); + + if (cgraph->tune) { + memcpy(&node->task_profile, profile, + sizeof(struct ggml_task_profile)); + + // Do not wait if the estimated execution time is too small + // (e.g. less than 0.1 ms) + // TODO: need bench actual wait/notify time, see + // ggml-threading.c + for (int j = 0; j < 3; j++) { + if (node->task_profile.stages[j].wait) { + if (stages_time_us[j] < 100) { + node->task_profile.stages[j].wait = false; + } + } + } + } + return; +#else + profile = ggml_mulmat_get_task_profile(node, profiles, n_profiles, NULL, + NULL); + GGML_ASSERT(profile); + memcpy(&node->task_profile, profile, sizeof(struct ggml_task_profile)); + return; +#endif + } else if (node->op == GGML_OP_OUT_PROD) { // FIXME: is this correct? + profile = ggml_mulmat_get_task_profile(node, profiles, n_profiles, NULL, + NULL); + GGML_ASSERT(profile); + memcpy(&node->task_profile, profile, sizeof(struct ggml_task_profile)); + return; + } + + // default. + profile = &profiles[0]; + GGML_ASSERT(profile->stages[1].backend == GGML_TASK_BACKEND_CPU); + memcpy(&node->task_profile, profile, sizeof(struct ggml_task_profile)); +} + void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph) { int n_threads = cgraph->n_threads; struct ggml_threading_context *thrd_ctx = ggml_threading_start( - n_threads, ggml_threading_graph_compute_thread, ggml_compute_forward, + n_threads, NULL, ggml_compute_forward, GGML_THREADING_FEATURE_WAIT_ON_DONE, NULL); // initialize tasks + work buffer @@ -15854,107 +15946,34 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph) size_t work_size = 0; - struct ggml_task_profile profiles[GGML_MAX_TASK_PROFILES]; - // thread scheduling for the different operations for (int i = 0; i < cgraph->n_nodes; i++) { struct ggml_tensor * node = cgraph->nodes[i]; - if (node->op == GGML_OP_NONE || node->op == GGML_OP_CONT) { - continue; - } - - int n_profiles = ggml_get_task_profiles(node, profiles); - const struct ggml_task_profile *profile = NULL; + GGML_ASSERT (node->op != GGML_OP_NONE); - // Adapt node->backend: assume GPU at COMPUTE stage. - if (node->backend == GGML_BACKEND_GPU || - node->backend == GGML_BACKEND_GPU_SPLIT) { - enum ggml_task_backend be; - if (ggml_cpu_has_cublas()) { - be = GGML_TASK_BACKEND_GPU_CUDA; - } else if (ggml_cpu_has_clblast()) { - be = GGML_TASK_BACKEND_GPU_CL; - } else { - GGML_ASSERT(false); - } + struct ggml_task_stage *stages = node->task_profile.stages; - for (int j = 0; j < n_profiles; j++) { - if (profiles[j].stages[1].backend == be) { - profile = &profiles[j]; - break; - } - } - GGML_ASSERT(profile); - } else { - GGML_ASSERT(node->backend == GGML_BACKEND_CPU); - } + ggml_graph_compute_set_tensor_task_proile(node, cgraph); - bool profile_copied = false; + // + // Allocate temp buffer `wdata` for CPU. + // NOTE: GPU MAY fallback to CPU, so we have to cover all possible cases. + // - if (node->op == GGML_OP_MUL_MAT) { -#if defined(GGML_USE_TUNE) - int stages_time_us[3]; - profile = ggml_mulmat_get_task_profile( - node, profiles, n_profiles, cgraph->tune, stages_time_us); - GGML_ASSERT(profile); - - if (cgraph->tune) { - memcpy(&node->task_profile, profile, - sizeof(struct ggml_task_profile)); - profile_copied = true; - - // Do not wait if the estimated execution time is too small - // (e.g. less than 0.1 ms) - // TODO: need bench actual wait/notify time, see - // ggml-threading.c - for (int j = 0; j< 3; j++) { - if (node->task_profile.stages[j].wait) { - if (stages_time_us[j] < 100) { - node->task_profile.stages[j].wait = false; - } - } - } + if (node->task_profile.get_wsize) { + int sz = node->task_profile.get_wsize(node); + if (sz >= 0) { + work_size = MAX(work_size, (size_t)sz); + continue; } -#else - profile = ggml_mulmat_get_task_profile(node, profiles, - n_profiles, NULL, NULL); - GGML_ASSERT(profile); -#endif - } else if (node->op == GGML_OP_OUT_PROD) { // FIXME: is is right? - profile = ggml_mulmat_get_task_profile(node, profiles, - n_profiles, NULL, NULL); - GGML_ASSERT(profile); - } else { - profile = &profiles[0]; - GGML_ASSERT(profile->stages[1].backend == - GGML_TASK_BACKEND_CPU); - } - - if (!profile_copied) { - memcpy(&node->task_profile, profile, - sizeof(struct ggml_task_profile)); } - struct ggml_task_stage *stages = node->task_profile.stages; - - // Workrounnd to set node->backend. - for (int j = 0; j < 3; j++) { - if (node->backend == GGML_BACKEND_CPU && - (stages[j].backend & GGML_TASK_BACKEND_GPU)) { - if (ggml_cpu_has_cublas() || ggml_cpu_has_clblast()) { - node->backend = GGML_BACKEND_GPU; - } else { - GGML_ASSERT(false); - } - } - } + //printf("op: %d, comp backend: %d\n", node->op, node->task_profile.stages[1].backend); // compute stage n_tasks. int n_tasks = stages[1].parallel ? n_threads : 1; - // Allocate temp buffer `wdata` for CPU. - // NOTE: GPU MAY fallback to CPU, so we have to cover all possible cases. switch (node->op) { case GGML_OP_CPY: case GGML_OP_DUP: @@ -16012,20 +16031,12 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph) { } break; case GGML_OP_MUL_MAT: - case GGML_OP_OUT_PROD: // FIXME: is is right? + case GGML_OP_OUT_PROD: // FIXME: is this correct? { size_t cur = 0; enum ggml_task_backend comp_backend = stages[GGML_TASK_COMPUTE].backend; GGML_ASSERT(comp_backend != GGML_TASK_BACKEND_NONE); - - if (comp_backend == GGML_TASK_BACKEND_GPU_CL) { -#if defined(GGML_USE_CLBLAST) - GGML_ASSERT(ggml_cl_can_mul_mat(node->src0, node->src1, node)); - cur = ggml_cl_mul_mat_get_wsize(node->src0, node->src1, node); -#else - GGML_ASSERT(false); -#endif - } else if (comp_backend == GGML_TASK_BACKEND_CPU_BLAS) { + if (comp_backend == GGML_TASK_BACKEND_CPU_BLAS) { GGML_ASSERT(ggml_cpu_has_cpublas()); GGML_ASSERT(node->src1->type == GGML_TYPE_F32); @@ -16039,11 +16050,7 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph) } else { GGML_ASSERT(false); } - } else if (comp_backend == GGML_TASK_BACKEND_CPU || comp_backend == GGML_TASK_BACKEND_GPU_CUDA) { - if (comp_backend == GGML_TASK_BACKEND_GPU_CUDA) { - GGML_ASSERT(ggml_cpu_has_cublas()); - } - + } else { // CPU or GPU fallback GGML_ASSERT(node->src1->type == GGML_TYPE_F32); if (node->src0->type == GGML_TYPE_F32) { @@ -16056,8 +16063,6 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph) } else { GGML_ASSERT(false); } - } else { - GGML_ASSERT(false); } work_size = MAX(work_size, cur); diff --git a/ggml.h b/ggml.h index 5ab78c4a011b5..d4d5d3521c74f 100644 --- a/ggml.h +++ b/ggml.h @@ -390,11 +390,40 @@ extern "C" { bool wait; }; + struct ggml_tensor; + struct ggml_compute_params; + + // Compute errors. + enum ggml_compute_error { + GGML_COMPUTE_OK = 0, + GGML_COMPUTE_FALLBACK = 1, + }; + + // The task runner to be called by main thread and workers. + typedef enum ggml_compute_error(ggml_task_runner)( + const struct ggml_compute_params *params, + struct ggml_tensor *node); + + // Get wsize for node computing. + // When return -1: should be explained as `fallback to CPU`, caller MUST + // determine how much memory to reserve for this node. + typedef int (ggml_task_get_wsize)(struct ggml_tensor *tensor); + // config for computing a tensor. struct ggml_task_profile { + // profile id, start from 1. + int id; + // index 0: INIT, 1: COMPUTE, 2: FINALIZE struct ggml_task_stage stages[3]; + // Optional task runner, overrides threading's task runner. + ggml_task_runner *runner; + + // Optional function to return required wsize for wdata. + ggml_task_get_wsize *get_wsize; + + // Optional flag for development. // MUST be used only in testing codes. uint8_t dev_flags[4]; }; diff --git a/tests/test-ggml-threading.c b/tests/test-ggml-threading.c index 90d53e4cdea2a..2079fe144b194 100644 --- a/tests/test-ggml-threading.c +++ b/tests/test-ggml-threading.c @@ -42,7 +42,8 @@ static const int n_repeat = 10; static int work_done_arr[MAX_N_THREADS]; static enum ggml_compute_error -mock_task_runner(struct ggml_compute_params *params, struct ggml_tensor *node) { +mock_task_runner(const struct ggml_compute_params *params, + struct ggml_tensor *node) { int64_t loops = node->task_profile.dev_flags[1] * 1000 * 1000; if (node->task_profile.stages[params->type].parallel) { loops /= params->nth; @@ -79,9 +80,8 @@ int test_driver(int id, struct ggml_tensor *node, int n_threads) { int t0 = (int)ggml_time_us(); - struct ggml_threading_context *ctx = - ggml_threading_start(n_threads, ggml_threading_graph_compute_thread, - mock_task_runner, features, /*stages_time*/ NULL); + struct ggml_threading_context *ctx = ggml_threading_start( + n_threads, NULL, mock_task_runner, features, /*stages_time*/ NULL); int t1 = (int)ggml_time_us(); @@ -141,7 +141,7 @@ int test_driver(int id, struct ggml_tensor *node, int n_threads) { } static enum ggml_compute_error -mock_task_runner_fallback(struct ggml_compute_params *params, +mock_task_runner_fallback(const struct ggml_compute_params *params, struct ggml_tensor *node) { UNUSED(params); if (node->backend == GGML_BACKEND_GPU) { @@ -158,7 +158,7 @@ mock_task_runner_fallback(struct ggml_compute_params *params, // thus it is not parallelled. int test_fallback(struct ggml_tensor *node) { struct ggml_threading_context *ctx = ggml_threading_start( - 1, ggml_threading_graph_compute_thread, mock_task_runner_fallback, + 1, NULL, mock_task_runner_fallback, /*features*/ GGML_THREADING_FEATURE_NONE, /*stages_time*/ NULL); enum ggml_compute_error err = @@ -177,6 +177,38 @@ int test_fallback(struct ggml_tensor *node) { return 0; } +static enum ggml_compute_error +customized_node_runner(const struct ggml_compute_params *params, + struct ggml_tensor *node) { + UNUSED(params); + // Reset runner thus caller will know it was called. + node->task_profile.runner = NULL; + return GGML_COMPUTE_OK; +} + +// Test when node->task_profile.runner is not NULL. +int test_customized_node_runner(struct ggml_tensor *node) { + struct ggml_threading_context *ctx = ggml_threading_start( + 1, NULL, mock_task_runner, + /*features*/ GGML_THREADING_FEATURE_NONE, /*stages_time*/ NULL); + + node->task_profile.runner = customized_node_runner; + enum ggml_compute_error err = + ggml_threading_compute_tensor(ctx, node, /*wdata*/ NULL, /*wsize*/ 0); + + ggml_threading_stop(ctx); + if (err != GGML_COMPUTE_OK) { + // should not happen. + abort(); + } + + if (node->task_profile.runner != NULL) { + return 2; + } + + return 0; +} + int main(void) { ggml_time_init(); @@ -367,7 +399,10 @@ int main(void) { } } + // fallback { + printf("[test-ggml-threading] test fallback ...\n"); + ++n_tests; // required by getting task profiles. @@ -382,9 +417,21 @@ int main(void) { node.src1 = &src1; node.backend = GGML_BACKEND_GPU; + stages[1].backend = GGML_TASK_BACKEND_GPU; if (test_fallback(&node) == 0) { ++n_passed; - printf("\n[test-ggml-threading] test fallback: ok\n\n"); + printf("[test-ggml-threading] test fallback: ok\n\n"); + } + } + + // customized node runner + { + printf("[test-ggml-threading] test customized node runner ...\n"); + ++n_tests; + + if (test_customized_node_runner(&node) == 0) { + ++n_passed; + printf("[test-ggml-threading] test customized node runner: ok\n\n"); } } diff --git a/tests/test-ggml-tune.c b/tests/test-ggml-tune.c index 5499fa6bf1d82..4339881e52c2d 100644 --- a/tests/test-ggml-tune.c +++ b/tests/test-ggml-tune.c @@ -72,7 +72,9 @@ static int bench(void) { // GGML_FTYPE_ALL_F32, // GGML_FTYPE_MOSTLY_F16, GGML_FTYPE_MOSTLY_Q4_0, +#if defined(GGML_USE_K_QUANTS) GGML_FTYPE_MOSTLY_Q4_K, +#endif }; int n_ftypes = sizeof(ftypes) / sizeof(ftypes[0]); @@ -132,16 +134,14 @@ int estimate_time_non_zero_NK(void) { int time[3]; // 3 profiles. }; - struct ggml_mulmat_tune tune = { - .version = 1, - .ftype = GGML_FTYPE_MOSTLY_Q4_0, - }; + struct ggml_mulmat_tune tune; + enum ggml_ftype ftype = GGML_FTYPE_MOSTLY_Q4_0; const int m_num = 2; const int n_threads = 1; // useless. struct ggml_mulmat_tune_params params; - init_params(¶ms, tune.ftype, m_num, n_threads); + init_params(¶ms, ftype, m_num, n_threads); ggml_mulmat_tune_init(&tune, ¶ms, ggml_task_profiles_mock_qxx_provider); From 06b00827a04a304cee1b2f5ca540a5a55223b9bb Mon Sep 17 00:00:00 2001 From: mqy Date: Sun, 18 Jun 2023 12:29:16 +0800 Subject: [PATCH 10/24] bulk refactoring task profile and related to run CL GPU offloading. * removed ggml_task_backend, infavour of ggml_task_profile.runner and newly added id and name. * extracted mul_mat blas codes into ggml_compute_forward_mul_mat_blas, thus align with CUDA/CL a bit more and make it easier to fix profile and run tune. * rewrote task profile and update/add some cuda/cl codes, finnaly made CL GPU offloading work. * misc minor fix/update to tune, the data format was changed. --- examples/mulmat-tune/README.md | 45 +- examples/mulmat-tune/mulmat-tune.cpp | 5 + ggml-cuda.cu | 25 +- ggml-cuda.h | 2 +- ggml-opencl.cpp | 17 +- ggml-opencl.h | 2 +- ggml-threading.c | 17 +- ggml-threading.h | 10 +- ggml-tune.c | 165 ++--- ggml-tune.h | 9 +- ggml.c | 962 +++++++++++++-------------- ggml.h | 30 +- llama.cpp | 73 +- tests/test-ggml-threading.c | 45 +- tests/test-ggml-tune.c | 21 +- 15 files changed, 664 insertions(+), 764 deletions(-) diff --git a/examples/mulmat-tune/README.md b/examples/mulmat-tune/README.md index df023757a85b1..4e521211d968e 100644 --- a/examples/mulmat-tune/README.md +++ b/examples/mulmat-tune/README.md @@ -214,26 +214,19 @@ The following results are generated with Accelerate compiled. **Example** ``` -5 3B 2 6 1 - -3200 3200 2 0 3 10 -16 0 0 0 16 1 0 1 0 0 0 0 -16 1 0 2 17 0 1 0 0 0 0 0 - 0 0 0 0 34 0 1 0 0 0 0 0 - 1 1 793 0 9103 2102 0 0 6014 0 - 2 2 1591 0 8034 2305 0 0 30982 0 - 4 4 2236 0 6476 2484 0 0 31388 0 - 8 7 4161 0 6623 2389 0 0 29204 0 - 16 15 8339 0 6434 2752 0 0 34303 0 - 32 32 16919 0 6915 3651 0 0 42511 0 - 64 200 34270 0 6574 4528 0 0 68212 0 - 128 188 69400 0 6325 6839 0 0 74437 0 - 256 303 134597 0 6168 11544 0 0 110180 0 - 512 687 279685 0 6337 29712 0 0 159728 0 - -3200 8640 2 0 2 10 - - ... +[tune] done, elapsed time: 0 seconds. +10 xB 12 4 2 + +1024 1024 12 0 2 4 +100 110 000 1 CPU +110 101 000 2 BLAS + 1 11 309 0 1234 90 0 + 2 23 654 0 1359 215 0 + 4 44 1283 0 1362 421 0 + 8 85 2341 0 1357 347 0 + +1024 2048 12 0 2 4 +... ``` @@ -249,17 +242,17 @@ shape+ # head version: 1 model: "3B" | "7B" | "13B" | "30B" | "65B" -ggml_ftype: 0 - 4, 7 - 14 +ggml_ftype: 0 - 3, 7 - 14 n_shapes: number of shapes n_threads: number of threads -shape := N K m_num n_profiles -task_conf_profile+ +shape := N K src0_ggml_type src1_ggml_type n_profiles m_num +task_profile+ bench_item+ -task_conf_profile: stage_conf(init) stage_conf(compute) stage_conf(finalize) -stage_conf: backend parallel wait -backend: 0 (NONE) | 16 (CPU) | 17 (CPU_BLAS) | 32 (GPU) | 33 (GPU_CUDA) | 34 (GPU_CL) +task_profile: stage_conf(init) stage_conf(compute) stage_conf(finalize) id name +stage_conf(bitmap): valid parallel wait +valid: 0 (false) | 1 (true) parallel: 0 (false) | 1 (true) wait: 0 (false) | 1 (true) diff --git a/examples/mulmat-tune/mulmat-tune.cpp b/examples/mulmat-tune/mulmat-tune.cpp index da1d0a1c1fe7e..ba1cc0f8a1fac 100644 --- a/examples/mulmat-tune/mulmat-tune.cpp +++ b/examples/mulmat-tune/mulmat-tune.cpp @@ -111,6 +111,11 @@ static void usage(char *prog) { } int main(int argc, char **argv) { + if (!ggml_cpu_has_blas()) { + fprintf(stderr, "error: this program is not built with BLAS.\n"); + return 1; + } + if (argc == 2) { if (strcmp(argv[1], "-h") == 0 || strcmp(argv[1], "--help") == 0) { usage(argv[0]); diff --git a/ggml-cuda.cu b/ggml-cuda.cu index cf52109bce96e..5a4c7725a92de 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -2207,17 +2207,12 @@ void ggml_cuda_rms_norm(const ggml_tensor * src0, const ggml_tensor * src1, ggml ggml_cuda_op(src0, src1, dst, ggml_cuda_op_rms_norm, true, true); } -bool ggml_cuda_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) { - const int64_t ne10 = src1->ne[0]; - - const int64_t ne0 = dst->ne[0]; - const int64_t ne1 = dst->ne[1]; - +// NOTE: don't check matrix size, otherwise mul_mat tune will fail to run. +static bool ggml_cuda_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) { // TODO: find the optimal values for these if ((src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) && src1->type == GGML_TYPE_F32 && - dst->type == GGML_TYPE_F32 && - (ne0 >= 32 && ne1 >= 32 && ne10 >= 32)) { + dst->type == GGML_TYPE_F32) { return true; } @@ -2539,11 +2534,17 @@ void ggml_cuda_free_scratch() { g_scratch_buffer = nullptr; } -bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor){ - ggml_cuda_func_t func; - const bool any_on_device = tensor->backend == GGML_BACKEND_GPU +bool ggml_cuda_is_gpu_offloading(struct ggml_tensor * tensor) { + GGML_ASSERT(tensor); + GGML_ASSERT(tensor->src0); + return tensor->backend == GGML_BACKEND_GPU || tensor->src0->backend == GGML_BACKEND_GPU || tensor->src0->backend == GGML_BACKEND_GPU_SPLIT || (tensor->src1 != nullptr && tensor->src1->backend == GGML_BACKEND_GPU); +} + +bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor){ + ggml_cuda_func_t func; + const bool any_on_device = is_gpu_offloading(tensor); switch (tensor->op) { case GGML_OP_ADD: @@ -2571,7 +2572,7 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_ func = ggml_cuda_rms_norm; break; case GGML_OP_MUL_MAT: - if (!any_on_device/* && !ggml_cuda_can_mul_mat(tensor->src0, tensor->src1, tensor)*/) { + if (!any_on_device && !ggml_cuda_can_mul_mat(tensor->src0, tensor->src1, tensor)) { return false; } func = ggml_cuda_mul_mat; diff --git a/ggml-cuda.h b/ggml-cuda.h index d32b4484267ab..75ea94392ce6b 100644 --- a/ggml-cuda.h +++ b/ggml-cuda.h @@ -16,7 +16,7 @@ void ggml_init_cublas(void); void ggml_cuda_set_tensor_split(const float * tensor_split); void ggml_cuda_mul(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst); -bool ggml_cuda_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst); +bool ggml_cuda_is_gpu_offloading(const struct ggml_tensor * src0); size_t ggml_cuda_mul_mat_get_wsize(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst); void ggml_cuda_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst, void * wdata, size_t wsize); diff --git a/ggml-opencl.cpp b/ggml-opencl.cpp index 2a1a04fcaccf1..28098793df296 100644 --- a/ggml-opencl.cpp +++ b/ggml-opencl.cpp @@ -1589,18 +1589,17 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor * } } +bool ggml_cl_is_gpu_offloading(struct ggml_tensor * tensor) { + GGML_ASSERT(tensor); + return (tensor->src0 && tensor->src0->backend == GGML_BACKEND_GPU) || + (tensor->src1 && tensor->src1->backend == GGML_BACKEND_GPU); +} -bool ggml_cl_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) { - const int64_t ne10 = src1->ne[0]; - - const int64_t ne0 = dst->ne[0]; - const int64_t ne1 = dst->ne[1]; - - // TODO: find the optimal values for these +// NOTE: don't check matrix size, otherwise mul_mat tune will fail to run. +static bool ggml_cl_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) { if ((src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) && src1->type == GGML_TYPE_F32 && - dst->type == GGML_TYPE_F32 /*&& - ((ne0 >= 32 && ne1 >= 32 && ne10 >= 32) || src0->backend == GGML_BACKEND_GPU)*/) { + dst->type == GGML_TYPE_F32) { return true; } diff --git a/ggml-opencl.h b/ggml-opencl.h index a92b445c9d766..1de12f55a5c95 100644 --- a/ggml-opencl.h +++ b/ggml-opencl.h @@ -9,7 +9,7 @@ extern "C" { void ggml_cl_init(void); void ggml_cl_mul(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst); -bool ggml_cl_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst); +bool ggml_cl_is_gpu_offloading(struct ggml_tensor * tensor); size_t ggml_cl_mul_mat_get_wsize(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst); void ggml_cl_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst, void * wdata, size_t wsize); diff --git a/ggml-threading.c b/ggml-threading.c index 7ef763c0f81e1..dada9f3fe8466 100644 --- a/ggml-threading.c +++ b/ggml-threading.c @@ -376,7 +376,7 @@ ggml_thread_ret_t ggml_threading_graph_compute_thread(void *data) { struct ggml_compute_state_shared *shared = state->shared; GGML_ASSERT(shared); - GGML_ASSERT(shared->task_runner); + //GGML_ASSERT(shared->task_runner); shared->n_ready++; @@ -397,7 +397,7 @@ ggml_thread_ret_t ggml_threading_graph_compute_thread(void *data) { : shared->task_runner; enum ggml_compute_error err = runner(&state->params, state->node); - GGML_ASSERT(err == GGML_COMPUTE_OK); + GGML_ASSERT(err == GGML_COMPUTE_OK || err == GGML_COMPUTE_FALLBACK); ggml_spin_lock(&shared->spin); @@ -430,7 +430,7 @@ ggml_threading_compute_tensor(struct ggml_threading_context *ctx, size_t wsize) { GGML_ASSERT(ctx); GGML_ASSERT(node); - GGML_ASSERT(ctx->shared.task_runner); + // GGML_ASSERT(ctx->shared.task_runner); ggml_task_runner *runner = ctx->shared.task_runner; if (node->task_profile.runner) { @@ -448,7 +448,7 @@ ggml_threading_compute_tensor(struct ggml_threading_context *ctx, memset(¶ms, 0, sizeof(struct ggml_compute_params)); for (int type = GGML_TASK_INIT; type <= GGML_TASK_FINALIZE; type++) { - if (node->task_profile.stages[type].backend == GGML_TASK_BACKEND_NONE) { + if (!node->task_profile.stages[type].valid) { continue; } @@ -519,18 +519,17 @@ ggml_threading_compute_tensor(struct ggml_threading_context *ctx, if (err == GGML_COMPUTE_FALLBACK) { PRINT_DEBUG("[main] fallback from profile, id=%d\n", node->task_profile.id); - GGML_ASSERT(node->task_profile.stages[1].backend > - GGML_TASK_BACKEND_CPU); + GGML_ASSERT(node->task_profile.id > 1); struct ggml_task_profile profiles[GGML_MAX_TASK_PROFILES]; int n = ggml_get_task_profiles(node, profiles); GGML_ASSERT(n > 0); - GGML_ASSERT(profiles[0].stages[1].backend == - GGML_TASK_BACKEND_CPU); + GGML_ASSERT(profiles[0].id == 1); memcpy(&node->task_profile, &profiles[0], - sizeof(struct ggml_task_profile)); + sizeof(struct ggml_task_profile)); runner = ctx->shared.task_runner; + GGML_ASSERT(runner); goto START; } diff --git a/ggml-threading.h b/ggml-threading.h index 189fc2ed56a69..81192450c6728 100644 --- a/ggml-threading.h +++ b/ggml-threading.h @@ -29,7 +29,9 @@ typedef ggml_thread_ret_t(ggml_threading_thread_runner)(void *data); // thread: optional OS thread runner, default value: // `ggml_threading_graph_compute_thread`. // -// features: optional for configure +// task_runner: default task runner, nullable wheen tensor.runner is not NULL. +// Overridden by tensor.runner. +// features: configure threading behaviour, optional. // threading additional features. see `ggml_threading_feature`, default 0. // // stages_time: optional for collecting per-stage wall clock time. @@ -51,12 +53,6 @@ enum ggml_compute_error ggml_threading_compute_tensor(struct ggml_threading_context *ctx, struct ggml_tensor *node, void *wdata, size_t wsize); - -// This is an experimental functionality for mulmat tune, as a thin wrapper. -enum ggml_compute_error -ggml_compute_forward_wrapper(const struct ggml_compute_params *params, - struct ggml_tensor *tensor); - #ifdef __cplusplus } #endif diff --git a/ggml-tune.c b/ggml-tune.c index aeb63e957da2d..444269ae4f55a 100644 --- a/ggml-tune.c +++ b/ggml-tune.c @@ -24,26 +24,7 @@ static uint64_t ggml_mulmat_tune_cache_hash(int M, int N, int K) { return hash; } -static const char * -ggml_mulmat_tune_task_backend_name(enum ggml_task_backend backend) { - switch (backend) { - case GGML_TASK_BACKEND_NONE: - return ""; - case GGML_TASK_BACKEND_CPU: - return "CPU"; - case GGML_TASK_BACKEND_CPU_BLAS: - return "BLAS"; - case GGML_TASK_BACKEND_GPU: - return "GPU"; - case GGML_TASK_BACKEND_GPU_CUDA: - return "CUDA"; - case GGML_TASK_BACKEND_GPU_CL: - return "CL"; - default: - GGML_ASSERT(false); - } -} - +// Return profile id, -1 when failed (such as unable to match shape). // NOTE: we can not use the profile from tune because the profiles do not // contain fields such as runner, get_size. int ggml_mulmat_tune_select_task_profile(struct ggml_mulmat_tune *tune, int M, @@ -101,20 +82,15 @@ int ggml_mulmat_tune_select_task_profile(struct ggml_mulmat_tune *tune, int M, e->K = K; #ifndef GGML_TUNE_NDEBUG - const char *names[3]; - for (int i = 0; i < 3; i++) { - names[i] = ggml_mulmat_tune_task_backend_name( - prof->stages[i].backend); - } printf("\n[tune] M: %3d, N: %5d, K: %5d, profile id: %d, " "backends: %s %s %s\n", - M, N, K, prof->id, names[0], names[1], names[2]); + M, N, K, prof->id, prof->name); #endif } } } - return prof->id; + return prof ? prof->id : -1; } void ggml_mulmat_tune_model_init(struct ggml_mulmat_tune_model *model, @@ -283,25 +259,24 @@ static bool ggml_mulmat_tune_write_profiles( int rc; for (int i = 0; i < n_profiles; i++) { const struct ggml_task_profile *profile = &profiles[i]; - rc = fprintf(fp, "%d ", profile->id); - if (rc <= 0) { - return false; - } - for (int j = 0; j < 3; j++) { const struct ggml_task_stage *ts = &profile->stages[j]; - rc = fprintf(fp, "%2d %d %d", ts->backend, ts->parallel ? 1 : 0, - ts->wait ? 1 : 0); + rc = fprintf(fp, "%1d%1d%1d", ts->valid ? 1 : 0, + ts->parallel ? 1 : 0, ts->wait ? 1 : 0); if (rc <= 0) { return false; } if (j < 2) { - rc = fprintf(fp, " "); + rc = fprintf(fp, " "); if (rc <= 0) { return false; } } } + rc = fprintf(fp, " %d %s", profile->id, profile->name); + if (rc <= 0) { + return false; + } rc = fprintf(fp, "\n"); if (rc <= 0) { return false; @@ -407,24 +382,24 @@ bool ggml_mulmat_tune_validate(const struct ggml_mulmat_tune *tune, return ok; } -bool ggml_mulmat_tune_read_data(struct ggml_mulmat_tune *tune, FILE *fp) { +int ggml_mulmat_tune_read_data(struct ggml_mulmat_tune *tune, FILE *fp) { GGML_ASSERT(tune); memset(tune, 0, sizeof(struct ggml_mulmat_tune)); int rc = fscanf(fp, "%d", &tune->version); if (rc <= 0) { - return false; + return 1; } if (tune->version != GGML_MULMAT_TUNE_VERSION) { fprintf(stderr, "[tune] version mismatch, run bench again\n"); - return false; + return 2; } rc = fscanf(fp, "%s %d %d %d", tune->model, (int *)&tune->ftype, &tune->n_shapes, &tune->n_threads); if (rc <= 0) { - return false; + return 3; } for (int i_shape = 0; i_shape < tune->n_shapes; i_shape++) { @@ -434,7 +409,7 @@ bool ggml_mulmat_tune_read_data(struct ggml_mulmat_tune *tune, FILE *fp) { (int *)&shape->src0_type, (int *)&shape->src1_type, &shape->n_profiles, &shape->m_num); if (rc <= 0) { - return false; + return 4; } { @@ -451,24 +426,24 @@ bool ggml_mulmat_tune_read_data(struct ggml_mulmat_tune *tune, FILE *fp) { for (int ip = 0; ip < shape->n_profiles; ip++) { struct ggml_task_profile *profile = &shape->profiles[ip]; - rc = fscanf(fp, "%d ", &profile->id); - if (rc <= 0) { - return false; - } - for (int j = 0; j < 3; j++) { struct ggml_task_stage *ts = &profile->stages[j]; - int backend; + int valid; int parallel; int wait; - rc = fscanf(fp, "%d %d %d", &backend, ¶llel, &wait); + rc = fscanf(fp, " %1d%1d%1d", &valid, ¶llel, &wait); if (rc <= 0) { - return false; + return 5; } - ts->backend = (enum ggml_task_backend)backend; + ts->valid = valid ? true : false; ts->parallel = parallel ? true : false; ts->wait = wait ? true : false; } + + rc = fscanf(fp, "%d %s", &profile->id, profile->name); + if (rc <= 0) { + return 6; + } } for (int i_m = 0; i_m < shape->m_num; i_m++) { @@ -477,7 +452,7 @@ bool ggml_mulmat_tune_read_data(struct ggml_mulmat_tune *tune, FILE *fp) { if (ip == 0) { rc = fscanf(fp, "%d", &M); if (rc <= 0) { - return false; + return 7; } } struct ggml_mulmat_tune_m *item = @@ -486,13 +461,13 @@ bool ggml_mulmat_tune_read_data(struct ggml_mulmat_tune *tune, FILE *fp) { rc = fscanf(fp, "%d %d %d", &item->stages_time[0], &item->stages_time[1], &item->stages_time[2]); if (rc <= 0) { - return false; + return 8; } } } } - return true; + return 0; } bool ggml_mulmat_tune_write_data(const struct ggml_mulmat_tune *tune, @@ -535,7 +510,7 @@ bool ggml_mulmat_tune_write_data(const struct ggml_mulmat_tune *tune, const struct ggml_task_profile *profile = &shape->profiles[ip]; for (int k = 0; k < 3; k++) { - if (profile->stages[k].backend != GGML_TASK_BACKEND_NONE) { + if (profile->stages[k].valid) { rc = fprintf(fp, "%9d", item->stages_time[k]); if (rc <= 0) { return false; @@ -562,8 +537,6 @@ const struct ggml_mulmat_tune_shape * ggml_mulmat_tune_get_shape(const struct ggml_mulmat_tune *tune, const int N, const int K, enum ggml_type src0_type, enum ggml_type src1_type) { - GGML_ASSERT(N > 0 && K > 0); - for (int i = 0; i < tune->n_shapes; i++) { const struct ggml_mulmat_tune_shape *s = &tune->shapes[i]; if (s->src0_type != src0_type || s->src1_type != src1_type) { @@ -574,13 +547,17 @@ ggml_mulmat_tune_get_shape(const struct ggml_mulmat_tune *tune, const int N, if (s->N == N && s->K == K) { return s; } - } else if (s->N > 0 && s->K == 0) { - if (s->N == N) { - return s; - } - } else if (s->N == 0 && s->K > 0) { - if (s->K == K) { - return s; + } + + if (GGML_MULMAT_N_SHAPES == 6) { + if (s->N > 0 && s->K == 0) { + if (s->N == N) { + return s; + } + } else if (s->N == 0 && s->K > 0) { + if (s->K == K) { + return s; + } } } } @@ -639,7 +616,7 @@ void ggml_mulmat_tune_estimate_time( for (int i_stage = 0; i_stage < 3; i_stage++) { const struct ggml_task_stage *stage = &profile->stages[i_stage]; - if (stage->backend == GGML_TASK_BACKEND_NONE) { + if (!stage->valid) { continue; } @@ -784,23 +761,6 @@ static size_t ggml_mulmat_allocate_wdata(int N, int K, char **wdata) { return sz; } -int ggml_mulmat_tune_get_builtin_task_backends( - enum ggml_task_backend *backends) { - int i = 0; - backends[i++] = GGML_TASK_BACKEND_CPU; - - if (ggml_cpu_has_cpublas()) { - backends[i++] = GGML_TASK_BACKEND_CPU_BLAS; - } - - if (ggml_cpu_has_cublas()) { - backends[i++] = GGML_TASK_BACKEND_GPU_CUDA; - } else if (ggml_cpu_has_clblast()) { - backends[i++] = GGML_TASK_BACKEND_GPU_CL; - } - return i; -} - bool ggml_mulmat_tune_bench(struct ggml_mulmat_tune *tune, struct ggml_mulmat_tune_params *params) { GGML_ASSERT(tune); @@ -809,23 +769,6 @@ bool ggml_mulmat_tune_bench(struct ggml_mulmat_tune *tune, memset(tune, 0, sizeof(struct ggml_mulmat_tune)); - enum ggml_task_backend backends[16]; - int n_backends = ggml_mulmat_tune_get_builtin_task_backends(backends); - if (n_backends < 2) { - fprintf(stderr, - "[tune] error: this program was not built with BLAS.\n"); - return false; - } - - if (params->model.ftype >= GGML_FTYPE_MOSTLY_Q2_K && - params->model.ftype <= GGML_FTYPE_MOSTLY_Q6_K) { -#if defined(GGML_USE_CLBLAST) - printf("[tune] error: cl implementation does not support k_quants at " - "the time of writing this code, skip.\n"); - return false; -#endif - } - bool ok = ggml_mulmat_tune_init(tune, params, ggml_get_task_profiles); if (!ok) { return false; @@ -835,12 +778,13 @@ bool ggml_mulmat_tune_bench(struct ggml_mulmat_tune *tune, char buf[128] = {0}; int offset = 0; - for (int i = 0; i < n_backends; i++) { + for (int i = 0; i < tune->shapes[0].n_profiles; i++) { if (i > 0) { buf[offset++] = ','; buf[offset++] = ' '; } - const char *name = ggml_mulmat_tune_task_backend_name(backends[i]); + const char *name = tune->shapes[0].profiles[i].name; + GGML_ASSERT(name != NULL && strcmp(name, "") != 0); size_t len = strlen(name); memcpy(&buf[offset], name, len); offset += (int)len; @@ -848,17 +792,17 @@ bool ggml_mulmat_tune_bench(struct ggml_mulmat_tune *tune, fprintf(stdout, "[tune] model: %s, ggml ftype: %d, " - "n_pass: %d, n_threads: %d, n_shapes: %d, backends: %s\n", + "n_pass: %d, n_shapes: %d, n_threads: %d, profiles: %s\n", params->model.name, params->model.ftype, params->n_pass, - params->n_threads, tune->n_shapes, buf); + tune->n_shapes, params->n_threads, buf); } int64_t stages_time[3]; int64_t t0 = ggml_time_ms(); - struct ggml_threading_context *thrd_ctx = ggml_threading_start( - tune->n_threads, NULL, ggml_compute_forward_wrapper, - GGML_THREADING_FEATURE_WAIT_ON_DONE, stages_time); + struct ggml_threading_context *thrd_ctx = + ggml_threading_start(tune->n_threads, NULL, NULL, + GGML_THREADING_FEATURE_WAIT_ON_DONE, stages_time); for (int i_shape = 0; i_shape < tune->n_shapes; i_shape++) { const struct ggml_mulmat_tune_shape *shape = &tune->shapes[i_shape]; @@ -896,6 +840,7 @@ bool ggml_mulmat_tune_bench(struct ggml_mulmat_tune *tune, for (int ip = 0; ip < shape->n_profiles; ip++) { const struct ggml_task_profile *profile = &shape->profiles[ip]; + // GGML_ASSERT(profile->runner); memcpy(&node->task_profile, profile, sizeof(struct ggml_task_profile)); @@ -911,9 +856,15 @@ bool ggml_mulmat_tune_bench(struct ggml_mulmat_tune *tune, stages_time[j] = 0; } - enum ggml_compute_error err = ggml_threading_compute_tensor( - thrd_ctx, node, wdata, wsize); - GGML_ASSERT(err == GGML_COMPUTE_OK); + ggml_threading_compute_tensor(thrd_ctx, node, wdata, wsize); + + if (memcmp(profile, &node->task_profile, + sizeof(struct ggml_task_profile)) != 0) { + printf("[tune] error: task profile changed, tensor op: " + "%d, original id: %d, current id: %d\n", + node->op, profile->id, node->task_profile.id); + exit(1); + } for (int i = 0; i < 3; i++) { int v = (int)stages_time[i]; diff --git a/ggml-tune.h b/ggml-tune.h index 7955a50a977da..addcd34dbbd62 100644 --- a/ggml-tune.h +++ b/ggml-tune.h @@ -10,7 +10,7 @@ extern "C" { #endif -#define GGML_MULMAT_TUNE_VERSION 9 +#define GGML_MULMAT_TUNE_VERSION 10 #define GGML_MULMAT_N_SHAPES 4 #define GGML_MULMAT_CACHE_LEN 16 @@ -119,7 +119,7 @@ void ggml_mulmat_tune_free(struct ggml_mulmat_tune *tune); bool ggml_mulmat_tune_write_data(const struct ggml_mulmat_tune *tune, FILE *fp); -bool ggml_mulmat_tune_read_data(struct ggml_mulmat_tune *tune, FILE *fp); +int ggml_mulmat_tune_read_data(struct ggml_mulmat_tune *tune, FILE *fp); const struct ggml_mulmat_tune_shape * ggml_mulmat_tune_get_shape(const struct ggml_mulmat_tune *tune, int N, int K, @@ -129,11 +129,6 @@ void ggml_mulmat_tune_estimate_time(const struct ggml_mulmat_tune_shape *shape, int M, struct ggml_mulmat_tune_time *profile_time); -const char *ggml_task_backend_name(enum ggml_task_backend backend); - -int ggml_mulmat_tune_get_builtin_task_backends( - enum ggml_task_backend *backends); - bool ggml_mulmat_tune_bench(struct ggml_mulmat_tune *tune, struct ggml_mulmat_tune_params *params); diff --git a/ggml.c b/ggml.c index 43ec93a64e2b2..62750b20bd127 100644 --- a/ggml.c +++ b/ggml.c @@ -8500,15 +8500,6 @@ static void ggml_compute_forward_mul_f32( const int ith = params->ith; const int nth = params->nth; -#ifdef GGML_USE_CLBLAST - if (src1->backend == GGML_BACKEND_GPU) { - if (ith == 0) { - ggml_cl_mul(src0, src1, dst); - } - return; - } -#endif - const int64_t nr = ggml_nrows(src0); const int64_t ne00 = src0->ne[0]; @@ -9933,6 +9924,168 @@ static void ggml_compute_forward_rms_norm_back( } } +#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) +static void ggml_compute_forward_mul_mat_blas( + const struct ggml_compute_params * params, + struct ggml_tensor * dst) { + struct ggml_tensor * src0 = dst->src0; + struct ggml_tensor * src1 = dst->src1; + + const int64_t ne00 = src0->ne[0]; + const int64_t ne01 = src0->ne[1]; + const int64_t ne02 = src0->ne[2]; + const int64_t ne03 = src0->ne[3]; + + const int64_t ne10 = src1->ne[0]; + const int64_t ne11 = src1->ne[1]; + const int64_t ne12 = src1->ne[2]; + const int64_t ne13 = src1->ne[3]; + + const int64_t ne0 = dst->ne[0]; + const int64_t ne1 = dst->ne[1]; + const int64_t ne2 = dst->ne[2]; + const int64_t ne3 = dst->ne[3]; + + const int nb00 = src0->nb[0]; + const int nb01 = src0->nb[1]; + const int nb02 = src0->nb[2]; + const int nb03 = src0->nb[3]; + + const int nb10 = src1->nb[0]; + // const int nb11 = src1->nb[1]; + const int nb12 = src1->nb[2]; + const int nb13 = src1->nb[3]; + + const int nb0 = dst->nb[0]; + const int nb1 = dst->nb[1]; + const int nb2 = dst->nb[2]; + const int nb3 = dst->nb[3]; + + GGML_ASSERT(ne02 == ne12); + GGML_ASSERT(ne03 == ne13); + GGML_ASSERT(ne2 == ne12); + GGML_ASSERT(ne3 == ne13); + + // dst cannot be transposed or permuted + GGML_ASSERT(nb0 == sizeof(float)); + GGML_ASSERT(nb10 == sizeof(float)); + + GGML_ASSERT(nb0 <= nb1); + GGML_ASSERT(nb1 <= nb2); + GGML_ASSERT(nb2 <= nb3); + + GGML_ASSERT(ne0 == ne01); + GGML_ASSERT(ne1 == ne11); + GGML_ASSERT(ne2 == ne02); + GGML_ASSERT(ne3 == ne03); + + const int ith = params->ith; + const int nth = params->nth; + + if (src0->type == GGML_TYPE_F32) { + // we don't support permuted src0 or src1 + GGML_ASSERT(nb00 == sizeof(float)); + GGML_ASSERT(params->nth == 1); + GGML_ASSERT(params->type == GGML_TASK_COMPUTE); + + for (int64_t i03 = 0; i03 < ne03; i03++) { + for (int64_t i02 = 0; i02 < ne02; i02++) { + const float * x = (float *) ((char *) src0->data + i02*nb02 + i03*nb03); + const float * y = (float *) ((char *) src1->data + i02*nb12 + i03*nb13); + float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3); + cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans, + ne11, ne01, ne10, + 1.0f, y, ne10, + x, ne00, + 0.0f, d, ne01); + } + } + return; + } else if (src0->type == GGML_TYPE_F16) { + // TODO: we don't support permuted src0 + GGML_ASSERT(nb00 == sizeof(ggml_fp16_t)); + GGML_ASSERT(params->nth == 1); + GGML_ASSERT(params->type == GGML_TASK_COMPUTE); + + for (int64_t i03 = 0; i03 < ne03; i03++) { + for (int64_t i02 = 0; i02 < ne02; i02++) { + float * const wdata = params->wdata; + { + size_t id = 0; + for (int64_t i01 = 0; i01 < ne01; ++i01) { + for (int64_t i00 = 0; i00 < ne00; ++i00) { + wdata[id++] = GGML_FP16_TO_FP32(*(ggml_fp16_t *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01 + i00*nb00)); + } + } + + assert(id*sizeof(float) <= params->wsize); + } + + const float * x = wdata; + const float * y = (float *) ((char *) src1->data + i02*nb12 + i03*nb13); + float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3); + + // zT = y * xT + cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans, + ne11, ne01, ne10, + 1.0f, y, ne10, + x, ne00, + 0.0f, d, ne01); + } + } + return; + } else if (ggml_is_quantized(src0->type)) { + // we don't support permuted src0 or src1 + GGML_ASSERT(nb00 == (int) GGML_TYPE_SIZE[src0->type]); + GGML_ASSERT(src0->data); + GGML_ASSERT(params->wdata); + + float * const wdata = params->wdata; + dequantize_row_q_t const dequantize_row_q = quantize_fns[src0->type].dequantize_row_q; + + if (params->type == GGML_TASK_INIT) { + // rows per thread + const int dr = (ne01 + nth - 1)/nth; + + // row range for this thread + const int ir0 = dr*ith; + int ir1 = MIN(ir0 + dr, ne01); + + for (int64_t i03 = 0; i03 < ne03; i03++) { + for (int64_t i02 = 0; i02 < ne02; i02++) { + char * data0_offset = (char *) src0->data + i03*nb03 + i02*nb02; + float * wdata_offset = wdata + i03*ne03 + i02*ne02; + for (int64_t i = ir0; i < ir1; ++i) { + dequantize_row_q(data0_offset + i*nb01, wdata_offset + i*ne00, ne00); + } + } + } + return; + } + + GGML_ASSERT(nth == 1); + GGML_ASSERT(params->type == GGML_TASK_COMPUTE); + + for (int64_t i03 = 0; i03 < ne03; i03++) { + for (int64_t i02 = 0; i02 < ne02; i02++) { + const float * x = wdata; + const float * y = (float *) ((char *) src1->data + i02*nb12 + i03*nb13); + float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3); + // zT = y * xT + cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans, + ne11, ne01, ne10, + 1.0f, y, ne10, + x, ne00, + 0.0f, d, ne01); + } + } + return; + } else { + GGML_ASSERT(false); + } +} +#endif + // CPU only static void ggml_compute_forward_mul_mat_f32( const struct ggml_compute_params * params, @@ -9947,9 +10100,6 @@ static void ggml_compute_forward_mul_mat_f32( const int64_t ne02 = src0->ne[2]; const int64_t ne03 = src0->ne[3]; -#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) - const int64_t ne10 = src1->ne[0]; -#endif const int64_t ne11 = src1->ne[1]; #ifndef NDEBUG const int64_t ne12 = src1->ne[2]; @@ -10004,37 +10154,7 @@ static void ggml_compute_forward_mul_mat_f32( // nb01 >= nb00 - src0 is not transposed // compute by src0 rows - enum ggml_task_backend comp_backend = dst->task_profile.stages[GGML_TASK_COMPUTE].backend; - GGML_ASSERT(comp_backend & GGML_TASK_BACKEND_CPU); - - if (comp_backend == GGML_TASK_BACKEND_CPU_BLAS) { -#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) - GGML_ASSERT(params->nth == 1); - GGML_ASSERT(params->type == GGML_TASK_COMPUTE); - - for (int64_t i03 = 0; i03 < ne03; i03++) { - for (int64_t i02 = 0; i02 < ne02; i02++) { - const float * x = (float *) ((char *) src0->data + i02*nb02 + i03*nb03); - const float * y = (float *) ((char *) src1->data + i02*nb12 + i03*nb13); - float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3); - - cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans, - ne11, ne01, ne10, - 1.0f, y, ne10, - x, ne00, - 0.0f, d, ne01); - } - } - //printf("CBLAS F32 = %f ms, %d x %d x %d x %d\n", (ggml_perf_time_us() - t0)/1000.0, ne0, ne1, ne2, ne3); - - return; -#else - GGML_ASSERT(false); -#endif - } - GGML_ASSERT(params->type == GGML_TASK_COMPUTE); - GGML_ASSERT(comp_backend == GGML_TASK_BACKEND_CPU); // parallelize by src0 rows using ggml_vec_dot_f32 @@ -10152,57 +10272,6 @@ static void ggml_compute_forward_mul_mat_f16_f32( // nb01 >= nb00 - src0 is not transposed // compute by src0 rows - enum ggml_task_backend init_backend = dst->task_profile.stages[GGML_TASK_INIT].backend; - enum ggml_task_backend comp_backend = dst->task_profile.stages[GGML_TASK_COMPUTE].backend; - - GGML_ASSERT(comp_backend & GGML_TASK_BACKEND_CPU); - - if (comp_backend == GGML_TASK_BACKEND_CPU_BLAS) { -#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) - GGML_ASSERT(nb10 == sizeof(float)); - GGML_ASSERT(params->nth == 1); - GGML_ASSERT(params->type == GGML_TASK_COMPUTE); - - for (int64_t i03 = 0; i03 < ne03; i03++) { - for (int64_t i02 = 0; i02 < ne02; i02++) { - float * const wdata = params->wdata; - { - size_t id = 0; - for (int64_t i01 = 0; i01 < ne01; ++i01) { - for (int64_t i00 = 0; i00 < ne00; ++i00) { - wdata[id++] = GGML_FP16_TO_FP32(*(ggml_fp16_t *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01 + i00*nb00)); - } - } - - assert(id*sizeof(float) <= params->wsize); - } - - const float * x = wdata; - const float * y = (float *) ((char *) src1->data + i02*nb12 + i03*nb13); - - float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3); - - // zT = y * xT - cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans, - ne11, ne01, ne10, - 1.0f, y, ne10, - x, ne00, - 0.0f, d, ne01); - } - } - - /*printf("CBLAS F16 = %f ms, %d x %d x %d x %d\n", (ggml_perf_time_us() - t0)/1000.0, ne0, ne1, ne2, ne3);*/ - - return; -#else - GGML_ASSERT(false); -#endif - } - - GGML_ASSERT(params->type == GGML_TASK_INIT || params->type == GGML_TASK_COMPUTE); - GGML_ASSERT(init_backend == GGML_TASK_BACKEND_CPU); - GGML_ASSERT(comp_backend == GGML_TASK_BACKEND_CPU); - if (params->type == GGML_TASK_INIT) { ggml_fp16_t * const wdata = params->wdata; @@ -10348,68 +10417,6 @@ static void ggml_compute_forward_mul_mat_q_f32( // nb01 >= nb00 - src0 is not transposed // compute by src0 rows - enum ggml_task_backend init_backend = dst->task_profile.stages[GGML_TASK_INIT].backend; - enum ggml_task_backend comp_backend = dst->task_profile.stages[GGML_TASK_COMPUTE].backend; - GGML_ASSERT(comp_backend & GGML_TASK_BACKEND_CPU); - - if (comp_backend == GGML_TASK_BACKEND_CPU_BLAS) { -#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) - GGML_ASSERT (init_backend == GGML_TASK_BACKEND_CPU); - GGML_ASSERT(params->type == GGML_TASK_INIT || params->type == GGML_TASK_COMPUTE); - GGML_ASSERT(src0->data); - GGML_ASSERT(params->wdata); - - float * const wdata = params->wdata; - dequantize_row_q_t const dequantize_row_q = quantize_fns[type].dequantize_row_q; - - if (params->type == GGML_TASK_INIT) { - // rows per thread - const int dr = (ne01 + nth - 1)/nth; - - // row range for this thread - const int ir0 = dr*ith; - int ir1 = MIN(ir0 + dr, ne01); - - for (int64_t i03 = 0; i03 < ne03; i03++) { - for (int64_t i02 = 0; i02 < ne02; i02++) { - char * data0_offset = (char *) src0->data + i03*nb03 + i02*nb02; - float * wdata_offset = wdata + i03*ne03 + i02*ne02; - for (int64_t i = ir0; i < ir1; ++i) { - dequantize_row_q(data0_offset + i*nb01, wdata_offset + i*ne00, ne00); - } - } - } - return; - } - - GGML_ASSERT(nth == 1); - GGML_ASSERT(params->type == GGML_TASK_COMPUTE); - - for (int64_t i03 = 0; i03 < ne03; i03++) { - for (int64_t i02 = 0; i02 < ne02; i02++) { - const float * y = (float *) ((char *) src1->data + i02*nb12 + i03*nb13); - float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3); - - // zT = y * xT - const float * x = wdata; - cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans, - ne11, ne01, ne10, - 1.0f, y, ne10, - x, ne00, - 0.0f, d, ne01); - } - } - - return; -#else - GGML_ASSERT(false); -#endif - } - - GGML_ASSERT(params->type == GGML_TASK_INIT || params->type == GGML_TASK_COMPUTE); - GGML_ASSERT(init_backend == GGML_TASK_BACKEND_CPU); - GGML_ASSERT(comp_backend == GGML_TASK_BACKEND_CPU); - if (params->type == GGML_TASK_INIT) { GGML_ASSERT(params->nth == 1); @@ -14257,6 +14264,7 @@ static void ggml_compute_forward_cross_entropy_loss_back( ///////////////////////////////// +// CPU only: no BLAS. static enum ggml_compute_error ggml_compute_forward(const struct ggml_compute_params * params, struct ggml_tensor * tensor) { GGML_ASSERT(params); @@ -15459,96 +15467,163 @@ struct ggml_cgraph ggml_build_backward(struct ggml_context * ctx, struct ggml_cg // ---- task profiles ---- -// TODO: replace with ggml_compute_forward_cuda +// Check the type and memeory layout for mul_mat on blas(CPU BLAS) +static bool ggml_mul_mat_check_type_mem(struct ggml_tensor *tensor) { + enum ggml_type src0_t = tensor->src0->type; + enum ggml_type src1_t = tensor->src1->type; + + // This is the minimal requirement to run mulmat with BLAS. + // Don't check matrix size because that would break tuning. + return (src0_t == GGML_TYPE_F32 || src0_t == GGML_TYPE_F16 || + ggml_is_quantized(src0_t)) && + src1_t == GGML_TYPE_F32 && tensor->type == GGML_TYPE_F32 && + ggml_is_contiguous(tensor->src0) && ggml_is_contiguous(tensor->src1); +} + // DO NOT check matrix size further. #if defined(GGML_USE_CUBLAS) -static enum ggml_compute_error ggml_compute_forward_cuda( - const struct ggml_compute_params * params, - struct ggml_tensor * tensor) { - GGML_ASSERT (ggml_cuda_can_mul_mat(tensor->src0, tensor->src1, tensor)); +// Implements ggml_task_runner. +static enum ggml_compute_error +ggml_compute_forward_cuda(const struct ggml_compute_params *params, + struct ggml_tensor *tensor) { + if (tensor->op == GGML_OP_MUL_MAT) { + GGML_ASSERT(ggml_mul_mat_check_type_mem(tensor)); + } + if (ggml_cuda_compute_forward(params, tensor)) { return GGML_COMPUTE_OK; } + GGML_ASSERT(tensor->src0->backend == GGML_BACKEND_CPU); - GGML_ASSERT(tensor->src1 == NULL || tensor->src1->backend == GGML_BACKEND_CPU); + GGML_ASSERT(tensor->src1 == NULL || + tensor->src1->backend == GGML_BACKEND_CPU); + return GGML_COMPUTE_FALLBACK; } -#endif +#endif // GGML_USE_CUBLAS -// TODO: replace with ggml_cl_mul_mat. -// DO NOT check matrix size further. #if defined(GGML_USE_CLBLAST) -static enum ggml_compute_error ggml_compute_forward_cl( - const struct ggml_compute_params * params, - struct ggml_tensor * tensor) { +// Implements ggml_task_runner. +static enum ggml_compute_error +ggml_compute_forward_cl(const struct ggml_compute_params *params, + struct ggml_tensor *tensor) { switch (tensor->op) { - case GGML_OP_MUL_MAT: - GGML_ASSERT(ggml_cl_can_mul_mat(tensor->src0, tensor->src1, tensor)); - ggml_cl_mul_mat(tensor->src0, tensor->src1, tensor, params->wdata, params->wsize); - return GGML_COMPUTE_OK; - default: - break; + case GGML_OP_MUL: { + if (tensor->src1 && ggml_cl_is_gpu_offloading(tensor)) { + if (params->ith == 0) { + ggml_cl_mul(tensor->src0, tensor->src1, tensor); + return GGML_COMPUTE_OK; + } + } + } break; + case GGML_OP_MUL_MAT: { + GGML_ASSERT(ggml_mul_mat_check_type_mem(tensor)); + ggml_cl_mul_mat(tensor->src0, tensor->src1, tensor, params->wdata, + params->wsize); + return GGML_COMPUTE_OK; + } break; + default: { + } break; } - GGML_ASSERT(false); + return GGML_COMPUTE_FALLBACK; } -static int ggml_compute_forward_get_wsize_cl (struct ggml_tensor *tensor) { - switch (tensor->op) { - case GGML_OP_MUL_MAT: - return ggml_cl_mul_mat_get_wsize(tensor->src0, tensor->src1, tensor); - default: - break; - } - return -1; +// Implements ggml_task_wsize_getter. +static int ggml_compute_forward_cl_get_wsize(struct ggml_tensor *tensor) { + switch (tensor->op) { + case GGML_OP_MUL_MAT: + return ggml_cl_mul_mat_get_wsize(tensor->src0, tensor->src1, tensor); + default: + break; + } + return -1; } -#endif +#endif // GGML_USE_CLBLAST + +#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) +// Implements ggml_task_runner. +static enum ggml_compute_error +ggml_compute_forward_blas(const struct ggml_compute_params *params, + struct ggml_tensor *tensor) { + switch (tensor->op) { + case GGML_OP_MUL_MAT: { + GGML_ASSERT(ggml_mul_mat_check_type_mem(tensor)); + ggml_compute_forward_mul_mat_blas(params, tensor); + return GGML_COMPUTE_OK; + } break; + default: { + } break; + } -// The wrapper for external mulmat tune tool. -enum ggml_compute_error ggml_compute_forward_wrapper(const struct ggml_compute_params *params, - struct ggml_tensor *tensor) { - // We call ggml_compute_forward because the CUDA mul_mat entry point - // was moved out of `ggml_compute_forward_mul_mat`. - return ggml_compute_forward(params, tensor); + return GGML_COMPUTE_FALLBACK; } +// Implements ggml_task_wsize_getter. +static int ggml_compute_forward_blas_get_wsize(struct ggml_tensor *tensor) { + switch (tensor->op) { + case GGML_OP_MUL_MAT: { + GGML_ASSERT(tensor->src1->type == GGML_TYPE_F32); + enum ggml_type src0_t = tensor->src0->type; + + if (src0_t == GGML_TYPE_F16) { + return GGML_TYPE_SIZE[GGML_TYPE_F32] * + (tensor->src0->ne[0] * tensor->src0->ne[1]); + } else if (src0_t == GGML_TYPE_F32) { + return 0; + } else if (ggml_is_quantized(src0_t)) { + return GGML_TYPE_SIZE[GGML_TYPE_F32] * + (tensor->src0->ne[0] * tensor->src0->ne[1]); + } else { + GGML_ASSERT(false); + } + } break; + default: + break; + } + return -1; +} +#endif // GGML_USE_ACCELERATE | GGML_USE_OPENBLAS + // Implement `ggml_task_profiles_provider`. -// Fill `profiles` for the `node` and return number of profiles. +// Fill `profiles` for the `tensor` and return number of profiles. // -// NOTE: the node may be incompleted from testing or tunning, so please assert +// NOTE: the tensor may be incompleted from testing or tunning, so please assert // everything used here. // -// TODO: configure cuda for none mul_mat nodes. +// First profile is always CPU, followed by BLAS, CUDA/CL. int ggml_get_task_profiles( - struct ggml_tensor *node, + struct ggml_tensor *tensor, struct ggml_task_profile profiles[GGML_MAX_TASK_PROFILES]) { - GGML_ASSERT(node); - GGML_ASSERT(node->op >= 0); + + GGML_ASSERT(tensor); + GGML_ASSERT(tensor->op >= 0); GGML_ASSERT(profiles); memset(profiles, 0, sizeof(struct ggml_task_profile) * GGML_MAX_TASK_PROFILES); struct ggml_task_profile *p = profiles; - int n_profiles = 0; - switch (node->op) { + int n_profiles = 1; + strcpy(p[0].name, "CPU"); + p[0].runner = ggml_compute_forward; + // p[0].wsize_getter = ...; + + switch (tensor->op) { case GGML_OP_CPY: case GGML_OP_DUP: { - p[0].stages[1].backend = GGML_TASK_BACKEND_CPU; - n_profiles = 1; + p[0].stages[1].valid = true; } break; case GGML_OP_ADD: case GGML_OP_ADD1: { - p[0].stages[1].backend = GGML_TASK_BACKEND_CPU; + p[0].stages[1].valid = true; p[0].stages[1].parallel = true; - n_profiles = 1; } break; case GGML_OP_ACC: { - p[0].stages[0].backend = GGML_TASK_BACKEND_CPU; - p[0].stages[1].backend = GGML_TASK_BACKEND_CPU; + p[0].stages[0].valid = true; + p[0].stages[1].valid = true; p[0].stages[1].parallel = true; - n_profiles = 1; } break; case GGML_OP_SUB: case GGML_OP_DIV: @@ -15565,13 +15640,11 @@ int ggml_get_task_profiles( case GGML_OP_NEG: case GGML_OP_STEP: case GGML_OP_RELU: { - p[0].stages[1].backend = GGML_TASK_BACKEND_CPU; - n_profiles = 1; + p[0].stages[1].valid = true; } break; case GGML_OP_MUL: { - p[0].stages[1].backend = GGML_TASK_BACKEND_CPU; + p[0].stages[1].valid = true; p[0].stages[1].parallel = true; - n_profiles = 1; } break; case GGML_OP_GELU: case GGML_OP_SILU: @@ -15579,69 +15652,32 @@ int ggml_get_task_profiles( case GGML_OP_NORM: case GGML_OP_RMS_NORM: case GGML_OP_RMS_NORM_BACK: { - p[0].stages[1].backend = GGML_TASK_BACKEND_CPU; + p[0].stages[1].valid = true; p[0].stages[1].parallel = true; - n_profiles = 1; } break; case GGML_OP_MUL_MAT: - case GGML_OP_OUT_PROD: { - // CPU only profiles. - // CUDA/CL: see end of function. - GGML_ASSERT(node->src0); - GGML_ASSERT(node->src1); - - enum ggml_type src0_t = node->src0->type; - enum ggml_type src1_t = node->src1->type; - - GGML_ASSERT(src1_t == GGML_TYPE_F32); - - int i = 0; + case GGML_OP_OUT_PROD: { // FIXME: is this correct? + enum ggml_type src0_t = tensor->src0->type; if (src0_t == GGML_TYPE_F32) { - p[i].stages[1].backend = GGML_TASK_BACKEND_CPU; - p[i].stages[1].parallel = true; - i++; - -#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) - p[i].stages[1].backend = GGML_TASK_BACKEND_CPU_BLAS; - p[i].stages[1].wait = true; - i++; -#endif + p[0].stages[1].valid = true; + p[0].stages[1].parallel = true; } else if (src0_t == GGML_TYPE_F16) { - p[i].stages[0].backend = GGML_TASK_BACKEND_CPU; - p[i].stages[1].backend = GGML_TASK_BACKEND_CPU; - p[i].stages[1].parallel = true; - i++; - -#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) - p[i].stages[1].backend = GGML_TASK_BACKEND_CPU_BLAS; - p[i].stages[1].wait = true; - i++; -#endif + p[0].stages[0].valid = true; + p[0].stages[1].valid = true; + p[0].stages[1].parallel = true; } else if (ggml_is_quantized(src0_t)) { - p[i].stages[0].backend = GGML_TASK_BACKEND_CPU; - p[i].stages[1].backend = GGML_TASK_BACKEND_CPU; - p[i].stages[1].parallel = true; - i++; - -#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) - p[i].stages[0].backend = GGML_TASK_BACKEND_CPU; - p[i].stages[0].parallel = true; - p[i].stages[1].backend = GGML_TASK_BACKEND_CPU_BLAS; - p[i].stages[1].wait = true; - i++; -#endif + p[0].stages[0].valid = true; + p[0].stages[1].valid = true; + p[0].stages[1].parallel = true; } - n_profiles = i; } break; case GGML_OP_SCALE: { - p[0].stages[1].backend = GGML_TASK_BACKEND_CPU; + p[0].stages[1].valid = true; p[0].stages[1].parallel = true; - n_profiles = 1; } break; case GGML_OP_SET: { - p[0].stages[0].backend = GGML_TASK_BACKEND_CPU; - p[0].stages[1].backend = GGML_TASK_BACKEND_CPU; - n_profiles = 1; + p[0].stages[0].valid = true; + p[0].stages[1].valid = true; } break; case GGML_OP_CONT: case GGML_OP_RESHAPE: @@ -15652,64 +15688,53 @@ int ggml_get_task_profiles( case GGML_OP_GET_ROWS_BACK: case GGML_OP_DIAG: case GGML_OP_DIAG_MASK_ZERO: { - p[0].stages[1].backend = GGML_TASK_BACKEND_CPU; - n_profiles = 1; + p[0].stages[1].valid = true; } break; case GGML_OP_DIAG_MASK_INF: case GGML_OP_SOFT_MAX: case GGML_OP_SOFT_MAX_BACK: case GGML_OP_ROPE: case GGML_OP_ROPE_BACK: { - p[0].stages[1].backend = GGML_TASK_BACKEND_CPU; + p[0].stages[1].valid = true; p[0].stages[1].parallel = true; - n_profiles = 1; } break; case GGML_OP_ALIBI: { - p[0].stages[1].backend = GGML_TASK_BACKEND_CPU; - n_profiles = 1; + p[0].stages[1].valid = true; } break; case GGML_OP_CLAMP: { - p[0].stages[1].backend = GGML_TASK_BACKEND_CPU; - n_profiles = 1; + p[0].stages[1].valid = true; } break; case GGML_OP_CONV_1D_1S: case GGML_OP_CONV_1D_2S: { - p[0].stages[0].backend = GGML_TASK_BACKEND_CPU; - p[0].stages[1].backend = GGML_TASK_BACKEND_CPU; + p[0].stages[0].valid = true; + p[0].stages[1].valid = true; p[0].stages[1].parallel = true; - n_profiles = 1; } break; case GGML_OP_FLASH_ATTN: { - p[0].stages[1].backend = GGML_TASK_BACKEND_CPU; + p[0].stages[1].valid = true; p[0].stages[1].parallel = true; - n_profiles = 1; } break; case GGML_OP_FLASH_FF: { - p[0].stages[1].backend = GGML_TASK_BACKEND_CPU; + p[0].stages[1].valid = true; p[0].stages[1].parallel = true; - n_profiles = 1; } case GGML_OP_FLASH_ATTN_BACK: { - p[0].stages[0].backend = GGML_TASK_BACKEND_CPU; - p[0].stages[1].backend = GGML_TASK_BACKEND_CPU; + p[0].stages[0].valid = true; + p[0].stages[1].valid = true; p[0].stages[1].parallel = true; - n_profiles = 1; } break; case GGML_OP_MAP_UNARY: case GGML_OP_MAP_BINARY: { - p[0].stages[1].backend = GGML_TASK_BACKEND_CPU; - n_profiles = 1; + p[0].stages[1].valid = true; } break; case GGML_OP_CROSS_ENTROPY_LOSS: - p[0].stages[0].backend = GGML_TASK_BACKEND_CPU; - p[0].stages[1].backend = GGML_TASK_BACKEND_CPU; + p[0].stages[0].valid = true; + p[0].stages[1].valid = true; p[0].stages[1].parallel = true; - p[0].stages[2].backend = GGML_TASK_BACKEND_CPU; - n_profiles = 1; + p[0].stages[2].valid = true; case GGML_OP_CROSS_ENTROPY_LOSS_BACK: { - p[0].stages[1].backend = GGML_TASK_BACKEND_CPU; + p[0].stages[1].valid = true; p[0].stages[1].parallel = true; - n_profiles = 1; } break; case GGML_OP_NONE: case GGML_OP_COUNT: { @@ -15719,227 +15744,196 @@ int ggml_get_task_profiles( GGML_ASSERT(false); } -#if defined(GGML_USE_CUBLAS) - switch (node->op) { - case GGML_OP_ADD: - case GGML_OP_MUL: - case GGML_OP_SILU: - case GGML_OP_RMS_NORM: - case GGML_OP_MUL_MAT: - case GGML_OP_RESHAPE: - case GGML_OP_ROPE: { - int i = n_profiles; - p[i].runner = ggml_compute_forward_cuda; - p[i].stages[1].backend = GGML_TASK_BACKEND_GPU_CUDA; +#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) + if (tensor->op == GGML_OP_MUL_MAT) { + enum ggml_type src0_t = tensor->src0->type; + int i = n_profiles; + + strcpy(p[i].name, "BLAS"); + p[i].runner = ggml_compute_forward_blas; + p[i].wsize_getter = ggml_compute_forward_blas_get_wsize; + + if (src0_t == GGML_TYPE_F32) { + p[i].stages[1].valid = true; p[i].stages[1].wait = true; - ++n_profiles; - } break; - default: { - } break; + } else if (src0_t == GGML_TYPE_F16) { + p[i].stages[1].valid = true; + p[i].stages[1].wait = true; + } else if (ggml_is_quantized(src0_t)) { + p[i].stages[0].valid = true; + p[i].stages[0].parallel = true; + p[i].stages[1].valid = true; + p[i].stages[1].wait = true; + } + ++n_profiles; + } +#endif + +#if defined(GGML_USE_CUBLAS) + if (true) { // FIXME: filter supported op to avoid unnecceary fallback. + int i = n_profiles; + strcpy(p[i].name, "CUDA"); + p[i].runner = ggml_compute_forward_cuda; + p[i].stages[1].valid = true; + p[i].stages[1].wait = true; + ++n_profiles; } #elif defined(GGML_USE_CLBLAST) - switch (node->op) { - case GGML_OP_MUL_MAT: { - int i = n_profiles; - p[i].runner = ggml_compute_forward_cl; - p[i].get_wsize = ggml_compute_forward_get_wsize_cl; - p[i].stages[1].backend = GGML_TASK_BACKEND_GPU_CL; - p[i].stages[1].wait = true; - ++n_profiles; - } break; - default: { - } break; + if (tensor->op == GGML_OP_MUL || tensor->op == GGML_OP_MUL_MAT) { + int i = n_profiles; + strcpy(p[i].name, "CL"); + p[i].runner = ggml_compute_forward_cl; + p[i].wsize_getter = ggml_compute_forward_cl_get_wsize; + p[i].stages[1].valid = true; + p[i].stages[1].wait = true; + ++n_profiles; } #endif GGML_ASSERT(n_profiles > 0 && n_profiles <= GGML_MAX_TASK_PROFILES); - for (int i = 0; i < n_profiles; i++) { - profiles[i].id = i + 1; + + for (int j = 0; j < n_profiles; j++) { + profiles[j].id = j + 1; } + return n_profiles; } -// Set task profile for GGML_OP_MUL_MAT or GGML_OP_OUT_PROD. -static const struct ggml_task_profile *ggml_mulmat_get_task_profile( - struct ggml_tensor *node, struct ggml_task_profile *profiles, - int n_profiles, struct ggml_mulmat_tune *tune, int stages_time_us[3]) { +// Try to fix task profile for given tensor, because the task profile might not +// be the most performant. +static void ggml_optimize_tensor_task_profile( + struct ggml_tensor *tensor, struct ggml_task_profile *profiles, + int n_profiles, struct ggml_mulmat_tune *tune) { + + if (tensor->op != GGML_OP_MUL_MAT && tensor->op != GGML_OP_OUT_PROD) { + return; + } + + GGML_ASSERT(tensor); + GGML_ASSERT(tensor->op == GGML_OP_MUL_MAT || + tensor->op == GGML_OP_OUT_PROD); + GGML_ASSERT(tensor->task_profile.id == n_profiles); - GGML_ASSERT(node); - GGML_ASSERT(node->op == GGML_OP_MUL_MAT || node->op == GGML_OP_OUT_PROD); GGML_ASSERT(profiles); - GGML_ASSERT(n_profiles > 0); + GGML_ASSERT(n_profiles > 1); - enum ggml_type src0_t = node->src0->type; - enum ggml_type src1_t = node->src1->type; + int M = (int)tensor->ne[1]; + int N = (int)tensor->ne[0]; + int K = (int)tensor->src1->ne[0]; - // Type and memory layout requirements for computing mul_mat with BLAS. - bool cond_match = (src0_t == GGML_TYPE_F32 || src0_t == GGML_TYPE_F16 || - ggml_is_quantized(src0_t)) && - src1_t == GGML_TYPE_F32 && node->type == GGML_TYPE_F32 && - ggml_is_contiguous(node->src0) && - ggml_is_contiguous(node->src1); +#if defined(GGML_USE_TUNE) + if (tune != NULL && ggml_mul_mat_check_type_mem(tensor)) { + GGML_ASSERT(tensor->backend == 0 && tensor->src0->backend == 0 && + tensor->src1->backend == 0); - int M = (int)node->ne[1]; - int N = (int)node->ne[0]; - int K = (int)node->src1->ne[0]; + GGML_ASSERT(n_profiles >= 2); - const struct ggml_task_profile *prof = NULL; + enum ggml_type src0_t = tensor->src0->type; + enum ggml_type src1_t = tensor->src1->type; + + int stages_time_us[3]; + + int id = ggml_mulmat_tune_select_task_profile(tune, M, N, K, src0_t, + src1_t, stages_time_us); + if (id > 0) { + struct ggml_task_profile *prof = NULL; - if (cond_match) { -#if defined(GGML_USE_TUNE) - if (tune != NULL) { - GGML_ASSERT(n_profiles >= 2); - int id = ggml_mulmat_tune_select_task_profile(tune, M, N, K, src0_t, - src1_t, stages_time_us); for (int i = 0; i < n_profiles; i++) { if (profiles[i].id == id) { prof = &profiles[i]; - return prof; + break; } } - } -#else - UNUSED(tune); - UNUSED(stages_time_us); -#endif - if (prof == NULL && M >= 32 && N >= 32 && K >= 32) { - for (int j = 0; j < n_profiles; j++) { - enum ggml_task_backend comp_be = - profiles[j].stages[GGML_TASK_COMPUTE].backend; - switch (comp_be) { - case GGML_TASK_BACKEND_GPU_CUDA: { - GGML_ASSERT(ggml_cpu_has_cublas()); - prof = &profiles[j]; - break; - } - case GGML_TASK_BACKEND_GPU_CL: { - GGML_ASSERT(ggml_cpu_has_clblast()); - prof = &profiles[j]; - break; - } - case GGML_TASK_BACKEND_CPU_BLAS: { - GGML_ASSERT(ggml_cpu_has_cpublas()); - prof = &profiles[j]; - break; - } - default: { - break; - } - } + if (prof) { + memcpy(&tensor->task_profile, prof, + sizeof(struct ggml_task_profile)); - if (prof) { - break; + // Do not wait if the estimated execution time is too small + // (e.g. less than 0.1 ms) + // TODO: need bench actual wait/notify time, see + // ggml-threading.c + for (int j = 0; j < 3; j++) { + if (tensor->task_profile.stages[j].wait) { + if (stages_time_us[j] < 100) { + tensor->task_profile.stages[j].wait = false; + } + } } + return; } } } +#else + UNUSED(tune); +#endif - if (prof == NULL) { - prof = &profiles[0]; - GGML_ASSERT(prof->stages[1].backend == GGML_TASK_BACKEND_CPU); - } + // Guess the optimal matrix size. + bool size_match = (M >= 32 && N >= 32 && K >= 32); + UNUSED(size_match); - return prof; -} + for (int i = n_profiles - 1; i >= 0; --i) { + const char *name = profiles[i].name; -void ggml_graph_compute_set_tensor_task_proile(struct ggml_tensor *node, - struct ggml_cgraph *cgraph) { - // Pre-specified. - for (int i = 0; i < 3; i++) { - if (node->task_profile.stages[i].backend > 0) { - return; + if (strcmp(name, "CUDA") == 0) { +#if defined(GGML_USE_CUBLAS) + if ((size_match || ggml_cuda_is_gpu_offloading(tensor)) && + ggml_mul_mat_check_type_mem(tensor)) { + memcpy(&tensor->task_profile, &profiles[i], + sizeof(struct ggml_task_profile)); + return; + } +#endif } - } - struct ggml_task_profile profiles[GGML_MAX_TASK_PROFILES]; - int n_profiles = ggml_get_task_profiles(node, profiles); - - const struct ggml_task_profile *profile = NULL; - - // GPU offloading. A special case of pre-specified task_profile. - if (node->backend == GGML_BACKEND_GPU || node->backend == GGML_BACKEND_GPU_SPLIT) { - if (node->op != GGML_OP_MUL_MAT && node->op != GGML_OP_OUT_PROD) { - enum ggml_task_backend be; - if (ggml_cpu_has_cublas()) { - be = GGML_TASK_BACKEND_GPU_CUDA; - } else if (ggml_cpu_has_clblast()) { - be = GGML_TASK_BACKEND_GPU_CL; - } else { - GGML_ASSERT(false); + if (strcmp(name, "CL") == 0) { +#if defined(GGML_USE_CLBLAST) + if ((size_match || ggml_cl_is_gpu_offloading(tensor)) && + ggml_mul_mat_check_type_mem(tensor)) { + memcpy(&tensor->task_profile, &profiles[i], + sizeof(struct ggml_task_profile)); + return; } +#endif + } - for (int j = 0; j < n_profiles; j++) { - if (profiles[j].stages[1].backend == be) { - profile = &profiles[j]; - break; - } + if (strcmp(name, "BLAS") == 0) { +#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) + if (size_match && ggml_mul_mat_check_type_mem(tensor)) { + memcpy(&tensor->task_profile, &profiles[0], + sizeof(struct ggml_task_profile)); + return; } - GGML_ASSERT(profile); - GGML_ASSERT(!cgraph->tune); - - memcpy(&node->task_profile, profile, sizeof(struct ggml_task_profile)); - return; +#endif } } - // mul_mat: GGML_OP_MUL_MAT and GGML_OP_OUT_PROD. - if (node->op == GGML_OP_MUL_MAT) { -#if defined(GGML_USE_TUNE) - GGML_ASSERT(node->backend == GGML_BACKEND_CPU); - - int stages_time_us[3]; - profile = ggml_mulmat_get_task_profile(node, profiles, n_profiles, - cgraph->tune, stages_time_us); - GGML_ASSERT(profile); + memcpy(&tensor->task_profile, &profiles[0], + sizeof(struct ggml_task_profile)); +} - memcpy(&node->task_profile, profile, sizeof(struct ggml_task_profile)); +static void ggml_set_tensor_task_profile(struct ggml_tensor *tensor, + struct ggml_mulmat_tune *tune) { + struct ggml_task_profile profiles[GGML_MAX_TASK_PROFILES]; + int n_profiles = ggml_get_task_profiles(tensor, profiles); + GGML_ASSERT(n_profiles > 0); - if (cgraph->tune) { - memcpy(&node->task_profile, profile, - sizeof(struct ggml_task_profile)); + // By default use profile with the largest id. + // Profile id starts from 1. + memcpy(&tensor->task_profile, &profiles[n_profiles - 1], + sizeof(struct ggml_task_profile)); - // Do not wait if the estimated execution time is too small - // (e.g. less than 0.1 ms) - // TODO: need bench actual wait/notify time, see - // ggml-threading.c - for (int j = 0; j < 3; j++) { - if (node->task_profile.stages[j].wait) { - if (stages_time_us[j] < 100) { - node->task_profile.stages[j].wait = false; - } - } - } - } - return; -#else - profile = ggml_mulmat_get_task_profile(node, profiles, n_profiles, NULL, - NULL); - GGML_ASSERT(profile); - memcpy(&node->task_profile, profile, sizeof(struct ggml_task_profile)); - return; -#endif - } else if (node->op == GGML_OP_OUT_PROD) { // FIXME: is this correct? - profile = ggml_mulmat_get_task_profile(node, profiles, n_profiles, NULL, - NULL); - GGML_ASSERT(profile); - memcpy(&node->task_profile, profile, sizeof(struct ggml_task_profile)); - return; + if (n_profiles > 1) { + GGML_ASSERT(tensor->task_profile.id > 1); + ggml_optimize_tensor_task_profile(tensor, profiles, n_profiles, tune); } - // default. - profile = &profiles[0]; - GGML_ASSERT(profile->stages[1].backend == GGML_TASK_BACKEND_CPU); - memcpy(&node->task_profile, profile, sizeof(struct ggml_task_profile)); + GGML_ASSERT(tensor->task_profile.id > 0); } void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph) { int n_threads = cgraph->n_threads; - struct ggml_threading_context *thrd_ctx = ggml_threading_start( - n_threads, NULL, ggml_compute_forward, - GGML_THREADING_FEATURE_WAIT_ON_DONE, NULL); - // initialize tasks + work buffer { // int64_t t0 = ggml_time_us(); @@ -15952,25 +15946,26 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph) GGML_ASSERT (node->op != GGML_OP_NONE); - struct ggml_task_stage *stages = node->task_profile.stages; + if (node->task_profile.id == 0) { + ggml_set_tensor_task_profile(node, cgraph->tune); + } - ggml_graph_compute_set_tensor_task_proile(node, cgraph); + struct ggml_task_stage *stages = node->task_profile.stages; // // Allocate temp buffer `wdata` for CPU. // NOTE: GPU MAY fallback to CPU, so we have to cover all possible cases. // - if (node->task_profile.get_wsize) { - int sz = node->task_profile.get_wsize(node); + if (node->task_profile.wsize_getter) { + int sz = node->task_profile.wsize_getter(node); if (sz >= 0) { work_size = MAX(work_size, (size_t)sz); + // FIXME: is it safe to continue in case fallback? continue; } } - //printf("op: %d, comp backend: %d\n", node->op, node->task_profile.stages[1].backend); - // compute stage n_tasks. int n_tasks = stages[1].parallel ? n_threads : 1; @@ -16034,35 +16029,17 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph) case GGML_OP_OUT_PROD: // FIXME: is this correct? { size_t cur = 0; - enum ggml_task_backend comp_backend = stages[GGML_TASK_COMPUTE].backend; - GGML_ASSERT(comp_backend != GGML_TASK_BACKEND_NONE); - if (comp_backend == GGML_TASK_BACKEND_CPU_BLAS) { - GGML_ASSERT(ggml_cpu_has_cpublas()); - GGML_ASSERT(node->src1->type == GGML_TYPE_F32); - - if (node->src0->type == GGML_TYPE_F32) { - cur = 0; - } else if (node->src0->type == GGML_TYPE_F16) { - // here we need memory just for single 2D matrix from src0 - cur = GGML_TYPE_SIZE[GGML_TYPE_F32]*(node->src0->ne[0]*node->src0->ne[1]); - } else if (ggml_is_quantized(node->src0->type)) { - cur = GGML_TYPE_SIZE[GGML_TYPE_F32]*(node->src0->ne[0]*node->src0->ne[1]); - } else { - GGML_ASSERT(false); - } - } else { // CPU or GPU fallback - GGML_ASSERT(node->src1->type == GGML_TYPE_F32); - - if (node->src0->type == GGML_TYPE_F32) { - cur = 0; - } else if (node->src0->type == GGML_TYPE_F16) { - cur = GGML_TYPE_SIZE[GGML_TYPE_F16]*ggml_nelements(node->src1); - } else if (ggml_is_quantized(node->src0->type)) { - const enum ggml_type type_q = quantize_fns[node->src0->type].vec_dot_type; - cur = GGML_TYPE_SIZE[type_q]*ggml_nelements(node->src1)/GGML_BLCK_SIZE[type_q]; - } else { - GGML_ASSERT(false); - } + GGML_ASSERT(node->src1->type == GGML_TYPE_F32); + + if (node->src0->type == GGML_TYPE_F32) { + cur = 0; + } else if (node->src0->type == GGML_TYPE_F16) { + cur = GGML_TYPE_SIZE[GGML_TYPE_F16]*ggml_nelements(node->src1); + } else if (ggml_is_quantized(node->src0->type)) { + const enum ggml_type type_q = quantize_fns[node->src0->type].vec_dot_type; + cur = GGML_TYPE_SIZE[type_q]*ggml_nelements(node->src1)/GGML_BLCK_SIZE[type_q]; + } else { + GGML_ASSERT(false); } work_size = MAX(work_size, cur); @@ -16218,6 +16195,9 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph) const int64_t perf_start_cycles = ggml_perf_cycles(); const int64_t perf_start_time_us = ggml_perf_time_us(); + struct ggml_threading_context *thrd_ctx = ggml_threading_start(n_threads, + NULL, ggml_compute_forward, GGML_THREADING_FEATURE_WAIT_ON_DONE, NULL); + for (int i = 0; i < cgraph->n_nodes; i++) { GGML_PRINT_DEBUG_5("%s: %d/%d\n", __func__, i, cgraph->n_nodes); diff --git a/ggml.h b/ggml.h index d4d5d3521c74f..554645ba8b6d5 100644 --- a/ggml.h +++ b/ggml.h @@ -362,29 +362,10 @@ extern "C" { static const size_t GGML_OBJECT_SIZE = sizeof(struct ggml_object); - // As part of task config profile solution, `ggml_task_backend` defines - // backends for each task stage. Similar to `ggml_tensor.backend`, - // `ggml_tensor.task_profile` generalizes how to configure tensor computing - // at per task-stage level. - // - // The following enum values are designed as combination of hardware and - // optional software interface. - enum ggml_task_backend { - GGML_TASK_BACKEND_NONE = 0, - - // [0x10, 0x1F]: CPU - GGML_TASK_BACKEND_CPU = 0x10, - GGML_TASK_BACKEND_CPU_BLAS = 0x11, - - // [0x20 - 0x2F]: GPU - GGML_TASK_BACKEND_GPU = 0x20, - GGML_TASK_BACKEND_GPU_CUDA = 0x21, - GGML_TASK_BACKEND_GPU_CL = 0x22, - }; - // config for computing one of the 3 task stages of a tensor. struct ggml_task_stage { - enum ggml_task_backend backend; + bool valid; + bool parallel; // hint idle workers go waiting, valid only when parallel is false. bool wait; @@ -407,13 +388,16 @@ extern "C" { // Get wsize for node computing. // When return -1: should be explained as `fallback to CPU`, caller MUST // determine how much memory to reserve for this node. - typedef int (ggml_task_get_wsize)(struct ggml_tensor *tensor); + typedef int (ggml_task_wsize_getter)(struct ggml_tensor *tensor); // config for computing a tensor. struct ggml_task_profile { // profile id, start from 1. int id; + // Required, not empty, no whitespaces. + char name[16]; + // index 0: INIT, 1: COMPUTE, 2: FINALIZE struct ggml_task_stage stages[3]; @@ -421,7 +405,7 @@ extern "C" { ggml_task_runner *runner; // Optional function to return required wsize for wdata. - ggml_task_get_wsize *get_wsize; + ggml_task_wsize_getter *wsize_getter; // Optional flag for development. // MUST be used only in testing codes. diff --git a/llama.cpp b/llama.cpp index 06555e1dddeb2..e6bddffd5edaa 100644 --- a/llama.cpp +++ b/llama.cpp @@ -2744,8 +2744,9 @@ struct llama_context * llama_init_from_file( } #ifdef GGML_USE_TUNE -bool llama_mulmat_tune(struct llama_context *ctx, int n_threads, bool tune, const char *fname) { - GGML_ASSERT (ctx->model.n_gpu_layers == 0); +bool llama_mulmat_tune(struct llama_context *ctx, int n_threads, bool tune, + const char *fname) { + GGML_ASSERT(ctx->model.n_gpu_layers == 0); printf("\n"); @@ -2755,7 +2756,7 @@ bool llama_mulmat_tune(struct llama_context *ctx, int n_threads, bool tune, cons enum ggml_ftype ggml_ftype; switch (hparams->ftype) { - case LLAMA_FTYPE_ALL_F32: + case LLAMA_FTYPE_ALL_F32: ggml_ftype = GGML_FTYPE_ALL_F32; break; case LLAMA_FTYPE_MOSTLY_F16: @@ -2767,9 +2768,6 @@ bool llama_mulmat_tune(struct llama_context *ctx, int n_threads, bool tune, cons case LLAMA_FTYPE_MOSTLY_Q4_1: ggml_ftype = GGML_FTYPE_MOSTLY_Q4_1; break; - case LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16: - ggml_ftype = GGML_FTYPE_MOSTLY_Q4_1_SOME_F16; - break; case LLAMA_FTYPE_MOSTLY_Q5_0: ggml_ftype = GGML_FTYPE_MOSTLY_Q5_0; break; @@ -2799,8 +2797,8 @@ bool llama_mulmat_tune(struct llama_context *ctx, int n_threads, bool tune, cons ggml_ftype = GGML_FTYPE_MOSTLY_Q6_K; break; default: - throw std::runtime_error( - format("invalid output file type %d\n", hparams->ftype)); + fprintf(stderr, "[tune] unsupported file type %d\n", hparams->ftype); + return false; } int n_vocab = hparams->n_vocab; @@ -2808,30 +2806,36 @@ bool llama_mulmat_tune(struct llama_context *ctx, int n_threads, bool tune, cons int n_rot = hparams->n_rot; int n_mult = hparams->n_mult; - int n_ff = ((2*(4*n_embd)/3 + n_mult - 1)/n_mult)*n_mult; + int n_ff = ((2 * (4 * n_embd) / 3 + n_mult - 1) / n_mult) * n_mult; struct ggml_mulmat_tune_params params = { - /*.model =*/ { - /* .name =*/ model_name, - /* .ftype =*/ ggml_ftype, - /* .n_vocab =*/ n_vocab, - /* .n_embd =*/ n_embd, - /* .n_ff =*/ n_ff, - /* .n_rot =*/ n_rot, + /*.model =*/{ + /* .name =*/model_name, + /* .ftype =*/ggml_ftype, + /* .n_vocab =*/n_vocab, + /* .n_embd =*/n_embd, + /* .n_ff =*/n_ff, + /* .n_rot =*/n_rot, }, - /* .m_num =*/ 8, - /* .n_pass =*/ 1, - /* .n_threads =*/ n_threads, - /* .prrogress =*/ true, - /* .output_console =*/ false, - /* .fname =*/ fname, + /* .m_num =*/8, + /* .n_pass =*/1, + /* .n_threads =*/n_threads, + /* .prrogress =*/true, + /* .output_console =*/false, + /* .fname =*/fname, }; bool empty_fname = !fname || strcmp(fname, "") == 0; - ctx->tune = new(struct ggml_mulmat_tune); + ctx->tune = new (struct ggml_mulmat_tune); if (!ctx->tune) { - throw std::runtime_error(format("failed to allocate memory for tune\n")); + fprintf(stderr, "[tune] failed to allocate memory for tune\n"); + return false; + } + + if (!ggml_cpu_has_blas()) { + fprintf(stderr, "[tune] this program is not built with BLAS, abort.\n"); + return false; } if (tune) { @@ -2844,31 +2848,30 @@ bool llama_mulmat_tune(struct llama_context *ctx, int n_threads, bool tune, cons ggml_mulmat_tune_free(ctx->tune); return true; } - } else { - if (empty_fname) { - return false; - } + } else if (empty_fname) { + return false; } if (!empty_fname) { FILE *fp = fopen(fname, "r"); if (!fp) { - fprintf(stderr, "[tune] failed to open file %s.\n", - fname); + fprintf(stderr, "[tune] failed to open file %s.\n", fname); + return false; } else { - bool ok = ggml_mulmat_tune_read_data(ctx->tune, fp); + int rc = ggml_mulmat_tune_read_data(ctx->tune, fp); fclose(fp); - if (!ok) { + if (rc != 0) { fprintf(stderr, - "[tune] failed to read data from %s\n", - fname); + "[tune] failed to read data from %s, error code: %d\n", + fname, rc); return false; } fprintf(stderr, "[tune] loaded data from %s\n", fname); - ok = ggml_mulmat_tune_validate(ctx->tune, model_name, ggml_ftype, params.n_threads); + bool ok = ggml_mulmat_tune_validate(ctx->tune, model_name, ggml_ftype, + params.n_threads); if (!ok) { return false; } diff --git a/tests/test-ggml-threading.c b/tests/test-ggml-threading.c index 2079fe144b194..e904fc10dcf67 100644 --- a/tests/test-ggml-threading.c +++ b/tests/test-ggml-threading.c @@ -41,9 +41,8 @@ static const int n_repeat = 10; // counter with array. static int work_done_arr[MAX_N_THREADS]; -static enum ggml_compute_error -mock_task_runner(const struct ggml_compute_params *params, - struct ggml_tensor *node) { +static enum ggml_compute_error mock_task_runner(const struct ggml_compute_params *params, + struct ggml_tensor *node) { int64_t loops = node->task_profile.dev_flags[1] * 1000 * 1000; if (node->task_profile.stages[params->type].parallel) { loops /= params->nth; @@ -80,20 +79,15 @@ int test_driver(int id, struct ggml_tensor *node, int n_threads) { int t0 = (int)ggml_time_us(); - struct ggml_threading_context *ctx = ggml_threading_start( - n_threads, NULL, mock_task_runner, features, /*stages_time*/ NULL); + node->task_profile.runner = mock_task_runner; + + struct ggml_threading_context *ctx = + ggml_threading_start(n_threads, NULL, NULL, features, /*stages_time*/ NULL); int t1 = (int)ggml_time_us(); for (int i = 0; i < n_repeat; i++) { - enum ggml_compute_error err = ggml_threading_compute_tensor( - ctx, node, /*wdata*/ NULL, /*wsize*/ 0); - if (err != GGML_COMPUTE_OK) { - ggml_threading_stop(ctx); - printf("ggml_threading_compute_tensor failed with error: %d.\n", - err); - return 1; - } + ggml_threading_compute_tensor(ctx, node, /*wdata*/ NULL, /*wsize*/ 0); } int t2 = (int)ggml_time_us(); @@ -107,7 +101,7 @@ int test_driver(int id, struct ggml_tensor *node, int n_threads) { int expect = 0; for (int i = 0; i < 3; i++) { const struct ggml_task_stage *ts = &stages[i]; - if (ts->backend != GGML_TASK_BACKEND_NONE) { + if (ts->valid) { if (ts->parallel) { expect += n_threads; } else { @@ -144,14 +138,12 @@ static enum ggml_compute_error mock_task_runner_fallback(const struct ggml_compute_params *params, struct ggml_tensor *node) { UNUSED(params); - if (node->backend == GGML_BACKEND_GPU) { - // ... finally failed to compute in GPU. - node->backend = GGML_BACKEND_CPU; + // failed to run ... + if (node->task_profile.id == 2) { return GGML_COMPUTE_FALLBACK; - } else { - return GGML_COMPUTE_OK; } + return GGML_COMPUTE_OK; } // By design, fallback should happen when attempt computing tensor in GPU, @@ -164,6 +156,9 @@ int test_fallback(struct ggml_tensor *node) { enum ggml_compute_error err = ggml_threading_compute_tensor(ctx, node, /*wdata*/ NULL, /*wsize*/ 0); if (err == GGML_COMPUTE_FALLBACK) { + // mock setup new profile ... + node->task_profile.id = 1; + err = ggml_threading_compute_tensor(ctx, node, /*wdata*/ NULL, /*wsize*/ 0); } @@ -214,12 +209,12 @@ int main(void) { struct ggml_tensor node; memset(&node, 0, sizeof(struct ggml_tensor)); + node.task_profile.runner = mock_task_runner; struct ggml_task_stage *stages = node.task_profile.stages; - stages[0].backend = GGML_TASK_BACKEND_CPU; - stages[1].backend = GGML_TASK_BACKEND_CPU; - stages[2].backend = GGML_TASK_BACKEND_NONE; + stages[0].valid = true; + stages[1].valid = true; int n_passed = 0; int n_tests = 0; @@ -277,7 +272,7 @@ int main(void) { struct ggml_threading_context *ctx = ggml_threading_start(n_threads, ggml_threading_graph_compute_thread, - mock_task_runner, 0, /*stages_time*/ NULL); + NULL, 0, /*stages_time*/ NULL); int t1 = (int)ggml_time_us(); @@ -416,8 +411,8 @@ int main(void) { node.src0 = &src0; node.src1 = &src1; - node.backend = GGML_BACKEND_GPU; - stages[1].backend = GGML_TASK_BACKEND_GPU; + node.task_profile.id = 2; + stages[1].valid = true; if (test_fallback(&node) == 0) { ++n_passed; printf("[test-ggml-threading] test fallback: ok\n\n"); diff --git a/tests/test-ggml-tune.c b/tests/test-ggml-tune.c index 4339881e52c2d..97fd6cfbfd07b 100644 --- a/tests/test-ggml-tune.c +++ b/tests/test-ggml-tune.c @@ -46,13 +46,9 @@ int main(void) { } static int bench(void) { - { - enum ggml_task_backend backends[16]; - int n_backends = ggml_mulmat_tune_get_builtin_task_backends(backends); - if (n_backends < 2) { - printf("[test-ggml-tune] skipped because no BLAS\n"); - return 0; - } + if (!ggml_cpu_has_blas()) { + printf("[test-ggml-tune] skipped because no BLAS\n"); + return 0; } { @@ -118,10 +114,13 @@ static int ggml_task_profiles_mock_qxx_provider(struct ggml_tensor *node, struct ggml_task_profile *profiles) { UNUSED(node); - profiles[0].stages[0].backend = GGML_TASK_BACKEND_CPU; - profiles[0].stages[1].backend = GGML_TASK_BACKEND_CPU; - profiles[1].stages[0].backend = GGML_TASK_BACKEND_CPU; - profiles[1].stages[1].backend = GGML_TASK_BACKEND_CPU_BLAS; + profiles[0].id = 1; + profiles[0].stages[0].valid = true; + profiles[0].stages[1].valid = true; + + profiles[0].id = 2; + profiles[1].stages[0].valid = true; + profiles[1].stages[1].valid = true; return 2; } From 67bb3679625f8b5e0d2443bcc9cd5098d7b3c931 Mon Sep 17 00:00:00 2001 From: mqy Date: Sun, 18 Jun 2023 14:03:09 +0800 Subject: [PATCH 11/24] typos --- Makefile | 2 +- ggml-tune.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/Makefile b/Makefile index 531f62fb01347..8cd253b531d7f 100644 --- a/Makefile +++ b/Makefile @@ -237,7 +237,7 @@ k_quants.o: k_quants.c k_quants.h endif # LLAMA_NO_K_QUANTS ifndef LLAMA_NO_TUNE -CFLAGS += -DGGML_USE_TUNE -DGGML_TUNE_NDEBUG +CFLAGS += -DGGML_USE_TUNE #-DGGML_TUNE_NDEBUG CXXFLAGS += -DGGML_USE_TUNE endif diff --git a/ggml-tune.c b/ggml-tune.c index 444269ae4f55a..2e292e98e7bb6 100644 --- a/ggml-tune.c +++ b/ggml-tune.c @@ -83,7 +83,7 @@ int ggml_mulmat_tune_select_task_profile(struct ggml_mulmat_tune *tune, int M, #ifndef GGML_TUNE_NDEBUG printf("\n[tune] M: %3d, N: %5d, K: %5d, profile id: %d, " - "backends: %s %s %s\n", + "profile name: %s\n", M, N, K, prof->id, prof->name); #endif } From 2193ab6281c6c21c76608500ae573c1be7c0d0bb Mon Sep 17 00:00:00 2001 From: mqy Date: Sun, 18 Jun 2023 14:07:33 +0800 Subject: [PATCH 12/24] fix cuda build error --- ggml-cuda.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ggml-cuda.cu b/ggml-cuda.cu index 5a4c7725a92de..48c7c83dfb4ad 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -2544,7 +2544,7 @@ bool ggml_cuda_is_gpu_offloading(struct ggml_tensor * tensor) { bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor){ ggml_cuda_func_t func; - const bool any_on_device = is_gpu_offloading(tensor); + const bool any_on_device = ggml_cuda_is_gpu_offloading(tensor); switch (tensor->op) { case GGML_OP_ADD: From 0ec4dab8649c00519df9f04838e28ca2cafb0e57 Mon Sep 17 00:00:00 2001 From: mqy Date: Sun, 18 Jun 2023 14:59:44 +0800 Subject: [PATCH 13/24] fixed break and asssion from select; try fix cuda link error --- ggml-cuda.h | 2 +- ggml-opencl.cpp | 2 +- ggml-opencl.h | 2 +- ggml-threading.c | 8 ++++---- ggml.c | 11 ++++++++--- 5 files changed, 15 insertions(+), 10 deletions(-) diff --git a/ggml-cuda.h b/ggml-cuda.h index 75ea94392ce6b..70bd65e227bf2 100644 --- a/ggml-cuda.h +++ b/ggml-cuda.h @@ -15,8 +15,8 @@ struct ggml_tensor_extra_gpu { void ggml_init_cublas(void); void ggml_cuda_set_tensor_split(const float * tensor_split); +bool ggml_cuda_is_gpu_offloading(struct ggml_tensor * tensor); void ggml_cuda_mul(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst); -bool ggml_cuda_is_gpu_offloading(const struct ggml_tensor * src0); size_t ggml_cuda_mul_mat_get_wsize(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst); void ggml_cuda_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst, void * wdata, size_t wsize); diff --git a/ggml-opencl.cpp b/ggml-opencl.cpp index 28098793df296..3ed9d1adb0c2b 100644 --- a/ggml-opencl.cpp +++ b/ggml-opencl.cpp @@ -1589,7 +1589,7 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor * } } -bool ggml_cl_is_gpu_offloading(struct ggml_tensor * tensor) { +bool ggml_cl_is_gpu_offloading(const struct ggml_tensor * tensor) { GGML_ASSERT(tensor); return (tensor->src0 && tensor->src0->backend == GGML_BACKEND_GPU) || (tensor->src1 && tensor->src1->backend == GGML_BACKEND_GPU); diff --git a/ggml-opencl.h b/ggml-opencl.h index 1de12f55a5c95..6d815bbf07294 100644 --- a/ggml-opencl.h +++ b/ggml-opencl.h @@ -8,8 +8,8 @@ extern "C" { void ggml_cl_init(void); +bool ggml_cl_is_gpu_offloading(const struct ggml_tensor * tensor); void ggml_cl_mul(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst); -bool ggml_cl_is_gpu_offloading(struct ggml_tensor * tensor); size_t ggml_cl_mul_mat_get_wsize(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst); void ggml_cl_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst, void * wdata, size_t wsize); diff --git a/ggml-threading.c b/ggml-threading.c index dada9f3fe8466..882639666ab05 100644 --- a/ggml-threading.c +++ b/ggml-threading.c @@ -142,7 +142,7 @@ static int sched_yield(void) { #endif struct ggml_perf_stats { - int runs; + atomic_int runs; // total cycles atomic_int cycles; @@ -211,9 +211,9 @@ static inline void ggml_spin_unlock(volatile atomic_flag *obj) { static inline void ggml_perf_collect(struct ggml_perf_stats *st, int64_t c0, int64_t t0) { - st->runs++; - st->cycles += (ggml_cycles() - c0); - st->time_us += (ggml_time_us() - t0); + atomic_fetch_add(&st->runs, 1); + atomic_fetch_add(&st->cycles, (int)(ggml_cycles() - c0)); + atomic_fetch_add(&st->time_us, (int)(ggml_time_us() - t0)); } // A worker thread goes cond waiting. diff --git a/ggml.c b/ggml.c index 62750b20bd127..5a9e0b33e8cea 100644 --- a/ggml.c +++ b/ggml.c @@ -15468,7 +15468,7 @@ struct ggml_cgraph ggml_build_backward(struct ggml_context * ctx, struct ggml_cg // ---- task profiles ---- // Check the type and memeory layout for mul_mat on blas(CPU BLAS) -static bool ggml_mul_mat_check_type_mem(struct ggml_tensor *tensor) { +static bool ggml_mul_mat_check_type_mem(const struct ggml_tensor *tensor) { enum ggml_type src0_t = tensor->src0->type; enum ggml_type src1_t = tensor->src1->type; @@ -15669,6 +15669,8 @@ int ggml_get_task_profiles( p[0].stages[0].valid = true; p[0].stages[1].valid = true; p[0].stages[1].parallel = true; + } else { + GGML_ASSERT(false); } } break; case GGML_OP_SCALE: { @@ -15717,7 +15719,7 @@ int ggml_get_task_profiles( case GGML_OP_FLASH_FF: { p[0].stages[1].valid = true; p[0].stages[1].parallel = true; - } + } break; case GGML_OP_FLASH_ATTN_BACK: { p[0].stages[0].valid = true; p[0].stages[1].valid = true; @@ -15727,11 +15729,12 @@ int ggml_get_task_profiles( case GGML_OP_MAP_BINARY: { p[0].stages[1].valid = true; } break; - case GGML_OP_CROSS_ENTROPY_LOSS: + case GGML_OP_CROSS_ENTROPY_LOSS: { p[0].stages[0].valid = true; p[0].stages[1].valid = true; p[0].stages[1].parallel = true; p[0].stages[2].valid = true; + } break; case GGML_OP_CROSS_ENTROPY_LOSS_BACK: { p[0].stages[1].valid = true; p[0].stages[1].parallel = true; @@ -15764,6 +15767,8 @@ int ggml_get_task_profiles( p[i].stages[0].parallel = true; p[i].stages[1].valid = true; p[i].stages[1].wait = true; + } else { + GGML_ASSERT(false); } ++n_profiles; } From 5abb8aefea8c9c9cc175448ff568f43788ae3718 Mon Sep 17 00:00:00 2001 From: mqy Date: Sun, 18 Jun 2023 18:55:44 +0800 Subject: [PATCH 14/24] fix warning --- ggml-cuda.cu | 2 +- ggml-cuda.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/ggml-cuda.cu b/ggml-cuda.cu index 48c7c83dfb4ad..e31d494b2ac52 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -2542,7 +2542,7 @@ bool ggml_cuda_is_gpu_offloading(struct ggml_tensor * tensor) { || (tensor->src1 != nullptr && tensor->src1->backend == GGML_BACKEND_GPU); } -bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor){ +bool ggml_cuda_compute_forward(const struct ggml_compute_params * params, struct ggml_tensor * tensor){ ggml_cuda_func_t func; const bool any_on_device = ggml_cuda_is_gpu_offloading(tensor); diff --git a/ggml-cuda.h b/ggml-cuda.h index 70bd65e227bf2..efb5bb38f2806 100644 --- a/ggml-cuda.h +++ b/ggml-cuda.h @@ -32,7 +32,7 @@ void ggml_cuda_assign_buffers_no_scratch(struct ggml_tensor * tensor); void ggml_cuda_set_main_device(int main_device); void ggml_cuda_set_scratch_size(size_t scratch_size); void ggml_cuda_free_scratch(void); -bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor); +bool ggml_cuda_compute_forward(const struct ggml_compute_params * params, struct ggml_tensor * tensor); #ifdef __cplusplus } From 5feefb32b3e332a411c2c618d38bfe5f98dbde13 Mon Sep 17 00:00:00 2001 From: mqy Date: Sun, 18 Jun 2023 18:57:33 +0800 Subject: [PATCH 15/24] threading: add suspend/resume APIs, so it's possible to run a thread pool at session level --- ggml-threading.c | 81 ++++++++++++------- ggml-threading.h | 12 ++- tests/test-ggml-threading.c | 151 +++++++++++++++++++++++++++++++++--- 3 files changed, 204 insertions(+), 40 deletions(-) diff --git a/ggml-threading.c b/ggml-threading.c index 882639666ab05..fb02b40469116 100644 --- a/ggml-threading.c +++ b/ggml-threading.c @@ -194,6 +194,8 @@ struct ggml_threading_context { struct ggml_perf_stats wait_perf; struct ggml_perf_stats wakeup_perf; + atomic_bool suspending; + int64_t *stages_time; }; @@ -252,6 +254,30 @@ static void ggml_threading_cond_wait(struct ggml_compute_state *state) { } } +// Suspend +void ggml_threading_suspend(struct ggml_threading_context *ctx) { + if (ctx->n_threads == 1) { + return; + } + + struct ggml_compute_state_shared *shared = &ctx->shared; + + ggml_spin_lock(&shared->spin); + ctx->shared.wait_now = true; + ggml_spin_unlock(&shared->spin); + + const int n_worker_threads = ctx->n_threads - 1; + + while (ctx->shared.n_waiting != n_worker_threads) { + ggml_spin_pause(); + } + + ggml_spin_lock(&shared->spin); + ctx->suspending = true; + ggml_spin_unlock(&shared->spin); + PRINT_DEBUG("[main] saw %d workers waiting\n", n_worker_threads); +} + // Wakeup all workers. // // Workers takes some time to wakeup, and has to lock spin after wakeup. Yield @@ -259,8 +285,14 @@ static void ggml_threading_cond_wait(struct ggml_compute_state *state) { // experimental. See tests/test-ggml-threading.c for details. // // NOTE: must be protected by shared->spin -static void -ggml_threading_wakeup_workers(struct ggml_compute_state_shared *shared) { +void ggml_threading_resume(struct ggml_threading_context *ctx) { + if (ctx->n_threads == 1) { + return; + } + + struct ggml_compute_state_shared *shared = &ctx->shared; + ggml_spin_lock(&shared->spin); + int64_t perf_cycles_0 = 0; int64_t perf_time_0 = 0; @@ -269,12 +301,11 @@ ggml_threading_wakeup_workers(struct ggml_compute_state_shared *shared) { perf_time_0 = ggml_time_us(); } - shared->wait_now = false; - int loop_counter = 0; - int notify_counter = 0; int64_t last_signal_time = 0; + shared->wait_now = false; + while (shared->n_waiting != 0) { ggml_spin_unlock(&shared->spin); @@ -294,22 +325,23 @@ ggml_threading_wakeup_workers(struct ggml_compute_state_shared *shared) { GGML_ASSERT(pthread_mutex_lock(&shared->mutex) == 0); GGML_ASSERT(pthread_cond_broadcast(&shared->cond) == 0); GGML_ASSERT(pthread_mutex_unlock(&shared->mutex) == 0); - ++notify_counter; last_signal_time = ggml_time_us(); ggml_spin_lock(&shared->spin); } + ctx->suspending = false; + if (shared->ctx->features & GGML_THREADING_FEATURE_PERF) { ggml_perf_collect(&shared->ctx->wakeup_perf, perf_cycles_0, perf_time_0); } - // if (notify_counter > 1) { - // printf("%s: loop counter: %d, notify counter: %d\n", __func__, - // loop_counter, notify_counter); - // } - UNUSED(notify_counter); + ggml_spin_unlock(&shared->spin); +} + +bool ggml_threading_is_suspending(struct ggml_threading_context *ctx) { + return ctx->suspending; } // Setup workers for a task stage. @@ -329,7 +361,9 @@ static void ggml_threading_setup_workers(struct ggml_threading_context *ctx, if (current->parallel) { if (shared->n_waiting > 0) { - ggml_threading_wakeup_workers(shared); + ggml_spin_unlock(&shared->spin); + ggml_threading_resume(ctx); + ggml_spin_lock(&shared->spin); } if ((ctx->features & GGML_THREADING_FEATURE_WAIT_ON_DONE) > 0) { @@ -351,17 +385,11 @@ static void ggml_threading_setup_workers(struct ggml_threading_context *ctx, } } else if (current->wait) { if (shared->n_waiting < n_worker_threads) { - shared->wait_now = true; - PRINT_DEBUG("[main] wait_now was set, expect %d workers wait\n", + PRINT_DEBUG("[main] wait_now will be set, expect %d workers wait\n", n_worker_threads); - ggml_spin_unlock(&shared->spin); - - while (shared->n_waiting != n_worker_threads) { - ggml_spin_pause(); - } - - ggml_spin_lock(&shared->spin); - PRINT_DEBUG("[main] saw %d workers waiting\n", n_worker_threads); + ggml_spin_unlock(&ctx->shared.spin); + ggml_threading_suspend(ctx); + ggml_spin_lock(&ctx->shared.spin); } } @@ -376,7 +404,7 @@ ggml_thread_ret_t ggml_threading_graph_compute_thread(void *data) { struct ggml_compute_state_shared *shared = state->shared; GGML_ASSERT(shared); - //GGML_ASSERT(shared->task_runner); + // GGML_ASSERT(shared->task_runner); shared->n_ready++; @@ -527,7 +555,7 @@ ggml_threading_compute_tensor(struct ggml_threading_context *ctx, GGML_ASSERT(profiles[0].id == 1); memcpy(&node->task_profile, &profiles[0], - sizeof(struct ggml_task_profile)); + sizeof(struct ggml_task_profile)); runner = ctx->shared.task_runner; GGML_ASSERT(runner); @@ -572,6 +600,7 @@ ggml_threading_start(int n_threads, ggml_threading_thread_runner *thread_runner, ctx->n_threads = n_threads; ctx->features = features; ctx->stages_time = stages_time; + ctx->suspending = false; int n_workers = n_threads - 1; if (n_workers > 0) { @@ -633,9 +662,7 @@ void ggml_threading_stop(struct ggml_threading_context *ctx) { PRINT_DEBUG("[main] stopping thread pool ...\n"); ctx->shared.stop = true; - ggml_spin_lock(&ctx->shared.spin); - ggml_threading_wakeup_workers(&ctx->shared); - ggml_spin_unlock(&ctx->shared.spin); + ggml_threading_resume(ctx); for (int j = 0; j < ctx->n_threads - 1; j++) { GGML_ASSERT(pthread_join(ctx->workers[j].thrd, NULL) == 0); diff --git a/ggml-threading.h b/ggml-threading.h index 81192450c6728..012c9cd504308 100644 --- a/ggml-threading.h +++ b/ggml-threading.h @@ -25,11 +25,12 @@ enum ggml_threading_features { typedef ggml_thread_ret_t(ggml_threading_thread_runner)(void *data); // Init and start underlying workers if n_threads > 1. +// n_threads: number of threads (including caller) involving in computing tasks. // // thread: optional OS thread runner, default value: // `ggml_threading_graph_compute_thread`. // -// task_runner: default task runner, nullable wheen tensor.runner is not NULL. +// task_runner: default task runner, nullable when tensor.runner is not NULL. // Overridden by tensor.runner. // features: configure threading behaviour, optional. // threading additional features. see `ggml_threading_feature`, default 0. @@ -41,9 +42,18 @@ ggml_threading_start(int n_threads, ggml_threading_thread_runner *thread, enum ggml_threading_features features, int64_t stages_time[3]); +// Suspend worker threads. +void ggml_threading_suspend(struct ggml_threading_context *ctx); + +// Resume worker threads. +void ggml_threading_resume(struct ggml_threading_context *ctx); + // Stop workers (if exist), free memories (including the ctx). void ggml_threading_stop(struct ggml_threading_context *ctx); +// Is all worker threads suspending? +bool ggml_threading_is_suspending(struct ggml_threading_context *ctx); + // The default implementation of `ggml_threading_thread_runner` ggml_thread_ret_t ggml_threading_graph_compute_thread(void *data); diff --git a/tests/test-ggml-threading.c b/tests/test-ggml-threading.c index e904fc10dcf67..f941f4dc3c372 100644 --- a/tests/test-ggml-threading.c +++ b/tests/test-ggml-threading.c @@ -41,8 +41,9 @@ static const int n_repeat = 10; // counter with array. static int work_done_arr[MAX_N_THREADS]; -static enum ggml_compute_error mock_task_runner(const struct ggml_compute_params *params, - struct ggml_tensor *node) { +static enum ggml_compute_error +mock_task_runner(const struct ggml_compute_params *params, + struct ggml_tensor *node) { int64_t loops = node->task_profile.dev_flags[1] * 1000 * 1000; if (node->task_profile.stages[params->type].parallel) { loops /= params->nth; @@ -59,7 +60,7 @@ static enum ggml_compute_error mock_task_runner(const struct ggml_compute_params return GGML_COMPUTE_OK; } -int test_driver(int id, struct ggml_tensor *node, int n_threads) { +static int test_driver(int id, struct ggml_tensor *node, int n_threads) { uint8_t loops = node->task_profile.dev_flags[1]; printf( "\n[test-ggml-threading] #%02d, workload: %2d million(s), n_threads: " @@ -81,8 +82,8 @@ int test_driver(int id, struct ggml_tensor *node, int n_threads) { node->task_profile.runner = mock_task_runner; - struct ggml_threading_context *ctx = - ggml_threading_start(n_threads, NULL, NULL, features, /*stages_time*/ NULL); + struct ggml_threading_context *ctx = ggml_threading_start( + n_threads, NULL, NULL, features, /*stages_time*/ NULL); int t1 = (int)ggml_time_us(); @@ -148,7 +149,7 @@ mock_task_runner_fallback(const struct ggml_compute_params *params, // By design, fallback should happen when attempt computing tensor in GPU, // thus it is not parallelled. -int test_fallback(struct ggml_tensor *node) { +static int test_fallback(struct ggml_tensor *node) { struct ggml_threading_context *ctx = ggml_threading_start( 1, NULL, mock_task_runner_fallback, /*features*/ GGML_THREADING_FEATURE_NONE, /*stages_time*/ NULL); @@ -182,7 +183,7 @@ customized_node_runner(const struct ggml_compute_params *params, } // Test when node->task_profile.runner is not NULL. -int test_customized_node_runner(struct ggml_tensor *node) { +static int test_customized_node_runner(struct ggml_tensor *node) { struct ggml_threading_context *ctx = ggml_threading_start( 1, NULL, mock_task_runner, /*features*/ GGML_THREADING_FEATURE_NONE, /*stages_time*/ NULL); @@ -204,6 +205,121 @@ int test_customized_node_runner(struct ggml_tensor *node) { return 0; } +static enum ggml_compute_error +lifecycle_runner(const struct ggml_compute_params *params, + struct ggml_tensor *node) { + UNUSED(params); + UNUSED(node); + return GGML_COMPUTE_OK; +} + +// Test thread lifecycle: start -> suspend -> resume -> stop +static int test_lifecycle(void) { + struct ggml_tensor node; + memset(&node, 0, sizeof(struct ggml_tensor)); + + struct ggml_task_stage *stages = node.task_profile.stages; + + stages[0].valid = true; + stages[1].valid = true; + stages[1].parallel = true; + + node.op = GGML_OP_MUL_MAT; + struct ggml_tensor src0 = { + .type = GGML_TYPE_Q4_0, + }; + struct ggml_tensor src1 = { + .type = GGML_TYPE_F32, + }; + node.src0 = &src0; + node.src1 = &src1; + + int t0 = (int)ggml_time_ms(); + // Suppose creating threading when entering session. + + // We have to try affable threads. + struct ggml_threading_context *ctx = NULL; + int threads_arr[] = {4, 2}; + int threads_arr_len = sizeof(threads_arr) / sizeof(threads_arr[0]); + int n_threads = 1; + + for (int i = 0; i < threads_arr_len; i++) { + n_threads = threads_arr[i]; + int start_time = (int)ggml_time_ms(); + ctx = ggml_threading_start( + n_threads, NULL, lifecycle_runner, + /*features*/ GGML_THREADING_FEATURE_WAIT_ON_DONE | + GGML_THREADING_FEATURE_PERF, + /*stages_time*/ NULL); + int elapsed = (int)ggml_time_ms() - start_time; + if (elapsed > 5 * n_threads) { + printf("[test-ggml-threading] %s: it took %d ms to start %d worker " + "thread(s), skip\n", + __func__, elapsed, n_threads - 1); + ggml_threading_stop(ctx); + } else { + break; + } + } + + if (n_threads == 1) { + printf("[test-ggml-threading] %s: too slow to start at least 1 worker " + "thread(s), skip\n", + __func__); + return 0; + } + + // Suppose exiting from previous compute graph ... + printf("[test-ggml-threading] %s: %d workers, suspending ...\n", __func__, + n_threads - 1); + ggml_threading_suspend(ctx); + + // Suppose entering new compute graph ... + printf("[test-ggml-threading] test lifecycle: resuming ...\n"); + ggml_threading_resume(ctx); + + const int m = 2; + const int n = 50; + + printf("[test-ggml-threading] %s: computing %d tensors (half wait)...\n", + __func__, m * n); + + for (int i = 0; i < m; i++) { + stages[0].wait = (i == 0); + for (int j = 0; j < n; j++) { + ggml_threading_compute_tensor(ctx, &node, /*wdata*/ NULL, + /*wsize*/ 0); + } + } + + printf("[test-ggml-threading] %s: compute done, resuming...\n", __func__); + ggml_threading_resume(ctx); + + const int loops = 100; + printf("[test-ggml-threading] %s: try %d loops of suspend-resume ...\n", + __func__, loops); + + for (int i = 0; i < loops; i++) { + ggml_threading_suspend(ctx); + if (!ggml_threading_is_suspending(ctx)) { + abort(); + } + + ggml_threading_resume(ctx); + if (ggml_threading_is_suspending(ctx)) { + abort(); + } + } + + printf("[test-ggml-threading] %s: stopping ...\n", __func__); + ggml_threading_stop(ctx); + + int elapsed_ms = (int)ggml_time_ms() - t0; + printf("[test-ggml-threading] %s: elapsed %d ms\n", __func__, elapsed_ms); + + return 0; +} + int main(void) { ggml_time_init(); @@ -268,21 +384,21 @@ int main(void) { } // skip this n_threads when too slow. - int t0 = (int)ggml_time_us(); + int t0 = (int)ggml_time_ms(); struct ggml_threading_context *ctx = ggml_threading_start(n_threads, ggml_threading_graph_compute_thread, NULL, 0, /*stages_time*/ NULL); - int t1 = (int)ggml_time_us(); + int t1 = (int)ggml_time_ms(); ggml_threading_stop(ctx); - int elapsed_us = t1 - t0; - if (elapsed_us > 500 * n_threads) { + int elapsed_ms = t1 - t0; + if (elapsed_ms > 5 * n_threads) { printf("[test-ggml-threading] warning: it took took %7.3f " "ms to start %2d worker thread(s). Too slow, skip.\n", - 1.0 * elapsed_us / 1000, n_threads - 1); + 1.0 * elapsed_ms, n_threads - 1); threads_arr[i] = 0; ++n_slow; } else { @@ -430,6 +546,17 @@ int main(void) { } } + // lifecycle. + { + printf("[test-ggml-threading] test lifecycle ...\n"); + ++n_tests; + + if (test_lifecycle() == 0) { + ++n_passed; + printf("[test-ggml-threading] test lifecycle: ok\n\n"); + } + } + printf("[test-ggml-threading] %d/%d passed.\n", n_passed, n_tests); return (n_passed == n_tests) ? 0 : 1; From 286c5b30143c7540d947811d37199afde50819a3 Mon Sep 17 00:00:00 2001 From: mqy Date: Sun, 18 Jun 2023 20:01:58 +0800 Subject: [PATCH 16/24] threadng: remove unnecessary spin lock/unlock from suspend/resume; add more tests --- ggml-threading.c | 22 ++++------------------ tests/test-ggml-threading.c | 25 +++++++++++++++---------- 2 files changed, 19 insertions(+), 28 deletions(-) diff --git a/ggml-threading.c b/ggml-threading.c index fb02b40469116..2a5cfa096dd4b 100644 --- a/ggml-threading.c +++ b/ggml-threading.c @@ -260,22 +260,17 @@ void ggml_threading_suspend(struct ggml_threading_context *ctx) { return; } - struct ggml_compute_state_shared *shared = &ctx->shared; - - ggml_spin_lock(&shared->spin); + PRINT_DEBUG("[main] wait_now will be set, expect %d workers wait\n", + n_worker_threads); ctx->shared.wait_now = true; - ggml_spin_unlock(&shared->spin); const int n_worker_threads = ctx->n_threads - 1; - while (ctx->shared.n_waiting != n_worker_threads) { ggml_spin_pause(); } - ggml_spin_lock(&shared->spin); - ctx->suspending = true; - ggml_spin_unlock(&shared->spin); PRINT_DEBUG("[main] saw %d workers waiting\n", n_worker_threads); + ctx->suspending = true; } // Wakeup all workers. @@ -291,7 +286,6 @@ void ggml_threading_resume(struct ggml_threading_context *ctx) { } struct ggml_compute_state_shared *shared = &ctx->shared; - ggml_spin_lock(&shared->spin); int64_t perf_cycles_0 = 0; int64_t perf_time_0 = 0; @@ -307,8 +301,6 @@ void ggml_threading_resume(struct ggml_threading_context *ctx) { shared->wait_now = false; while (shared->n_waiting != 0) { - ggml_spin_unlock(&shared->spin); - if (loop_counter > 0) { ggml_spin_pause(); if (loop_counter > 3) { @@ -326,8 +318,6 @@ void ggml_threading_resume(struct ggml_threading_context *ctx) { GGML_ASSERT(pthread_cond_broadcast(&shared->cond) == 0); GGML_ASSERT(pthread_mutex_unlock(&shared->mutex) == 0); last_signal_time = ggml_time_us(); - - ggml_spin_lock(&shared->spin); } ctx->suspending = false; @@ -335,9 +325,7 @@ void ggml_threading_resume(struct ggml_threading_context *ctx) { if (shared->ctx->features & GGML_THREADING_FEATURE_PERF) { ggml_perf_collect(&shared->ctx->wakeup_perf, perf_cycles_0, perf_time_0); - } - - ggml_spin_unlock(&shared->spin); + }; } bool ggml_threading_is_suspending(struct ggml_threading_context *ctx) { @@ -385,8 +373,6 @@ static void ggml_threading_setup_workers(struct ggml_threading_context *ctx, } } else if (current->wait) { if (shared->n_waiting < n_worker_threads) { - PRINT_DEBUG("[main] wait_now will be set, expect %d workers wait\n", - n_worker_threads); ggml_spin_unlock(&ctx->shared.spin); ggml_threading_suspend(ctx); ggml_spin_lock(&ctx->shared.spin); diff --git a/tests/test-ggml-threading.c b/tests/test-ggml-threading.c index f941f4dc3c372..cb2cca163f202 100644 --- a/tests/test-ggml-threading.c +++ b/tests/test-ggml-threading.c @@ -214,7 +214,7 @@ lifecycle_runner(const struct ggml_compute_params *params, } // Test thread lifecycle: start -> suspend -> resume -> stop -static int test_lifecycle(void) { +static int test_lifecycle(bool wait_on_done) { struct ggml_tensor node; memset(&node, 0, sizeof(struct ggml_tensor)); @@ -243,14 +243,15 @@ static int test_lifecycle(void) { int threads_arr_len = sizeof(threads_arr) / sizeof(threads_arr[0]); int n_threads = 1; + enum ggml_threading_features features = + wait_on_done ? GGML_THREADING_FEATURE_NONE + : GGML_THREADING_FEATURE_WAIT_ON_DONE; for (int i = 0; i < threads_arr_len; i++) { n_threads = threads_arr[i]; int start_time = (int)ggml_time_ms(); - ctx = ggml_threading_start( - n_threads, NULL, lifecycle_runner, - /*features*/ GGML_THREADING_FEATURE_WAIT_ON_DONE | - GGML_THREADING_FEATURE_PERF, - /*stages_time*/ NULL); + ctx = ggml_threading_start(n_threads, NULL, lifecycle_runner, + features | GGML_THREADING_FEATURE_PERF, + /*stages_time*/ NULL); int elapsed = (int)ggml_time_ms() - start_time; if (elapsed > 5 * n_threads) { printf("[test-ggml-threading] %s: it took %d ms to start %d worker " @@ -547,13 +548,17 @@ int main(void) { } // lifecycle. - { - printf("[test-ggml-threading] test lifecycle ...\n"); + for (int i = 0; i < 2; i++) { + bool wait_on_done = (i == 1); + printf("[test-ggml-threading] test lifecycle (want_on_done = %d) ...\n", + wait_on_done); ++n_tests; - if (test_lifecycle() == 0) { + if (test_lifecycle(wait_on_done) == 0) { ++n_passed; - printf("[test-ggml-threading] test lifecycle: ok\n\n"); + printf("[test-ggml-threading] test lifecycle (want_on_done = %d): " + "ok\n\n", + wait_on_done); } } From 98728632c67a4bd1ec3c791b1b92b125c904f8e4 Mon Sep 17 00:00:00 2001 From: mqy Date: Mon, 19 Jun 2023 01:04:32 +0800 Subject: [PATCH 17/24] threading test: less loops to avoid timeout --- tests/test-ggml-threading.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/test-ggml-threading.c b/tests/test-ggml-threading.c index cb2cca163f202..d34be3ac6f2b5 100644 --- a/tests/test-ggml-threading.c +++ b/tests/test-ggml-threading.c @@ -280,7 +280,7 @@ static int test_lifecycle(bool wait_on_done) { ggml_threading_resume(ctx); const int m = 2; - const int n = 50; + const int n = 10; printf("[test-ggml-threading] %s: computing %d tensors (half wait)...\n", __func__, m * n); @@ -296,7 +296,7 @@ static int test_lifecycle(bool wait_on_done) { printf("[test-ggml-threading] %s: compute done, resuming...\n", __func__); ggml_threading_resume(ctx); - const int loops = 100; + const int loops = 10; printf("[test-ggml-threading] %s: try %d loops of suspend-resume ...\n", __func__, loops); From 6609c229e84e7cb749a5a3902f0123033c06c523 Mon Sep 17 00:00:00 2001 From: mqy Date: Mon, 19 Jun 2023 01:05:34 +0800 Subject: [PATCH 18/24] fixed OP_OUT_PROD and OP_NONE --- ggml.c | 23 ++++++++++++++++------- 1 file changed, 16 insertions(+), 7 deletions(-) diff --git a/ggml.c b/ggml.c index 5a9e0b33e8cea..75a5624818a19 100644 --- a/ggml.c +++ b/ggml.c @@ -15655,8 +15655,7 @@ int ggml_get_task_profiles( p[0].stages[1].valid = true; p[0].stages[1].parallel = true; } break; - case GGML_OP_MUL_MAT: - case GGML_OP_OUT_PROD: { // FIXME: is this correct? + case GGML_OP_MUL_MAT: { enum ggml_type src0_t = tensor->src0->type; if (src0_t == GGML_TYPE_F32) { p[0].stages[1].valid = true; @@ -15673,6 +15672,15 @@ int ggml_get_task_profiles( GGML_ASSERT(false); } } break; + case GGML_OP_OUT_PROD: { + enum ggml_type src0_t = tensor->src0->type; + if (src0_t == GGML_TYPE_F32) { + p[0].stages[1].valid = true; + p[0].stages[1].parallel = true; + } else { + GGML_ASSERT(false); + } + } break; case GGML_OP_SCALE: { p[0].stages[1].valid = true; p[0].stages[1].parallel = true; @@ -15810,13 +15818,12 @@ static void ggml_optimize_tensor_task_profile( struct ggml_tensor *tensor, struct ggml_task_profile *profiles, int n_profiles, struct ggml_mulmat_tune *tune) { - if (tensor->op != GGML_OP_MUL_MAT && tensor->op != GGML_OP_OUT_PROD) { + if (tensor->op != GGML_OP_MUL_MAT) { return; } GGML_ASSERT(tensor); - GGML_ASSERT(tensor->op == GGML_OP_MUL_MAT || - tensor->op == GGML_OP_OUT_PROD); + GGML_ASSERT(tensor->op == GGML_OP_MUL_MAT); GGML_ASSERT(tensor->task_profile.id == n_profiles); GGML_ASSERT(profiles); @@ -15949,7 +15956,9 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph) for (int i = 0; i < cgraph->n_nodes; i++) { struct ggml_tensor * node = cgraph->nodes[i]; - GGML_ASSERT (node->op != GGML_OP_NONE); + if (node->op == GGML_OP_NONE) { + continue; + } if (node->task_profile.id == 0) { ggml_set_tensor_task_profile(node, cgraph->tune); @@ -16031,7 +16040,7 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph) { } break; case GGML_OP_MUL_MAT: - case GGML_OP_OUT_PROD: // FIXME: is this correct? + case GGML_OP_OUT_PROD: { size_t cur = 0; GGML_ASSERT(node->src1->type == GGML_TYPE_F32); From 65fd65e0c1d74d595a8ce14fca289444fc25e345 Mon Sep 17 00:00:00 2001 From: mqy Date: Mon, 19 Jun 2023 13:50:35 +0800 Subject: [PATCH 19/24] tune: update readme --- examples/mulmat-tune/README.md | 77 ++++++++++++++++++++++------------ 1 file changed, 50 insertions(+), 27 deletions(-) diff --git a/examples/mulmat-tune/README.md b/examples/mulmat-tune/README.md index 4e521211d968e..4ba1a1d38adb7 100644 --- a/examples/mulmat-tune/README.md +++ b/examples/mulmat-tune/README.md @@ -19,21 +19,60 @@ run bench ahead of time (saving tens of seconds), but there are two shortcomings outdated format. So I integrated mulmat tune into `main` and `perplexity` as a complementary solution. -## Build into main and perplexity +The `load` mode try validates at least the following fields: +- version +- model +- ftype +- n_threads +- n_profiles +- profiles + +`n_threads` is very critical to performance, to select best n_threads. +when run `main` or `perplexity`, the n_threads is automatically set, the default +n_threads generally works well. Example: +``` +system_info: n_threads = 4 / 12 +``` +This is read as use 4 of total 12 cores(with 6 physical cores). + +## Build + +Compile options: +- `LLAMA_TUNE` for CMake (default ON) +- `LLAMA_NO_TUNE` for Make (default undefined) + +`GGML_USE_TUNE` and `GGML_TUNE_NDEBUG` are defined when llama tune is enabled. + +When `GGML_USE_TUNE` is defined, mulmat_tune functionalities are compiled into +main and perplexity: +- cli args `--tune`, `--tune-file` are visible. +- try selecting fastest task profile according to tune result for mul_mat. + +The standalone tool `mulmat-tune` is always build: no compile options. + +**Makefile** + +To use tune, at least one of the vendors have to be built: +- BLAS(ACCELERATE, OpenNBLAS, BLIS) +- ClBlast +- CUDA (may not run) + +To enable the debug, comment out `-DGGML_TUNE_NDEBUG` from Makefile. -Makefile: ``` make clean && make ``` -CMake (with BLAS): +**CMake** + ``` -cmake --build . --target clean -cmake .. -DLLAMA_BLAS=ON +rm -rf build/* +cd build +cmake .. cmake --build . --config Release ``` -Run examples: +## Run main or perplexity ``` # bench and run: @@ -48,21 +87,7 @@ Run examples: ./main -m ./models/3B/open-llama-3b-q4-0.bin -c 512 -b 1024 -n 256 --keep 48 --repeat_penalty 1.0 --color -i -r "User:" -f prompts/chat-with-bob.txt -t 4 --tune-file ``` -# Build the standalone `mulmat-tune` - -Makefile: -``` -make clean && make -``` - -CMake (with BLAS) -``` -cmake --build . --target clean -cmake .. -DLLAMA_BLAS=ON -cmake --build . --config Release -``` - -Run examples: +## Run mulmat-tune tool ``` ./mulmat-tune -h @@ -82,8 +107,8 @@ Run examples: # customized n_pass: run 1 pass only instead of the default 3. ./mulmat-tune --n_pass 1 -# customized n_threads instead of the default 1. -./mulmat-tune --n_threads 4 +# customized n_threads instead of the default 4. +./mulmat-tune --n_threads 6 # save to file ./mulmat-tune --file @@ -93,9 +118,7 @@ Run examples: ``` -# End to End Test - -## Compare With Master +## Example: compare With Master You may want to run the following commands. Make sure the tune result file is setup properly. @@ -103,7 +126,7 @@ setup properly. General steps: 1. run `./mulmat-tune -h` to see how to build for misc vendors. - To enable the debug, comment out `-DGGML_TUNE_NDEBUG` from makefile then run: + then run: ``` make clean; make From 44b831dc59f8d6e7b3bbfc4ccd9cc2a121684339 Mon Sep 17 00:00:00 2001 From: mqy Date: Mon, 19 Jun 2023 13:54:20 +0800 Subject: [PATCH 20/24] tune: extract ggml_mulmat_tune_bench_wrapper --- ggml-tune.c | 45 ++++++++++++++++++++++++++++++++++++++++++++ ggml-tune.h | 6 ++++++ llama.cpp | 54 +++-------------------------------------------------- 3 files changed, 54 insertions(+), 51 deletions(-) diff --git a/ggml-tune.c b/ggml-tune.c index 2e292e98e7bb6..36c44e1dc53d2 100644 --- a/ggml-tune.c +++ b/ggml-tune.c @@ -935,3 +935,48 @@ bool ggml_mulmat_tune_bench(struct ggml_mulmat_tune *tune, return true; } + +bool ggml_mulmat_tune_bench_wrapper(struct ggml_mulmat_tune *mulmat_tune, + struct ggml_mulmat_tune_params *params, + bool run_bench) { + printf("\n"); + bool empty_fname = !params->fname || strcmp(params->fname, "") == 0; + + if (!ggml_cpu_has_blas()) { + fprintf(stderr, "[tune] this program is not built with BLAS, abort.\n"); + return 1; + } + + if (run_bench) { + return ggml_mulmat_tune_bench(mulmat_tune, params); + } + + if (!empty_fname) { + FILE *fp = fopen(params->fname, "r"); + if (!fp) { + fprintf(stderr, "[tune] failed to open file %s.\n", params->fname); + return false; + } else { + int rc = ggml_mulmat_tune_read_data(mulmat_tune, fp); + fclose(fp); + + if (rc != 0) { + fprintf(stderr, + "[tune] failed to read data from %s, error code: %d\n", + params->fname, rc); + return false; + } + + fprintf(stderr, "[tune] loaded data from %s\n", params->fname); + + bool ok = ggml_mulmat_tune_validate(mulmat_tune, mulmat_tune->model, + params->model.ftype, + params->n_threads); + if (!ok) { + return false; + } + } + } + + return true; +} diff --git a/ggml-tune.h b/ggml-tune.h index addcd34dbbd62..633f92697050a 100644 --- a/ggml-tune.h +++ b/ggml-tune.h @@ -132,6 +132,12 @@ void ggml_mulmat_tune_estimate_time(const struct ggml_mulmat_tune_shape *shape, bool ggml_mulmat_tune_bench(struct ggml_mulmat_tune *tune, struct ggml_mulmat_tune_params *params); +// This API is intended to be called by llama, etc. +// Three modes: bench and run; bench(save) then exit; load and run +bool ggml_mulmat_tune_bench_wrapper(struct ggml_mulmat_tune *mulmat_tune, + struct ggml_mulmat_tune_params *params, + bool run_bench); + #ifdef __cplusplus } #endif diff --git a/llama.cpp b/llama.cpp index e6bddffd5edaa..a3c3586e399f2 100644 --- a/llama.cpp +++ b/llama.cpp @@ -2748,8 +2748,6 @@ bool llama_mulmat_tune(struct llama_context *ctx, int n_threads, bool tune, const char *fname) { GGML_ASSERT(ctx->model.n_gpu_layers == 0); - printf("\n"); - const char *model_name = llama_model_type_name(ctx->model.type); llama_hparams *hparams = &ctx->model.hparams; @@ -2820,71 +2818,25 @@ bool llama_mulmat_tune(struct llama_context *ctx, int n_threads, bool tune, /* .m_num =*/8, /* .n_pass =*/1, /* .n_threads =*/n_threads, - /* .prrogress =*/true, + /* .progress =*/true, /* .output_console =*/false, /* .fname =*/fname, }; - bool empty_fname = !fname || strcmp(fname, "") == 0; - ctx->tune = new (struct ggml_mulmat_tune); if (!ctx->tune) { fprintf(stderr, "[tune] failed to allocate memory for tune\n"); return false; } - if (!ggml_cpu_has_blas()) { - fprintf(stderr, "[tune] this program is not built with BLAS, abort.\n"); - return false; - } - - if (tune) { - bool ok = ggml_mulmat_tune_bench(ctx->tune, ¶ms); - if (!ok) { - ggml_mulmat_tune_free(ctx->tune); - return false; - } - if (!empty_fname) { - ggml_mulmat_tune_free(ctx->tune); - return true; - } - } else if (empty_fname) { - return false; - } - - if (!empty_fname) { - FILE *fp = fopen(fname, "r"); - if (!fp) { - fprintf(stderr, "[tune] failed to open file %s.\n", fname); - return false; - } else { - int rc = ggml_mulmat_tune_read_data(ctx->tune, fp); - fclose(fp); - - if (rc != 0) { - fprintf(stderr, - "[tune] failed to read data from %s, error code: %d\n", - fname, rc); - return false; - } - - fprintf(stderr, "[tune] loaded data from %s\n", fname); - - bool ok = ggml_mulmat_tune_validate(ctx->tune, model_name, ggml_ftype, - params.n_threads); - if (!ok) { - return false; - } - } - } - - return true; + return ggml_mulmat_tune_bench_wrapper(ctx->tune, ¶ms, tune); } #endif void llama_free(struct llama_context * ctx) { #ifdef GGML_USE_TUNE if (ctx->tune) { + ggml_mulmat_tune_free(ctx->tune); delete(ctx->tune); } #endif From 4d32b4088e10ad37733cda7979706a3b8ffa1cb2 Mon Sep 17 00:00:00 2001 From: mqy Date: Mon, 19 Jun 2023 14:05:30 +0800 Subject: [PATCH 21/24] threading test: decrease a threshold value to avoid timeout --- tests/test-ggml-threading.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/test-ggml-threading.c b/tests/test-ggml-threading.c index d34be3ac6f2b5..30dddbeabf288 100644 --- a/tests/test-ggml-threading.c +++ b/tests/test-ggml-threading.c @@ -253,7 +253,7 @@ static int test_lifecycle(bool wait_on_done) { features | GGML_THREADING_FEATURE_PERF, /*stages_time*/ NULL); int elapsed = (int)ggml_time_ms() - start_time; - if (elapsed > 5 * n_threads) { + if (elapsed > 1 * n_threads) { printf("[test-ggml-threading] %s: it took %d ms to start %d worker " "thread(s), skip\n", __func__, elapsed, n_threads - 1); @@ -396,7 +396,7 @@ int main(void) { ggml_threading_stop(ctx); int elapsed_ms = t1 - t0; - if (elapsed_ms > 5 * n_threads) { + if (elapsed_ms > 1 * n_threads) { printf("[test-ggml-threading] warning: it took took %7.3f " "ms to start %2d worker thread(s). Too slow, skip.\n", 1.0 * elapsed_ms, n_threads - 1); From cc8a375bc411c153ec0771faf852316aa5e42f83 Mon Sep 17 00:00:00 2001 From: mqy Date: Mon, 19 Jun 2023 16:17:48 +0800 Subject: [PATCH 22/24] threading: fix deadlock by reverting part of changes from commit 286c5b30 --- ggml-threading.c | 16 ++++++++++++---- tests/test-ggml-threading.c | 4 ++-- 2 files changed, 14 insertions(+), 6 deletions(-) diff --git a/ggml-threading.c b/ggml-threading.c index 2a5cfa096dd4b..4c1bc0714fede 100644 --- a/ggml-threading.c +++ b/ggml-threading.c @@ -199,8 +199,6 @@ struct ggml_threading_context { int64_t *stages_time; }; -// NOTE: ggml_spin_lock and ggml_spin_unlock may can be noop if -// feature wait_on_done is off. static inline void ggml_spin_lock(volatile atomic_flag *obj) { while (atomic_flag_test_and_set(obj)) { ggml_spin_pause(); @@ -262,7 +260,10 @@ void ggml_threading_suspend(struct ggml_threading_context *ctx) { PRINT_DEBUG("[main] wait_now will be set, expect %d workers wait\n", n_worker_threads); + + ggml_spin_lock(&ctx->shared.spin); ctx->shared.wait_now = true; + ggml_spin_unlock(&ctx->shared.spin); const int n_worker_threads = ctx->n_threads - 1; while (ctx->shared.n_waiting != n_worker_threads) { @@ -270,7 +271,9 @@ void ggml_threading_suspend(struct ggml_threading_context *ctx) { } PRINT_DEBUG("[main] saw %d workers waiting\n", n_worker_threads); + ggml_spin_lock(&ctx->shared.spin); ctx->suspending = true; + ggml_spin_unlock(&ctx->shared.spin); } // Wakeup all workers. @@ -278,8 +281,6 @@ void ggml_threading_suspend(struct ggml_threading_context *ctx) { // Workers takes some time to wakeup, and has to lock spin after wakeup. Yield // is used to avoid signal frequently. Current implementation is highly // experimental. See tests/test-ggml-threading.c for details. -// -// NOTE: must be protected by shared->spin void ggml_threading_resume(struct ggml_threading_context *ctx) { if (ctx->n_threads == 1) { return; @@ -298,9 +299,12 @@ void ggml_threading_resume(struct ggml_threading_context *ctx) { int loop_counter = 0; int64_t last_signal_time = 0; + ggml_spin_lock(&shared->spin); shared->wait_now = false; while (shared->n_waiting != 0) { + ggml_spin_unlock(&shared->spin); + if (loop_counter > 0) { ggml_spin_pause(); if (loop_counter > 3) { @@ -318,6 +322,8 @@ void ggml_threading_resume(struct ggml_threading_context *ctx) { GGML_ASSERT(pthread_cond_broadcast(&shared->cond) == 0); GGML_ASSERT(pthread_mutex_unlock(&shared->mutex) == 0); last_signal_time = ggml_time_us(); + + ggml_spin_lock(&shared->spin); } ctx->suspending = false; @@ -326,6 +332,8 @@ void ggml_threading_resume(struct ggml_threading_context *ctx) { ggml_perf_collect(&shared->ctx->wakeup_perf, perf_cycles_0, perf_time_0); }; + + ggml_spin_unlock(&shared->spin); } bool ggml_threading_is_suspending(struct ggml_threading_context *ctx) { diff --git a/tests/test-ggml-threading.c b/tests/test-ggml-threading.c index 30dddbeabf288..886c5ee672794 100644 --- a/tests/test-ggml-threading.c +++ b/tests/test-ggml-threading.c @@ -550,13 +550,13 @@ int main(void) { // lifecycle. for (int i = 0; i < 2; i++) { bool wait_on_done = (i == 1); - printf("[test-ggml-threading] test lifecycle (want_on_done = %d) ...\n", + printf("[test-ggml-threading] test lifecycle (wait_on_done = %d) ...\n", wait_on_done); ++n_tests; if (test_lifecycle(wait_on_done) == 0) { ++n_passed; - printf("[test-ggml-threading] test lifecycle (want_on_done = %d): " + printf("[test-ggml-threading] test lifecycle (wait_on_done = %d): " "ok\n\n", wait_on_done); } From aac7f7cc0407a6eb830079377a34e885d0a0e1e2 Mon Sep 17 00:00:00 2001 From: mqy Date: Mon, 19 Jun 2023 18:15:32 +0800 Subject: [PATCH 23/24] threading: try to fix a deadlock, also added critical deadlock detection --- ggml-threading.c | 55 ++++++++++++++++++++++--------------- tests/test-ggml-threading.c | 4 +-- 2 files changed, 35 insertions(+), 24 deletions(-) diff --git a/ggml-threading.c b/ggml-threading.c index 4c1bc0714fede..4a9cf622fafb0 100644 --- a/ggml-threading.c +++ b/ggml-threading.c @@ -261,19 +261,22 @@ void ggml_threading_suspend(struct ggml_threading_context *ctx) { PRINT_DEBUG("[main] wait_now will be set, expect %d workers wait\n", n_worker_threads); - ggml_spin_lock(&ctx->shared.spin); - ctx->shared.wait_now = true; + struct ggml_compute_state_shared *shared = &ctx->shared; + + ggml_spin_lock(&shared->spin); + shared->wait_now = true; ggml_spin_unlock(&ctx->shared.spin); const int n_worker_threads = ctx->n_threads - 1; - while (ctx->shared.n_waiting != n_worker_threads) { + while (shared->n_waiting != n_worker_threads) { ggml_spin_pause(); } PRINT_DEBUG("[main] saw %d workers waiting\n", n_worker_threads); - ggml_spin_lock(&ctx->shared.spin); + + ggml_spin_lock(&shared->spin); ctx->suspending = true; - ggml_spin_unlock(&ctx->shared.spin); + ggml_spin_unlock(&shared->spin); } // Wakeup all workers. @@ -296,44 +299,52 @@ void ggml_threading_resume(struct ggml_threading_context *ctx) { perf_time_0 = ggml_time_us(); } - int loop_counter = 0; - int64_t last_signal_time = 0; + // Dead lock detection. + int counter = 0; + int64_t last_notify_ms = 0; + const int max_notify_count = ctx->n_threads - 1; + const int max_duration_ms = 100 * max_notify_count; ggml_spin_lock(&shared->spin); shared->wait_now = false; while (shared->n_waiting != 0) { - ggml_spin_unlock(&shared->spin); - if (loop_counter > 0) { - ggml_spin_pause(); - if (loop_counter > 3) { - sched_yield(); - } + GGML_ASSERT(pthread_mutex_lock(&shared->mutex) == 0); + if (shared->n_waiting == 0) { + GGML_ASSERT(pthread_mutex_unlock(&shared->mutex) == 0); + ggml_spin_unlock(&shared->spin); + break; } - ++loop_counter; - // TODO: should bench actual average wait/wakeup time. - if (last_signal_time > 0 && (ggml_time_us() - last_signal_time) < 10) { - continue; - } + ggml_spin_unlock(&shared->spin); - GGML_ASSERT(pthread_mutex_lock(&shared->mutex) == 0); GGML_ASSERT(pthread_cond_broadcast(&shared->cond) == 0); GGML_ASSERT(pthread_mutex_unlock(&shared->mutex) == 0); - last_signal_time = ggml_time_us(); + last_notify_ms = ggml_time_ms(); + + sched_yield(); + + int elapsed = last_notify_ms > 0 ? ggml_time_ms() - last_notify_ms : 0; + + if ((counter > max_notify_count) || elapsed > max_duration_ms) { + fprintf(stderr, + "[ggml-threading] potential dead lock detected, notified " + "for %d times, elapsed time: %d ms, abort\n", + counter, elapsed); + abort(); + } ggml_spin_lock(&shared->spin); } ctx->suspending = false; + ggml_spin_unlock(&shared->spin); if (shared->ctx->features & GGML_THREADING_FEATURE_PERF) { ggml_perf_collect(&shared->ctx->wakeup_perf, perf_cycles_0, perf_time_0); }; - - ggml_spin_unlock(&shared->spin); } bool ggml_threading_is_suspending(struct ggml_threading_context *ctx) { diff --git a/tests/test-ggml-threading.c b/tests/test-ggml-threading.c index 886c5ee672794..8fc705a6bd197 100644 --- a/tests/test-ggml-threading.c +++ b/tests/test-ggml-threading.c @@ -33,7 +33,7 @@ #define UNUSED(x) (void)(x) -#define MAX_N_THREADS 16 +#define MAX_N_THREADS 64 static const int n_repeat = 10; @@ -353,7 +353,7 @@ int main(void) { // average time, thus greatly punishes those small workloads. // - wait_on_done is general faster than wait_now, can be 10x faster. - int threads_arr[] = {1, 2, 4, 6, 8, 16}; + int threads_arr[] = {1, 2, 4, 6, 8, 16, 32, 64}; int threads_arr_len = sizeof(threads_arr) / sizeof(threads_arr[0]); // millions of loops. From 08972d2aee2021461b589b8e71bb29275bff7436 Mon Sep 17 00:00:00 2001 From: mqy Date: Mon, 19 Jun 2023 19:15:00 +0800 Subject: [PATCH 24/24] threading: removed feature wait_on_done to figure out causes of deadlock in windows AVX --- ggml-threading.c | 56 +++++++++++++------------------------ ggml-threading.h | 3 +- ggml-tune.c | 2 +- ggml.c | 4 +-- tests/test-ggml-threading.c | 55 ++++++++++-------------------------- 5 files changed, 38 insertions(+), 82 deletions(-) diff --git a/ggml-threading.c b/ggml-threading.c index 4a9cf622fafb0..1468318599ac9 100644 --- a/ggml-threading.c +++ b/ggml-threading.c @@ -167,7 +167,6 @@ struct ggml_compute_state_shared { // commands. atomic_bool wait_now; - atomic_bool wait_on_done; atomic_bool stop; // Default task runner, can be overriden by node.task_profile.runner. @@ -263,11 +262,12 @@ void ggml_threading_suspend(struct ggml_threading_context *ctx) { struct ggml_compute_state_shared *shared = &ctx->shared; + const int n_worker_threads = ctx->n_threads - 1; + ggml_spin_lock(&shared->spin); shared->wait_now = true; ggml_spin_unlock(&ctx->shared.spin); - const int n_worker_threads = ctx->n_threads - 1; while (shared->n_waiting != n_worker_threads) { ggml_spin_pause(); } @@ -281,8 +281,8 @@ void ggml_threading_suspend(struct ggml_threading_context *ctx) { // Wakeup all workers. // -// Workers takes some time to wakeup, and has to lock spin after wakeup. Yield -// is used to avoid signal frequently. Current implementation is highly +// Workers takes some time to wakeup. +// Yield is used to avoid notify frequently. Current implementation is highly // experimental. See tests/test-ggml-threading.c for details. void ggml_threading_resume(struct ggml_threading_context *ctx) { if (ctx->n_threads == 1) { @@ -302,14 +302,20 @@ void ggml_threading_resume(struct ggml_threading_context *ctx) { // Dead lock detection. int counter = 0; int64_t last_notify_ms = 0; - const int max_notify_count = ctx->n_threads - 1; - const int max_duration_ms = 100 * max_notify_count; + const int max_notify_count = 50; + const int max_duration_ms = 1000; + + if (shared->n_waiting != 0 && shared->n_waiting != ctx->n_threads - 1) { + fprintf(stderr, + "[ggml-threading] expected n_waiting is 0 or %d, actual %d, abort\n", + ctx->n_threads - 1, shared->n_waiting); + abort(); + } ggml_spin_lock(&shared->spin); shared->wait_now = false; while (shared->n_waiting != 0) { - GGML_ASSERT(pthread_mutex_lock(&shared->mutex) == 0); if (shared->n_waiting == 0) { GGML_ASSERT(pthread_mutex_unlock(&shared->mutex) == 0); @@ -323,7 +329,11 @@ void ggml_threading_resume(struct ggml_threading_context *ctx) { GGML_ASSERT(pthread_mutex_unlock(&shared->mutex) == 0); last_notify_ms = ggml_time_ms(); - sched_yield(); + if (counter % 1 == 0) { + ggml_spin_pause(); + } else { + sched_yield(); + } int elapsed = last_notify_ms > 0 ? ggml_time_ms() - last_notify_ms : 0; @@ -372,24 +382,6 @@ static void ggml_threading_setup_workers(struct ggml_threading_context *ctx, ggml_threading_resume(ctx); ggml_spin_lock(&shared->spin); } - - if ((ctx->features & GGML_THREADING_FEATURE_WAIT_ON_DONE) > 0) { - // Optimize energy: wait_on_done. We MAY also check following nodes, - // but that's a bit complicated. - shared->wait_on_done = false; - for (int i = type + 1; i <= GGML_TASK_FINALIZE; i++) { - struct ggml_task_stage *next = &profile->stages[i]; - if (next->parallel) { - break; - } - if (next->wait) { - shared->wait_on_done = true; - PRINT_DEBUG("[main] wait_on_done is enabled for " - "current task stage\n"); - break; - } - } - } } else if (current->wait) { if (shared->n_waiting < n_worker_threads) { ggml_spin_unlock(&ctx->shared.spin); @@ -437,17 +429,7 @@ ggml_thread_ret_t ggml_threading_graph_compute_thread(void *data) { state->has_work = false; shared->n_tasks--; - bool wait = shared->wait_on_done && !state->has_work; - if (wait) { - ggml_threading_cond_wait(state); - } - ggml_spin_unlock(&shared->spin); - - // no need to pause. - if (wait) { - continue; - } } ggml_spin_pause(); @@ -594,11 +576,11 @@ ggml_threading_start(int n_threads, ggml_threading_thread_runner *thread_runner, .n_tasks = 0, .n_waiting = 0, .wait_now = false, - .wait_on_done = false, .stop = false, .task_runner = task_runner, .ctx = ctx, }; + atomic_flag_clear(&ctx->shared.spin); PRINT_DEBUG("[main] thread start, features: %d\n", features); diff --git a/ggml-threading.h b/ggml-threading.h index 012c9cd504308..f226b00196ff6 100644 --- a/ggml-threading.h +++ b/ggml-threading.h @@ -17,8 +17,7 @@ struct ggml_threading_context; // Optional (experimental) features. enum ggml_threading_features { GGML_THREADING_FEATURE_NONE = 0, - GGML_THREADING_FEATURE_WAIT_ON_DONE = 1 << 0, - GGML_THREADING_FEATURE_PERF = 1 << 1, + GGML_THREADING_FEATURE_PERF = 1, }; // The thread runner to feed into OS threads. diff --git a/ggml-tune.c b/ggml-tune.c index 36c44e1dc53d2..7aa9c217c7b29 100644 --- a/ggml-tune.c +++ b/ggml-tune.c @@ -802,7 +802,7 @@ bool ggml_mulmat_tune_bench(struct ggml_mulmat_tune *tune, struct ggml_threading_context *thrd_ctx = ggml_threading_start(tune->n_threads, NULL, NULL, - GGML_THREADING_FEATURE_WAIT_ON_DONE, stages_time); + GGML_THREADING_FEATURE_NONE, stages_time); for (int i_shape = 0; i_shape < tune->n_shapes; i_shape++) { const struct ggml_mulmat_tune_shape *shape = &tune->shapes[i_shape]; diff --git a/ggml.c b/ggml.c index 75a5624818a19..614212b677226 100644 --- a/ggml.c +++ b/ggml.c @@ -16203,14 +16203,14 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph) } // ~ 50 us - //printf("=== prepare computing took %d us\n", (int)(ggml_time_us() - t0)); + // printf("=== prepare computing took %d us\n", (int)(ggml_time_us() - t0)); } const int64_t perf_start_cycles = ggml_perf_cycles(); const int64_t perf_start_time_us = ggml_perf_time_us(); struct ggml_threading_context *thrd_ctx = ggml_threading_start(n_threads, - NULL, ggml_compute_forward, GGML_THREADING_FEATURE_WAIT_ON_DONE, NULL); + NULL, ggml_compute_forward, GGML_THREADING_FEATURE_NONE, NULL); for (int i = 0; i < cgraph->n_nodes; i++) { GGML_PRINT_DEBUG_5("%s: %d/%d\n", __func__, i, cgraph->n_nodes); diff --git a/tests/test-ggml-threading.c b/tests/test-ggml-threading.c index 8fc705a6bd197..cfb5b7a8e7223 100644 --- a/tests/test-ggml-threading.c +++ b/tests/test-ggml-threading.c @@ -71,12 +71,7 @@ static int test_driver(int id, struct ggml_tensor *node, int n_threads) { work_done_arr[i] = 0; } - bool wait_on_done = (node->task_profile.dev_flags[0] > 0u); - enum ggml_threading_features features = GGML_THREADING_FEATURE_PERF; - if (wait_on_done) { - features |= GGML_THREADING_FEATURE_WAIT_ON_DONE; - } int t0 = (int)ggml_time_us(); @@ -118,9 +113,9 @@ static int test_driver(int id, struct ggml_tensor *node, int n_threads) { } printf("\tstage-0: parallel: %d, wait: %d\n\tstage-1: parallel: %d, wait: " - "%d, wait_on_done: %d %s\n", + "%d\n", stages[0].parallel, stages[0].wait, stages[1].parallel, - stages[1].wait, wait_on_done, stages[1].wait ? "<--------" : ""); + stages[1].wait); if (actual == expect) { printf("\tthreading: init %6.3f ms, compute %6.3f ms, cleanup %6.3f " @@ -214,7 +209,7 @@ lifecycle_runner(const struct ggml_compute_params *params, } // Test thread lifecycle: start -> suspend -> resume -> stop -static int test_lifecycle(bool wait_on_done) { +static int test_lifecycle() { struct ggml_tensor node; memset(&node, 0, sizeof(struct ggml_tensor)); @@ -243,9 +238,7 @@ static int test_lifecycle(bool wait_on_done) { int threads_arr_len = sizeof(threads_arr) / sizeof(threads_arr[0]); int n_threads = 1; - enum ggml_threading_features features = - wait_on_done ? GGML_THREADING_FEATURE_NONE - : GGML_THREADING_FEATURE_WAIT_ON_DONE; + enum ggml_threading_features features = GGML_THREADING_FEATURE_NONE; for (int i = 0; i < threads_arr_len; i++) { n_threads = threads_arr[i]; int start_time = (int)ggml_time_ms(); @@ -351,7 +344,6 @@ int main(void) { // physical cores: // - the wait/wakeup time varies much: can be up to tens or hundreds of the // average time, thus greatly punishes those small workloads. - // - wait_on_done is general faster than wait_now, can be 10x faster. int threads_arr[] = {1, 2, 4, 6, 8, 16, 32, 64}; int threads_arr_len = sizeof(threads_arr) / sizeof(threads_arr[0]); @@ -408,7 +400,7 @@ int main(void) { } } - // node.task_profile.dev_flags: byte 0 for wait_on_done, byte 1 for loops. + // node.task_profile.dev_flags: byte 0 unused, byte 1 for loops. for (int x = 0; x < workload_arr_len; x++) { node.task_profile.dev_flags[1] = workload_arr[x]; @@ -423,7 +415,7 @@ int main(void) { "n_threads: %2d ====\n", workload_arr[x], n_threads); - // multi-threads: parallel + wait_now/wait_on_done + // multi-threads: parallel + wait if (n_threads == 1) { stages[0].parallel = false; @@ -489,22 +481,11 @@ int main(void) { abort(); } - { // disable wait_on_done - node.task_profile.dev_flags[0] = 0u; // wait now. - - n_tests++; - if (test_driver(n_tests, &node, n_threads) == 0) { - n_passed++; - } - } - - { // enable wait_on_done - node.task_profile.dev_flags[0] = 1u; // wait on done + node.task_profile.dev_flags[0] = 0u; // wait now. - n_tests++; - if (test_driver(n_tests, &node, n_threads) == 0) { - n_passed++; - } + n_tests++; + if (test_driver(n_tests, &node, n_threads) == 0) { + n_passed++; } } } @@ -548,18 +529,12 @@ int main(void) { } // lifecycle. - for (int i = 0; i < 2; i++) { - bool wait_on_done = (i == 1); - printf("[test-ggml-threading] test lifecycle (wait_on_done = %d) ...\n", - wait_on_done); - ++n_tests; + printf("[test-ggml-threading] test lifecycle ...\n"); + ++n_tests; - if (test_lifecycle(wait_on_done) == 0) { - ++n_passed; - printf("[test-ggml-threading] test lifecycle (wait_on_done = %d): " - "ok\n\n", - wait_on_done); - } + if (test_lifecycle() == 0) { + ++n_passed; + printf("[test-ggml-threading] test lifecycle: ok\n\n"); } printf("[test-ggml-threading] %d/%d passed.\n", n_passed, n_tests);