From 7dd261f3e965348cb631c625fd9e29dd8c48c1f6 Mon Sep 17 00:00:00 2001
From: Djip007 <3705339+Djip007@users.noreply.github.com>
Date: Sat, 16 Nov 2024 20:58:30 +0100
Subject: [PATCH 1/3] extract llamafile in new tinyblas backend

---
 Makefile                                      |  21 +-
 ggml/include/ggml-cpu.h                       |   1 -
 ggml/include/ggml-tinyblas.h                  |  17 +
 ggml/src/ggml-backend-reg.cpp                 |   7 +
 ggml/src/ggml-cpu/ggml-cpu.c                  |   8 -
 ggml/src/ggml-cpu/ggml-cpu.cpp                |   3 -
 ggml/src/ggml-cpu/llamafile/sgemm.h           |  14 -
 ggml/src/ggml-tinyblas/CMakeLists.txt         | 230 +++++++
 ggml/src/ggml-tinyblas/ggml-tinyblas.cpp      | 472 ++++++++++++++
 .../llamafile => ggml-tinyblas}/sgemm.cpp     | 604 +++++++++++-------
 ggml/src/ggml-tinyblas/sgemm.h                |  51 ++
 src/llama.cpp                                 |   1 -
 12 files changed, 1183 insertions(+), 246 deletions(-)
 create mode 100644 ggml/include/ggml-tinyblas.h
 delete mode 100644 ggml/src/ggml-cpu/llamafile/sgemm.h
 create mode 100644 ggml/src/ggml-tinyblas/CMakeLists.txt
 create mode 100644 ggml/src/ggml-tinyblas/ggml-tinyblas.cpp
 rename ggml/src/{ggml-cpu/llamafile => ggml-tinyblas}/sgemm.cpp (80%)
 create mode 100644 ggml/src/ggml-tinyblas/sgemm.h

diff --git a/Makefile b/Makefile
index 539370e0639f4..fa94c3bf39072 100644
--- a/Makefile
+++ b/Makefile
@@ -568,8 +568,8 @@ ifdef GGML_NVPL
 endif # GGML_NVPL
 
 ifndef GGML_NO_LLAMAFILE
-	MK_CPPFLAGS  += -DGGML_USE_LLAMAFILE
-	OBJ_GGML_EXT += ggml/src/ggml-cpu/llamafile/sgemm.o
+	MK_CPPFLAGS  += -DGGML_USE_TINYBLAS
+	OBJ_GGML_EXT += ggml/src/ggml-tinyblas/ggml-tinyblas.o ggml/src/ggml-tinyblas/sgemm.o
 endif
 
 ifndef GGML_NO_AMX
@@ -1153,6 +1153,23 @@ $(DIR_GGML)/src/ggml-cpu/ggml-cpu-cpp.o: \
 	ggml/src/ggml-impl.h
 	$(CXX) $(CXXFLAGS)   -c $< -o $@
 
+# TODO: renomer en GGML_NO_TINYBLAS
+# needed for c++17 build
+ifndef GGML_NO_LLAMAFILE
+ggml/src/ggml-tinyblas/ggml-tinyblas.o: \
+	ggml/src/ggml-tinyblas/ggml-tinyblas.cpp \
+	ggml/include/ggml-tinyblas.h \
+	ggml/src/ggml-tinyblas/sgemm.h \
+	ggml/include/ggml.h
+	$(CXX) $(CXXFLAGS) -std=c++17 -c $< -o $@
+
+ggml/src/ggml-tinyblas/sgemm.o: \
+	ggml/src/ggml-tinyblas/sgemm.cpp \
+	ggml/src/ggml-tinyblas/sgemm.h \
+	ggml/include/ggml.h
+	$(CXX) $(CXXFLAGS) -std=c++17 -c $< -o $@
+endif # GGML_NO_LLAMAFILE
+
 # Rules for building object files
 $(DIR_GGML)/%.o: $(DIR_GGML)/%.c
 	$(CC) $(CFLAGS) -MMD -c $< -o $@
diff --git a/ggml/include/ggml-cpu.h b/ggml/include/ggml-cpu.h
index 7571ef9798364..49a18ba37cb98 100644
--- a/ggml/include/ggml-cpu.h
+++ b/ggml/include/ggml-cpu.h
@@ -124,7 +124,6 @@ extern "C" {
     GGML_BACKEND_API int ggml_cpu_has_riscv_v    (void);
     GGML_BACKEND_API int ggml_cpu_has_vsx        (void);
     GGML_BACKEND_API int ggml_cpu_has_wasm_simd  (void);
-    GGML_BACKEND_API int ggml_cpu_has_llamafile  (void);
 
     // Internal types and functions exposed for tests and benchmarks
 
diff --git a/ggml/include/ggml-tinyblas.h b/ggml/include/ggml-tinyblas.h
new file mode 100644
index 0000000000000..4c0075327003e
--- /dev/null
+++ b/ggml/include/ggml-tinyblas.h
@@ -0,0 +1,17 @@
+#pragma once
+
+#include "ggml.h"
+#include "ggml-backend.h"
+
+
+#ifdef  __cplusplus
+extern "C" {
+#endif
+
+// backend register
+GGML_API ggml_backend_reg_t ggml_backend_tinyblas_reg(void);
+
+
+#ifdef  __cplusplus
+}
+#endif
diff --git a/ggml/src/ggml-backend-reg.cpp b/ggml/src/ggml-backend-reg.cpp
index 63e9d82017457..78bcb6c5ca5d9 100644
--- a/ggml/src/ggml-backend-reg.cpp
+++ b/ggml/src/ggml-backend-reg.cpp
@@ -27,6 +27,10 @@
 #include "ggml-blas.h"
 #endif
 
+#ifdef GGML_USE_TINYBLAS
+#include "ggml-tinyblas.h"
+#endif
+
 #ifdef GGML_USE_RPC
 #include "ggml-rpc.h"
 #endif
@@ -66,6 +70,9 @@ struct ggml_backend_registry {
 #ifdef GGML_USE_BLAS
         register_backend(ggml_backend_blas_reg());
 #endif
+#ifdef GGML_USE_TINYBLAS
+        register_backend(ggml_backend_tinyblas_reg());
+#endif
 #ifdef GGML_USE_RPC
         register_backend(ggml_backend_rpc_reg());
 #endif
diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c
index 61f53cd0145a6..7f5c465df068c 100644
--- a/ggml/src/ggml-cpu/ggml-cpu.c
+++ b/ggml/src/ggml-cpu/ggml-cpu.c
@@ -13868,14 +13868,6 @@ int ggml_cpu_has_wasm_simd(void) {
 #endif
 }
 
-int ggml_cpu_has_llamafile(void) {
-#if defined(GGML_USE_LLAMAFILE)
-    return 1;
-#else
-    return 0;
-#endif
-}
-
 int ggml_cpu_has_sse3(void) {
 #if defined(__SSE3__)
     return 1;
diff --git a/ggml/src/ggml-cpu/ggml-cpu.cpp b/ggml/src/ggml-cpu/ggml-cpu.cpp
index 573b7c5b9b375..a131f5e28a65a 100644
--- a/ggml/src/ggml-cpu/ggml-cpu.cpp
+++ b/ggml/src/ggml-cpu/ggml-cpu.cpp
@@ -616,9 +616,6 @@ static ggml_backend_feature * ggml_backend_cpu_get_features(ggml_backend_reg_t r
         if (ggml_cpu_has_wasm_simd()) {
             features.push_back({ "WASM_SIMD", "1" });
         }
-        if (ggml_cpu_has_llamafile()) {
-            features.push_back({ "LLAMAFILE", "1" });
-        }
 
         features.push_back({ nullptr, nullptr });
 
diff --git a/ggml/src/ggml-cpu/llamafile/sgemm.h b/ggml/src/ggml-cpu/llamafile/sgemm.h
deleted file mode 100644
index caf6dd5567b3a..0000000000000
--- a/ggml/src/ggml-cpu/llamafile/sgemm.h
+++ /dev/null
@@ -1,14 +0,0 @@
-#pragma once
-#include <stdint.h>
-#include <stdbool.h>
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-bool llamafile_sgemm(int64_t, int64_t, int64_t, const void *, int64_t,
-                     const void *, int64_t, void *, int64_t, int, int,
-                     int, int, int);
-
-#ifdef __cplusplus
-}
-#endif
diff --git a/ggml/src/ggml-tinyblas/CMakeLists.txt b/ggml/src/ggml-tinyblas/CMakeLists.txt
new file mode 100644
index 0000000000000..c8c4fd04e0f29
--- /dev/null
+++ b/ggml/src/ggml-tinyblas/CMakeLists.txt
@@ -0,0 +1,230 @@
+add_library(ggml-tinyblas
+            ggml-tinyblas.cpp
+            )
+
+target_link_libraries(ggml-tinyblas PRIVATE ggml-base)
+target_include_directories(ggml-tinyblas PRIVATE . ..)
+
+if (APPLE AND GGML_ACCELERATE)
+    find_library(ACCELERATE_FRAMEWORK Accelerate)
+    if (ACCELERATE_FRAMEWORK)
+        message(STATUS "Accelerate framework found")
+
+        add_compile_definitions(GGML_USE_ACCELERATE)
+        add_compile_definitions(ACCELERATE_NEW_LAPACK)
+        add_compile_definitions(ACCELERATE_LAPACK_ILP64)
+
+        target_link_libraries(ggml-tinyblas PRIVATE ${ACCELERATE_FRAMEWORK})
+    else()
+        message(WARNING "Accelerate framework not found")
+    endif()
+endif()
+
+if (GGML_OPENMP)
+    find_package(OpenMP)
+    if (OpenMP_FOUND)
+        message(STATUS "OpenMP found")
+
+        add_compile_definitions(GGML_USE_OPENMP)
+
+        target_link_libraries(ggml-tinyblas PRIVATE OpenMP::OpenMP_C OpenMP::OpenMP_CXX)
+
+    else()
+        message(WARNING "OpenMP not found")
+    endif()
+endif()
+
+target_sources(ggml-tinyblas PRIVATE
+               sgemm.cpp
+               sgemm.h)
+
+if (CMAKE_OSX_ARCHITECTURES      STREQUAL "arm64" OR
+    CMAKE_GENERATOR_PLATFORM_LWR STREQUAL "arm64" OR
+    (NOT CMAKE_OSX_ARCHITECTURES      AND
+     NOT CMAKE_GENERATOR_PLATFORM_LWR AND
+         CMAKE_SYSTEM_PROCESSOR MATCHES "^(aarch64|arm.*|ARM64)$"))
+
+    message(STATUS "ARM detected")
+
+    if (MSVC)
+        add_compile_definitions(__aarch64__) # MSVC defines _M_ARM64 instead
+        add_compile_definitions(__ARM_NEON)
+        add_compile_definitions(__ARM_FEATURE_FMA)
+
+        set(CMAKE_REQUIRED_FLAGS_PREV ${CMAKE_REQUIRED_FLAGS})
+        string(JOIN " " CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS} "/arch:armv8.2")
+
+        check_cxx_source_compiles("#include <arm_neon.h>\nint main() { int8x16_t _a, _b; int32x4_t _s = vdotq_s32(_s, _a, _b); return 0; }" GGML_COMPILER_SUPPORT_DOTPROD)
+        if (GGML_COMPILER_SUPPORT_DOTPROD)
+            add_compile_definitions(__ARM_FEATURE_DOTPROD)
+        endif ()
+
+        check_cxx_source_compiles("#include <arm_neon.h>\nint main() { int8x16_t _a, _b; int32x4_t _s = vmlaq_f32(_s, _a, _b); return 0; }" GGML_COMPILER_SUPPORT_MATMUL_INT8)
+
+        if (GGML_COMPILER_SUPPORT_MATMUL_INT8)
+            add_compile_definitions(__ARM_FEATURE_MATMUL_INT8)
+        endif ()
+
+        check_cxx_source_compiles("#include <arm_neon.h>\nint main() { float16_t _a; float16x8_t _s = vdupq_n_f16(_a); return 0; }" GGML_COMPILER_SUPPORT_FP16_VECTOR_ARITHMETIC)
+        if (GGML_COMPILER_SUPPORT_FP16_VECTOR_ARITHMETIC)
+            add_compile_definitions(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+        endif ()
+
+        set(CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS_PREV})
+    else()
+        check_cxx_compiler_flag(-mfp16-format=ieee COMPILER_SUPPORTS_FP16_FORMAT_I3E)
+        if (NOT "${COMPILER_SUPPORTS_FP16_FORMAT_I3E}" STREQUAL "")
+            list(APPEND ARCH_FLAGS -mfp16-format=ieee)
+        endif()
+        if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "armv6")
+            # Raspberry Pi 1, Zero
+            list(APPEND ARCH_FLAGS -mfpu=neon-fp-armv8 -mno-unaligned-access)
+        endif()
+        if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "armv7")
+            if ("${CMAKE_SYSTEM_NAME}" STREQUAL "Android")
+                # Android armeabi-v7a
+                list(APPEND ARCH_FLAGS -mfpu=neon-vfpv4 -mno-unaligned-access -funsafe-math-optimizations)
+            else()
+                # Raspberry Pi 2
+                list(APPEND ARCH_FLAGS -mfpu=neon-fp-armv8 -mno-unaligned-access -funsafe-math-optimizations)
+            endif()
+        endif()
+        if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "armv8")
+            # Android arm64-v8a
+            # Raspberry Pi 3, 4, Zero 2 (32-bit)
+            list(APPEND ARCH_FLAGS -mno-unaligned-access)
+        endif()
+        if (GGML_SVE)
+            list(APPEND ARCH_FLAGS -march=armv8.6-a+sve)
+        endif()
+    endif()
+elseif (CMAKE_OSX_ARCHITECTURES STREQUAL "x86_64" OR CMAKE_GENERATOR_PLATFORM_LWR MATCHES "^(x86_64|i686|amd64|x64|win32)$" OR
+        (NOT CMAKE_OSX_ARCHITECTURES AND NOT CMAKE_GENERATOR_PLATFORM_LWR AND
+         CMAKE_SYSTEM_PROCESSOR MATCHES "^(x86_64|i686|AMD64)$"))
+    message(STATUS "x86 detected")
+    if (MSVC)
+        # instruction set detection for MSVC only
+        if (GGML_NATIVE)
+            # TODO: improve, should not reference files from the parent folder
+            include(../ggml-cpu/cmake/FindSIMD.cmake)
+        endif ()
+        if (GGML_AVX512)
+            list(APPEND ARCH_FLAGS /arch:AVX512)
+            # MSVC has no compile-time flags enabling specific
+            # AVX512 extensions, neither it defines the
+            # macros corresponding to the extensions.
+            # Do it manually.
+            if (GGML_AVX512_VBMI)
+                add_compile_definitions($<$<COMPILE_LANGUAGE:C>:__AVX512VBMI__>)
+                add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AVX512VBMI__>)
+                if (CMAKE_C_COMPILER_ID STREQUAL "Clang")
+                    list(APPEND ARCH_FLAGS -mavx512vbmi)
+                endif()
+            endif()
+            if (GGML_AVX512_VNNI)
+                add_compile_definitions($<$<COMPILE_LANGUAGE:C>:__AVX512VNNI__>)
+                add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AVX512VNNI__>)
+                if (CMAKE_C_COMPILER_ID STREQUAL "Clang")
+                    list(APPEND ARCH_FLAGS -mavx512vnni)
+                endif()
+            endif()
+            if (GGML_AVX512_BF16)
+                add_compile_definitions($<$<COMPILE_LANGUAGE:C>:__AVX512BF16__>)
+                add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AVX512BF16__>)
+                if (CMAKE_C_COMPILER_ID STREQUAL "Clang")
+                    list(APPEND ARCH_FLAGS -mavx512bf16)
+                endif()
+            endif()
+            if (GGML_AMX_TILE)
+                add_compile_definitions($<$<COMPILE_LANGUAGE:C>:__AMX_TILE__>)
+                add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AMX_TILE__>)
+            endif()
+            if (GGML_AMX_INT8)
+                add_compile_definitions($<$<COMPILE_LANGUAGE:C>:__AMX_INT8__>)
+                add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AMX_INT8__>)
+            endif()
+            if (GGML_AMX_BF16)
+                add_compile_definitions($<$<COMPILE_LANGUAGE:C>:__AMX_BF16__>)
+                add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AMX_BF16__>)
+            endif()
+        elseif (GGML_AVX2)
+            list(APPEND ARCH_FLAGS /arch:AVX2)
+        elseif (GGML_AVX)
+            list(APPEND ARCH_FLAGS /arch:AVX)
+        endif()
+    else()
+        if (GGML_NATIVE)
+            list(APPEND ARCH_FLAGS -march=native)
+        endif()
+        if (GGML_F16C)
+            list(APPEND ARCH_FLAGS -mf16c)
+        endif()
+        if (GGML_FMA)
+            list(APPEND ARCH_FLAGS -mfma)
+        endif()
+        if (GGML_AVX)
+            list(APPEND ARCH_FLAGS -mavx)
+        endif()
+        if (GGML_AVX2)
+            list(APPEND ARCH_FLAGS -mavx2)
+        endif()
+        if (GGML_AVX512)
+            list(APPEND ARCH_FLAGS -mavx512f)
+            list(APPEND ARCH_FLAGS -mavx512dq)
+            list(APPEND ARCH_FLAGS -mavx512bw)
+        endif()
+        if (GGML_AVX512_VBMI)
+            list(APPEND ARCH_FLAGS -mavx512vbmi)
+        endif()
+        if (GGML_AVX512_VNNI)
+            list(APPEND ARCH_FLAGS -mavx512vnni)
+        endif()
+        if (GGML_AVX512_BF16)
+            list(APPEND ARCH_FLAGS -mavx512bf16)
+        endif()
+        if (GGML_AMX_TILE)
+            list(APPEND ARCH_FLAGS -mamx-tile)
+        endif()
+        if (GGML_AMX_INT8)
+            list(APPEND ARCH_FLAGS -mamx-int8)
+        endif()
+        if (GGML_AMX_BF16)
+            list(APPEND ARCH_FLAGS -mamx-bf16)
+        endif()
+    endif()
+elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc64")
+    message(STATUS "PowerPC detected")
+    execute_process(COMMAND bash -c "grep POWER10 /proc/cpuinfo | head -n 1" OUTPUT_VARIABLE POWER10_M)
+    string(FIND "${POWER10_M}" "POWER10" substring_index)
+    if (NOT DEFINED substring_index OR "${substring_index}" STREQUAL "")
+        set(substring_index -1)
+    endif()
+
+    if (${substring_index} GREATER_EQUAL 0)
+       list(APPEND ARCH_FLAGS -mcpu=power10)
+    elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc64le")
+       list(APPEND ARCH_FLAGS -mcpu=powerpc64le)
+    else()
+        list(APPEND ARCH_FLAGS -mcpu=native -mtune=native)
+        #TODO: Add  targets for Power8/Power9 (Altivec/VSX) and Power10(MMA) and query for big endian systems (ppc64/le/be)
+    endif()
+elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "loongarch64")
+    message(STATUS "loongarch64 detected")
+
+    list(APPEND ARCH_FLAGS -march=loongarch64)
+    if (GGML_LASX)
+        list(APPEND ARCH_FLAGS -mlasx)
+    endif()
+    if (GGML_LSX)
+        list(APPEND ARCH_FLAGS -mlsx)
+    endif()
+else()
+    message(STATUS "Unknown architecture")
+endif()
+
+target_compile_options(ggml-tinyblas PRIVATE "$<$<COMPILE_LANGUAGE:CXX>:${ARCH_FLAGS}>")
+target_compile_options(ggml-tinyblas PRIVATE "$<$<COMPILE_LANGUAGE:C>:${ARCH_FLAGS}>")
+
+if (EMSCRIPTEN)
+    set_target_properties(ggml-tinyblas PROPERTIES COMPILE_FLAGS "-msimd128")
+endif()
diff --git a/ggml/src/ggml-tinyblas/ggml-tinyblas.cpp b/ggml/src/ggml-tinyblas/ggml-tinyblas.cpp
new file mode 100644
index 0000000000000..7317b5dd352dc
--- /dev/null
+++ b/ggml/src/ggml-tinyblas/ggml-tinyblas.cpp
@@ -0,0 +1,472 @@
+#include "ggml-cpu.h"
+#include "ggml-impl.h"
+#include "ggml-tinyblas.h"
+#include "ggml-backend-impl.h"
+
+#include "sgemm.h"
+
+#include <memory>
+#include <cstring>
+#include <iostream>
+
+#ifdef GGML_USE_OPENMP
+#include <omp.h>
+#endif
+
+namespace ggml::backend::tinyblas {
+
+    static const char* NAME = "tinyBLAS";
+
+    struct context {
+        int n_threads = GGML_DEFAULT_N_THREADS;
+        std::unique_ptr<char[]> work_data;
+        size_t work_size = 0;
+        //int pp_threads = GGML_DEFAULT_N_THREADS;
+        //int tg_threads = GGML_DEFAULT_N_THREADS;
+    };
+
+    template<bool RUN>
+    static bool mul_mat(int64_t m, int64_t n, int64_t k,
+            const void *A, int64_t lda, const void *B, int64_t ldb, void *C, int64_t ldc,
+            int ith, int nth,
+            const enum ggml_type Atype, const enum ggml_type Btype, const enum ggml_type Ctype)
+    {
+        GGML_ASSERT(Ctype == GGML_TYPE_F32);
+        switch (Atype) {
+        case GGML_TYPE_F32:
+            if (Btype != GGML_TYPE_F32) return false;
+            return gemm<RUN>(m, n, k, (const float*)A, lda, (const float*)B, ldb, (float*)C, ldc, ith, nth);
+            break;
+        case GGML_TYPE_F16:
+            switch (Btype) {
+            case GGML_TYPE_F32:
+                return gemm<RUN>(m, n, k, (const ggml_fp16_t*)A, lda, (const float*)B, ldb, (float*)C, ldc, ith, nth);
+            case GGML_TYPE_F16:
+                return gemm<RUN>(m, n, k, (const ggml_fp16_t*)A, lda, (const ggml_fp16_t*)B, ldb, (float*)C, ldc, ith, nth);
+            default:
+                return false;
+            }
+            break;
+        case GGML_TYPE_BF16:
+            switch (Btype) {
+            case GGML_TYPE_F32:
+                return gemm<RUN>(m, n, k, (const ggml_bf16_t*)A, lda, (const float*)B, ldb, (float*)C, ldc, ith, nth);
+            case GGML_TYPE_BF16:
+                return gemm<RUN>(m, n, k, (const ggml_bf16_t*)A, lda, (const ggml_bf16_t*)B, ldb, (float*)C, ldc, ith, nth);
+            default:
+                return false;
+            }
+            break;
+        case GGML_TYPE_Q8_0:
+            if (Btype != GGML_TYPE_Q8_0) return false;
+            return gemm<RUN>(m, n, k, (const block_q8_0*)A, lda, (const block_q8_0*)B, ldb, (float*)C, ldc, ith, nth);
+            break;
+        case GGML_TYPE_Q4_0:
+            if (Btype != GGML_TYPE_Q8_0) return false;
+            return gemm<RUN>(m, n, k, (const block_q4_0*)A, lda, (const block_q8_0*)B, ldb, (float*)C, ldc, ith, nth);
+            break;
+        case GGML_TYPE_Q5_0:
+            if (Btype != GGML_TYPE_Q8_0) return false;
+            return gemm<RUN>(m, n, k, (const block_q5_0*)A, lda, (const block_q8_0*)B, ldb, (float*)C, ldc, ith, nth);
+            break;
+        case GGML_TYPE_IQ4_NL:
+            if (Btype != GGML_TYPE_Q8_0) return false;
+            return gemm<RUN>(m, n, k, (const block_iq4_nl*)A, lda, (const block_q8_0*)B, ldb, (float*)C, ldc, ith, nth);
+            break;
+        default:
+            return false;
+        }
+        return false;
+    }
+
+    static bool supports_mul_mat(ggml_backend_dev_t, const struct ggml_tensor * dst) {
+        const struct ggml_tensor * src0 = dst->src[0];
+        const struct ggml_tensor * src1 = dst->src[1];
+
+        GGML_TENSOR_BINARY_OP_LOCALS
+
+        if (dst->type != GGML_TYPE_F32) return false;
+
+        if (ne0 != ne01) return false;
+        if (ne1 != ne11) return false;
+        if (ne2 != ne12) return false;
+        if (ne3 != ne13) return false;
+
+        // we don't support permuted src0 or src1
+        if (nb00 != ggml_type_size(src0->type)) return false;
+        if (nb10 != ggml_type_size(src1->type)) return false;
+
+        // dst cannot be transposed or permuted
+        if (nb0 != sizeof(float)) return false;
+        if (nb0 > nb1) return false;
+        if (nb1 > nb2) return false;
+        if (nb2 > nb3) return false;
+
+        if (ggml_is_contiguous(src1)) {
+            if (mul_mat<false>(ne01, ne11, ne00/ggml_blck_size(src0->type),
+                    src0->data, nb01/ggml_type_size(src0->type),
+                    src1->data, nb11/ggml_type_size(src1->type),
+                    dst->data, nb1/ggml_type_size(dst->type),
+                    0, 1, src0->type, src1->type, GGML_TYPE_F32)) {
+                return true;
+            }
+        }
+
+        // apres conversion de B: FP32 => src0->vec_dot_type
+        enum ggml_type const vec_dot_type = ggml_get_type_traits_cpu(src0->type)->vec_dot_type;
+        if ((src1->type != vec_dot_type) && (src1->type == GGML_TYPE_F32)) {
+            if (mul_mat<false>(ne01, ne11, ne00/ggml_blck_size(src0->type),
+                    src0->data, nb01/ggml_type_size(src0->type),
+                    src1->data, nb11/ggml_type_size(src1->type),
+                    dst->data, nb1/ggml_type_size(dst->type),
+                    0, 1, src0->type, vec_dot_type, GGML_TYPE_F32)) {
+                // @ voir ca aurait etait bien de redimensioner work_data ici..
+                return true;
+            }
+        }
+        return false;
+    }
+
+    static void mul_mat(ggml::backend::tinyblas::context * ctx, struct ggml_tensor * dst, const int ith, const int nth) {
+        const struct ggml_tensor * src0 = dst->src[0];
+        const struct ggml_tensor * src1 = dst->src[1];
+
+        GGML_TENSOR_BINARY_OP_LOCALS
+
+        const enum ggml_type type0 = src0->type;
+        const enum ggml_type type1 = src1->type;
+
+        // les type "directs"
+        // broadcast factors
+        const int64_t r2 = ne12 / ne02;
+        const int64_t r3 = ne13 / ne03;
+
+        if (ggml_is_contiguous(src1)) {
+            for (int64_t i13 = 0; i13 < ne13; i13++) {
+                for (int64_t i12 = 0; i12 < ne12; i12++) {
+                    const void* data0 = (const char *)src0->data + i12/r2*nb02 + i13/r3*nb03;
+                    const void* data1 = (const char *)src1->data + i12*nb12 + i13*nb13;
+                    void* data  = (char *)dst->data + i12*nb2 + i13*nb3;
+                    if (!mul_mat<true>(ne01, ne11, ne00/ggml_blck_size(src0->type),
+                            data0, nb01/ggml_type_size(src0->type),
+                            data1, nb11/ggml_type_size(src1->type),
+                            data, nb1/ggml_type_size(dst->type),
+                            ith, nth, type0, type1, GGML_TYPE_F32)) {
+                        goto UseGgmlGemm1;
+                    }
+                }
+            }
+            return;
+        }
+        UseGgmlGemm1:;
+
+        // apres conversion de B ?
+        GGML_ASSERT(src1->type == GGML_TYPE_F32); // for use 'from_float'
+        enum ggml_type    const vec_dot_type = ggml_get_type_traits_cpu(type0)->vec_dot_type;
+        ggml_from_float_t const from_float   = ggml_get_type_traits_cpu(vec_dot_type)->from_float;
+        // auto const type_size = ggml_get_type_traits(vec_dot_type)->type_size;
+
+        if (src1->type != vec_dot_type) {
+            // OK on va au moins essayer de changer le type de B
+
+            const size_t nbw1 = ggml_row_size(vec_dot_type, ne10);
+            // const size_t row_size = ggml_row_size(vec_dot_type, ne10);
+            const size_t nbw2 = nbw1*ne11;
+            const size_t nbw3 = nbw2*ne12;
+
+            // TOD0: vor si on peu caller ca dans supports_mul_mat
+            if ((ith == 0) && (ctx->work_size < ne13*nbw3)) {
+                ctx->work_data.reset(new char[ne13*nbw3]);
+                ctx->work_size = ne13*nbw3;
+            }
+#ifdef GGML_USE_OPENMP
+#pragma omp barrier
+#else
+            static_assert(false, "Note implemented: use GGML_USE_OPENMP");
+#endif
+            char * wdata = ctx->work_data.get();
+
+            for (int64_t i13 = 0; i13 < ne13; ++i13) {
+                for (int64_t i12 = 0; i12 < ne12; ++i12) {
+                    for (int64_t i11 = ith; i11 < ne11; i11 += nth) {
+                        from_float((float *)((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11),
+                                   (void *)               (wdata + i13*nbw3 + i12*nbw2 + i11*nbw1),
+                                   ne10);
+                    }
+                }
+            }
+
+            // synchronize all threads!
+#ifdef GGML_USE_OPENMP
+#pragma omp barrier
+#else
+            static_assert(false, "Note implemented: use GGML_USE_OPENMP");
+#endif
+            // mat-mul bis...
+            for (int64_t i13 = 0; i13 < ne13; i13++)
+                for (int64_t i12 = 0; i12 < ne12; i12++) {
+                    const void* data0 = (const char *)src0->data + i12/r2*nb02 + i13/r3*nb03;
+                    const void* data1 = (const char *)wdata      + i12*nbw2    + i13*nbw3;
+
+                    void* data  = (char *)dst->data + i12*nb2 + i13*nb3;
+                    if (!mul_mat<true>(ne01, ne11, ne00/ggml_blck_size(src0->type),
+                            data0, nb01/ggml_type_size(src0->type),
+                            data1, nbw1/ggml_type_size(vec_dot_type),
+                            data, nb1/ggml_type_size(dst->type),
+                            ith, nth, type0, vec_dot_type, GGML_TYPE_F32)) {
+                        goto UseGgmlGemm2;
+                    }
+                }
+            return;
+        }
+        UseGgmlGemm2:;
+    }
+
+    static const char * get_name(ggml_backend_t /*backend*/) {
+        return NAME;
+    }
+
+    static void free(ggml_backend_t backend) {
+        context * ctx = (context *)backend->context;
+        delete ctx;
+        delete backend;
+    }
+
+    // TODO: voir comment gerer les threads / pool ... pour tous les backends qui en ont besoin...
+    //  - voir ggml_graph_compute / ggml_threadpool
+    // https://github.com/ggerganov/llama.cpp/pull/1999
+    //
+    static enum ggml_status graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
+        context * ctx = (context *)backend->context;
+
+        for (int i = 0; i < cgraph->n_nodes; i++) {
+            struct ggml_tensor * node = cgraph->nodes[i];
+
+            switch (node->op) {
+            case GGML_OP_MUL_MAT:
+#ifdef GGML_USE_OPENMP
+#pragma omp parallel num_threads(ctx->n_threads)
+            {
+                int ith = omp_get_thread_num();
+                int nth = ctx->n_threads;
+                mul_mat(ctx, node, ith, nth);
+            }
+#else
+            static_assert(false, "Note implemented: use GGML_USE_OPENMP");
+            mul_mat(ctx, node, 0, 1);
+#endif
+            break;
+            case GGML_OP_NONE:
+            case GGML_OP_RESHAPE:
+            case GGML_OP_VIEW:
+            case GGML_OP_PERMUTE:
+            case GGML_OP_TRANSPOSE:
+                break;
+
+            default:
+                GGML_ABORT("%s: unsupported op %s\n", __func__, ggml_op_desc(node));
+            }
+        }
+
+        return GGML_STATUS_SUCCESS;
+    }
+
+    static struct ggml_backend_i interface = {
+            /* .get_name                = */ get_name,
+            /* .free                    = */ free,
+            /* .set_tensor_async        = */ NULL,
+            /* .get_tensor_async        = */ NULL,
+            /* .cpy_tensor_async        = */ NULL,
+            /* .synchronize             = */ NULL,
+            /* .graph_plan_create       = */ NULL,
+            /* .graph_plan_free         = */ NULL,
+            /* .graph_plan_update       = */ NULL,
+            /* .graph_plan_compute      = */ NULL,
+            /* .graph_compute           = */ graph_compute,
+            /* .event_record            = */ NULL,
+            /* .event_wait              = */ NULL,
+    };
+
+    static ggml_guid_t guid(void) {
+        static ggml_guid guid = { 0x23, 0xf5, 0x9f, 0xa2, 0xb1, 0x48, 0x39, 0x25, 0x83, 0xcd, 0x79, 0x16, 0xb7, 0x23, 0x94, 0xde };
+        return &guid;
+    }
+
+    static ggml_backend_t init(void) {
+        context * ctx = new context;
+
+        ggml_backend_t backend = new ggml_backend {
+            /* .guid      = */ guid(),
+                    /* .interface = */ interface,
+                    /* .device    = */ ggml_backend_reg_dev_get(ggml_backend_tinyblas_reg(), 0),
+                    /* .context   = */ ctx,
+        };
+
+        return backend;
+    }
+
+    static bool is_tinyblas(ggml_backend_t backend) {
+        return backend != NULL && ggml_guid_matches(backend->guid, guid());
+    }
+
+    // number of threads to use for compute
+    static void set_pp_threads(ggml_backend_t backend, int n_threads) {
+        GGML_ASSERT(is_tinyblas(backend));
+        context * ctx = (context *)backend->context;
+        //ctx->pp_threads = n_threads;
+    }
+
+    static void set_tg_threads(ggml_backend_t backend, int n_threads) {
+        GGML_ASSERT(is_tinyblas(backend));
+        context * ctx = (context *)backend->context;
+        //ctx->tg_threads = n_threads;
+    }
+
+    static void set_n_threads(ggml_backend_t backend, int n_threads) {
+        GGML_ASSERT(is_tinyblas(backend));
+        context * ctx = (context *)backend->context;
+        ctx->n_threads = n_threads;
+        //ctx->tg_threads = n_threads;
+        //ctx->pp_threads = n_threads;
+    }
+
+}
+
+// device interface
+namespace ggml::backend::tinyblas::device {
+    static const char * get_name(ggml_backend_dev_t) {
+        return "BLAS";
+    }
+
+    static const char * get_description(ggml_backend_dev_t) {
+        return "tinyBLAS";
+    }
+
+    static void get_memory(ggml_backend_dev_t, size_t * free, size_t * total) {
+        // TODO
+        *free = 0;
+        *total = 0;
+    }
+
+    static enum ggml_backend_dev_type get_type(ggml_backend_dev_t) {
+        return GGML_BACKEND_DEVICE_TYPE_ACCEL;
+    }
+
+    static void get_props(ggml_backend_dev_t dev, struct ggml_backend_dev_props * props) {
+        props->name        = get_name(dev);
+        props->description = get_description(dev);
+        props->type        = get_type(dev);
+        get_memory(dev, &props->memory_free, &props->memory_total);
+        props->caps = {
+                /* .async                 = */ false,
+                /* .host_buffer           = */ false,
+                /* .buffer_from_host_ptr  = */ true,
+                /* .events                = */ false,
+        };
+    }
+
+    static ggml_backend_t init_backend(ggml_backend_dev_t, const char *) {
+        return ggml::backend::tinyblas::init();
+    }
+
+    static ggml_backend_buffer_type_t get_buffer_type(ggml_backend_dev_t) {
+        return ggml_backend_cpu_buffer_type();
+    }
+
+    static ggml_backend_buffer_t buffer_from_host_ptr(ggml_backend_dev_t, void * ptr, size_t size, size_t) {
+        return ggml_backend_cpu_buffer_from_ptr(ptr, size);
+    }
+
+    static bool supports_op(ggml_backend_dev_t device, const struct ggml_tensor * op) {
+        //const struct ggml_tensor * src0 = op->src[0];
+        //const struct ggml_tensor * src1 = op->src[1];
+
+        switch (op->op) {
+        case GGML_OP_NONE:
+        case GGML_OP_RESHAPE:
+        case GGML_OP_VIEW:
+        case GGML_OP_PERMUTE:
+        case GGML_OP_TRANSPOSE:
+            return true;
+        case GGML_OP_MUL_MAT:
+            return supports_mul_mat(device, op);
+        default:
+            return false;
+        }
+    }
+
+    static bool supports_buft(ggml_backend_dev_t, ggml_backend_buffer_type_t buft) {
+        return ggml_backend_buft_is_host(buft);
+    }
+
+    static const struct ggml_backend_device_i interface = {
+            /* .get_name             = */ get_name,
+            /* .get_description      = */ get_description,
+            /* .get_memory           = */ get_memory,
+            /* .get_type             = */ get_type,
+            /* .get_props            = */ get_props,
+            /* .init_backend         = */ init_backend,
+            /* .get_buffer_type      = */ get_buffer_type,
+            /* .get_host_buffer_type = */ NULL,
+            /* .buffer_from_host_ptr = */ buffer_from_host_ptr,
+            /* .supports_op          = */ supports_op,
+            /* .supports_buft        = */ supports_buft,
+            /* .offload_op           = */ NULL,
+            /* .event_new            = */ NULL,
+            /* .event_free           = */ NULL,
+            /* .event_synchronize    = */ NULL,
+    };
+
+}
+
+// backend reg interface
+namespace ggml::backend::tinyblas::reg {
+    static const char * get_name(ggml_backend_reg_t) {
+        return ggml::backend::tinyblas::NAME;
+    }
+
+    static size_t get_device_count(ggml_backend_reg_t) {
+        return 1;
+    }
+
+    static ggml_backend_dev_t get_device(ggml_backend_reg_t reg, size_t index) {
+        GGML_ASSERT(index == 0);
+
+        static ggml_backend_device device = {
+                /* .iface   = */ ggml::backend::tinyblas::device::interface,
+                /* .reg     = */ reg,
+                /* .context = */ nullptr,
+        };
+
+        return &device;
+    }
+
+    static void * get_proc_address(ggml_backend_reg_t, const char * name) {
+        if (std::strcmp(name, "ggml_backend_set_n_threads") == 0) {
+            return (void *)ggml::backend::tinyblas::set_n_threads;
+        }
+        if (std::strcmp(name, "ggml_backend_set_pp_threads") == 0) {
+            return (void *)ggml::backend::tinyblas::set_pp_threads;
+        }
+        if (std::strcmp(name, "ggml_backend_set_tg_threads") == 0) {
+            return (void *)ggml::backend::tinyblas::set_tg_threads;
+        }
+        return NULL;
+    }
+
+    static const struct ggml_backend_reg_i interface = {
+            /* .get_name         = */ get_name,
+            /* .get_device_count = */ get_device_count,
+            /* .get_device       = */ get_device,
+            /* .get_proc_address = */ get_proc_address,
+    };
+
+}
+
+ggml_backend_reg_t ggml_backend_tinyblas_reg(void) {
+    static struct ggml_backend_reg backend_reg = {
+            /* .iface   = */ ggml::backend::tinyblas::reg::interface,
+            /* .context = */ NULL,
+    };
+    return &backend_reg;
+}
diff --git a/ggml/src/ggml-cpu/llamafile/sgemm.cpp b/ggml/src/ggml-tinyblas/sgemm.cpp
similarity index 80%
rename from ggml/src/ggml-cpu/llamafile/sgemm.cpp
rename to ggml/src/ggml-tinyblas/sgemm.cpp
index b2ce2e6649479..5c7a3c357ee9f 100644
--- a/ggml/src/ggml-cpu/llamafile/sgemm.cpp
+++ b/ggml/src/ggml-tinyblas/sgemm.cpp
@@ -50,8 +50,6 @@
 
 #include "sgemm.h"
 #include "ggml-impl.h"
-// hack until moved into the CPU backend
-#include "../ggml-cpu-impl.h"
 #include "ggml-quants.h"
 
 #ifdef _MSC_VER
@@ -135,6 +133,16 @@ inline __m512 madd(__m512 a, __m512 b, __m512 c) {
     return _mm512_fmadd_ps(a, b, c);
 }
 #endif
+#if defined(__AVX512BF16__)
+template <>
+inline __m512 madd(__m512bh a, __m512bh b, __m512 c) {
+    return _mm512_dpbf16_ps(c, a, b);
+}
+template <>
+inline __m256 madd(__m256bh a, __m256bh b, __m256 c) {
+    return _mm256_dpbf16_ps(c, a, b);
+}
+#endif
 #endif
 
 #if defined(__ARM_FEATURE_FMA)
@@ -226,6 +234,13 @@ template <> inline __m256 load(const float *p) {
 }
 #endif // __AVX__
 
+#if defined(__AVX2__) || defined(__AVX512F__)
+template <> inline __m256 load(const ggml_bf16_t *p) {
+    return _mm256_castsi256_ps(
+        _mm256_slli_epi32(_mm256_cvtepu16_epi32(_mm_loadu_si128((const __m128i *)p)), 16));
+}
+#endif // __AVX2__
+
 #if defined(__F16C__)
 template <> inline __m256 load(const ggml_fp16_t *p) {
     return _mm256_cvtph_ps(_mm_loadu_si128((const __m128i *)p));
@@ -239,8 +254,27 @@ template <> inline __m512 load(const float *p) {
 template <> inline __m512 load(const ggml_fp16_t *p) {
     return _mm512_cvtph_ps(_mm256_loadu_si256((const __m256i *)p));
 }
+template <> inline __m512 load(const ggml_bf16_t *p) {
+    return _mm512_castsi512_ps(
+        _mm512_slli_epi32(_mm512_cvtepu16_epi32(_mm256_loadu_si256((const __m256i *)p)), 16));
+}
 #endif // __AVX512F__
 
+#if defined(__AVX512BF16__)
+template <> inline __m512bh load(const ggml_bf16_t *p) {
+    return (__m512bh)_mm512_loadu_ps((const float *)p);
+}
+template <> inline __m256bh load(const ggml_bf16_t *p) {
+    return (__m256bh)_mm256_loadu_ps((const float *)p);
+}
+template <> inline __m512bh load(const float *p) {
+    return _mm512_cvtne2ps_pbh(_mm512_loadu_ps(p + 16), _mm512_loadu_ps(p));
+}
+template <> inline __m256bh load(const float *p) {
+    return _mm512_cvtneps_pbh(_mm512_loadu_ps(p));
+}
+#endif
+
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 // CONSTANTS
 
@@ -1627,259 +1661,395 @@ class tinyBLAS_PPC {
 #endif
 } // namespace
 
-/**
- * Performs optimized matrix multiplication on CPU.
- *
- * This subroutine may compute C = Aᵀ * B with column major ordering.
- * Despite its name, this isn't a generalized implementation. Work is
- * only performed when a handwritten kernel is written and available.
- * Otherwise the caller should fall back to a general matmul routine.
- *
- * For example, for single-threaded single-precision GEMM you can say
- *
- *     llamafile_sgemm(m, n, k, A, lda, B, ldb, C, ldc,
- *                     0, 1,
- *                     GGML_TYPE_F32, GGML_TYPE_F32, GGML_TYPE_F32);
- *
- * @param m is rows in `A` and `C`
- * @param n is cols in `B` and `C`
- * @param k is cols in `A` and rows in `B`
- * @param A is first input matrix (always transposed)
- * @param lda is row stride of `A`
- * @param B is second input matrix (never transposed)
- * @param ldb is row stride of `B`
- * @param C is input/output array of output matrices
- * @param ldc is row stride of `C`
- * @param ith is thread id (must be less than `nth`)
- * @param nth is number of threads (must be greater than zero)
- * @param Atype is GGML data type of `A`
- * @param Btype is GGML data type of `B`
- * @param Ctype is GGML data type of `C`
- * @return true if this function was able to service the matmul request
- */
-bool llamafile_sgemm(int64_t m, int64_t n, int64_t k, const void *A, int64_t lda, const void *B, int64_t ldb, void *C,
-                     int64_t ldc, int ith, int nth, int Atype, int Btype, int Ctype) {
-
-    assert(m >= 0);
-    assert(n >= 0);
-    assert(k >= 0);
-    assert(lda >= k);
-    assert(ldb >= k);
-    assert(ldc >= m);
-    assert(nth > 0);
-    assert(ith < nth);
-
-    // only enable sgemm for prompt processing
-    if (n < 2)
-        return false;
-
-    if (Ctype != GGML_TYPE_F32)
-        return false;
-
-    switch (Atype) {
-
-    case GGML_TYPE_F32: {
-        if (Btype != GGML_TYPE_F32)
-            return false;
+namespace ggml::backend::tinyblas {
+
+    /**
+     * Performs optimized matrix multiplication on CPU.
+     *
+     * This subroutine may compute C = Aᵀ * B with column major ordering.
+     * Despite its name, this isn't a generalized implementation. Work is
+     * only performed when a handwritten kernel is written and available.
+     * Otherwise the caller should fall back to a general matmul routine.
+     *
+     * For example, for single-threaded single-precision GEMM you can say
+     *
+     *     llamafile_sgemm(m, n, k, A, lda, B, ldb, C, ldc, 0, 1);
+     *
+     * @param m is rows in `A` and `C`
+     * @param n is cols in `B` and `C`
+     * @param k is cols in `A` and rows in `B`
+     * @param A is first input matrix (always transposed)
+     * @param lda is row stride of `A`
+     * @param B is second input matrix (never transposed)
+     * @param ldb is row stride of `B`
+     * @param C is input/output array of output matrices
+     * @param ldc is row stride of `C`
+     * @param ith is thread id (must be less than `nth`)
+     * @param nth is number of threads (must be greater than zero)
+     * @return true if this function was able to service the matmul request
+     */
+
+    template<bool RUN>
+    bool gemm(int64_t m, int64_t n, int64_t k,
+            const float *A, int64_t lda, const float *B, int64_t ldb, float *C, int64_t ldc,
+            int ith, int nth) {
+        assert(m >= 0);
+        assert(n >= 0);
+        assert(k >= 0);
+        assert(lda >= k);
+        assert(ldb >= k);
+        assert(ldc >= m);
+        assert(nth > 0);
+        assert(ith < nth);
 #if defined(__AVX512F__)
-        if (k % 16)
-            return false;
-        tinyBLAS<16, __m512, __m512, float, float, float> tb{
-            k, (const float *)A, lda,
-            (const float *)B, ldb,
-            (float *)C, ldc,
-            ith, nth};
-        tb.matmul(m, n);
-        return true;
-#elif defined(__AVX__) || defined(__AVX2__)
-        if (k % 8)
-            return false;
-        tinyBLAS<8, __m256, __m256, float, float, float> tb{
-            k, (const float *)A, lda,
-            (const float *)B, ldb,
-            (float *)C, ldc,
-            ith, nth};
-        tb.matmul(m, n);
-        return true;
-#elif defined(__ARM_NEON)
-        if (n < 4)
-            return false;
-        if (k % 4)
-            return false;
-        tinyBLAS<4, float32x4_t, float32x4_t, float, float, float> tb{
-            k, (const float *)A, lda,
-            (const float *)B, ldb,
-            (float *)C, ldc,
-            ith, nth};
-        tb.matmul(m, n);
-        return true;
-#elif defined(__MMA__)
-        if (k % 8)
-            return false;
-        tinyBLAS_PPC<float, float, float> tb{
-            k, (const float *)A, lda,
-            (const float *)B, ldb,
-            (float *)C, ldc,
-            ith, nth};
-        tb.matmul(m, n);
-        return true;
-#else
+        if ((k % 16) == 0) {
+            if constexpr (RUN) {
+                tinyBLAS<16, __m512, __m512, float, float, float> tb{k, A, lda, B, ldb, C, ldc, ith, nth};
+                tb.matmul(m, n);
+            }
+            return true;
+        }
+#endif
+#if defined(__AVX__) || defined(__AVX2__)
+        if ((k % 8)==0) {
+            if constexpr (RUN) {
+                tinyBLAS<8, __m256, __m256, float, float, float> tb{k, A, lda, B, ldb, C, ldc, ith, nth};
+                tb.matmul(m, n);
+            }
+            return true;
+        }
+#endif
+#if defined(__ARM_NEON)
+        if ((k % 4) == 0) {
+            if constexpr (RUN) {
+                tinyBLAS<4, float32x4_t, float32x4_t, float, float, float> tb{k, A, lda, B, ldb, C, ldc, ith, nth};
+                tb.matmul(m, n);
+            }
+            return true;
+        }
+#endif
+        // TODO: voir a mettre ca dans un autre fichier...
+#if defined(__MMA__)
+        if ((k % 8) == 0) {
+            if constexpr (RUN) {
+                tinyBLAS_PPC<float, float, float> tb{ k, A, lda, B, ldb, C, ldc, ith, nth};
+                tb.matmul(m, n);
+            }
+            return true;
+        }
+#endif
         return false;
+    }
+    template bool gemm<true>(int64_t m, int64_t n, int64_t k,
+            const float *A, int64_t lda, const float *B, int64_t ldb, float *C, int64_t ldc,
+            int ith, int nth);
+    template bool gemm<false>(int64_t m, int64_t n, int64_t k,
+            const float *A, int64_t lda, const float *B, int64_t ldb, float *C, int64_t ldc,
+            int ith, int nth);
+
+    template<bool RUN>
+    bool gemm(int64_t m, int64_t n, int64_t k,
+            const ggml_fp16_t *A, int64_t lda, const float *B, int64_t ldb, float *C, int64_t ldc,
+            int ith, int nth) {
+        assert(m >= 0);
+        assert(n >= 0);
+        assert(k >= 0);
+        assert(lda >= k);
+        assert(ldb >= k);
+        assert(ldc >= m);
+        assert(nth > 0);
+        assert(ith < nth);
+#if defined(__AVX512F__)
+        if ((k % 16) == 0) {
+            if constexpr (RUN) {
+                tinyBLAS<16, __m512, __m512, ggml_fp16_t, float, float> tb{k, A, lda, B, ldb, C, ldc, ith, nth};
+                tb.matmul(m, n);
+            }
+            return true;
+        }
 #endif
+#if (defined(__AVX__) || defined(__AVX2__)) && defined(__F16C__)
+        if ((k % 8) == 0) {
+            if constexpr (RUN) {
+                tinyBLAS<8, __m256, __m256, ggml_fp16_t, float, float> tb{k, A, lda, B, ldb, C, ldc, ith, nth};
+                tb.matmul(m, n);
+            }
+            return true;
+        }
+#endif
+#if defined(__ARM_NEON) && !defined(_MSC_VER)
+        if ((k % 4) == 0) {
+            if constexpr (RUN) {
+                tinyBLAS<4, float32x4_t, float32x4_t, ggml_fp16_t, float, float> tb{k, A, lda, B, ldb, C, ldc, ith, nth};
+                tb.matmul(m, n);
+            }
+            return true;
+        }
+#endif
+        return false;
     }
-
-    case GGML_TYPE_F16: {
+    template bool gemm<true>(int64_t m, int64_t n, int64_t k,
+            const ggml_fp16_t *A, int64_t lda, const float *B, int64_t ldb, float *C, int64_t ldc,
+            int ith, int nth);
+    template bool gemm<false>(int64_t m, int64_t n, int64_t k,
+            const ggml_fp16_t *A, int64_t lda, const float *B, int64_t ldb, float *C, int64_t ldc,
+            int ith, int nth);
+
+    template<bool RUN>
+    bool gemm(int64_t m, int64_t n, int64_t k,
+            const ggml_fp16_t *A, int64_t lda, const ggml_fp16_t *B, int64_t ldb, float *C, int64_t ldc,
+            int ith, int nth) {
+        assert(m >= 0);
+        assert(n >= 0);
+        assert(k >= 0);
+        assert(lda >= k);
+        assert(ldb >= k);
+        assert(ldc >= m);
+        assert(nth > 0);
+        assert(ith < nth);
 #if defined(__AVX512F__)
-        if (k % 16)
-            return false;
-        if (Btype != GGML_TYPE_F32)
-            return false;
-        tinyBLAS<16, __m512, __m512, ggml_fp16_t, float, float> tb{
-            k, (const ggml_fp16_t *)A, lda,
-            (const float *)B, ldb,
-            (float *)C, ldc,
-            ith, nth};
-        tb.matmul(m, n);
-        return true;
+        if ((k % 16) == 0) {
+            if constexpr (RUN) {
+                tinyBLAS<16, __m512, __m512, ggml_fp16_t, ggml_fp16_t, float> tb{k, A, lda, B, ldb, C, ldc, ith, nth};
+                tb.matmul(m, n);
+            }
+            return true;
+        }
+#endif
+#if (defined(__AVX__) || defined(__AVX2__)) && defined(__F16C__)
+        if ((k % 8) == 0) {
+            if constexpr (RUN) {
+                tinyBLAS<8, __m256, __m256, ggml_fp16_t, ggml_fp16_t, float> tb{k, A, lda, B, ldb, C, ldc, ith, nth};
+                tb.matmul(m, n);
+            }
+            return true;
+        }
+#endif
+#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && !defined(_MSC_VER)
+        if ((k % 8) == 0) {
+            if constexpr (RUN) {
+                tinyBLAS<8, float16x8_t, float16x8_t, ggml_fp16_t, ggml_fp16_t, float> tb{k, A, lda, B, ldb, C, ldc, ith, nth};
+                tb.matmul(m, n);
+            }
+            return true;
+        }
+#endif
+        return false;
+    }
+    template bool gemm<true>(int64_t m, int64_t n, int64_t k,
+            const ggml_fp16_t *A, int64_t lda, const ggml_fp16_t *B, int64_t ldb, float *C, int64_t ldc,
+            int ith, int nth);
+    template bool gemm<false>(int64_t m, int64_t n, int64_t k,
+            const ggml_fp16_t *A, int64_t lda, const ggml_fp16_t *B, int64_t ldb, float *C, int64_t ldc,
+            int ith, int nth);
+
+    template<bool RUN>
+    bool gemm(int64_t m, int64_t n, int64_t k,
+            const ggml_bf16_t *A, int64_t lda, const float *B, int64_t ldb, float *C, int64_t ldc,
+            int ith, int nth) {
+        assert(m >= 0);
+        assert(n >= 0);
+        assert(k >= 0);
+        assert(lda >= k);
+        assert(ldb >= k);
+        assert(ldc >= m);
+        assert(nth > 0);
+        assert(ith < nth);
+#if defined(__AVX512BF16__)
+        // wait for convert B => bf16?
+        //if ((k % 32) == 0) {
+        //    if constexpr (RUN) {
+        //        tinyBLAS<32, __m512, __m512bh, ggml_bf16_t, float, float> tb{k, A, lda, B, ldb, C, ldc, ith, nth};
+        //        tb.matmul(m, n);
+        //    }
+        //    return true;
+        //}
+#elif defined(__AVX512F__)
+        if ((k % 16) == 0) {
+            if constexpr (RUN) {
+                tinyBLAS<16, __m512, __m512, ggml_bf16_t, float, float> tb{k, A, lda, B, ldb, C, ldc, ith, nth};
+                tb.matmul(m, n);
+            }
+            return true;
+        }
 #elif (defined(__AVX__) || defined(__AVX2__)) && defined(__F16C__)
-        if (k % 8)
-            return false;
-        if (Btype != GGML_TYPE_F32)
-            return false;
-        tinyBLAS<8, __m256, __m256, ggml_fp16_t, float, float> tb{
-            k, (const ggml_fp16_t *)A, lda,
-            (const float *)B, ldb,
-            (float *)C, ldc,
-            ith, nth};
-        tb.matmul(m, n);
-        return true;
-#elif defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && !defined(_MSC_VER)
-        if (n < 8)
-            return false;
-        if (k % 8)
-            return false;
-        if (Btype != GGML_TYPE_F16)
-            return false;
-        tinyBLAS<8, float16x8_t, float16x8_t, ggml_fp16_t, ggml_fp16_t, float> tb{
-            k, (const ggml_fp16_t *)A, lda,
-            (const ggml_fp16_t *)B, ldb,
-            (float *)C, ldc,
-            ith, nth};
-        tb.matmul(m, n);
-        return true;
-#elif defined(__ARM_NEON) && !defined(_MSC_VER)
-        if (k % 4)
-            return false;
-        if (Btype != GGML_TYPE_F32)
-            return false;
-        tinyBLAS<4, float32x4_t, float32x4_t, ggml_fp16_t, float, float> tb{
-            k, (const ggml_fp16_t *)A, lda,
-            (const float *)B, ldb,
-            (float *)C, ldc,
-            ith, nth};
-        tb.matmul(m, n);
-        return true;
-#else
+        // TODO
+#endif
         return false;
+    }
+    template bool gemm<true>(int64_t m, int64_t n, int64_t k,
+            const ggml_bf16_t *A, int64_t lda, const float *B, int64_t ldb, float *C, int64_t ldc,
+            int ith, int nth);
+    template bool gemm<false>(int64_t m, int64_t n, int64_t k,
+            const ggml_bf16_t *A, int64_t lda, const float *B, int64_t ldb, float *C, int64_t ldc,
+            int ith, int nth);
+
+    template<bool RUN>
+    bool gemm(int64_t m, int64_t n, int64_t k,
+            const ggml_bf16_t *A, int64_t lda, const ggml_bf16_t *B, int64_t ldb, float *C, int64_t ldc,
+            int ith, int nth) {
+        assert(m >= 0);
+        assert(n >= 0);
+        assert(k >= 0);
+        assert(lda >= k);
+        assert(ldb >= k);
+        assert(ldc >= m);
+        assert(nth > 0);
+        assert(ith < nth);
+#if defined(__AVX512BF16__)
+        if ((k % 32) == 0) {
+            if constexpr (RUN) {
+                tinyBLAS<32, __m512, __m512bh, ggml_bf16_t, ggml_bf16_t, float> tb{k, A, lda, B, ldb, C, ldc, ith, nth};
+                tb.matmul(m, n);
+            }
+            return true;
+        }
+        // 2eme chance...
+        if ((k % 16) == 0) {
+            if constexpr (RUN) {
+                tinyBLAS<16, __m256, __m256bh, ggml_bf16_t, ggml_bf16_t, float> tb{k, A, lda, B, ldb, C, ldc, ith, nth};
+                tb.matmul(m, n);
+            }
+            return true;
+        }
 #endif
+        return false;
     }
+    template bool gemm<true>(int64_t m, int64_t n, int64_t k,
+            const ggml_bf16_t *A, int64_t lda, const ggml_bf16_t *B, int64_t ldb, float *C, int64_t ldc,
+            int ith, int nth);
+    template bool gemm<false>(int64_t m, int64_t n, int64_t k,
+            const ggml_bf16_t *A, int64_t lda, const ggml_bf16_t *B, int64_t ldb, float *C, int64_t ldc,
+            int ith, int nth);
+
+    template<bool RUN>
+    bool gemm(int64_t m, int64_t n, int64_t k,
+            const block_q8_0 *A, int64_t lda, const block_q8_0 *B, int64_t ldb, float *C, int64_t ldc,
+            int ith, int nth) {
+        assert(m >= 0);
+        assert(n >= 0);
+        assert(k >= 0);
+        assert(lda >= k);
+        assert(ldb >= k);
+        assert(ldc >= m);
+        assert(nth > 0);
+        assert(ith < nth);
 
-    case GGML_TYPE_Q8_0: {
-        if (Btype != GGML_TYPE_Q8_0)
-           return false;
 #if defined(__AVX2__) || defined(__AVX512F__) || defined(__AVX__)
-        tinyBLAS_Q0_AVX<block_q8_0, block_q8_0, float> tb{
-            k, (const block_q8_0 *)A, lda,
-            (const block_q8_0 *)B, ldb,
-            (float *)C, ldc,
-            ith, nth};
-        tb.matmul(m, n);
+        if constexpr (RUN) {
+            tinyBLAS_Q0_AVX<block_q8_0, block_q8_0, float> tb{k, A, lda, B, ldb, C, ldc, ith, nth};
+            tb.matmul(m, n);
+        }
         return true;
 #elif defined(__ARM_FEATURE_DOTPROD)
-        tinyBLAS_Q0_ARM<block_q8_0> tb{
-            k, (const block_q8_0 *)A, lda,
-            (const block_q8_0 *)B, ldb,
-            (float *)C, ldc,
-            ith, nth};
-        tb.matmul(m, n);
+        if constexpr (RUN) {
+            tinyBLAS_Q0_ARM<block_q8_0> tb{k, A, lda, B, ldb, C, ldc, ith, nth};
+            tb.matmul(m, n);
+        }
         return true;
 #else
         return false;
 #endif
     }
+    template bool gemm<true>(int64_t m, int64_t n, int64_t k,
+            const block_q8_0 *A, int64_t lda, const block_q8_0 *B, int64_t ldb, float *C, int64_t ldc,
+            int ith, int nth);
+    template bool gemm<false>(int64_t m, int64_t n, int64_t k,
+            const block_q8_0 *A, int64_t lda, const block_q8_0 *B, int64_t ldb, float *C, int64_t ldc,
+            int ith, int nth);
+
+    template<bool RUN>
+    bool gemm(int64_t m, int64_t n, int64_t k,
+            const block_q4_0 *A, int64_t lda, const block_q8_0 *B, int64_t ldb, float *C, int64_t ldc,
+            int ith, int nth) {
+        assert(m >= 0);
+        assert(n >= 0);
+        assert(k >= 0);
+        assert(lda >= k);
+        assert(ldb >= k);
+        assert(ldc >= m);
+        assert(nth > 0);
+        assert(ith < nth);
 
-    case GGML_TYPE_Q4_0: {
-        if (Btype != GGML_TYPE_Q8_0)
-            return false;
 #if defined(__AVX2__) || defined(__AVX512F__) || defined(__AVX__)
-        tinyBLAS_Q0_AVX<block_q4_0, block_q8_0, float> tb{
-            k, (const block_q4_0 *)A, lda,
-            (const block_q8_0 *)B, ldb,
-            (float *)C, ldc,
-            ith, nth};
-        tb.matmul(m, n);
+        if constexpr (RUN) {
+            tinyBLAS_Q0_AVX<block_q4_0, block_q8_0, float> tb{k, A, lda, B, ldb, C, ldc, ith, nth};
+            tb.matmul(m, n);
+        }
         return true;
 #elif defined(__ARM_FEATURE_DOTPROD)
-        tinyBLAS_Q0_ARM<block_q4_0> tb{
-            k, (const block_q4_0 *)A, lda,
-            (const block_q8_0 *)B, ldb,
-            (float *)C, ldc,
-            ith, nth};
-        tb.matmul(m, n);
+        if constexpr (RUN) {
+            tinyBLAS_Q0_ARM<block_q4_0> tb{k, A, lda, B, ldb, C, ldc, ith, nth};
+            tb.matmul(m, n);
+        }
         return true;
 #else
         return false;
 #endif
     }
+    template bool gemm<true>(int64_t m, int64_t n, int64_t k,
+            const block_q4_0 *A, int64_t lda, const block_q8_0 *B, int64_t ldb, float *C, int64_t ldc,
+            int ith, int nth);
+    template bool gemm<false>(int64_t m, int64_t n, int64_t k,
+            const block_q4_0 *A, int64_t lda, const block_q8_0 *B, int64_t ldb, float *C, int64_t ldc,
+            int ith, int nth);
+
+    template<bool RUN>
+    bool gemm(int64_t m, int64_t n, int64_t k,
+            const block_q5_0 *A, int64_t lda, const block_q8_0 *B, int64_t ldb, float *C, int64_t ldc,
+            int ith, int nth) {
+        assert(m >= 0);
+        assert(n >= 0);
+        assert(k >= 0);
+        assert(lda >= k);
+        assert(ldb >= k);
+        assert(ldc >= m);
+        assert(nth > 0);
+        assert(ith < nth);
 
-    case GGML_TYPE_Q5_0: {
-        if (Btype != GGML_TYPE_Q8_0)
-            return false;
 #if defined(__AVX2__) || defined(__AVX512F__) || defined(__AVX__)
-        tinyBLAS_Q0_AVX<block_q5_0, block_q8_0, float> tb{
-            k, (const block_q5_0 *)A, lda,
-            (const block_q8_0 *)B, ldb,
-            (float *)C, ldc,
-            ith, nth};
-        tb.matmul(m, n);
+        if constexpr (RUN) {
+            tinyBLAS_Q0_AVX<block_q5_0, block_q8_0, float> tb{k, A, lda, B, ldb, C, ldc, ith, nth};
+            tb.matmul(m, n);
+        }
         return true;
 #else
         return false;
 #endif
     }
-
-    case GGML_TYPE_IQ4_NL: {
-        if (Btype != GGML_TYPE_Q8_0)
-            return false;
+    template bool gemm<true>(int64_t m, int64_t n, int64_t k,
+            const block_q5_0 *A, int64_t lda, const block_q8_0 *B, int64_t ldb, float *C, int64_t ldc,
+            int ith, int nth);
+    template bool gemm<false>(int64_t m, int64_t n, int64_t k,
+            const block_q5_0 *A, int64_t lda, const block_q8_0 *B, int64_t ldb, float *C, int64_t ldc,
+            int ith, int nth);
+
+    template<bool RUN>
+    bool gemm(int64_t m, int64_t n, int64_t k,
+            const block_iq4_nl *A, int64_t lda, const block_q8_0 *B, int64_t ldb, float *C, int64_t ldc,
+            int ith, int nth) {
+        assert(m >= 0);
+        assert(n >= 0);
+        assert(k >= 0);
+        assert(lda >= k);
+        assert(ldb >= k);
+        assert(ldc >= m);
+        assert(nth > 0);
+        assert(ith < nth);
 #if defined(__AVX2__) || defined(__AVX512F__) || defined(__AVX__)
-        tinyBLAS_Q0_AVX<block_iq4_nl, block_q8_0, float> tb{
-            k, (const block_iq4_nl *)A, lda,
-            (const block_q8_0 *)B, ldb,
-            (float *)C, ldc,
-            ith, nth};
-        tb.matmul(m, n);
+        if constexpr (RUN) {
+            tinyBLAS_Q0_AVX<block_iq4_nl, block_q8_0, float> tb{k, A, lda, B, ldb, C, ldc, ith, nth};
+            tb.matmul(m, n);
+        }
         return true;
 #else
         return false;
 #endif
     }
-
-    default:
-        return false;
-    }
-
-    (void)m;
-    (void)n;
-    (void)k;
-    (void)A;
-    (void)lda;
-    (void)B;
-    (void)ldb;
-    (void)C;
-    (void)ldc;
-    (void)ith;
-    (void)nth;
-    (void)Atype;
-    (void)Btype;
-    (void)Ctype;
+    template bool gemm<true>(int64_t m, int64_t n, int64_t k,
+            const block_iq4_nl *A, int64_t lda, const block_q8_0 *B, int64_t ldb, float *C, int64_t ldc,
+            int ith, int nth);
+    template bool gemm<false>(int64_t m, int64_t n, int64_t k,
+            const block_iq4_nl *A, int64_t lda, const block_q8_0 *B, int64_t ldb, float *C, int64_t ldc,
+            int ith, int nth);
 }
diff --git a/ggml/src/ggml-tinyblas/sgemm.h b/ggml/src/ggml-tinyblas/sgemm.h
new file mode 100644
index 0000000000000..88c014e3ebb1e
--- /dev/null
+++ b/ggml/src/ggml-tinyblas/sgemm.h
@@ -0,0 +1,51 @@
+#pragma once
+//#include <cstdint>
+#include "ggml.h"
+#define GGML_COMMON_DECL_C
+//#define GGML_COMMON_DECL_CPP
+#include "ggml-common.h"
+
+// appelé que depuis du c++ (le tinyBLAS backend)
+
+namespace ggml::backend::tinyblas {
+
+    // on est en C++
+    //  => on peu avoir autant de fonction que de type.
+    // calcule C = Aᵀ * B
+    template<bool RUN>
+    bool gemm(int64_t m, int64_t n, int64_t k,
+              const float *A, int64_t lda, const float *B, int64_t ldb, float *C, int64_t ldc,
+              int ith=0, int nth=1);
+    template<bool RUN>
+    bool gemm(int64_t m, int64_t n, int64_t k,
+              const ggml_fp16_t *A, int64_t lda, const float *B, int64_t ldb, float *C, int64_t ldc,
+              int ith=0, int nth=1);
+    template<bool RUN>
+    bool gemm(int64_t m, int64_t n, int64_t k,
+              const ggml_fp16_t *A, int64_t lda, const ggml_fp16_t *B, int64_t ldb, float *C, int64_t ldc,
+              int ith=0, int nth=1);
+    template<bool RUN>
+    bool gemm(int64_t m, int64_t n, int64_t k,
+              const ggml_bf16_t *A, int64_t lda, const float *B, int64_t ldb, float *C, int64_t ldc,
+              int ith=0, int nth=1);
+    template<bool RUN>
+    bool gemm(int64_t m, int64_t n, int64_t k,
+              const ggml_bf16_t *A, int64_t lda, const ggml_bf16_t *B, int64_t ldb, float *C, int64_t ldc,
+              int ith=0, int nth=1);
+    template<bool RUN>
+    bool gemm(int64_t m, int64_t n, int64_t k,
+              const block_q8_0 *A, int64_t lda, const block_q8_0 *B, int64_t ldb, float *C, int64_t ldc,
+              int ith=0, int nth=1);
+    template<bool RUN>
+    bool gemm(int64_t m, int64_t n, int64_t k,
+              const block_q4_0 *A, int64_t lda, const block_q8_0 *B, int64_t ldb, float *C, int64_t ldc,
+              int ith=0, int nth=1);
+    template<bool RUN>
+    bool gemm(int64_t m, int64_t n, int64_t k,
+              const block_q5_0 *A, int64_t lda, const block_q8_0 *B, int64_t ldb, float *C, int64_t ldc,
+              int ith=0, int nth=1);
+    template<bool RUN>
+    bool gemm(int64_t m, int64_t n, int64_t k,
+              const block_iq4_nl *A, int64_t lda, const block_q8_0 *B, int64_t ldb, float *C, int64_t ldc,
+              int ith=0, int nth=1);
+}
diff --git a/src/llama.cpp b/src/llama.cpp
index 1703104fb3680..d6c7cd08ef2f1 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -22034,7 +22034,6 @@ const char * llama_print_system_info(void) {
     s += "SSSE3 = "       + std::to_string(ggml_cpu_has_ssse3())       + " | ";
     s += "VSX = "         + std::to_string(ggml_cpu_has_vsx())         + " | ";
     s += "MATMUL_INT8 = " + std::to_string(ggml_cpu_has_matmul_int8()) + " | ";
-    s += "LLAMAFILE = "   + std::to_string(ggml_cpu_has_llamafile())   + " | ";
 
     return s.c_str();
 }

From dda8847636a60078bc0ffb3314ed9b2198d4faa1 Mon Sep 17 00:00:00 2001
From: Djip007 <3705339+Djip007@users.noreply.github.com>
Date: Sat, 16 Nov 2024 22:30:02 +0100
Subject: [PATCH 2/3] some cleanup with tinyblas backend

---
 CMakeLists.txt                           |  4 +-
 docs/android.md                          |  2 +-
 docs/build.md                            |  4 +-
 ggml/CMakeLists.txt                      |  8 +-
 ggml/src/CMakeLists.txt                  |  1 +
 ggml/src/ggml-backend-reg.cpp            |  8 +-
 ggml/src/ggml-common.h                   | 58 ++++++++++----
 ggml/src/ggml-cpu/CMakeLists.txt         | 10 ---
 ggml/src/ggml-cpu/ggml-cpu.c             | 59 --------------
 ggml/src/ggml-tinyblas/CMakeLists.txt    |  6 ++
 ggml/src/ggml-tinyblas/ggml-tinyblas.cpp | 96 +++++++++++++----------
 ggml/src/ggml-tinyblas/sgemm.cpp         | 99 ++++++++++++++++++++++++
 ggml/src/ggml-tinyblas/sgemm.h           | 55 +++++++++++--
 13 files changed, 262 insertions(+), 148 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 93c60ef431d66..ff62b3cba602c 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -84,8 +84,8 @@ set(GGML_ALL_WARNINGS       ${LLAMA_ALL_WARNINGS})
 set(GGML_FATAL_WARNINGS     ${LLAMA_FATAL_WARNINGS})
 
 # change the default for these ggml options
-if (NOT DEFINED GGML_LLAMAFILE)
-    set(GGML_LLAMAFILE_DEFAULT ON)
+if (NOT DEFINED GGML_TINYBLAS)
+    set(GGML_TINYBLAS ON)
 endif()
 
 if (NOT DEFINED GGML_AMX)
diff --git a/docs/android.md b/docs/android.md
index 320b62240382f..ac3ecb43ddd22 100644
--- a/docs/android.md
+++ b/docs/android.md
@@ -45,7 +45,7 @@ $ cmake \
   -DCMAKE_C_FLAGS="-march=armv8.7a" \
   -DCMAKE_CXX_FLAGS="-march=armv8.7a" \
   -DGGML_OPENMP=OFF \
-  -DGGML_LLAMAFILE=OFF \
+  -DGGML_TINYBLAS=OFF \
   -B build-android
 ```
 
diff --git a/docs/build.md b/docs/build.md
index 52de2b4e2c224..538490a17b493 100644
--- a/docs/build.md
+++ b/docs/build.md
@@ -42,7 +42,7 @@ In order to build llama.cpp you have four different options.
 
   **Notes**:
 
-    - For `Q4_0_4_4` quantization type build, add the `-DGGML_LLAMAFILE=OFF` cmake option. For example, use `cmake -B build -DGGML_LLAMAFILE=OFF`.
+    - For `Q4_0_4_4` quantization type build, add the `-DGGML_TINYBLAS=OFF` cmake option. For example, use `cmake -B build -DGGML_TINYBLAS=OFF`.
     - For faster compilation, add the `-j` argument to run multiple jobs in parallel. For example, `cmake --build build --config Release -j 8` will run 8 jobs in parallel.
     - For faster repeated compilation, install [ccache](https://ccache.dev/).
     - For debug builds, there are two cases:
@@ -393,4 +393,4 @@ To read documentation for how to build on Android, [click here](./android.md)
 
 Llama.cpp includes a set of optimized mulmat kernels for the Arm architecture, leveraging Arm® Neon™, int8mm and SVE instructions. These kernels are enabled at build time through the appropriate compiler cpu-type flags, such as `-DCMAKE_C_FLAGS=-march=armv8.2a+i8mm+sve`. Note that these optimized kernels require the model to be quantized into one of the formats: `Q4_0_4_4` (Arm Neon), `Q4_0_4_8` (int8mm) or `Q4_0_8_8` (SVE). The SVE mulmat kernel specifically requires a vector width of 256 bits. When running on devices with a different vector width, it is recommended to use the `Q4_0_4_8` (int8mm) or `Q4_0_4_4` (Arm Neon) formats for better performance. Refer to [examples/quantize/README.md](../examples/quantize/README.md) for more information on the quantization formats.
 
-To support `Q4_0_4_4`, you must build with `GGML_NO_LLAMAFILE=1` (`make`) or `-DGGML_LLAMAFILE=OFF` (`cmake`).
+To support `Q4_0_4_4`, you must build with `GGML_NO_LLAMAFILE=1` (`make`) or `-DGGML_TINYBLAS=OFF` (`cmake`).
diff --git a/ggml/CMakeLists.txt b/ggml/CMakeLists.txt
index 4fb78e59fa72c..cc9d277a9c81e 100644
--- a/ggml/CMakeLists.txt
+++ b/ggml/CMakeLists.txt
@@ -57,8 +57,8 @@ else()
 endif()
 
 # defaults
-if (NOT GGML_LLAMAFILE_DEFAULT)
-    set(GGML_LLAMAFILE_DEFAULT OFF)
+if (NOT GGML_TINYBLAS_DEFAULT)
+    set(GGML_TINYBLAS_DEFAULT OFF)
 endif()
 
 if (NOT GGML_CUDA_GRAPHS_DEFAULT)
@@ -124,8 +124,7 @@ option(GGML_ACCELERATE                      "ggml: enable Accelerate framework"
 option(GGML_BLAS                            "ggml: use BLAS"                                  ${GGML_BLAS_DEFAULT})
 set(GGML_BLAS_VENDOR ${GGML_BLAS_VENDOR_DEFAULT} CACHE STRING
                                             "ggml: BLAS library vendor")
-option(GGML_LLAMAFILE                       "ggml: use LLAMAFILE"                             ${GGML_LLAMAFILE_DEFAULT})
-
+option(GGML_TINYBLAS                        "ggml: use TINYBLAS"                              OFF)
 option(GGML_CUDA                            "ggml: use CUDA"                                  OFF)
 option(GGML_MUSA                            "ggml: use MUSA"                                  OFF)
 option(GGML_CUDA_FORCE_DMMV                 "ggml: use dmmv instead of mmvq CUDA kernels"     OFF)
@@ -231,6 +230,7 @@ set(GGML_PUBLIC_HEADERS
     include/ggml-metal.h
     include/ggml-rpc.h
     include/ggml-sycl.h
+    include/ggml-tinyblas.h
     include/ggml-vulkan.h)
 
 set_target_properties(ggml PROPERTIES PUBLIC_HEADER "${GGML_PUBLIC_HEADERS}")
diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt
index 71934c6791e32..33d494dd7a524 100644
--- a/ggml/src/CMakeLists.txt
+++ b/ggml/src/CMakeLists.txt
@@ -256,6 +256,7 @@ ggml_add_backend(Kompute)
 ggml_add_backend(METAL)
 ggml_add_backend(RPC)
 ggml_add_backend(SYCL)
+ggml_add_backend(TINYBLAS)
 ggml_add_backend(Vulkan)
 ggml_add_backend(MUSA)
 
diff --git a/ggml/src/ggml-backend-reg.cpp b/ggml/src/ggml-backend-reg.cpp
index 78bcb6c5ca5d9..233debb12bda4 100644
--- a/ggml/src/ggml-backend-reg.cpp
+++ b/ggml/src/ggml-backend-reg.cpp
@@ -91,10 +91,12 @@ struct ggml_backend_registry {
             return;
         }
 
-#ifndef NDEBUG
-        GGML_LOG_DEBUG("%s: registered backend %s (%zu devices)\n",
+        GGML_LOG_INFO("%s: registered backend %s (%zu devices)\n",
             __func__, ggml_backend_reg_name(reg), ggml_backend_reg_dev_count(reg));
-#endif
+//#ifndef NDEBUG
+//        GGML_LOG_DEBUG("%s: registered backend %s (%zu devices)\n",
+//            __func__, ggml_backend_reg_name(reg), ggml_backend_reg_dev_count(reg));
+//#endif
         backends.push_back(reg);
         for (size_t i = 0; i < ggml_backend_reg_dev_count(reg); i++) {
             register_device(ggml_backend_reg_dev_get(reg, i));
diff --git a/ggml/src/ggml-common.h b/ggml/src/ggml-common.h
index 050161393456e..51c36ff571fde 100644
--- a/ggml/src/ggml-common.h
+++ b/ggml/src/ggml-common.h
@@ -6,7 +6,20 @@
 typedef uint16_t ggml_half;
 typedef uint32_t ggml_half2;
 
-#define GGML_COMMON_AGGR
+#define GGML_COMMON_AGGR_U
+#define GGML_COMMON_AGGR_S
+
+#define GGML_COMMON_DECL
+#elif defined(GGML_COMMON_DECL_CPP)
+#include <cstdint>
+
+typedef uint16_t ggml_half;
+typedef uint32_t ggml_half2;
+
+// std-c++ allow anonymous unions but some compiler warn on it
+#define GGML_COMMON_AGGR_U data
+// std-c++ do not allow it.
+#define GGML_COMMON_AGGR_S data
 
 #define GGML_COMMON_DECL
 #elif defined(GGML_COMMON_DECL_METAL)
@@ -15,7 +28,8 @@ typedef uint32_t ggml_half2;
 typedef half  ggml_half;
 typedef half2 ggml_half2;
 
-#define GGML_COMMON_AGGR
+#define GGML_COMMON_AGGR_U
+#define GGML_COMMON_AGGR_S
 
 #define GGML_COMMON_DECL
 #elif defined(GGML_COMMON_DECL_CUDA)
@@ -29,7 +43,8 @@ typedef half2 ggml_half2;
 typedef half  ggml_half;
 typedef half2 ggml_half2;
 
-#define GGML_COMMON_AGGR data
+#define GGML_COMMON_AGGR_U
+#define GGML_COMMON_AGGR_S data
 
 #define GGML_COMMON_DECL
 #elif defined(GGML_COMMON_DECL_HIP)
@@ -39,7 +54,8 @@ typedef half2 ggml_half2;
 typedef half  ggml_half;
 typedef half2 ggml_half2;
 
-#define GGML_COMMON_AGGR data
+#define GGML_COMMON_AGGR_U
+#define GGML_COMMON_AGGR_S data
 
 #define GGML_COMMON_DECL
 #elif defined(GGML_COMMON_DECL_SYCL)
@@ -49,7 +65,8 @@ typedef half2 ggml_half2;
 typedef sycl::half  ggml_half;
 typedef sycl::half2 ggml_half2;
 
-#define GGML_COMMON_AGGR data
+#define GGML_COMMON_AGGR_U
+#define GGML_COMMON_AGGR_S data
 
 #define GGML_COMMON_DECL
 #endif
@@ -154,9 +171,9 @@ typedef struct {
         struct {
             ggml_half d; // delta
             ggml_half m; // min
-        } GGML_COMMON_AGGR;
+        } GGML_COMMON_AGGR_S;
         ggml_half2 dm;
-    };
+    } GGML_COMMON_AGGR_U;
     uint8_t qs[QK4_1 / 2]; // nibbles / quants
 } block_q4_1;
 static_assert(sizeof(block_q4_1) == 2 * sizeof(ggml_half) + QK4_1 / 2, "wrong q4_1 block size/padding");
@@ -175,9 +192,9 @@ typedef struct {
         struct {
             ggml_half d; // delta
             ggml_half m; // min
-        } GGML_COMMON_AGGR;
+        } GGML_COMMON_AGGR_S;
         ggml_half2 dm;
-    };
+    } GGML_COMMON_AGGR_U;
     uint8_t qh[4];         // 5-th bit of quants
     uint8_t qs[QK5_1 / 2]; // nibbles / quants
 } block_q5_1;
@@ -196,9 +213,9 @@ typedef struct {
         struct {
             ggml_half d; // delta
             ggml_half s; // d * sum(qs[i])
-        } GGML_COMMON_AGGR;
+        } GGML_COMMON_AGGR_S;
         ggml_half2 ds;
-    };
+    } GGML_COMMON_AGGR_U;
     int8_t qs[QK8_1]; // quants
 } block_q8_1;
 static_assert(sizeof(block_q8_1) == 2*sizeof(ggml_half) + QK8_1, "wrong q8_1 block size/padding");
@@ -261,9 +278,9 @@ typedef struct {
         struct {
             ggml_half d;    // super-block scale for quantized scales
             ggml_half dmin; // super-block scale for quantized mins
-        } GGML_COMMON_AGGR;
+        } GGML_COMMON_AGGR_S;
         ggml_half2 dm;
-    };
+    } GGML_COMMON_AGGR_U;
 } block_q2_K;
 static_assert(sizeof(block_q2_K) == 2*sizeof(ggml_half) + QK_K/16 + QK_K/4, "wrong q2_K block size/padding");
 
@@ -288,9 +305,9 @@ typedef struct {
         struct {
             ggml_half d;    // super-block scale for quantized scales
             ggml_half dmin; // super-block scale for quantized mins
-        } GGML_COMMON_AGGR;
+        } GGML_COMMON_AGGR_S;
         ggml_half2 dm;
-    };
+    } GGML_COMMON_AGGR_U;
     uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits
     uint8_t qs[QK_K/2];           // 4--bit quants
 } block_q4_K;
@@ -305,9 +322,9 @@ typedef struct {
         struct {
             ggml_half d;    // super-block scale for quantized scales
             ggml_half dmin; // super-block scale for quantized mins
-        } GGML_COMMON_AGGR;
+        } GGML_COMMON_AGGR_S;
         ggml_half2 dm;
-    };
+    } GGML_COMMON_AGGR_U;
     uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits
     uint8_t qh[QK_K/8];           // quants, high bit
     uint8_t qs[QK_K/2];           // quants, low 4 bits
@@ -431,6 +448,13 @@ static_assert(sizeof(block_iq4_xs) == sizeof(ggml_half) + sizeof(uint16_t) + QK_
 #define GGML_TABLE_BEGIN(type, name, size) static const type name[size] = {
 #define GGML_TABLE_END() };
 
+#define GGML_COMMON_IMPL
+#elif defined(GGML_COMMON_IMPL_CPP)
+#include <cstdint>
+
+#define GGML_TABLE_BEGIN(type, name, size) static const type name[size] = {
+#define GGML_TABLE_END() };
+
 #define GGML_COMMON_IMPL
 #elif defined(GGML_COMMON_IMPL_METAL)
 #include <metal_stdlib>
diff --git a/ggml/src/ggml-cpu/CMakeLists.txt b/ggml/src/ggml-cpu/CMakeLists.txt
index cef41a0743cef..03c7607b533d2 100644
--- a/ggml/src/ggml-cpu/CMakeLists.txt
+++ b/ggml/src/ggml-cpu/CMakeLists.txt
@@ -44,16 +44,6 @@ if (GGML_OPENMP)
     endif()
 endif()
 
-if (GGML_LLAMAFILE)
-    message(STATUS "Using llamafile")
-
-    add_compile_definitions(GGML_USE_LLAMAFILE)
-
-    target_sources(ggml-cpu PRIVATE
-                    llamafile/sgemm.cpp
-                    llamafile/sgemm.h)
-endif()
-
 if (GGML_CPU_HBM)
     find_library(memkind memkind REQUIRED)
 
diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c
index 7f5c465df068c..37a11449c155c 100644
--- a/ggml/src/ggml-cpu/ggml-cpu.c
+++ b/ggml/src/ggml-cpu/ggml-cpu.c
@@ -39,14 +39,6 @@
 #include <omp.h>
 #endif
 
-#if defined(__ARM_FEATURE_SVE) || defined(__ARM_FEATURE_MATMUL_INT8)
-#undef GGML_USE_LLAMAFILE
-#endif
-
-#ifdef GGML_USE_LLAMAFILE
-#include "llamafile/sgemm.h"
-#endif
-
 #if defined(_MSC_VER)
 // disable "possible loss of data" to avoid hundreds of casts
 // we should just be careful :)
@@ -7466,33 +7458,6 @@ static void ggml_compute_forward_mul_mat(
     // nb01 >= nb00 - src0 is not transposed
     //   compute by src0 rows
 
-#if GGML_USE_LLAMAFILE
-    // broadcast factors
-    const int64_t r2 = ne12 / ne02;
-    const int64_t r3 = ne13 / ne03;
-
-    const bool src1_cont = ggml_is_contiguous(src1);
-
-    if (src1_cont) {
-        for (int64_t i13 = 0; i13 < ne13; i13++)
-            for (int64_t i12 = 0; i12 < ne12; i12++)
-                if (!llamafile_sgemm(ne01, ne11, ne00/ggml_blck_size(type),
-                                     (const char *)src0->data + i12/r2*nb02 + i13/r3*nb03,
-                                     nb01/ggml_type_size(type),
-                                     (const char *)src1->data + i12*nb12 + i13*nb13,
-                                     nb11/ggml_type_size(src1->type),
-                                     (char *)dst->data + i12*nb2 + i13*nb3,
-                                     nb1/ggml_type_size(dst->type),
-                                     ith, nth,
-                                     type,
-                                     src1->type,
-                                     dst->type))
-                    goto UseGgmlGemm1;
-        return;
-    }
-UseGgmlGemm1:;
-#endif
-
     if (src1->type != vec_dot_type) {
         char * wdata = params->wdata;
 
@@ -7530,30 +7495,6 @@ UseGgmlGemm1:;
 
     ggml_barrier(params->threadpool);
 
-#if GGML_USE_LLAMAFILE
-    if (src1->type != vec_dot_type) {
-        const void* wdata = (src1->type == vec_dot_type) ? src1->data : params->wdata;
-        const size_t row_size = ggml_row_size(vec_dot_type, ne10);
-
-        for (int64_t i13 = 0; i13 < ne13; i13++)
-            for (int64_t i12 = 0; i12 < ne12; i12++)
-                if (!llamafile_sgemm(ne01, ne11, ne00/ggml_blck_size(type),
-                                     (const char *)src0->data + i12/r2*nb02 + i13/r3*nb03,
-                                     nb01/ggml_type_size(type),
-                                     (const char *)wdata + (i12*ne11 + i13*ne12*ne11)*row_size,
-                                     row_size/ggml_type_size(vec_dot_type),
-                                     (char *)dst->data + i12*nb2 + i13*nb3,
-                                     nb1/ggml_type_size(dst->type),
-                                     ith, nth,
-                                     type,
-                                     vec_dot_type,
-                                     dst->type))
-                    goto UseGgmlGemm2;
-        return;
-    }
-UseGgmlGemm2:;
-#endif
-
     // This is the size of the first dimension of the result, so we can iterate that way. (see the ASSERT above, these are the same numbers)
     const int64_t nr0 = ne0;
 
diff --git a/ggml/src/ggml-tinyblas/CMakeLists.txt b/ggml/src/ggml-tinyblas/CMakeLists.txt
index c8c4fd04e0f29..2b197f511743b 100644
--- a/ggml/src/ggml-tinyblas/CMakeLists.txt
+++ b/ggml/src/ggml-tinyblas/CMakeLists.txt
@@ -1,3 +1,5 @@
+message(STATUS "Using TINYBLAS")
+
 add_library(ggml-tinyblas
             ggml-tinyblas.cpp
             )
@@ -225,6 +227,10 @@ endif()
 target_compile_options(ggml-tinyblas PRIVATE "$<$<COMPILE_LANGUAGE:CXX>:${ARCH_FLAGS}>")
 target_compile_options(ggml-tinyblas PRIVATE "$<$<COMPILE_LANGUAGE:C>:${ARCH_FLAGS}>")
 
+#set_source_files_properties( ${GGML_SOURCES_FP8} PROPERTIES CXX_STANDARD 17)
+#set_source_files_properties( ${GGML_SOURCES_FP8} PROPERTIES COMPILE_FLAGS "-std=c++17")
+target_compile_features   (ggml-tinyblas PRIVATE cxx_std_17)
+
 if (EMSCRIPTEN)
     set_target_properties(ggml-tinyblas PROPERTIES COMPILE_FLAGS "-msimd128")
 endif()
diff --git a/ggml/src/ggml-tinyblas/ggml-tinyblas.cpp b/ggml/src/ggml-tinyblas/ggml-tinyblas.cpp
index 7317b5dd352dc..5d0704289534e 100644
--- a/ggml/src/ggml-tinyblas/ggml-tinyblas.cpp
+++ b/ggml/src/ggml-tinyblas/ggml-tinyblas.cpp
@@ -1,3 +1,48 @@
+// Copyright 2024 Mozilla Foundation
+//
+// Permission is hereby granted, free of charge, to any person obtaining
+// a copy of this software and associated documentation files (the
+// "Software"), to deal in the Software without restriction, including
+// without limitation the rights to use, copy, modify, merge, publish,
+// distribute, sublicense, and/or sell copies of the Software, and to
+// permit persons to whom the Software is furnished to do so, subject to
+// the following conditions:
+//
+// The above copyright notice and this permission notice shall be
+// included in all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+// MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+// BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+// ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+// CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+//
+//                   _   _          ___ _      _   ___
+//                  | |_(_)_ _ _  _| _ ) |    /_\ / __|
+//                  |  _| | ' \ || | _ \ |__ / _ \\__ \.
+//                   \__|_|_||_\_, |___/____/_/ \_\___/
+//                             |__/
+//
+//                    BASIC LINEAR ALGEBRA SUBPROGRAMS
+//
+//
+// This file implements multithreaded CPU matrix multiplication for the
+// common contiguous use case C = Aᵀ * B. These kernels are designed to
+// have excellent performance[1] for matrices that fit in the CPU cache
+// without imposing any overhead such as cache filling or malloc calls.
+//
+// This implementation does not guarantee any upper bound with rounding
+// errors, which grow along with k. Our goal's to maximally exploit the
+// hardware for performance, and then use whatever resources remain for
+// improving numerical accuracy.
+//
+// [1] J. Tunney, ‘LLaMA Now Goes Faster on CPUs’, Mar. 2024. [Online].
+//     Available: https://justine.lol/matmul/. [Accessed: 29-Mar-2024].
+
 #include "ggml-cpu.h"
 #include "ggml-impl.h"
 #include "ggml-tinyblas.h"
@@ -7,8 +52,9 @@
 
 #include <memory>
 #include <cstring>
-#include <iostream>
 
+// TODO: see how to use threads/pool for all backend: ggml_graph_compute / ggml_threadpool
+// https://github.com/ggerganov/llama.cpp/pull/1999
 #ifdef GGML_USE_OPENMP
 #include <omp.h>
 #endif
@@ -21,8 +67,6 @@ namespace ggml::backend::tinyblas {
         int n_threads = GGML_DEFAULT_N_THREADS;
         std::unique_ptr<char[]> work_data;
         size_t work_size = 0;
-        //int pp_threads = GGML_DEFAULT_N_THREADS;
-        //int tg_threads = GGML_DEFAULT_N_THREADS;
     };
 
     template<bool RUN>
@@ -112,7 +156,7 @@ namespace ggml::backend::tinyblas {
             }
         }
 
-        // apres conversion de B: FP32 => src0->vec_dot_type
+        // after convert B: FP32 => src0->vec_dot_type
         enum ggml_type const vec_dot_type = ggml_get_type_traits_cpu(src0->type)->vec_dot_type;
         if ((src1->type != vec_dot_type) && (src1->type == GGML_TYPE_F32)) {
             if (mul_mat<false>(ne01, ne11, ne00/ggml_blck_size(src0->type),
@@ -120,7 +164,7 @@ namespace ggml::backend::tinyblas {
                     src1->data, nb11/ggml_type_size(src1->type),
                     dst->data, nb1/ggml_type_size(dst->type),
                     0, 1, src0->type, vec_dot_type, GGML_TYPE_F32)) {
-                // @ voir ca aurait etait bien de redimensioner work_data ici..
+                // TODO: how to resize work_data here
                 return true;
             }
         }
@@ -136,7 +180,6 @@ namespace ggml::backend::tinyblas {
         const enum ggml_type type0 = src0->type;
         const enum ggml_type type1 = src1->type;
 
-        // les type "directs"
         // broadcast factors
         const int64_t r2 = ne12 / ne02;
         const int64_t r3 = ne13 / ne03;
@@ -160,21 +203,18 @@ namespace ggml::backend::tinyblas {
         }
         UseGgmlGemm1:;
 
-        // apres conversion de B ?
+        // with B converted from FP32 -> vec_dot_type
         GGML_ASSERT(src1->type == GGML_TYPE_F32); // for use 'from_float'
         enum ggml_type    const vec_dot_type = ggml_get_type_traits_cpu(type0)->vec_dot_type;
         ggml_from_float_t const from_float   = ggml_get_type_traits_cpu(vec_dot_type)->from_float;
-        // auto const type_size = ggml_get_type_traits(vec_dot_type)->type_size;
 
         if (src1->type != vec_dot_type) {
-            // OK on va au moins essayer de changer le type de B
-
             const size_t nbw1 = ggml_row_size(vec_dot_type, ne10);
             // const size_t row_size = ggml_row_size(vec_dot_type, ne10);
             const size_t nbw2 = nbw1*ne11;
             const size_t nbw3 = nbw2*ne12;
 
-            // TOD0: vor si on peu caller ca dans supports_mul_mat
+            // TODO: move to: supports_mul_mat
             if ((ith == 0) && (ctx->work_size < ne13*nbw3)) {
                 ctx->work_data.reset(new char[ne13*nbw3]);
                 ctx->work_size = ne13*nbw3;
@@ -182,7 +222,7 @@ namespace ggml::backend::tinyblas {
 #ifdef GGML_USE_OPENMP
 #pragma omp barrier
 #else
-            static_assert(false, "Note implemented: use GGML_USE_OPENMP");
+            static_assert(false, "Not implemented: use GGML_USE_OPENMP");
 #endif
             char * wdata = ctx->work_data.get();
 
@@ -200,7 +240,7 @@ namespace ggml::backend::tinyblas {
 #ifdef GGML_USE_OPENMP
 #pragma omp barrier
 #else
-            static_assert(false, "Note implemented: use GGML_USE_OPENMP");
+            static_assert(false, "Not implemented: use GGML_USE_OPENMP");
 #endif
             // mat-mul bis...
             for (int64_t i13 = 0; i13 < ne13; i13++)
@@ -232,10 +272,6 @@ namespace ggml::backend::tinyblas {
         delete backend;
     }
 
-    // TODO: voir comment gerer les threads / pool ... pour tous les backends qui en ont besoin...
-    //  - voir ggml_graph_compute / ggml_threadpool
-    // https://github.com/ggerganov/llama.cpp/pull/1999
-    //
     static enum ggml_status graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
         context * ctx = (context *)backend->context;
 
@@ -252,7 +288,7 @@ namespace ggml::backend::tinyblas {
                 mul_mat(ctx, node, ith, nth);
             }
 #else
-            static_assert(false, "Note implemented: use GGML_USE_OPENMP");
+            static_assert(false, "Not implemented: use GGML_USE_OPENMP");
             mul_mat(ctx, node, 0, 1);
 #endif
             break;
@@ -309,25 +345,10 @@ namespace ggml::backend::tinyblas {
         return backend != NULL && ggml_guid_matches(backend->guid, guid());
     }
 
-    // number of threads to use for compute
-    static void set_pp_threads(ggml_backend_t backend, int n_threads) {
-        GGML_ASSERT(is_tinyblas(backend));
-        context * ctx = (context *)backend->context;
-        //ctx->pp_threads = n_threads;
-    }
-
-    static void set_tg_threads(ggml_backend_t backend, int n_threads) {
-        GGML_ASSERT(is_tinyblas(backend));
-        context * ctx = (context *)backend->context;
-        //ctx->tg_threads = n_threads;
-    }
-
     static void set_n_threads(ggml_backend_t backend, int n_threads) {
         GGML_ASSERT(is_tinyblas(backend));
         context * ctx = (context *)backend->context;
         ctx->n_threads = n_threads;
-        //ctx->tg_threads = n_threads;
-        //ctx->pp_threads = n_threads;
     }
 
 }
@@ -378,9 +399,6 @@ namespace ggml::backend::tinyblas::device {
     }
 
     static bool supports_op(ggml_backend_dev_t device, const struct ggml_tensor * op) {
-        //const struct ggml_tensor * src0 = op->src[0];
-        //const struct ggml_tensor * src1 = op->src[1];
-
         switch (op->op) {
         case GGML_OP_NONE:
         case GGML_OP_RESHAPE:
@@ -445,12 +463,6 @@ namespace ggml::backend::tinyblas::reg {
         if (std::strcmp(name, "ggml_backend_set_n_threads") == 0) {
             return (void *)ggml::backend::tinyblas::set_n_threads;
         }
-        if (std::strcmp(name, "ggml_backend_set_pp_threads") == 0) {
-            return (void *)ggml::backend::tinyblas::set_pp_threads;
-        }
-        if (std::strcmp(name, "ggml_backend_set_tg_threads") == 0) {
-            return (void *)ggml::backend::tinyblas::set_tg_threads;
-        }
         return NULL;
     }
 
diff --git a/ggml/src/ggml-tinyblas/sgemm.cpp b/ggml/src/ggml-tinyblas/sgemm.cpp
index 5c7a3c357ee9f..b82ae3f84be49 100644
--- a/ggml/src/ggml-tinyblas/sgemm.cpp
+++ b/ggml/src/ggml-tinyblas/sgemm.cpp
@@ -1739,6 +1739,17 @@ namespace ggml::backend::tinyblas {
         }
 #endif
         return false;
+        GGML_UNUSED(m);
+        GGML_UNUSED(n);
+        GGML_UNUSED(k);
+        GGML_UNUSED(A);
+        GGML_UNUSED(lda);
+        GGML_UNUSED(B);
+        GGML_UNUSED(ldb);
+        GGML_UNUSED(C);
+        GGML_UNUSED(ldc);
+        GGML_UNUSED(ith);
+        GGML_UNUSED(nth);
     }
     template bool gemm<true>(int64_t m, int64_t n, int64_t k,
             const float *A, int64_t lda, const float *B, int64_t ldb, float *C, int64_t ldc,
@@ -1787,6 +1798,17 @@ namespace ggml::backend::tinyblas {
         }
 #endif
         return false;
+        GGML_UNUSED(m);
+        GGML_UNUSED(n);
+        GGML_UNUSED(k);
+        GGML_UNUSED(A);
+        GGML_UNUSED(lda);
+        GGML_UNUSED(B);
+        GGML_UNUSED(ldb);
+        GGML_UNUSED(C);
+        GGML_UNUSED(ldc);
+        GGML_UNUSED(ith);
+        GGML_UNUSED(nth);
     }
     template bool gemm<true>(int64_t m, int64_t n, int64_t k,
             const ggml_fp16_t *A, int64_t lda, const float *B, int64_t ldb, float *C, int64_t ldc,
@@ -1835,6 +1857,17 @@ namespace ggml::backend::tinyblas {
         }
 #endif
         return false;
+        GGML_UNUSED(m);
+        GGML_UNUSED(n);
+        GGML_UNUSED(k);
+        GGML_UNUSED(A);
+        GGML_UNUSED(lda);
+        GGML_UNUSED(B);
+        GGML_UNUSED(ldb);
+        GGML_UNUSED(C);
+        GGML_UNUSED(ldc);
+        GGML_UNUSED(ith);
+        GGML_UNUSED(nth);
     }
     template bool gemm<true>(int64_t m, int64_t n, int64_t k,
             const ggml_fp16_t *A, int64_t lda, const ggml_fp16_t *B, int64_t ldb, float *C, int64_t ldc,
@@ -1876,6 +1909,17 @@ namespace ggml::backend::tinyblas {
         // TODO
 #endif
         return false;
+        GGML_UNUSED(m);
+        GGML_UNUSED(n);
+        GGML_UNUSED(k);
+        GGML_UNUSED(A);
+        GGML_UNUSED(lda);
+        GGML_UNUSED(B);
+        GGML_UNUSED(ldb);
+        GGML_UNUSED(C);
+        GGML_UNUSED(ldc);
+        GGML_UNUSED(ith);
+        GGML_UNUSED(nth);
     }
     template bool gemm<true>(int64_t m, int64_t n, int64_t k,
             const ggml_bf16_t *A, int64_t lda, const float *B, int64_t ldb, float *C, int64_t ldc,
@@ -1914,6 +1958,17 @@ namespace ggml::backend::tinyblas {
         }
 #endif
         return false;
+        GGML_UNUSED(m);
+        GGML_UNUSED(n);
+        GGML_UNUSED(k);
+        GGML_UNUSED(A);
+        GGML_UNUSED(lda);
+        GGML_UNUSED(B);
+        GGML_UNUSED(ldb);
+        GGML_UNUSED(C);
+        GGML_UNUSED(ldc);
+        GGML_UNUSED(ith);
+        GGML_UNUSED(nth);
     }
     template bool gemm<true>(int64_t m, int64_t n, int64_t k,
             const ggml_bf16_t *A, int64_t lda, const ggml_bf16_t *B, int64_t ldb, float *C, int64_t ldc,
@@ -1950,6 +2005,17 @@ namespace ggml::backend::tinyblas {
 #else
         return false;
 #endif
+        GGML_UNUSED(m);
+        GGML_UNUSED(n);
+        GGML_UNUSED(k);
+        GGML_UNUSED(A);
+        GGML_UNUSED(lda);
+        GGML_UNUSED(B);
+        GGML_UNUSED(ldb);
+        GGML_UNUSED(C);
+        GGML_UNUSED(ldc);
+        GGML_UNUSED(ith);
+        GGML_UNUSED(nth);
     }
     template bool gemm<true>(int64_t m, int64_t n, int64_t k,
             const block_q8_0 *A, int64_t lda, const block_q8_0 *B, int64_t ldb, float *C, int64_t ldc,
@@ -1986,6 +2052,17 @@ namespace ggml::backend::tinyblas {
 #else
         return false;
 #endif
+        GGML_UNUSED(m);
+        GGML_UNUSED(n);
+        GGML_UNUSED(k);
+        GGML_UNUSED(A);
+        GGML_UNUSED(lda);
+        GGML_UNUSED(B);
+        GGML_UNUSED(ldb);
+        GGML_UNUSED(C);
+        GGML_UNUSED(ldc);
+        GGML_UNUSED(ith);
+        GGML_UNUSED(nth);
     }
     template bool gemm<true>(int64_t m, int64_t n, int64_t k,
             const block_q4_0 *A, int64_t lda, const block_q8_0 *B, int64_t ldb, float *C, int64_t ldc,
@@ -2016,6 +2093,17 @@ namespace ggml::backend::tinyblas {
 #else
         return false;
 #endif
+        GGML_UNUSED(m);
+        GGML_UNUSED(n);
+        GGML_UNUSED(k);
+        GGML_UNUSED(A);
+        GGML_UNUSED(lda);
+        GGML_UNUSED(B);
+        GGML_UNUSED(ldb);
+        GGML_UNUSED(C);
+        GGML_UNUSED(ldc);
+        GGML_UNUSED(ith);
+        GGML_UNUSED(nth);
     }
     template bool gemm<true>(int64_t m, int64_t n, int64_t k,
             const block_q5_0 *A, int64_t lda, const block_q8_0 *B, int64_t ldb, float *C, int64_t ldc,
@@ -2045,6 +2133,17 @@ namespace ggml::backend::tinyblas {
 #else
         return false;
 #endif
+        GGML_UNUSED(m);
+        GGML_UNUSED(n);
+        GGML_UNUSED(k);
+        GGML_UNUSED(A);
+        GGML_UNUSED(lda);
+        GGML_UNUSED(B);
+        GGML_UNUSED(ldb);
+        GGML_UNUSED(C);
+        GGML_UNUSED(ldc);
+        GGML_UNUSED(ith);
+        GGML_UNUSED(nth);
     }
     template bool gemm<true>(int64_t m, int64_t n, int64_t k,
             const block_iq4_nl *A, int64_t lda, const block_q8_0 *B, int64_t ldb, float *C, int64_t ldc,
diff --git a/ggml/src/ggml-tinyblas/sgemm.h b/ggml/src/ggml-tinyblas/sgemm.h
index 88c014e3ebb1e..18bf5230e333d 100644
--- a/ggml/src/ggml-tinyblas/sgemm.h
+++ b/ggml/src/ggml-tinyblas/sgemm.h
@@ -1,17 +1,56 @@
+// Copyright 2024 Mozilla Foundation
+//
+// Permission is hereby granted, free of charge, to any person obtaining
+// a copy of this software and associated documentation files (the
+// "Software"), to deal in the Software without restriction, including
+// without limitation the rights to use, copy, modify, merge, publish,
+// distribute, sublicense, and/or sell copies of the Software, and to
+// permit persons to whom the Software is furnished to do so, subject to
+// the following conditions:
+//
+// The above copyright notice and this permission notice shall be
+// included in all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+// MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+// BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+// ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+// CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+//
+//                   _   _          ___ _      _   ___
+//                  | |_(_)_ _ _  _| _ ) |    /_\ / __|
+//                  |  _| | ' \ || | _ \ |__ / _ \\__ \.
+//                   \__|_|_||_\_, |___/____/_/ \_\___/
+//                             |__/
+//
+//                    BASIC LINEAR ALGEBRA SUBPROGRAMS
+//
+//
+// This file implements multithreaded CPU matrix multiplication for the
+// common contiguous use case C = Aᵀ * B. These kernels are designed to
+// have excellent performance[1] for matrices that fit in the CPU cache
+// without imposing any overhead such as cache filling or malloc calls.
+//
+// This implementation does not guarantee any upper bound with rounding
+// errors, which grow along with k. Our goal's to maximally exploit the
+// hardware for performance, and then use whatever resources remain for
+// improving numerical accuracy.
+//
+// [1] J. Tunney, ‘LLaMA Now Goes Faster on CPUs’, Mar. 2024. [Online].
+//     Available: https://justine.lol/matmul/. [Accessed: 29-Mar-2024].
+
 #pragma once
-//#include <cstdint>
 #include "ggml.h"
-#define GGML_COMMON_DECL_C
-//#define GGML_COMMON_DECL_CPP
+#define GGML_COMMON_DECL_CPP
 #include "ggml-common.h"
 
-// appelé que depuis du c++ (le tinyBLAS backend)
-
 namespace ggml::backend::tinyblas {
 
-    // on est en C++
-    //  => on peu avoir autant de fonction que de type.
-    // calcule C = Aᵀ * B
+    // compute: C = Aᵀ * B
     template<bool RUN>
     bool gemm(int64_t m, int64_t n, int64_t k,
               const float *A, int64_t lda, const float *B, int64_t ldb, float *C, int64_t ldc,

From a3822fb59b9649a7dc1b916cd9c05950b5adfba7 Mon Sep 17 00:00:00 2001
From: Djip007 <3705339+Djip007@users.noreply.github.com>
Date: Sun, 17 Nov 2024 00:49:52 +0100
Subject: [PATCH 3/3] update Makefile

---
 Makefile | 21 ++++-----------------
 1 file changed, 4 insertions(+), 17 deletions(-)

diff --git a/Makefile b/Makefile
index fa94c3bf39072..5a35d05617999 100644
--- a/Makefile
+++ b/Makefile
@@ -569,7 +569,7 @@ endif # GGML_NVPL
 
 ifndef GGML_NO_LLAMAFILE
 	MK_CPPFLAGS  += -DGGML_USE_TINYBLAS
-	OBJ_GGML_EXT += ggml/src/ggml-tinyblas/ggml-tinyblas.o ggml/src/ggml-tinyblas/sgemm.o
+	OBJ_GGML_EXT += ggml/src/ggml-tinyblas/ggml-tinyblas-cpp17.o ggml/src/ggml-tinyblas/sgemm-cpp17.o
 endif
 
 ifndef GGML_NO_AMX
@@ -1153,22 +1153,9 @@ $(DIR_GGML)/src/ggml-cpu/ggml-cpu-cpp.o: \
 	ggml/src/ggml-impl.h
 	$(CXX) $(CXXFLAGS)   -c $< -o $@
 
-# TODO: renomer en GGML_NO_TINYBLAS
-# needed for c++17 build
-ifndef GGML_NO_LLAMAFILE
-ggml/src/ggml-tinyblas/ggml-tinyblas.o: \
-	ggml/src/ggml-tinyblas/ggml-tinyblas.cpp \
-	ggml/include/ggml-tinyblas.h \
-	ggml/src/ggml-tinyblas/sgemm.h \
-	ggml/include/ggml.h
-	$(CXX) $(CXXFLAGS) -std=c++17 -c $< -o $@
-
-ggml/src/ggml-tinyblas/sgemm.o: \
-	ggml/src/ggml-tinyblas/sgemm.cpp \
-	ggml/src/ggml-tinyblas/sgemm.h \
-	ggml/include/ggml.h
-	$(CXX) $(CXXFLAGS) -std=c++17 -c $< -o $@
-endif # GGML_NO_LLAMAFILE
+# for c++17 build
+$(DIR_GGML)/%-cpp17.o: $(DIR_GGML)/%.cpp
+	$(CXX) $(CXXFLAGS) -MMD -std=c++17 -c $< -o $@
 
 # Rules for building object files
 $(DIR_GGML)/%.o: $(DIR_GGML)/%.c