From 7dd261f3e965348cb631c625fd9e29dd8c48c1f6 Mon Sep 17 00:00:00 2001 From: Djip007 <3705339+Djip007@users.noreply.github.com> Date: Sat, 16 Nov 2024 20:58:30 +0100 Subject: [PATCH 1/3] extract llamafile in new tinyblas backend --- Makefile | 21 +- ggml/include/ggml-cpu.h | 1 - ggml/include/ggml-tinyblas.h | 17 + ggml/src/ggml-backend-reg.cpp | 7 + ggml/src/ggml-cpu/ggml-cpu.c | 8 - ggml/src/ggml-cpu/ggml-cpu.cpp | 3 - ggml/src/ggml-cpu/llamafile/sgemm.h | 14 - ggml/src/ggml-tinyblas/CMakeLists.txt | 230 +++++++ ggml/src/ggml-tinyblas/ggml-tinyblas.cpp | 472 ++++++++++++++ .../llamafile => ggml-tinyblas}/sgemm.cpp | 604 +++++++++++------- ggml/src/ggml-tinyblas/sgemm.h | 51 ++ src/llama.cpp | 1 - 12 files changed, 1183 insertions(+), 246 deletions(-) create mode 100644 ggml/include/ggml-tinyblas.h delete mode 100644 ggml/src/ggml-cpu/llamafile/sgemm.h create mode 100644 ggml/src/ggml-tinyblas/CMakeLists.txt create mode 100644 ggml/src/ggml-tinyblas/ggml-tinyblas.cpp rename ggml/src/{ggml-cpu/llamafile => ggml-tinyblas}/sgemm.cpp (80%) create mode 100644 ggml/src/ggml-tinyblas/sgemm.h diff --git a/Makefile b/Makefile index 539370e0639f4..fa94c3bf39072 100644 --- a/Makefile +++ b/Makefile @@ -568,8 +568,8 @@ ifdef GGML_NVPL endif # GGML_NVPL ifndef GGML_NO_LLAMAFILE - MK_CPPFLAGS += -DGGML_USE_LLAMAFILE - OBJ_GGML_EXT += ggml/src/ggml-cpu/llamafile/sgemm.o + MK_CPPFLAGS += -DGGML_USE_TINYBLAS + OBJ_GGML_EXT += ggml/src/ggml-tinyblas/ggml-tinyblas.o ggml/src/ggml-tinyblas/sgemm.o endif ifndef GGML_NO_AMX @@ -1153,6 +1153,23 @@ $(DIR_GGML)/src/ggml-cpu/ggml-cpu-cpp.o: \ ggml/src/ggml-impl.h $(CXX) $(CXXFLAGS) -c $< -o $@ +# TODO: renomer en GGML_NO_TINYBLAS +# needed for c++17 build +ifndef GGML_NO_LLAMAFILE +ggml/src/ggml-tinyblas/ggml-tinyblas.o: \ + ggml/src/ggml-tinyblas/ggml-tinyblas.cpp \ + ggml/include/ggml-tinyblas.h \ + ggml/src/ggml-tinyblas/sgemm.h \ + ggml/include/ggml.h + $(CXX) $(CXXFLAGS) -std=c++17 -c $< -o $@ + +ggml/src/ggml-tinyblas/sgemm.o: \ + ggml/src/ggml-tinyblas/sgemm.cpp \ + ggml/src/ggml-tinyblas/sgemm.h \ + ggml/include/ggml.h + $(CXX) $(CXXFLAGS) -std=c++17 -c $< -o $@ +endif # GGML_NO_LLAMAFILE + # Rules for building object files $(DIR_GGML)/%.o: $(DIR_GGML)/%.c $(CC) $(CFLAGS) -MMD -c $< -o $@ diff --git a/ggml/include/ggml-cpu.h b/ggml/include/ggml-cpu.h index 7571ef9798364..49a18ba37cb98 100644 --- a/ggml/include/ggml-cpu.h +++ b/ggml/include/ggml-cpu.h @@ -124,7 +124,6 @@ extern "C" { GGML_BACKEND_API int ggml_cpu_has_riscv_v (void); GGML_BACKEND_API int ggml_cpu_has_vsx (void); GGML_BACKEND_API int ggml_cpu_has_wasm_simd (void); - GGML_BACKEND_API int ggml_cpu_has_llamafile (void); // Internal types and functions exposed for tests and benchmarks diff --git a/ggml/include/ggml-tinyblas.h b/ggml/include/ggml-tinyblas.h new file mode 100644 index 0000000000000..4c0075327003e --- /dev/null +++ b/ggml/include/ggml-tinyblas.h @@ -0,0 +1,17 @@ +#pragma once + +#include "ggml.h" +#include "ggml-backend.h" + + +#ifdef __cplusplus +extern "C" { +#endif + +// backend register +GGML_API ggml_backend_reg_t ggml_backend_tinyblas_reg(void); + + +#ifdef __cplusplus +} +#endif diff --git a/ggml/src/ggml-backend-reg.cpp b/ggml/src/ggml-backend-reg.cpp index 63e9d82017457..78bcb6c5ca5d9 100644 --- a/ggml/src/ggml-backend-reg.cpp +++ b/ggml/src/ggml-backend-reg.cpp @@ -27,6 +27,10 @@ #include "ggml-blas.h" #endif +#ifdef GGML_USE_TINYBLAS +#include "ggml-tinyblas.h" +#endif + #ifdef GGML_USE_RPC #include "ggml-rpc.h" #endif @@ -66,6 +70,9 @@ struct ggml_backend_registry { #ifdef GGML_USE_BLAS register_backend(ggml_backend_blas_reg()); #endif +#ifdef GGML_USE_TINYBLAS + register_backend(ggml_backend_tinyblas_reg()); +#endif #ifdef GGML_USE_RPC register_backend(ggml_backend_rpc_reg()); #endif diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c index 61f53cd0145a6..7f5c465df068c 100644 --- a/ggml/src/ggml-cpu/ggml-cpu.c +++ b/ggml/src/ggml-cpu/ggml-cpu.c @@ -13868,14 +13868,6 @@ int ggml_cpu_has_wasm_simd(void) { #endif } -int ggml_cpu_has_llamafile(void) { -#if defined(GGML_USE_LLAMAFILE) - return 1; -#else - return 0; -#endif -} - int ggml_cpu_has_sse3(void) { #if defined(__SSE3__) return 1; diff --git a/ggml/src/ggml-cpu/ggml-cpu.cpp b/ggml/src/ggml-cpu/ggml-cpu.cpp index 573b7c5b9b375..a131f5e28a65a 100644 --- a/ggml/src/ggml-cpu/ggml-cpu.cpp +++ b/ggml/src/ggml-cpu/ggml-cpu.cpp @@ -616,9 +616,6 @@ static ggml_backend_feature * ggml_backend_cpu_get_features(ggml_backend_reg_t r if (ggml_cpu_has_wasm_simd()) { features.push_back({ "WASM_SIMD", "1" }); } - if (ggml_cpu_has_llamafile()) { - features.push_back({ "LLAMAFILE", "1" }); - } features.push_back({ nullptr, nullptr }); diff --git a/ggml/src/ggml-cpu/llamafile/sgemm.h b/ggml/src/ggml-cpu/llamafile/sgemm.h deleted file mode 100644 index caf6dd5567b3a..0000000000000 --- a/ggml/src/ggml-cpu/llamafile/sgemm.h +++ /dev/null @@ -1,14 +0,0 @@ -#pragma once -#include -#include -#ifdef __cplusplus -extern "C" { -#endif - -bool llamafile_sgemm(int64_t, int64_t, int64_t, const void *, int64_t, - const void *, int64_t, void *, int64_t, int, int, - int, int, int); - -#ifdef __cplusplus -} -#endif diff --git a/ggml/src/ggml-tinyblas/CMakeLists.txt b/ggml/src/ggml-tinyblas/CMakeLists.txt new file mode 100644 index 0000000000000..c8c4fd04e0f29 --- /dev/null +++ b/ggml/src/ggml-tinyblas/CMakeLists.txt @@ -0,0 +1,230 @@ +add_library(ggml-tinyblas + ggml-tinyblas.cpp + ) + +target_link_libraries(ggml-tinyblas PRIVATE ggml-base) +target_include_directories(ggml-tinyblas PRIVATE . ..) + +if (APPLE AND GGML_ACCELERATE) + find_library(ACCELERATE_FRAMEWORK Accelerate) + if (ACCELERATE_FRAMEWORK) + message(STATUS "Accelerate framework found") + + add_compile_definitions(GGML_USE_ACCELERATE) + add_compile_definitions(ACCELERATE_NEW_LAPACK) + add_compile_definitions(ACCELERATE_LAPACK_ILP64) + + target_link_libraries(ggml-tinyblas PRIVATE ${ACCELERATE_FRAMEWORK}) + else() + message(WARNING "Accelerate framework not found") + endif() +endif() + +if (GGML_OPENMP) + find_package(OpenMP) + if (OpenMP_FOUND) + message(STATUS "OpenMP found") + + add_compile_definitions(GGML_USE_OPENMP) + + target_link_libraries(ggml-tinyblas PRIVATE OpenMP::OpenMP_C OpenMP::OpenMP_CXX) + + else() + message(WARNING "OpenMP not found") + endif() +endif() + +target_sources(ggml-tinyblas PRIVATE + sgemm.cpp + sgemm.h) + +if (CMAKE_OSX_ARCHITECTURES STREQUAL "arm64" OR + CMAKE_GENERATOR_PLATFORM_LWR STREQUAL "arm64" OR + (NOT CMAKE_OSX_ARCHITECTURES AND + NOT CMAKE_GENERATOR_PLATFORM_LWR AND + CMAKE_SYSTEM_PROCESSOR MATCHES "^(aarch64|arm.*|ARM64)$")) + + message(STATUS "ARM detected") + + if (MSVC) + add_compile_definitions(__aarch64__) # MSVC defines _M_ARM64 instead + add_compile_definitions(__ARM_NEON) + add_compile_definitions(__ARM_FEATURE_FMA) + + set(CMAKE_REQUIRED_FLAGS_PREV ${CMAKE_REQUIRED_FLAGS}) + string(JOIN " " CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS} "/arch:armv8.2") + + check_cxx_source_compiles("#include \nint main() { int8x16_t _a, _b; int32x4_t _s = vdotq_s32(_s, _a, _b); return 0; }" GGML_COMPILER_SUPPORT_DOTPROD) + if (GGML_COMPILER_SUPPORT_DOTPROD) + add_compile_definitions(__ARM_FEATURE_DOTPROD) + endif () + + check_cxx_source_compiles("#include \nint main() { int8x16_t _a, _b; int32x4_t _s = vmlaq_f32(_s, _a, _b); return 0; }" GGML_COMPILER_SUPPORT_MATMUL_INT8) + + if (GGML_COMPILER_SUPPORT_MATMUL_INT8) + add_compile_definitions(__ARM_FEATURE_MATMUL_INT8) + endif () + + check_cxx_source_compiles("#include \nint main() { float16_t _a; float16x8_t _s = vdupq_n_f16(_a); return 0; }" GGML_COMPILER_SUPPORT_FP16_VECTOR_ARITHMETIC) + if (GGML_COMPILER_SUPPORT_FP16_VECTOR_ARITHMETIC) + add_compile_definitions(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) + endif () + + set(CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS_PREV}) + else() + check_cxx_compiler_flag(-mfp16-format=ieee COMPILER_SUPPORTS_FP16_FORMAT_I3E) + if (NOT "${COMPILER_SUPPORTS_FP16_FORMAT_I3E}" STREQUAL "") + list(APPEND ARCH_FLAGS -mfp16-format=ieee) + endif() + if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "armv6") + # Raspberry Pi 1, Zero + list(APPEND ARCH_FLAGS -mfpu=neon-fp-armv8 -mno-unaligned-access) + endif() + if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "armv7") + if ("${CMAKE_SYSTEM_NAME}" STREQUAL "Android") + # Android armeabi-v7a + list(APPEND ARCH_FLAGS -mfpu=neon-vfpv4 -mno-unaligned-access -funsafe-math-optimizations) + else() + # Raspberry Pi 2 + list(APPEND ARCH_FLAGS -mfpu=neon-fp-armv8 -mno-unaligned-access -funsafe-math-optimizations) + endif() + endif() + if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "armv8") + # Android arm64-v8a + # Raspberry Pi 3, 4, Zero 2 (32-bit) + list(APPEND ARCH_FLAGS -mno-unaligned-access) + endif() + if (GGML_SVE) + list(APPEND ARCH_FLAGS -march=armv8.6-a+sve) + endif() + endif() +elseif (CMAKE_OSX_ARCHITECTURES STREQUAL "x86_64" OR CMAKE_GENERATOR_PLATFORM_LWR MATCHES "^(x86_64|i686|amd64|x64|win32)$" OR + (NOT CMAKE_OSX_ARCHITECTURES AND NOT CMAKE_GENERATOR_PLATFORM_LWR AND + CMAKE_SYSTEM_PROCESSOR MATCHES "^(x86_64|i686|AMD64)$")) + message(STATUS "x86 detected") + if (MSVC) + # instruction set detection for MSVC only + if (GGML_NATIVE) + # TODO: improve, should not reference files from the parent folder + include(../ggml-cpu/cmake/FindSIMD.cmake) + endif () + if (GGML_AVX512) + list(APPEND ARCH_FLAGS /arch:AVX512) + # MSVC has no compile-time flags enabling specific + # AVX512 extensions, neither it defines the + # macros corresponding to the extensions. + # Do it manually. + if (GGML_AVX512_VBMI) + add_compile_definitions($<$:__AVX512VBMI__>) + add_compile_definitions($<$:__AVX512VBMI__>) + if (CMAKE_C_COMPILER_ID STREQUAL "Clang") + list(APPEND ARCH_FLAGS -mavx512vbmi) + endif() + endif() + if (GGML_AVX512_VNNI) + add_compile_definitions($<$:__AVX512VNNI__>) + add_compile_definitions($<$:__AVX512VNNI__>) + if (CMAKE_C_COMPILER_ID STREQUAL "Clang") + list(APPEND ARCH_FLAGS -mavx512vnni) + endif() + endif() + if (GGML_AVX512_BF16) + add_compile_definitions($<$:__AVX512BF16__>) + add_compile_definitions($<$:__AVX512BF16__>) + if (CMAKE_C_COMPILER_ID STREQUAL "Clang") + list(APPEND ARCH_FLAGS -mavx512bf16) + endif() + endif() + if (GGML_AMX_TILE) + add_compile_definitions($<$:__AMX_TILE__>) + add_compile_definitions($<$:__AMX_TILE__>) + endif() + if (GGML_AMX_INT8) + add_compile_definitions($<$:__AMX_INT8__>) + add_compile_definitions($<$:__AMX_INT8__>) + endif() + if (GGML_AMX_BF16) + add_compile_definitions($<$:__AMX_BF16__>) + add_compile_definitions($<$:__AMX_BF16__>) + endif() + elseif (GGML_AVX2) + list(APPEND ARCH_FLAGS /arch:AVX2) + elseif (GGML_AVX) + list(APPEND ARCH_FLAGS /arch:AVX) + endif() + else() + if (GGML_NATIVE) + list(APPEND ARCH_FLAGS -march=native) + endif() + if (GGML_F16C) + list(APPEND ARCH_FLAGS -mf16c) + endif() + if (GGML_FMA) + list(APPEND ARCH_FLAGS -mfma) + endif() + if (GGML_AVX) + list(APPEND ARCH_FLAGS -mavx) + endif() + if (GGML_AVX2) + list(APPEND ARCH_FLAGS -mavx2) + endif() + if (GGML_AVX512) + list(APPEND ARCH_FLAGS -mavx512f) + list(APPEND ARCH_FLAGS -mavx512dq) + list(APPEND ARCH_FLAGS -mavx512bw) + endif() + if (GGML_AVX512_VBMI) + list(APPEND ARCH_FLAGS -mavx512vbmi) + endif() + if (GGML_AVX512_VNNI) + list(APPEND ARCH_FLAGS -mavx512vnni) + endif() + if (GGML_AVX512_BF16) + list(APPEND ARCH_FLAGS -mavx512bf16) + endif() + if (GGML_AMX_TILE) + list(APPEND ARCH_FLAGS -mamx-tile) + endif() + if (GGML_AMX_INT8) + list(APPEND ARCH_FLAGS -mamx-int8) + endif() + if (GGML_AMX_BF16) + list(APPEND ARCH_FLAGS -mamx-bf16) + endif() + endif() +elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc64") + message(STATUS "PowerPC detected") + execute_process(COMMAND bash -c "grep POWER10 /proc/cpuinfo | head -n 1" OUTPUT_VARIABLE POWER10_M) + string(FIND "${POWER10_M}" "POWER10" substring_index) + if (NOT DEFINED substring_index OR "${substring_index}" STREQUAL "") + set(substring_index -1) + endif() + + if (${substring_index} GREATER_EQUAL 0) + list(APPEND ARCH_FLAGS -mcpu=power10) + elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc64le") + list(APPEND ARCH_FLAGS -mcpu=powerpc64le) + else() + list(APPEND ARCH_FLAGS -mcpu=native -mtune=native) + #TODO: Add targets for Power8/Power9 (Altivec/VSX) and Power10(MMA) and query for big endian systems (ppc64/le/be) + endif() +elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "loongarch64") + message(STATUS "loongarch64 detected") + + list(APPEND ARCH_FLAGS -march=loongarch64) + if (GGML_LASX) + list(APPEND ARCH_FLAGS -mlasx) + endif() + if (GGML_LSX) + list(APPEND ARCH_FLAGS -mlsx) + endif() +else() + message(STATUS "Unknown architecture") +endif() + +target_compile_options(ggml-tinyblas PRIVATE "$<$:${ARCH_FLAGS}>") +target_compile_options(ggml-tinyblas PRIVATE "$<$:${ARCH_FLAGS}>") + +if (EMSCRIPTEN) + set_target_properties(ggml-tinyblas PROPERTIES COMPILE_FLAGS "-msimd128") +endif() diff --git a/ggml/src/ggml-tinyblas/ggml-tinyblas.cpp b/ggml/src/ggml-tinyblas/ggml-tinyblas.cpp new file mode 100644 index 0000000000000..7317b5dd352dc --- /dev/null +++ b/ggml/src/ggml-tinyblas/ggml-tinyblas.cpp @@ -0,0 +1,472 @@ +#include "ggml-cpu.h" +#include "ggml-impl.h" +#include "ggml-tinyblas.h" +#include "ggml-backend-impl.h" + +#include "sgemm.h" + +#include +#include +#include + +#ifdef GGML_USE_OPENMP +#include +#endif + +namespace ggml::backend::tinyblas { + + static const char* NAME = "tinyBLAS"; + + struct context { + int n_threads = GGML_DEFAULT_N_THREADS; + std::unique_ptr work_data; + size_t work_size = 0; + //int pp_threads = GGML_DEFAULT_N_THREADS; + //int tg_threads = GGML_DEFAULT_N_THREADS; + }; + + template + static bool mul_mat(int64_t m, int64_t n, int64_t k, + const void *A, int64_t lda, const void *B, int64_t ldb, void *C, int64_t ldc, + int ith, int nth, + const enum ggml_type Atype, const enum ggml_type Btype, const enum ggml_type Ctype) + { + GGML_ASSERT(Ctype == GGML_TYPE_F32); + switch (Atype) { + case GGML_TYPE_F32: + if (Btype != GGML_TYPE_F32) return false; + return gemm(m, n, k, (const float*)A, lda, (const float*)B, ldb, (float*)C, ldc, ith, nth); + break; + case GGML_TYPE_F16: + switch (Btype) { + case GGML_TYPE_F32: + return gemm(m, n, k, (const ggml_fp16_t*)A, lda, (const float*)B, ldb, (float*)C, ldc, ith, nth); + case GGML_TYPE_F16: + return gemm(m, n, k, (const ggml_fp16_t*)A, lda, (const ggml_fp16_t*)B, ldb, (float*)C, ldc, ith, nth); + default: + return false; + } + break; + case GGML_TYPE_BF16: + switch (Btype) { + case GGML_TYPE_F32: + return gemm(m, n, k, (const ggml_bf16_t*)A, lda, (const float*)B, ldb, (float*)C, ldc, ith, nth); + case GGML_TYPE_BF16: + return gemm(m, n, k, (const ggml_bf16_t*)A, lda, (const ggml_bf16_t*)B, ldb, (float*)C, ldc, ith, nth); + default: + return false; + } + break; + case GGML_TYPE_Q8_0: + if (Btype != GGML_TYPE_Q8_0) return false; + return gemm(m, n, k, (const block_q8_0*)A, lda, (const block_q8_0*)B, ldb, (float*)C, ldc, ith, nth); + break; + case GGML_TYPE_Q4_0: + if (Btype != GGML_TYPE_Q8_0) return false; + return gemm(m, n, k, (const block_q4_0*)A, lda, (const block_q8_0*)B, ldb, (float*)C, ldc, ith, nth); + break; + case GGML_TYPE_Q5_0: + if (Btype != GGML_TYPE_Q8_0) return false; + return gemm(m, n, k, (const block_q5_0*)A, lda, (const block_q8_0*)B, ldb, (float*)C, ldc, ith, nth); + break; + case GGML_TYPE_IQ4_NL: + if (Btype != GGML_TYPE_Q8_0) return false; + return gemm(m, n, k, (const block_iq4_nl*)A, lda, (const block_q8_0*)B, ldb, (float*)C, ldc, ith, nth); + break; + default: + return false; + } + return false; + } + + static bool supports_mul_mat(ggml_backend_dev_t, const struct ggml_tensor * dst) { + const struct ggml_tensor * src0 = dst->src[0]; + const struct ggml_tensor * src1 = dst->src[1]; + + GGML_TENSOR_BINARY_OP_LOCALS + + if (dst->type != GGML_TYPE_F32) return false; + + if (ne0 != ne01) return false; + if (ne1 != ne11) return false; + if (ne2 != ne12) return false; + if (ne3 != ne13) return false; + + // we don't support permuted src0 or src1 + if (nb00 != ggml_type_size(src0->type)) return false; + if (nb10 != ggml_type_size(src1->type)) return false; + + // dst cannot be transposed or permuted + if (nb0 != sizeof(float)) return false; + if (nb0 > nb1) return false; + if (nb1 > nb2) return false; + if (nb2 > nb3) return false; + + if (ggml_is_contiguous(src1)) { + if (mul_mat(ne01, ne11, ne00/ggml_blck_size(src0->type), + src0->data, nb01/ggml_type_size(src0->type), + src1->data, nb11/ggml_type_size(src1->type), + dst->data, nb1/ggml_type_size(dst->type), + 0, 1, src0->type, src1->type, GGML_TYPE_F32)) { + return true; + } + } + + // apres conversion de B: FP32 => src0->vec_dot_type + enum ggml_type const vec_dot_type = ggml_get_type_traits_cpu(src0->type)->vec_dot_type; + if ((src1->type != vec_dot_type) && (src1->type == GGML_TYPE_F32)) { + if (mul_mat(ne01, ne11, ne00/ggml_blck_size(src0->type), + src0->data, nb01/ggml_type_size(src0->type), + src1->data, nb11/ggml_type_size(src1->type), + dst->data, nb1/ggml_type_size(dst->type), + 0, 1, src0->type, vec_dot_type, GGML_TYPE_F32)) { + // @ voir ca aurait etait bien de redimensioner work_data ici.. + return true; + } + } + return false; + } + + static void mul_mat(ggml::backend::tinyblas::context * ctx, struct ggml_tensor * dst, const int ith, const int nth) { + const struct ggml_tensor * src0 = dst->src[0]; + const struct ggml_tensor * src1 = dst->src[1]; + + GGML_TENSOR_BINARY_OP_LOCALS + + const enum ggml_type type0 = src0->type; + const enum ggml_type type1 = src1->type; + + // les type "directs" + // broadcast factors + const int64_t r2 = ne12 / ne02; + const int64_t r3 = ne13 / ne03; + + if (ggml_is_contiguous(src1)) { + for (int64_t i13 = 0; i13 < ne13; i13++) { + for (int64_t i12 = 0; i12 < ne12; i12++) { + const void* data0 = (const char *)src0->data + i12/r2*nb02 + i13/r3*nb03; + const void* data1 = (const char *)src1->data + i12*nb12 + i13*nb13; + void* data = (char *)dst->data + i12*nb2 + i13*nb3; + if (!mul_mat(ne01, ne11, ne00/ggml_blck_size(src0->type), + data0, nb01/ggml_type_size(src0->type), + data1, nb11/ggml_type_size(src1->type), + data, nb1/ggml_type_size(dst->type), + ith, nth, type0, type1, GGML_TYPE_F32)) { + goto UseGgmlGemm1; + } + } + } + return; + } + UseGgmlGemm1:; + + // apres conversion de B ? + GGML_ASSERT(src1->type == GGML_TYPE_F32); // for use 'from_float' + enum ggml_type const vec_dot_type = ggml_get_type_traits_cpu(type0)->vec_dot_type; + ggml_from_float_t const from_float = ggml_get_type_traits_cpu(vec_dot_type)->from_float; + // auto const type_size = ggml_get_type_traits(vec_dot_type)->type_size; + + if (src1->type != vec_dot_type) { + // OK on va au moins essayer de changer le type de B + + const size_t nbw1 = ggml_row_size(vec_dot_type, ne10); + // const size_t row_size = ggml_row_size(vec_dot_type, ne10); + const size_t nbw2 = nbw1*ne11; + const size_t nbw3 = nbw2*ne12; + + // TOD0: vor si on peu caller ca dans supports_mul_mat + if ((ith == 0) && (ctx->work_size < ne13*nbw3)) { + ctx->work_data.reset(new char[ne13*nbw3]); + ctx->work_size = ne13*nbw3; + } +#ifdef GGML_USE_OPENMP +#pragma omp barrier +#else + static_assert(false, "Note implemented: use GGML_USE_OPENMP"); +#endif + char * wdata = ctx->work_data.get(); + + for (int64_t i13 = 0; i13 < ne13; ++i13) { + for (int64_t i12 = 0; i12 < ne12; ++i12) { + for (int64_t i11 = ith; i11 < ne11; i11 += nth) { + from_float((float *)((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11), + (void *) (wdata + i13*nbw3 + i12*nbw2 + i11*nbw1), + ne10); + } + } + } + + // synchronize all threads! +#ifdef GGML_USE_OPENMP +#pragma omp barrier +#else + static_assert(false, "Note implemented: use GGML_USE_OPENMP"); +#endif + // mat-mul bis... + for (int64_t i13 = 0; i13 < ne13; i13++) + for (int64_t i12 = 0; i12 < ne12; i12++) { + const void* data0 = (const char *)src0->data + i12/r2*nb02 + i13/r3*nb03; + const void* data1 = (const char *)wdata + i12*nbw2 + i13*nbw3; + + void* data = (char *)dst->data + i12*nb2 + i13*nb3; + if (!mul_mat(ne01, ne11, ne00/ggml_blck_size(src0->type), + data0, nb01/ggml_type_size(src0->type), + data1, nbw1/ggml_type_size(vec_dot_type), + data, nb1/ggml_type_size(dst->type), + ith, nth, type0, vec_dot_type, GGML_TYPE_F32)) { + goto UseGgmlGemm2; + } + } + return; + } + UseGgmlGemm2:; + } + + static const char * get_name(ggml_backend_t /*backend*/) { + return NAME; + } + + static void free(ggml_backend_t backend) { + context * ctx = (context *)backend->context; + delete ctx; + delete backend; + } + + // TODO: voir comment gerer les threads / pool ... pour tous les backends qui en ont besoin... + // - voir ggml_graph_compute / ggml_threadpool + // https://github.com/ggerganov/llama.cpp/pull/1999 + // + static enum ggml_status graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) { + context * ctx = (context *)backend->context; + + for (int i = 0; i < cgraph->n_nodes; i++) { + struct ggml_tensor * node = cgraph->nodes[i]; + + switch (node->op) { + case GGML_OP_MUL_MAT: +#ifdef GGML_USE_OPENMP +#pragma omp parallel num_threads(ctx->n_threads) + { + int ith = omp_get_thread_num(); + int nth = ctx->n_threads; + mul_mat(ctx, node, ith, nth); + } +#else + static_assert(false, "Note implemented: use GGML_USE_OPENMP"); + mul_mat(ctx, node, 0, 1); +#endif + break; + case GGML_OP_NONE: + case GGML_OP_RESHAPE: + case GGML_OP_VIEW: + case GGML_OP_PERMUTE: + case GGML_OP_TRANSPOSE: + break; + + default: + GGML_ABORT("%s: unsupported op %s\n", __func__, ggml_op_desc(node)); + } + } + + return GGML_STATUS_SUCCESS; + } + + static struct ggml_backend_i interface = { + /* .get_name = */ get_name, + /* .free = */ free, + /* .set_tensor_async = */ NULL, + /* .get_tensor_async = */ NULL, + /* .cpy_tensor_async = */ NULL, + /* .synchronize = */ NULL, + /* .graph_plan_create = */ NULL, + /* .graph_plan_free = */ NULL, + /* .graph_plan_update = */ NULL, + /* .graph_plan_compute = */ NULL, + /* .graph_compute = */ graph_compute, + /* .event_record = */ NULL, + /* .event_wait = */ NULL, + }; + + static ggml_guid_t guid(void) { + static ggml_guid guid = { 0x23, 0xf5, 0x9f, 0xa2, 0xb1, 0x48, 0x39, 0x25, 0x83, 0xcd, 0x79, 0x16, 0xb7, 0x23, 0x94, 0xde }; + return &guid; + } + + static ggml_backend_t init(void) { + context * ctx = new context; + + ggml_backend_t backend = new ggml_backend { + /* .guid = */ guid(), + /* .interface = */ interface, + /* .device = */ ggml_backend_reg_dev_get(ggml_backend_tinyblas_reg(), 0), + /* .context = */ ctx, + }; + + return backend; + } + + static bool is_tinyblas(ggml_backend_t backend) { + return backend != NULL && ggml_guid_matches(backend->guid, guid()); + } + + // number of threads to use for compute + static void set_pp_threads(ggml_backend_t backend, int n_threads) { + GGML_ASSERT(is_tinyblas(backend)); + context * ctx = (context *)backend->context; + //ctx->pp_threads = n_threads; + } + + static void set_tg_threads(ggml_backend_t backend, int n_threads) { + GGML_ASSERT(is_tinyblas(backend)); + context * ctx = (context *)backend->context; + //ctx->tg_threads = n_threads; + } + + static void set_n_threads(ggml_backend_t backend, int n_threads) { + GGML_ASSERT(is_tinyblas(backend)); + context * ctx = (context *)backend->context; + ctx->n_threads = n_threads; + //ctx->tg_threads = n_threads; + //ctx->pp_threads = n_threads; + } + +} + +// device interface +namespace ggml::backend::tinyblas::device { + static const char * get_name(ggml_backend_dev_t) { + return "BLAS"; + } + + static const char * get_description(ggml_backend_dev_t) { + return "tinyBLAS"; + } + + static void get_memory(ggml_backend_dev_t, size_t * free, size_t * total) { + // TODO + *free = 0; + *total = 0; + } + + static enum ggml_backend_dev_type get_type(ggml_backend_dev_t) { + return GGML_BACKEND_DEVICE_TYPE_ACCEL; + } + + static void get_props(ggml_backend_dev_t dev, struct ggml_backend_dev_props * props) { + props->name = get_name(dev); + props->description = get_description(dev); + props->type = get_type(dev); + get_memory(dev, &props->memory_free, &props->memory_total); + props->caps = { + /* .async = */ false, + /* .host_buffer = */ false, + /* .buffer_from_host_ptr = */ true, + /* .events = */ false, + }; + } + + static ggml_backend_t init_backend(ggml_backend_dev_t, const char *) { + return ggml::backend::tinyblas::init(); + } + + static ggml_backend_buffer_type_t get_buffer_type(ggml_backend_dev_t) { + return ggml_backend_cpu_buffer_type(); + } + + static ggml_backend_buffer_t buffer_from_host_ptr(ggml_backend_dev_t, void * ptr, size_t size, size_t) { + return ggml_backend_cpu_buffer_from_ptr(ptr, size); + } + + static bool supports_op(ggml_backend_dev_t device, const struct ggml_tensor * op) { + //const struct ggml_tensor * src0 = op->src[0]; + //const struct ggml_tensor * src1 = op->src[1]; + + switch (op->op) { + case GGML_OP_NONE: + case GGML_OP_RESHAPE: + case GGML_OP_VIEW: + case GGML_OP_PERMUTE: + case GGML_OP_TRANSPOSE: + return true; + case GGML_OP_MUL_MAT: + return supports_mul_mat(device, op); + default: + return false; + } + } + + static bool supports_buft(ggml_backend_dev_t, ggml_backend_buffer_type_t buft) { + return ggml_backend_buft_is_host(buft); + } + + static const struct ggml_backend_device_i interface = { + /* .get_name = */ get_name, + /* .get_description = */ get_description, + /* .get_memory = */ get_memory, + /* .get_type = */ get_type, + /* .get_props = */ get_props, + /* .init_backend = */ init_backend, + /* .get_buffer_type = */ get_buffer_type, + /* .get_host_buffer_type = */ NULL, + /* .buffer_from_host_ptr = */ buffer_from_host_ptr, + /* .supports_op = */ supports_op, + /* .supports_buft = */ supports_buft, + /* .offload_op = */ NULL, + /* .event_new = */ NULL, + /* .event_free = */ NULL, + /* .event_synchronize = */ NULL, + }; + +} + +// backend reg interface +namespace ggml::backend::tinyblas::reg { + static const char * get_name(ggml_backend_reg_t) { + return ggml::backend::tinyblas::NAME; + } + + static size_t get_device_count(ggml_backend_reg_t) { + return 1; + } + + static ggml_backend_dev_t get_device(ggml_backend_reg_t reg, size_t index) { + GGML_ASSERT(index == 0); + + static ggml_backend_device device = { + /* .iface = */ ggml::backend::tinyblas::device::interface, + /* .reg = */ reg, + /* .context = */ nullptr, + }; + + return &device; + } + + static void * get_proc_address(ggml_backend_reg_t, const char * name) { + if (std::strcmp(name, "ggml_backend_set_n_threads") == 0) { + return (void *)ggml::backend::tinyblas::set_n_threads; + } + if (std::strcmp(name, "ggml_backend_set_pp_threads") == 0) { + return (void *)ggml::backend::tinyblas::set_pp_threads; + } + if (std::strcmp(name, "ggml_backend_set_tg_threads") == 0) { + return (void *)ggml::backend::tinyblas::set_tg_threads; + } + return NULL; + } + + static const struct ggml_backend_reg_i interface = { + /* .get_name = */ get_name, + /* .get_device_count = */ get_device_count, + /* .get_device = */ get_device, + /* .get_proc_address = */ get_proc_address, + }; + +} + +ggml_backend_reg_t ggml_backend_tinyblas_reg(void) { + static struct ggml_backend_reg backend_reg = { + /* .iface = */ ggml::backend::tinyblas::reg::interface, + /* .context = */ NULL, + }; + return &backend_reg; +} diff --git a/ggml/src/ggml-cpu/llamafile/sgemm.cpp b/ggml/src/ggml-tinyblas/sgemm.cpp similarity index 80% rename from ggml/src/ggml-cpu/llamafile/sgemm.cpp rename to ggml/src/ggml-tinyblas/sgemm.cpp index b2ce2e6649479..5c7a3c357ee9f 100644 --- a/ggml/src/ggml-cpu/llamafile/sgemm.cpp +++ b/ggml/src/ggml-tinyblas/sgemm.cpp @@ -50,8 +50,6 @@ #include "sgemm.h" #include "ggml-impl.h" -// hack until moved into the CPU backend -#include "../ggml-cpu-impl.h" #include "ggml-quants.h" #ifdef _MSC_VER @@ -135,6 +133,16 @@ inline __m512 madd(__m512 a, __m512 b, __m512 c) { return _mm512_fmadd_ps(a, b, c); } #endif +#if defined(__AVX512BF16__) +template <> +inline __m512 madd(__m512bh a, __m512bh b, __m512 c) { + return _mm512_dpbf16_ps(c, a, b); +} +template <> +inline __m256 madd(__m256bh a, __m256bh b, __m256 c) { + return _mm256_dpbf16_ps(c, a, b); +} +#endif #endif #if defined(__ARM_FEATURE_FMA) @@ -226,6 +234,13 @@ template <> inline __m256 load(const float *p) { } #endif // __AVX__ +#if defined(__AVX2__) || defined(__AVX512F__) +template <> inline __m256 load(const ggml_bf16_t *p) { + return _mm256_castsi256_ps( + _mm256_slli_epi32(_mm256_cvtepu16_epi32(_mm_loadu_si128((const __m128i *)p)), 16)); +} +#endif // __AVX2__ + #if defined(__F16C__) template <> inline __m256 load(const ggml_fp16_t *p) { return _mm256_cvtph_ps(_mm_loadu_si128((const __m128i *)p)); @@ -239,8 +254,27 @@ template <> inline __m512 load(const float *p) { template <> inline __m512 load(const ggml_fp16_t *p) { return _mm512_cvtph_ps(_mm256_loadu_si256((const __m256i *)p)); } +template <> inline __m512 load(const ggml_bf16_t *p) { + return _mm512_castsi512_ps( + _mm512_slli_epi32(_mm512_cvtepu16_epi32(_mm256_loadu_si256((const __m256i *)p)), 16)); +} #endif // __AVX512F__ +#if defined(__AVX512BF16__) +template <> inline __m512bh load(const ggml_bf16_t *p) { + return (__m512bh)_mm512_loadu_ps((const float *)p); +} +template <> inline __m256bh load(const ggml_bf16_t *p) { + return (__m256bh)_mm256_loadu_ps((const float *)p); +} +template <> inline __m512bh load(const float *p) { + return _mm512_cvtne2ps_pbh(_mm512_loadu_ps(p + 16), _mm512_loadu_ps(p)); +} +template <> inline __m256bh load(const float *p) { + return _mm512_cvtneps_pbh(_mm512_loadu_ps(p)); +} +#endif + //////////////////////////////////////////////////////////////////////////////////////////////////// // CONSTANTS @@ -1627,259 +1661,395 @@ class tinyBLAS_PPC { #endif } // namespace -/** - * Performs optimized matrix multiplication on CPU. - * - * This subroutine may compute C = Aᵀ * B with column major ordering. - * Despite its name, this isn't a generalized implementation. Work is - * only performed when a handwritten kernel is written and available. - * Otherwise the caller should fall back to a general matmul routine. - * - * For example, for single-threaded single-precision GEMM you can say - * - * llamafile_sgemm(m, n, k, A, lda, B, ldb, C, ldc, - * 0, 1, - * GGML_TYPE_F32, GGML_TYPE_F32, GGML_TYPE_F32); - * - * @param m is rows in `A` and `C` - * @param n is cols in `B` and `C` - * @param k is cols in `A` and rows in `B` - * @param A is first input matrix (always transposed) - * @param lda is row stride of `A` - * @param B is second input matrix (never transposed) - * @param ldb is row stride of `B` - * @param C is input/output array of output matrices - * @param ldc is row stride of `C` - * @param ith is thread id (must be less than `nth`) - * @param nth is number of threads (must be greater than zero) - * @param Atype is GGML data type of `A` - * @param Btype is GGML data type of `B` - * @param Ctype is GGML data type of `C` - * @return true if this function was able to service the matmul request - */ -bool llamafile_sgemm(int64_t m, int64_t n, int64_t k, const void *A, int64_t lda, const void *B, int64_t ldb, void *C, - int64_t ldc, int ith, int nth, int Atype, int Btype, int Ctype) { - - assert(m >= 0); - assert(n >= 0); - assert(k >= 0); - assert(lda >= k); - assert(ldb >= k); - assert(ldc >= m); - assert(nth > 0); - assert(ith < nth); - - // only enable sgemm for prompt processing - if (n < 2) - return false; - - if (Ctype != GGML_TYPE_F32) - return false; - - switch (Atype) { - - case GGML_TYPE_F32: { - if (Btype != GGML_TYPE_F32) - return false; +namespace ggml::backend::tinyblas { + + /** + * Performs optimized matrix multiplication on CPU. + * + * This subroutine may compute C = Aᵀ * B with column major ordering. + * Despite its name, this isn't a generalized implementation. Work is + * only performed when a handwritten kernel is written and available. + * Otherwise the caller should fall back to a general matmul routine. + * + * For example, for single-threaded single-precision GEMM you can say + * + * llamafile_sgemm(m, n, k, A, lda, B, ldb, C, ldc, 0, 1); + * + * @param m is rows in `A` and `C` + * @param n is cols in `B` and `C` + * @param k is cols in `A` and rows in `B` + * @param A is first input matrix (always transposed) + * @param lda is row stride of `A` + * @param B is second input matrix (never transposed) + * @param ldb is row stride of `B` + * @param C is input/output array of output matrices + * @param ldc is row stride of `C` + * @param ith is thread id (must be less than `nth`) + * @param nth is number of threads (must be greater than zero) + * @return true if this function was able to service the matmul request + */ + + template + bool gemm(int64_t m, int64_t n, int64_t k, + const float *A, int64_t lda, const float *B, int64_t ldb, float *C, int64_t ldc, + int ith, int nth) { + assert(m >= 0); + assert(n >= 0); + assert(k >= 0); + assert(lda >= k); + assert(ldb >= k); + assert(ldc >= m); + assert(nth > 0); + assert(ith < nth); #if defined(__AVX512F__) - if (k % 16) - return false; - tinyBLAS<16, __m512, __m512, float, float, float> tb{ - k, (const float *)A, lda, - (const float *)B, ldb, - (float *)C, ldc, - ith, nth}; - tb.matmul(m, n); - return true; -#elif defined(__AVX__) || defined(__AVX2__) - if (k % 8) - return false; - tinyBLAS<8, __m256, __m256, float, float, float> tb{ - k, (const float *)A, lda, - (const float *)B, ldb, - (float *)C, ldc, - ith, nth}; - tb.matmul(m, n); - return true; -#elif defined(__ARM_NEON) - if (n < 4) - return false; - if (k % 4) - return false; - tinyBLAS<4, float32x4_t, float32x4_t, float, float, float> tb{ - k, (const float *)A, lda, - (const float *)B, ldb, - (float *)C, ldc, - ith, nth}; - tb.matmul(m, n); - return true; -#elif defined(__MMA__) - if (k % 8) - return false; - tinyBLAS_PPC tb{ - k, (const float *)A, lda, - (const float *)B, ldb, - (float *)C, ldc, - ith, nth}; - tb.matmul(m, n); - return true; -#else + if ((k % 16) == 0) { + if constexpr (RUN) { + tinyBLAS<16, __m512, __m512, float, float, float> tb{k, A, lda, B, ldb, C, ldc, ith, nth}; + tb.matmul(m, n); + } + return true; + } +#endif +#if defined(__AVX__) || defined(__AVX2__) + if ((k % 8)==0) { + if constexpr (RUN) { + tinyBLAS<8, __m256, __m256, float, float, float> tb{k, A, lda, B, ldb, C, ldc, ith, nth}; + tb.matmul(m, n); + } + return true; + } +#endif +#if defined(__ARM_NEON) + if ((k % 4) == 0) { + if constexpr (RUN) { + tinyBLAS<4, float32x4_t, float32x4_t, float, float, float> tb{k, A, lda, B, ldb, C, ldc, ith, nth}; + tb.matmul(m, n); + } + return true; + } +#endif + // TODO: voir a mettre ca dans un autre fichier... +#if defined(__MMA__) + if ((k % 8) == 0) { + if constexpr (RUN) { + tinyBLAS_PPC tb{ k, A, lda, B, ldb, C, ldc, ith, nth}; + tb.matmul(m, n); + } + return true; + } +#endif return false; + } + template bool gemm(int64_t m, int64_t n, int64_t k, + const float *A, int64_t lda, const float *B, int64_t ldb, float *C, int64_t ldc, + int ith, int nth); + template bool gemm(int64_t m, int64_t n, int64_t k, + const float *A, int64_t lda, const float *B, int64_t ldb, float *C, int64_t ldc, + int ith, int nth); + + template + bool gemm(int64_t m, int64_t n, int64_t k, + const ggml_fp16_t *A, int64_t lda, const float *B, int64_t ldb, float *C, int64_t ldc, + int ith, int nth) { + assert(m >= 0); + assert(n >= 0); + assert(k >= 0); + assert(lda >= k); + assert(ldb >= k); + assert(ldc >= m); + assert(nth > 0); + assert(ith < nth); +#if defined(__AVX512F__) + if ((k % 16) == 0) { + if constexpr (RUN) { + tinyBLAS<16, __m512, __m512, ggml_fp16_t, float, float> tb{k, A, lda, B, ldb, C, ldc, ith, nth}; + tb.matmul(m, n); + } + return true; + } #endif +#if (defined(__AVX__) || defined(__AVX2__)) && defined(__F16C__) + if ((k % 8) == 0) { + if constexpr (RUN) { + tinyBLAS<8, __m256, __m256, ggml_fp16_t, float, float> tb{k, A, lda, B, ldb, C, ldc, ith, nth}; + tb.matmul(m, n); + } + return true; + } +#endif +#if defined(__ARM_NEON) && !defined(_MSC_VER) + if ((k % 4) == 0) { + if constexpr (RUN) { + tinyBLAS<4, float32x4_t, float32x4_t, ggml_fp16_t, float, float> tb{k, A, lda, B, ldb, C, ldc, ith, nth}; + tb.matmul(m, n); + } + return true; + } +#endif + return false; } - - case GGML_TYPE_F16: { + template bool gemm(int64_t m, int64_t n, int64_t k, + const ggml_fp16_t *A, int64_t lda, const float *B, int64_t ldb, float *C, int64_t ldc, + int ith, int nth); + template bool gemm(int64_t m, int64_t n, int64_t k, + const ggml_fp16_t *A, int64_t lda, const float *B, int64_t ldb, float *C, int64_t ldc, + int ith, int nth); + + template + bool gemm(int64_t m, int64_t n, int64_t k, + const ggml_fp16_t *A, int64_t lda, const ggml_fp16_t *B, int64_t ldb, float *C, int64_t ldc, + int ith, int nth) { + assert(m >= 0); + assert(n >= 0); + assert(k >= 0); + assert(lda >= k); + assert(ldb >= k); + assert(ldc >= m); + assert(nth > 0); + assert(ith < nth); #if defined(__AVX512F__) - if (k % 16) - return false; - if (Btype != GGML_TYPE_F32) - return false; - tinyBLAS<16, __m512, __m512, ggml_fp16_t, float, float> tb{ - k, (const ggml_fp16_t *)A, lda, - (const float *)B, ldb, - (float *)C, ldc, - ith, nth}; - tb.matmul(m, n); - return true; + if ((k % 16) == 0) { + if constexpr (RUN) { + tinyBLAS<16, __m512, __m512, ggml_fp16_t, ggml_fp16_t, float> tb{k, A, lda, B, ldb, C, ldc, ith, nth}; + tb.matmul(m, n); + } + return true; + } +#endif +#if (defined(__AVX__) || defined(__AVX2__)) && defined(__F16C__) + if ((k % 8) == 0) { + if constexpr (RUN) { + tinyBLAS<8, __m256, __m256, ggml_fp16_t, ggml_fp16_t, float> tb{k, A, lda, B, ldb, C, ldc, ith, nth}; + tb.matmul(m, n); + } + return true; + } +#endif +#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && !defined(_MSC_VER) + if ((k % 8) == 0) { + if constexpr (RUN) { + tinyBLAS<8, float16x8_t, float16x8_t, ggml_fp16_t, ggml_fp16_t, float> tb{k, A, lda, B, ldb, C, ldc, ith, nth}; + tb.matmul(m, n); + } + return true; + } +#endif + return false; + } + template bool gemm(int64_t m, int64_t n, int64_t k, + const ggml_fp16_t *A, int64_t lda, const ggml_fp16_t *B, int64_t ldb, float *C, int64_t ldc, + int ith, int nth); + template bool gemm(int64_t m, int64_t n, int64_t k, + const ggml_fp16_t *A, int64_t lda, const ggml_fp16_t *B, int64_t ldb, float *C, int64_t ldc, + int ith, int nth); + + template + bool gemm(int64_t m, int64_t n, int64_t k, + const ggml_bf16_t *A, int64_t lda, const float *B, int64_t ldb, float *C, int64_t ldc, + int ith, int nth) { + assert(m >= 0); + assert(n >= 0); + assert(k >= 0); + assert(lda >= k); + assert(ldb >= k); + assert(ldc >= m); + assert(nth > 0); + assert(ith < nth); +#if defined(__AVX512BF16__) + // wait for convert B => bf16? + //if ((k % 32) == 0) { + // if constexpr (RUN) { + // tinyBLAS<32, __m512, __m512bh, ggml_bf16_t, float, float> tb{k, A, lda, B, ldb, C, ldc, ith, nth}; + // tb.matmul(m, n); + // } + // return true; + //} +#elif defined(__AVX512F__) + if ((k % 16) == 0) { + if constexpr (RUN) { + tinyBLAS<16, __m512, __m512, ggml_bf16_t, float, float> tb{k, A, lda, B, ldb, C, ldc, ith, nth}; + tb.matmul(m, n); + } + return true; + } #elif (defined(__AVX__) || defined(__AVX2__)) && defined(__F16C__) - if (k % 8) - return false; - if (Btype != GGML_TYPE_F32) - return false; - tinyBLAS<8, __m256, __m256, ggml_fp16_t, float, float> tb{ - k, (const ggml_fp16_t *)A, lda, - (const float *)B, ldb, - (float *)C, ldc, - ith, nth}; - tb.matmul(m, n); - return true; -#elif defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && !defined(_MSC_VER) - if (n < 8) - return false; - if (k % 8) - return false; - if (Btype != GGML_TYPE_F16) - return false; - tinyBLAS<8, float16x8_t, float16x8_t, ggml_fp16_t, ggml_fp16_t, float> tb{ - k, (const ggml_fp16_t *)A, lda, - (const ggml_fp16_t *)B, ldb, - (float *)C, ldc, - ith, nth}; - tb.matmul(m, n); - return true; -#elif defined(__ARM_NEON) && !defined(_MSC_VER) - if (k % 4) - return false; - if (Btype != GGML_TYPE_F32) - return false; - tinyBLAS<4, float32x4_t, float32x4_t, ggml_fp16_t, float, float> tb{ - k, (const ggml_fp16_t *)A, lda, - (const float *)B, ldb, - (float *)C, ldc, - ith, nth}; - tb.matmul(m, n); - return true; -#else + // TODO +#endif return false; + } + template bool gemm(int64_t m, int64_t n, int64_t k, + const ggml_bf16_t *A, int64_t lda, const float *B, int64_t ldb, float *C, int64_t ldc, + int ith, int nth); + template bool gemm(int64_t m, int64_t n, int64_t k, + const ggml_bf16_t *A, int64_t lda, const float *B, int64_t ldb, float *C, int64_t ldc, + int ith, int nth); + + template + bool gemm(int64_t m, int64_t n, int64_t k, + const ggml_bf16_t *A, int64_t lda, const ggml_bf16_t *B, int64_t ldb, float *C, int64_t ldc, + int ith, int nth) { + assert(m >= 0); + assert(n >= 0); + assert(k >= 0); + assert(lda >= k); + assert(ldb >= k); + assert(ldc >= m); + assert(nth > 0); + assert(ith < nth); +#if defined(__AVX512BF16__) + if ((k % 32) == 0) { + if constexpr (RUN) { + tinyBLAS<32, __m512, __m512bh, ggml_bf16_t, ggml_bf16_t, float> tb{k, A, lda, B, ldb, C, ldc, ith, nth}; + tb.matmul(m, n); + } + return true; + } + // 2eme chance... + if ((k % 16) == 0) { + if constexpr (RUN) { + tinyBLAS<16, __m256, __m256bh, ggml_bf16_t, ggml_bf16_t, float> tb{k, A, lda, B, ldb, C, ldc, ith, nth}; + tb.matmul(m, n); + } + return true; + } #endif + return false; } + template bool gemm(int64_t m, int64_t n, int64_t k, + const ggml_bf16_t *A, int64_t lda, const ggml_bf16_t *B, int64_t ldb, float *C, int64_t ldc, + int ith, int nth); + template bool gemm(int64_t m, int64_t n, int64_t k, + const ggml_bf16_t *A, int64_t lda, const ggml_bf16_t *B, int64_t ldb, float *C, int64_t ldc, + int ith, int nth); + + template + bool gemm(int64_t m, int64_t n, int64_t k, + const block_q8_0 *A, int64_t lda, const block_q8_0 *B, int64_t ldb, float *C, int64_t ldc, + int ith, int nth) { + assert(m >= 0); + assert(n >= 0); + assert(k >= 0); + assert(lda >= k); + assert(ldb >= k); + assert(ldc >= m); + assert(nth > 0); + assert(ith < nth); - case GGML_TYPE_Q8_0: { - if (Btype != GGML_TYPE_Q8_0) - return false; #if defined(__AVX2__) || defined(__AVX512F__) || defined(__AVX__) - tinyBLAS_Q0_AVX tb{ - k, (const block_q8_0 *)A, lda, - (const block_q8_0 *)B, ldb, - (float *)C, ldc, - ith, nth}; - tb.matmul(m, n); + if constexpr (RUN) { + tinyBLAS_Q0_AVX tb{k, A, lda, B, ldb, C, ldc, ith, nth}; + tb.matmul(m, n); + } return true; #elif defined(__ARM_FEATURE_DOTPROD) - tinyBLAS_Q0_ARM tb{ - k, (const block_q8_0 *)A, lda, - (const block_q8_0 *)B, ldb, - (float *)C, ldc, - ith, nth}; - tb.matmul(m, n); + if constexpr (RUN) { + tinyBLAS_Q0_ARM tb{k, A, lda, B, ldb, C, ldc, ith, nth}; + tb.matmul(m, n); + } return true; #else return false; #endif } + template bool gemm(int64_t m, int64_t n, int64_t k, + const block_q8_0 *A, int64_t lda, const block_q8_0 *B, int64_t ldb, float *C, int64_t ldc, + int ith, int nth); + template bool gemm(int64_t m, int64_t n, int64_t k, + const block_q8_0 *A, int64_t lda, const block_q8_0 *B, int64_t ldb, float *C, int64_t ldc, + int ith, int nth); + + template + bool gemm(int64_t m, int64_t n, int64_t k, + const block_q4_0 *A, int64_t lda, const block_q8_0 *B, int64_t ldb, float *C, int64_t ldc, + int ith, int nth) { + assert(m >= 0); + assert(n >= 0); + assert(k >= 0); + assert(lda >= k); + assert(ldb >= k); + assert(ldc >= m); + assert(nth > 0); + assert(ith < nth); - case GGML_TYPE_Q4_0: { - if (Btype != GGML_TYPE_Q8_0) - return false; #if defined(__AVX2__) || defined(__AVX512F__) || defined(__AVX__) - tinyBLAS_Q0_AVX tb{ - k, (const block_q4_0 *)A, lda, - (const block_q8_0 *)B, ldb, - (float *)C, ldc, - ith, nth}; - tb.matmul(m, n); + if constexpr (RUN) { + tinyBLAS_Q0_AVX tb{k, A, lda, B, ldb, C, ldc, ith, nth}; + tb.matmul(m, n); + } return true; #elif defined(__ARM_FEATURE_DOTPROD) - tinyBLAS_Q0_ARM tb{ - k, (const block_q4_0 *)A, lda, - (const block_q8_0 *)B, ldb, - (float *)C, ldc, - ith, nth}; - tb.matmul(m, n); + if constexpr (RUN) { + tinyBLAS_Q0_ARM tb{k, A, lda, B, ldb, C, ldc, ith, nth}; + tb.matmul(m, n); + } return true; #else return false; #endif } + template bool gemm(int64_t m, int64_t n, int64_t k, + const block_q4_0 *A, int64_t lda, const block_q8_0 *B, int64_t ldb, float *C, int64_t ldc, + int ith, int nth); + template bool gemm(int64_t m, int64_t n, int64_t k, + const block_q4_0 *A, int64_t lda, const block_q8_0 *B, int64_t ldb, float *C, int64_t ldc, + int ith, int nth); + + template + bool gemm(int64_t m, int64_t n, int64_t k, + const block_q5_0 *A, int64_t lda, const block_q8_0 *B, int64_t ldb, float *C, int64_t ldc, + int ith, int nth) { + assert(m >= 0); + assert(n >= 0); + assert(k >= 0); + assert(lda >= k); + assert(ldb >= k); + assert(ldc >= m); + assert(nth > 0); + assert(ith < nth); - case GGML_TYPE_Q5_0: { - if (Btype != GGML_TYPE_Q8_0) - return false; #if defined(__AVX2__) || defined(__AVX512F__) || defined(__AVX__) - tinyBLAS_Q0_AVX tb{ - k, (const block_q5_0 *)A, lda, - (const block_q8_0 *)B, ldb, - (float *)C, ldc, - ith, nth}; - tb.matmul(m, n); + if constexpr (RUN) { + tinyBLAS_Q0_AVX tb{k, A, lda, B, ldb, C, ldc, ith, nth}; + tb.matmul(m, n); + } return true; #else return false; #endif } - - case GGML_TYPE_IQ4_NL: { - if (Btype != GGML_TYPE_Q8_0) - return false; + template bool gemm(int64_t m, int64_t n, int64_t k, + const block_q5_0 *A, int64_t lda, const block_q8_0 *B, int64_t ldb, float *C, int64_t ldc, + int ith, int nth); + template bool gemm(int64_t m, int64_t n, int64_t k, + const block_q5_0 *A, int64_t lda, const block_q8_0 *B, int64_t ldb, float *C, int64_t ldc, + int ith, int nth); + + template + bool gemm(int64_t m, int64_t n, int64_t k, + const block_iq4_nl *A, int64_t lda, const block_q8_0 *B, int64_t ldb, float *C, int64_t ldc, + int ith, int nth) { + assert(m >= 0); + assert(n >= 0); + assert(k >= 0); + assert(lda >= k); + assert(ldb >= k); + assert(ldc >= m); + assert(nth > 0); + assert(ith < nth); #if defined(__AVX2__) || defined(__AVX512F__) || defined(__AVX__) - tinyBLAS_Q0_AVX tb{ - k, (const block_iq4_nl *)A, lda, - (const block_q8_0 *)B, ldb, - (float *)C, ldc, - ith, nth}; - tb.matmul(m, n); + if constexpr (RUN) { + tinyBLAS_Q0_AVX tb{k, A, lda, B, ldb, C, ldc, ith, nth}; + tb.matmul(m, n); + } return true; #else return false; #endif } - - default: - return false; - } - - (void)m; - (void)n; - (void)k; - (void)A; - (void)lda; - (void)B; - (void)ldb; - (void)C; - (void)ldc; - (void)ith; - (void)nth; - (void)Atype; - (void)Btype; - (void)Ctype; + template bool gemm(int64_t m, int64_t n, int64_t k, + const block_iq4_nl *A, int64_t lda, const block_q8_0 *B, int64_t ldb, float *C, int64_t ldc, + int ith, int nth); + template bool gemm(int64_t m, int64_t n, int64_t k, + const block_iq4_nl *A, int64_t lda, const block_q8_0 *B, int64_t ldb, float *C, int64_t ldc, + int ith, int nth); } diff --git a/ggml/src/ggml-tinyblas/sgemm.h b/ggml/src/ggml-tinyblas/sgemm.h new file mode 100644 index 0000000000000..88c014e3ebb1e --- /dev/null +++ b/ggml/src/ggml-tinyblas/sgemm.h @@ -0,0 +1,51 @@ +#pragma once +//#include +#include "ggml.h" +#define GGML_COMMON_DECL_C +//#define GGML_COMMON_DECL_CPP +#include "ggml-common.h" + +// appelé que depuis du c++ (le tinyBLAS backend) + +namespace ggml::backend::tinyblas { + + // on est en C++ + // => on peu avoir autant de fonction que de type. + // calcule C = Aᵀ * B + template + bool gemm(int64_t m, int64_t n, int64_t k, + const float *A, int64_t lda, const float *B, int64_t ldb, float *C, int64_t ldc, + int ith=0, int nth=1); + template + bool gemm(int64_t m, int64_t n, int64_t k, + const ggml_fp16_t *A, int64_t lda, const float *B, int64_t ldb, float *C, int64_t ldc, + int ith=0, int nth=1); + template + bool gemm(int64_t m, int64_t n, int64_t k, + const ggml_fp16_t *A, int64_t lda, const ggml_fp16_t *B, int64_t ldb, float *C, int64_t ldc, + int ith=0, int nth=1); + template + bool gemm(int64_t m, int64_t n, int64_t k, + const ggml_bf16_t *A, int64_t lda, const float *B, int64_t ldb, float *C, int64_t ldc, + int ith=0, int nth=1); + template + bool gemm(int64_t m, int64_t n, int64_t k, + const ggml_bf16_t *A, int64_t lda, const ggml_bf16_t *B, int64_t ldb, float *C, int64_t ldc, + int ith=0, int nth=1); + template + bool gemm(int64_t m, int64_t n, int64_t k, + const block_q8_0 *A, int64_t lda, const block_q8_0 *B, int64_t ldb, float *C, int64_t ldc, + int ith=0, int nth=1); + template + bool gemm(int64_t m, int64_t n, int64_t k, + const block_q4_0 *A, int64_t lda, const block_q8_0 *B, int64_t ldb, float *C, int64_t ldc, + int ith=0, int nth=1); + template + bool gemm(int64_t m, int64_t n, int64_t k, + const block_q5_0 *A, int64_t lda, const block_q8_0 *B, int64_t ldb, float *C, int64_t ldc, + int ith=0, int nth=1); + template + bool gemm(int64_t m, int64_t n, int64_t k, + const block_iq4_nl *A, int64_t lda, const block_q8_0 *B, int64_t ldb, float *C, int64_t ldc, + int ith=0, int nth=1); +} diff --git a/src/llama.cpp b/src/llama.cpp index 1703104fb3680..d6c7cd08ef2f1 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -22034,7 +22034,6 @@ const char * llama_print_system_info(void) { s += "SSSE3 = " + std::to_string(ggml_cpu_has_ssse3()) + " | "; s += "VSX = " + std::to_string(ggml_cpu_has_vsx()) + " | "; s += "MATMUL_INT8 = " + std::to_string(ggml_cpu_has_matmul_int8()) + " | "; - s += "LLAMAFILE = " + std::to_string(ggml_cpu_has_llamafile()) + " | "; return s.c_str(); } From dda8847636a60078bc0ffb3314ed9b2198d4faa1 Mon Sep 17 00:00:00 2001 From: Djip007 <3705339+Djip007@users.noreply.github.com> Date: Sat, 16 Nov 2024 22:30:02 +0100 Subject: [PATCH 2/3] some cleanup with tinyblas backend --- CMakeLists.txt | 4 +- docs/android.md | 2 +- docs/build.md | 4 +- ggml/CMakeLists.txt | 8 +- ggml/src/CMakeLists.txt | 1 + ggml/src/ggml-backend-reg.cpp | 8 +- ggml/src/ggml-common.h | 58 ++++++++++---- ggml/src/ggml-cpu/CMakeLists.txt | 10 --- ggml/src/ggml-cpu/ggml-cpu.c | 59 -------------- ggml/src/ggml-tinyblas/CMakeLists.txt | 6 ++ ggml/src/ggml-tinyblas/ggml-tinyblas.cpp | 96 +++++++++++++---------- ggml/src/ggml-tinyblas/sgemm.cpp | 99 ++++++++++++++++++++++++ ggml/src/ggml-tinyblas/sgemm.h | 55 +++++++++++-- 13 files changed, 262 insertions(+), 148 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 93c60ef431d66..ff62b3cba602c 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -84,8 +84,8 @@ set(GGML_ALL_WARNINGS ${LLAMA_ALL_WARNINGS}) set(GGML_FATAL_WARNINGS ${LLAMA_FATAL_WARNINGS}) # change the default for these ggml options -if (NOT DEFINED GGML_LLAMAFILE) - set(GGML_LLAMAFILE_DEFAULT ON) +if (NOT DEFINED GGML_TINYBLAS) + set(GGML_TINYBLAS ON) endif() if (NOT DEFINED GGML_AMX) diff --git a/docs/android.md b/docs/android.md index 320b62240382f..ac3ecb43ddd22 100644 --- a/docs/android.md +++ b/docs/android.md @@ -45,7 +45,7 @@ $ cmake \ -DCMAKE_C_FLAGS="-march=armv8.7a" \ -DCMAKE_CXX_FLAGS="-march=armv8.7a" \ -DGGML_OPENMP=OFF \ - -DGGML_LLAMAFILE=OFF \ + -DGGML_TINYBLAS=OFF \ -B build-android ``` diff --git a/docs/build.md b/docs/build.md index 52de2b4e2c224..538490a17b493 100644 --- a/docs/build.md +++ b/docs/build.md @@ -42,7 +42,7 @@ In order to build llama.cpp you have four different options. **Notes**: - - For `Q4_0_4_4` quantization type build, add the `-DGGML_LLAMAFILE=OFF` cmake option. For example, use `cmake -B build -DGGML_LLAMAFILE=OFF`. + - For `Q4_0_4_4` quantization type build, add the `-DGGML_TINYBLAS=OFF` cmake option. For example, use `cmake -B build -DGGML_TINYBLAS=OFF`. - For faster compilation, add the `-j` argument to run multiple jobs in parallel. For example, `cmake --build build --config Release -j 8` will run 8 jobs in parallel. - For faster repeated compilation, install [ccache](https://ccache.dev/). - For debug builds, there are two cases: @@ -393,4 +393,4 @@ To read documentation for how to build on Android, [click here](./android.md) Llama.cpp includes a set of optimized mulmat kernels for the Arm architecture, leveraging Arm® Neon™, int8mm and SVE instructions. These kernels are enabled at build time through the appropriate compiler cpu-type flags, such as `-DCMAKE_C_FLAGS=-march=armv8.2a+i8mm+sve`. Note that these optimized kernels require the model to be quantized into one of the formats: `Q4_0_4_4` (Arm Neon), `Q4_0_4_8` (int8mm) or `Q4_0_8_8` (SVE). The SVE mulmat kernel specifically requires a vector width of 256 bits. When running on devices with a different vector width, it is recommended to use the `Q4_0_4_8` (int8mm) or `Q4_0_4_4` (Arm Neon) formats for better performance. Refer to [examples/quantize/README.md](../examples/quantize/README.md) for more information on the quantization formats. -To support `Q4_0_4_4`, you must build with `GGML_NO_LLAMAFILE=1` (`make`) or `-DGGML_LLAMAFILE=OFF` (`cmake`). +To support `Q4_0_4_4`, you must build with `GGML_NO_LLAMAFILE=1` (`make`) or `-DGGML_TINYBLAS=OFF` (`cmake`). diff --git a/ggml/CMakeLists.txt b/ggml/CMakeLists.txt index 4fb78e59fa72c..cc9d277a9c81e 100644 --- a/ggml/CMakeLists.txt +++ b/ggml/CMakeLists.txt @@ -57,8 +57,8 @@ else() endif() # defaults -if (NOT GGML_LLAMAFILE_DEFAULT) - set(GGML_LLAMAFILE_DEFAULT OFF) +if (NOT GGML_TINYBLAS_DEFAULT) + set(GGML_TINYBLAS_DEFAULT OFF) endif() if (NOT GGML_CUDA_GRAPHS_DEFAULT) @@ -124,8 +124,7 @@ option(GGML_ACCELERATE "ggml: enable Accelerate framework" option(GGML_BLAS "ggml: use BLAS" ${GGML_BLAS_DEFAULT}) set(GGML_BLAS_VENDOR ${GGML_BLAS_VENDOR_DEFAULT} CACHE STRING "ggml: BLAS library vendor") -option(GGML_LLAMAFILE "ggml: use LLAMAFILE" ${GGML_LLAMAFILE_DEFAULT}) - +option(GGML_TINYBLAS "ggml: use TINYBLAS" OFF) option(GGML_CUDA "ggml: use CUDA" OFF) option(GGML_MUSA "ggml: use MUSA" OFF) option(GGML_CUDA_FORCE_DMMV "ggml: use dmmv instead of mmvq CUDA kernels" OFF) @@ -231,6 +230,7 @@ set(GGML_PUBLIC_HEADERS include/ggml-metal.h include/ggml-rpc.h include/ggml-sycl.h + include/ggml-tinyblas.h include/ggml-vulkan.h) set_target_properties(ggml PROPERTIES PUBLIC_HEADER "${GGML_PUBLIC_HEADERS}") diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt index 71934c6791e32..33d494dd7a524 100644 --- a/ggml/src/CMakeLists.txt +++ b/ggml/src/CMakeLists.txt @@ -256,6 +256,7 @@ ggml_add_backend(Kompute) ggml_add_backend(METAL) ggml_add_backend(RPC) ggml_add_backend(SYCL) +ggml_add_backend(TINYBLAS) ggml_add_backend(Vulkan) ggml_add_backend(MUSA) diff --git a/ggml/src/ggml-backend-reg.cpp b/ggml/src/ggml-backend-reg.cpp index 78bcb6c5ca5d9..233debb12bda4 100644 --- a/ggml/src/ggml-backend-reg.cpp +++ b/ggml/src/ggml-backend-reg.cpp @@ -91,10 +91,12 @@ struct ggml_backend_registry { return; } -#ifndef NDEBUG - GGML_LOG_DEBUG("%s: registered backend %s (%zu devices)\n", + GGML_LOG_INFO("%s: registered backend %s (%zu devices)\n", __func__, ggml_backend_reg_name(reg), ggml_backend_reg_dev_count(reg)); -#endif +//#ifndef NDEBUG +// GGML_LOG_DEBUG("%s: registered backend %s (%zu devices)\n", +// __func__, ggml_backend_reg_name(reg), ggml_backend_reg_dev_count(reg)); +//#endif backends.push_back(reg); for (size_t i = 0; i < ggml_backend_reg_dev_count(reg); i++) { register_device(ggml_backend_reg_dev_get(reg, i)); diff --git a/ggml/src/ggml-common.h b/ggml/src/ggml-common.h index 050161393456e..51c36ff571fde 100644 --- a/ggml/src/ggml-common.h +++ b/ggml/src/ggml-common.h @@ -6,7 +6,20 @@ typedef uint16_t ggml_half; typedef uint32_t ggml_half2; -#define GGML_COMMON_AGGR +#define GGML_COMMON_AGGR_U +#define GGML_COMMON_AGGR_S + +#define GGML_COMMON_DECL +#elif defined(GGML_COMMON_DECL_CPP) +#include + +typedef uint16_t ggml_half; +typedef uint32_t ggml_half2; + +// std-c++ allow anonymous unions but some compiler warn on it +#define GGML_COMMON_AGGR_U data +// std-c++ do not allow it. +#define GGML_COMMON_AGGR_S data #define GGML_COMMON_DECL #elif defined(GGML_COMMON_DECL_METAL) @@ -15,7 +28,8 @@ typedef uint32_t ggml_half2; typedef half ggml_half; typedef half2 ggml_half2; -#define GGML_COMMON_AGGR +#define GGML_COMMON_AGGR_U +#define GGML_COMMON_AGGR_S #define GGML_COMMON_DECL #elif defined(GGML_COMMON_DECL_CUDA) @@ -29,7 +43,8 @@ typedef half2 ggml_half2; typedef half ggml_half; typedef half2 ggml_half2; -#define GGML_COMMON_AGGR data +#define GGML_COMMON_AGGR_U +#define GGML_COMMON_AGGR_S data #define GGML_COMMON_DECL #elif defined(GGML_COMMON_DECL_HIP) @@ -39,7 +54,8 @@ typedef half2 ggml_half2; typedef half ggml_half; typedef half2 ggml_half2; -#define GGML_COMMON_AGGR data +#define GGML_COMMON_AGGR_U +#define GGML_COMMON_AGGR_S data #define GGML_COMMON_DECL #elif defined(GGML_COMMON_DECL_SYCL) @@ -49,7 +65,8 @@ typedef half2 ggml_half2; typedef sycl::half ggml_half; typedef sycl::half2 ggml_half2; -#define GGML_COMMON_AGGR data +#define GGML_COMMON_AGGR_U +#define GGML_COMMON_AGGR_S data #define GGML_COMMON_DECL #endif @@ -154,9 +171,9 @@ typedef struct { struct { ggml_half d; // delta ggml_half m; // min - } GGML_COMMON_AGGR; + } GGML_COMMON_AGGR_S; ggml_half2 dm; - }; + } GGML_COMMON_AGGR_U; uint8_t qs[QK4_1 / 2]; // nibbles / quants } block_q4_1; static_assert(sizeof(block_q4_1) == 2 * sizeof(ggml_half) + QK4_1 / 2, "wrong q4_1 block size/padding"); @@ -175,9 +192,9 @@ typedef struct { struct { ggml_half d; // delta ggml_half m; // min - } GGML_COMMON_AGGR; + } GGML_COMMON_AGGR_S; ggml_half2 dm; - }; + } GGML_COMMON_AGGR_U; uint8_t qh[4]; // 5-th bit of quants uint8_t qs[QK5_1 / 2]; // nibbles / quants } block_q5_1; @@ -196,9 +213,9 @@ typedef struct { struct { ggml_half d; // delta ggml_half s; // d * sum(qs[i]) - } GGML_COMMON_AGGR; + } GGML_COMMON_AGGR_S; ggml_half2 ds; - }; + } GGML_COMMON_AGGR_U; int8_t qs[QK8_1]; // quants } block_q8_1; static_assert(sizeof(block_q8_1) == 2*sizeof(ggml_half) + QK8_1, "wrong q8_1 block size/padding"); @@ -261,9 +278,9 @@ typedef struct { struct { ggml_half d; // super-block scale for quantized scales ggml_half dmin; // super-block scale for quantized mins - } GGML_COMMON_AGGR; + } GGML_COMMON_AGGR_S; ggml_half2 dm; - }; + } GGML_COMMON_AGGR_U; } block_q2_K; static_assert(sizeof(block_q2_K) == 2*sizeof(ggml_half) + QK_K/16 + QK_K/4, "wrong q2_K block size/padding"); @@ -288,9 +305,9 @@ typedef struct { struct { ggml_half d; // super-block scale for quantized scales ggml_half dmin; // super-block scale for quantized mins - } GGML_COMMON_AGGR; + } GGML_COMMON_AGGR_S; ggml_half2 dm; - }; + } GGML_COMMON_AGGR_U; uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits uint8_t qs[QK_K/2]; // 4--bit quants } block_q4_K; @@ -305,9 +322,9 @@ typedef struct { struct { ggml_half d; // super-block scale for quantized scales ggml_half dmin; // super-block scale for quantized mins - } GGML_COMMON_AGGR; + } GGML_COMMON_AGGR_S; ggml_half2 dm; - }; + } GGML_COMMON_AGGR_U; uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits uint8_t qh[QK_K/8]; // quants, high bit uint8_t qs[QK_K/2]; // quants, low 4 bits @@ -431,6 +448,13 @@ static_assert(sizeof(block_iq4_xs) == sizeof(ggml_half) + sizeof(uint16_t) + QK_ #define GGML_TABLE_BEGIN(type, name, size) static const type name[size] = { #define GGML_TABLE_END() }; +#define GGML_COMMON_IMPL +#elif defined(GGML_COMMON_IMPL_CPP) +#include + +#define GGML_TABLE_BEGIN(type, name, size) static const type name[size] = { +#define GGML_TABLE_END() }; + #define GGML_COMMON_IMPL #elif defined(GGML_COMMON_IMPL_METAL) #include diff --git a/ggml/src/ggml-cpu/CMakeLists.txt b/ggml/src/ggml-cpu/CMakeLists.txt index cef41a0743cef..03c7607b533d2 100644 --- a/ggml/src/ggml-cpu/CMakeLists.txt +++ b/ggml/src/ggml-cpu/CMakeLists.txt @@ -44,16 +44,6 @@ if (GGML_OPENMP) endif() endif() -if (GGML_LLAMAFILE) - message(STATUS "Using llamafile") - - add_compile_definitions(GGML_USE_LLAMAFILE) - - target_sources(ggml-cpu PRIVATE - llamafile/sgemm.cpp - llamafile/sgemm.h) -endif() - if (GGML_CPU_HBM) find_library(memkind memkind REQUIRED) diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c index 7f5c465df068c..37a11449c155c 100644 --- a/ggml/src/ggml-cpu/ggml-cpu.c +++ b/ggml/src/ggml-cpu/ggml-cpu.c @@ -39,14 +39,6 @@ #include #endif -#if defined(__ARM_FEATURE_SVE) || defined(__ARM_FEATURE_MATMUL_INT8) -#undef GGML_USE_LLAMAFILE -#endif - -#ifdef GGML_USE_LLAMAFILE -#include "llamafile/sgemm.h" -#endif - #if defined(_MSC_VER) // disable "possible loss of data" to avoid hundreds of casts // we should just be careful :) @@ -7466,33 +7458,6 @@ static void ggml_compute_forward_mul_mat( // nb01 >= nb00 - src0 is not transposed // compute by src0 rows -#if GGML_USE_LLAMAFILE - // broadcast factors - const int64_t r2 = ne12 / ne02; - const int64_t r3 = ne13 / ne03; - - const bool src1_cont = ggml_is_contiguous(src1); - - if (src1_cont) { - for (int64_t i13 = 0; i13 < ne13; i13++) - for (int64_t i12 = 0; i12 < ne12; i12++) - if (!llamafile_sgemm(ne01, ne11, ne00/ggml_blck_size(type), - (const char *)src0->data + i12/r2*nb02 + i13/r3*nb03, - nb01/ggml_type_size(type), - (const char *)src1->data + i12*nb12 + i13*nb13, - nb11/ggml_type_size(src1->type), - (char *)dst->data + i12*nb2 + i13*nb3, - nb1/ggml_type_size(dst->type), - ith, nth, - type, - src1->type, - dst->type)) - goto UseGgmlGemm1; - return; - } -UseGgmlGemm1:; -#endif - if (src1->type != vec_dot_type) { char * wdata = params->wdata; @@ -7530,30 +7495,6 @@ UseGgmlGemm1:; ggml_barrier(params->threadpool); -#if GGML_USE_LLAMAFILE - if (src1->type != vec_dot_type) { - const void* wdata = (src1->type == vec_dot_type) ? src1->data : params->wdata; - const size_t row_size = ggml_row_size(vec_dot_type, ne10); - - for (int64_t i13 = 0; i13 < ne13; i13++) - for (int64_t i12 = 0; i12 < ne12; i12++) - if (!llamafile_sgemm(ne01, ne11, ne00/ggml_blck_size(type), - (const char *)src0->data + i12/r2*nb02 + i13/r3*nb03, - nb01/ggml_type_size(type), - (const char *)wdata + (i12*ne11 + i13*ne12*ne11)*row_size, - row_size/ggml_type_size(vec_dot_type), - (char *)dst->data + i12*nb2 + i13*nb3, - nb1/ggml_type_size(dst->type), - ith, nth, - type, - vec_dot_type, - dst->type)) - goto UseGgmlGemm2; - return; - } -UseGgmlGemm2:; -#endif - // This is the size of the first dimension of the result, so we can iterate that way. (see the ASSERT above, these are the same numbers) const int64_t nr0 = ne0; diff --git a/ggml/src/ggml-tinyblas/CMakeLists.txt b/ggml/src/ggml-tinyblas/CMakeLists.txt index c8c4fd04e0f29..2b197f511743b 100644 --- a/ggml/src/ggml-tinyblas/CMakeLists.txt +++ b/ggml/src/ggml-tinyblas/CMakeLists.txt @@ -1,3 +1,5 @@ +message(STATUS "Using TINYBLAS") + add_library(ggml-tinyblas ggml-tinyblas.cpp ) @@ -225,6 +227,10 @@ endif() target_compile_options(ggml-tinyblas PRIVATE "$<$:${ARCH_FLAGS}>") target_compile_options(ggml-tinyblas PRIVATE "$<$:${ARCH_FLAGS}>") +#set_source_files_properties( ${GGML_SOURCES_FP8} PROPERTIES CXX_STANDARD 17) +#set_source_files_properties( ${GGML_SOURCES_FP8} PROPERTIES COMPILE_FLAGS "-std=c++17") +target_compile_features (ggml-tinyblas PRIVATE cxx_std_17) + if (EMSCRIPTEN) set_target_properties(ggml-tinyblas PROPERTIES COMPILE_FLAGS "-msimd128") endif() diff --git a/ggml/src/ggml-tinyblas/ggml-tinyblas.cpp b/ggml/src/ggml-tinyblas/ggml-tinyblas.cpp index 7317b5dd352dc..5d0704289534e 100644 --- a/ggml/src/ggml-tinyblas/ggml-tinyblas.cpp +++ b/ggml/src/ggml-tinyblas/ggml-tinyblas.cpp @@ -1,3 +1,48 @@ +// Copyright 2024 Mozilla Foundation +// +// Permission is hereby granted, free of charge, to any person obtaining +// a copy of this software and associated documentation files (the +// "Software"), to deal in the Software without restriction, including +// without limitation the rights to use, copy, modify, merge, publish, +// distribute, sublicense, and/or sell copies of the Software, and to +// permit persons to whom the Software is furnished to do so, subject to +// the following conditions: +// +// The above copyright notice and this permission notice shall be +// included in all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +// MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS +// BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN +// ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN +// CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +// SOFTWARE. + +// +// _ _ ___ _ _ ___ +// | |_(_)_ _ _ _| _ ) | /_\ / __| +// | _| | ' \ || | _ \ |__ / _ \\__ \. +// \__|_|_||_\_, |___/____/_/ \_\___/ +// |__/ +// +// BASIC LINEAR ALGEBRA SUBPROGRAMS +// +// +// This file implements multithreaded CPU matrix multiplication for the +// common contiguous use case C = Aᵀ * B. These kernels are designed to +// have excellent performance[1] for matrices that fit in the CPU cache +// without imposing any overhead such as cache filling or malloc calls. +// +// This implementation does not guarantee any upper bound with rounding +// errors, which grow along with k. Our goal's to maximally exploit the +// hardware for performance, and then use whatever resources remain for +// improving numerical accuracy. +// +// [1] J. Tunney, ‘LLaMA Now Goes Faster on CPUs’, Mar. 2024. [Online]. +// Available: https://justine.lol/matmul/. [Accessed: 29-Mar-2024]. + #include "ggml-cpu.h" #include "ggml-impl.h" #include "ggml-tinyblas.h" @@ -7,8 +52,9 @@ #include #include -#include +// TODO: see how to use threads/pool for all backend: ggml_graph_compute / ggml_threadpool +// https://github.com/ggerganov/llama.cpp/pull/1999 #ifdef GGML_USE_OPENMP #include #endif @@ -21,8 +67,6 @@ namespace ggml::backend::tinyblas { int n_threads = GGML_DEFAULT_N_THREADS; std::unique_ptr work_data; size_t work_size = 0; - //int pp_threads = GGML_DEFAULT_N_THREADS; - //int tg_threads = GGML_DEFAULT_N_THREADS; }; template @@ -112,7 +156,7 @@ namespace ggml::backend::tinyblas { } } - // apres conversion de B: FP32 => src0->vec_dot_type + // after convert B: FP32 => src0->vec_dot_type enum ggml_type const vec_dot_type = ggml_get_type_traits_cpu(src0->type)->vec_dot_type; if ((src1->type != vec_dot_type) && (src1->type == GGML_TYPE_F32)) { if (mul_mat(ne01, ne11, ne00/ggml_blck_size(src0->type), @@ -120,7 +164,7 @@ namespace ggml::backend::tinyblas { src1->data, nb11/ggml_type_size(src1->type), dst->data, nb1/ggml_type_size(dst->type), 0, 1, src0->type, vec_dot_type, GGML_TYPE_F32)) { - // @ voir ca aurait etait bien de redimensioner work_data ici.. + // TODO: how to resize work_data here return true; } } @@ -136,7 +180,6 @@ namespace ggml::backend::tinyblas { const enum ggml_type type0 = src0->type; const enum ggml_type type1 = src1->type; - // les type "directs" // broadcast factors const int64_t r2 = ne12 / ne02; const int64_t r3 = ne13 / ne03; @@ -160,21 +203,18 @@ namespace ggml::backend::tinyblas { } UseGgmlGemm1:; - // apres conversion de B ? + // with B converted from FP32 -> vec_dot_type GGML_ASSERT(src1->type == GGML_TYPE_F32); // for use 'from_float' enum ggml_type const vec_dot_type = ggml_get_type_traits_cpu(type0)->vec_dot_type; ggml_from_float_t const from_float = ggml_get_type_traits_cpu(vec_dot_type)->from_float; - // auto const type_size = ggml_get_type_traits(vec_dot_type)->type_size; if (src1->type != vec_dot_type) { - // OK on va au moins essayer de changer le type de B - const size_t nbw1 = ggml_row_size(vec_dot_type, ne10); // const size_t row_size = ggml_row_size(vec_dot_type, ne10); const size_t nbw2 = nbw1*ne11; const size_t nbw3 = nbw2*ne12; - // TOD0: vor si on peu caller ca dans supports_mul_mat + // TODO: move to: supports_mul_mat if ((ith == 0) && (ctx->work_size < ne13*nbw3)) { ctx->work_data.reset(new char[ne13*nbw3]); ctx->work_size = ne13*nbw3; @@ -182,7 +222,7 @@ namespace ggml::backend::tinyblas { #ifdef GGML_USE_OPENMP #pragma omp barrier #else - static_assert(false, "Note implemented: use GGML_USE_OPENMP"); + static_assert(false, "Not implemented: use GGML_USE_OPENMP"); #endif char * wdata = ctx->work_data.get(); @@ -200,7 +240,7 @@ namespace ggml::backend::tinyblas { #ifdef GGML_USE_OPENMP #pragma omp barrier #else - static_assert(false, "Note implemented: use GGML_USE_OPENMP"); + static_assert(false, "Not implemented: use GGML_USE_OPENMP"); #endif // mat-mul bis... for (int64_t i13 = 0; i13 < ne13; i13++) @@ -232,10 +272,6 @@ namespace ggml::backend::tinyblas { delete backend; } - // TODO: voir comment gerer les threads / pool ... pour tous les backends qui en ont besoin... - // - voir ggml_graph_compute / ggml_threadpool - // https://github.com/ggerganov/llama.cpp/pull/1999 - // static enum ggml_status graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) { context * ctx = (context *)backend->context; @@ -252,7 +288,7 @@ namespace ggml::backend::tinyblas { mul_mat(ctx, node, ith, nth); } #else - static_assert(false, "Note implemented: use GGML_USE_OPENMP"); + static_assert(false, "Not implemented: use GGML_USE_OPENMP"); mul_mat(ctx, node, 0, 1); #endif break; @@ -309,25 +345,10 @@ namespace ggml::backend::tinyblas { return backend != NULL && ggml_guid_matches(backend->guid, guid()); } - // number of threads to use for compute - static void set_pp_threads(ggml_backend_t backend, int n_threads) { - GGML_ASSERT(is_tinyblas(backend)); - context * ctx = (context *)backend->context; - //ctx->pp_threads = n_threads; - } - - static void set_tg_threads(ggml_backend_t backend, int n_threads) { - GGML_ASSERT(is_tinyblas(backend)); - context * ctx = (context *)backend->context; - //ctx->tg_threads = n_threads; - } - static void set_n_threads(ggml_backend_t backend, int n_threads) { GGML_ASSERT(is_tinyblas(backend)); context * ctx = (context *)backend->context; ctx->n_threads = n_threads; - //ctx->tg_threads = n_threads; - //ctx->pp_threads = n_threads; } } @@ -378,9 +399,6 @@ namespace ggml::backend::tinyblas::device { } static bool supports_op(ggml_backend_dev_t device, const struct ggml_tensor * op) { - //const struct ggml_tensor * src0 = op->src[0]; - //const struct ggml_tensor * src1 = op->src[1]; - switch (op->op) { case GGML_OP_NONE: case GGML_OP_RESHAPE: @@ -445,12 +463,6 @@ namespace ggml::backend::tinyblas::reg { if (std::strcmp(name, "ggml_backend_set_n_threads") == 0) { return (void *)ggml::backend::tinyblas::set_n_threads; } - if (std::strcmp(name, "ggml_backend_set_pp_threads") == 0) { - return (void *)ggml::backend::tinyblas::set_pp_threads; - } - if (std::strcmp(name, "ggml_backend_set_tg_threads") == 0) { - return (void *)ggml::backend::tinyblas::set_tg_threads; - } return NULL; } diff --git a/ggml/src/ggml-tinyblas/sgemm.cpp b/ggml/src/ggml-tinyblas/sgemm.cpp index 5c7a3c357ee9f..b82ae3f84be49 100644 --- a/ggml/src/ggml-tinyblas/sgemm.cpp +++ b/ggml/src/ggml-tinyblas/sgemm.cpp @@ -1739,6 +1739,17 @@ namespace ggml::backend::tinyblas { } #endif return false; + GGML_UNUSED(m); + GGML_UNUSED(n); + GGML_UNUSED(k); + GGML_UNUSED(A); + GGML_UNUSED(lda); + GGML_UNUSED(B); + GGML_UNUSED(ldb); + GGML_UNUSED(C); + GGML_UNUSED(ldc); + GGML_UNUSED(ith); + GGML_UNUSED(nth); } template bool gemm(int64_t m, int64_t n, int64_t k, const float *A, int64_t lda, const float *B, int64_t ldb, float *C, int64_t ldc, @@ -1787,6 +1798,17 @@ namespace ggml::backend::tinyblas { } #endif return false; + GGML_UNUSED(m); + GGML_UNUSED(n); + GGML_UNUSED(k); + GGML_UNUSED(A); + GGML_UNUSED(lda); + GGML_UNUSED(B); + GGML_UNUSED(ldb); + GGML_UNUSED(C); + GGML_UNUSED(ldc); + GGML_UNUSED(ith); + GGML_UNUSED(nth); } template bool gemm(int64_t m, int64_t n, int64_t k, const ggml_fp16_t *A, int64_t lda, const float *B, int64_t ldb, float *C, int64_t ldc, @@ -1835,6 +1857,17 @@ namespace ggml::backend::tinyblas { } #endif return false; + GGML_UNUSED(m); + GGML_UNUSED(n); + GGML_UNUSED(k); + GGML_UNUSED(A); + GGML_UNUSED(lda); + GGML_UNUSED(B); + GGML_UNUSED(ldb); + GGML_UNUSED(C); + GGML_UNUSED(ldc); + GGML_UNUSED(ith); + GGML_UNUSED(nth); } template bool gemm(int64_t m, int64_t n, int64_t k, const ggml_fp16_t *A, int64_t lda, const ggml_fp16_t *B, int64_t ldb, float *C, int64_t ldc, @@ -1876,6 +1909,17 @@ namespace ggml::backend::tinyblas { // TODO #endif return false; + GGML_UNUSED(m); + GGML_UNUSED(n); + GGML_UNUSED(k); + GGML_UNUSED(A); + GGML_UNUSED(lda); + GGML_UNUSED(B); + GGML_UNUSED(ldb); + GGML_UNUSED(C); + GGML_UNUSED(ldc); + GGML_UNUSED(ith); + GGML_UNUSED(nth); } template bool gemm(int64_t m, int64_t n, int64_t k, const ggml_bf16_t *A, int64_t lda, const float *B, int64_t ldb, float *C, int64_t ldc, @@ -1914,6 +1958,17 @@ namespace ggml::backend::tinyblas { } #endif return false; + GGML_UNUSED(m); + GGML_UNUSED(n); + GGML_UNUSED(k); + GGML_UNUSED(A); + GGML_UNUSED(lda); + GGML_UNUSED(B); + GGML_UNUSED(ldb); + GGML_UNUSED(C); + GGML_UNUSED(ldc); + GGML_UNUSED(ith); + GGML_UNUSED(nth); } template bool gemm(int64_t m, int64_t n, int64_t k, const ggml_bf16_t *A, int64_t lda, const ggml_bf16_t *B, int64_t ldb, float *C, int64_t ldc, @@ -1950,6 +2005,17 @@ namespace ggml::backend::tinyblas { #else return false; #endif + GGML_UNUSED(m); + GGML_UNUSED(n); + GGML_UNUSED(k); + GGML_UNUSED(A); + GGML_UNUSED(lda); + GGML_UNUSED(B); + GGML_UNUSED(ldb); + GGML_UNUSED(C); + GGML_UNUSED(ldc); + GGML_UNUSED(ith); + GGML_UNUSED(nth); } template bool gemm(int64_t m, int64_t n, int64_t k, const block_q8_0 *A, int64_t lda, const block_q8_0 *B, int64_t ldb, float *C, int64_t ldc, @@ -1986,6 +2052,17 @@ namespace ggml::backend::tinyblas { #else return false; #endif + GGML_UNUSED(m); + GGML_UNUSED(n); + GGML_UNUSED(k); + GGML_UNUSED(A); + GGML_UNUSED(lda); + GGML_UNUSED(B); + GGML_UNUSED(ldb); + GGML_UNUSED(C); + GGML_UNUSED(ldc); + GGML_UNUSED(ith); + GGML_UNUSED(nth); } template bool gemm(int64_t m, int64_t n, int64_t k, const block_q4_0 *A, int64_t lda, const block_q8_0 *B, int64_t ldb, float *C, int64_t ldc, @@ -2016,6 +2093,17 @@ namespace ggml::backend::tinyblas { #else return false; #endif + GGML_UNUSED(m); + GGML_UNUSED(n); + GGML_UNUSED(k); + GGML_UNUSED(A); + GGML_UNUSED(lda); + GGML_UNUSED(B); + GGML_UNUSED(ldb); + GGML_UNUSED(C); + GGML_UNUSED(ldc); + GGML_UNUSED(ith); + GGML_UNUSED(nth); } template bool gemm(int64_t m, int64_t n, int64_t k, const block_q5_0 *A, int64_t lda, const block_q8_0 *B, int64_t ldb, float *C, int64_t ldc, @@ -2045,6 +2133,17 @@ namespace ggml::backend::tinyblas { #else return false; #endif + GGML_UNUSED(m); + GGML_UNUSED(n); + GGML_UNUSED(k); + GGML_UNUSED(A); + GGML_UNUSED(lda); + GGML_UNUSED(B); + GGML_UNUSED(ldb); + GGML_UNUSED(C); + GGML_UNUSED(ldc); + GGML_UNUSED(ith); + GGML_UNUSED(nth); } template bool gemm(int64_t m, int64_t n, int64_t k, const block_iq4_nl *A, int64_t lda, const block_q8_0 *B, int64_t ldb, float *C, int64_t ldc, diff --git a/ggml/src/ggml-tinyblas/sgemm.h b/ggml/src/ggml-tinyblas/sgemm.h index 88c014e3ebb1e..18bf5230e333d 100644 --- a/ggml/src/ggml-tinyblas/sgemm.h +++ b/ggml/src/ggml-tinyblas/sgemm.h @@ -1,17 +1,56 @@ +// Copyright 2024 Mozilla Foundation +// +// Permission is hereby granted, free of charge, to any person obtaining +// a copy of this software and associated documentation files (the +// "Software"), to deal in the Software without restriction, including +// without limitation the rights to use, copy, modify, merge, publish, +// distribute, sublicense, and/or sell copies of the Software, and to +// permit persons to whom the Software is furnished to do so, subject to +// the following conditions: +// +// The above copyright notice and this permission notice shall be +// included in all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +// MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS +// BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN +// ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN +// CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +// SOFTWARE. + +// +// _ _ ___ _ _ ___ +// | |_(_)_ _ _ _| _ ) | /_\ / __| +// | _| | ' \ || | _ \ |__ / _ \\__ \. +// \__|_|_||_\_, |___/____/_/ \_\___/ +// |__/ +// +// BASIC LINEAR ALGEBRA SUBPROGRAMS +// +// +// This file implements multithreaded CPU matrix multiplication for the +// common contiguous use case C = Aᵀ * B. These kernels are designed to +// have excellent performance[1] for matrices that fit in the CPU cache +// without imposing any overhead such as cache filling or malloc calls. +// +// This implementation does not guarantee any upper bound with rounding +// errors, which grow along with k. Our goal's to maximally exploit the +// hardware for performance, and then use whatever resources remain for +// improving numerical accuracy. +// +// [1] J. Tunney, ‘LLaMA Now Goes Faster on CPUs’, Mar. 2024. [Online]. +// Available: https://justine.lol/matmul/. [Accessed: 29-Mar-2024]. + #pragma once -//#include #include "ggml.h" -#define GGML_COMMON_DECL_C -//#define GGML_COMMON_DECL_CPP +#define GGML_COMMON_DECL_CPP #include "ggml-common.h" -// appelé que depuis du c++ (le tinyBLAS backend) - namespace ggml::backend::tinyblas { - // on est en C++ - // => on peu avoir autant de fonction que de type. - // calcule C = Aᵀ * B + // compute: C = Aᵀ * B template bool gemm(int64_t m, int64_t n, int64_t k, const float *A, int64_t lda, const float *B, int64_t ldb, float *C, int64_t ldc, From a3822fb59b9649a7dc1b916cd9c05950b5adfba7 Mon Sep 17 00:00:00 2001 From: Djip007 <3705339+Djip007@users.noreply.github.com> Date: Sun, 17 Nov 2024 00:49:52 +0100 Subject: [PATCH 3/3] update Makefile --- Makefile | 21 ++++----------------- 1 file changed, 4 insertions(+), 17 deletions(-) diff --git a/Makefile b/Makefile index fa94c3bf39072..5a35d05617999 100644 --- a/Makefile +++ b/Makefile @@ -569,7 +569,7 @@ endif # GGML_NVPL ifndef GGML_NO_LLAMAFILE MK_CPPFLAGS += -DGGML_USE_TINYBLAS - OBJ_GGML_EXT += ggml/src/ggml-tinyblas/ggml-tinyblas.o ggml/src/ggml-tinyblas/sgemm.o + OBJ_GGML_EXT += ggml/src/ggml-tinyblas/ggml-tinyblas-cpp17.o ggml/src/ggml-tinyblas/sgemm-cpp17.o endif ifndef GGML_NO_AMX @@ -1153,22 +1153,9 @@ $(DIR_GGML)/src/ggml-cpu/ggml-cpu-cpp.o: \ ggml/src/ggml-impl.h $(CXX) $(CXXFLAGS) -c $< -o $@ -# TODO: renomer en GGML_NO_TINYBLAS -# needed for c++17 build -ifndef GGML_NO_LLAMAFILE -ggml/src/ggml-tinyblas/ggml-tinyblas.o: \ - ggml/src/ggml-tinyblas/ggml-tinyblas.cpp \ - ggml/include/ggml-tinyblas.h \ - ggml/src/ggml-tinyblas/sgemm.h \ - ggml/include/ggml.h - $(CXX) $(CXXFLAGS) -std=c++17 -c $< -o $@ - -ggml/src/ggml-tinyblas/sgemm.o: \ - ggml/src/ggml-tinyblas/sgemm.cpp \ - ggml/src/ggml-tinyblas/sgemm.h \ - ggml/include/ggml.h - $(CXX) $(CXXFLAGS) -std=c++17 -c $< -o $@ -endif # GGML_NO_LLAMAFILE +# for c++17 build +$(DIR_GGML)/%-cpp17.o: $(DIR_GGML)/%.cpp + $(CXX) $(CXXFLAGS) -MMD -std=c++17 -c $< -o $@ # Rules for building object files $(DIR_GGML)/%.o: $(DIR_GGML)/%.c