Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Refactor/tinyblas #10343

Draft
wants to merge 3 commits into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -84,8 +84,8 @@ set(GGML_ALL_WARNINGS ${LLAMA_ALL_WARNINGS})
set(GGML_FATAL_WARNINGS ${LLAMA_FATAL_WARNINGS})

# change the default for these ggml options
if (NOT DEFINED GGML_LLAMAFILE)
set(GGML_LLAMAFILE_DEFAULT ON)
if (NOT DEFINED GGML_TINYBLAS)
set(GGML_TINYBLAS ON)
endif()

if (NOT DEFINED GGML_AMX)
Expand Down
8 changes: 6 additions & 2 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -568,8 +568,8 @@ ifdef GGML_NVPL
endif # GGML_NVPL

ifndef GGML_NO_LLAMAFILE
MK_CPPFLAGS += -DGGML_USE_LLAMAFILE
OBJ_GGML_EXT += ggml/src/ggml-cpu/llamafile/sgemm.o
MK_CPPFLAGS += -DGGML_USE_TINYBLAS
OBJ_GGML_EXT += ggml/src/ggml-tinyblas/ggml-tinyblas-cpp17.o ggml/src/ggml-tinyblas/sgemm-cpp17.o
endif

ifndef GGML_NO_AMX
Expand Down Expand Up @@ -1153,6 +1153,10 @@ $(DIR_GGML)/src/ggml-cpu/ggml-cpu-cpp.o: \
ggml/src/ggml-impl.h
$(CXX) $(CXXFLAGS) -c $< -o $@

# for c++17 build
$(DIR_GGML)/%-cpp17.o: $(DIR_GGML)/%.cpp
$(CXX) $(CXXFLAGS) -MMD -std=c++17 -c $< -o $@

# Rules for building object files
$(DIR_GGML)/%.o: $(DIR_GGML)/%.c
$(CC) $(CFLAGS) -MMD -c $< -o $@
Expand Down
2 changes: 1 addition & 1 deletion docs/android.md
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ $ cmake \
-DCMAKE_C_FLAGS="-march=armv8.7a" \
-DCMAKE_CXX_FLAGS="-march=armv8.7a" \
-DGGML_OPENMP=OFF \
-DGGML_LLAMAFILE=OFF \
-DGGML_TINYBLAS=OFF \
-B build-android
```

Expand Down
4 changes: 2 additions & 2 deletions docs/build.md
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ In order to build llama.cpp you have four different options.

**Notes**:

- For `Q4_0_4_4` quantization type build, add the `-DGGML_LLAMAFILE=OFF` cmake option. For example, use `cmake -B build -DGGML_LLAMAFILE=OFF`.
- For `Q4_0_4_4` quantization type build, add the `-DGGML_TINYBLAS=OFF` cmake option. For example, use `cmake -B build -DGGML_TINYBLAS=OFF`.
- For faster compilation, add the `-j` argument to run multiple jobs in parallel. For example, `cmake --build build --config Release -j 8` will run 8 jobs in parallel.
- For faster repeated compilation, install [ccache](https://ccache.dev/).
- For debug builds, there are two cases:
Expand Down Expand Up @@ -393,4 +393,4 @@ To read documentation for how to build on Android, [click here](./android.md)

Llama.cpp includes a set of optimized mulmat kernels for the Arm architecture, leveraging Arm® Neon™, int8mm and SVE instructions. These kernels are enabled at build time through the appropriate compiler cpu-type flags, such as `-DCMAKE_C_FLAGS=-march=armv8.2a+i8mm+sve`. Note that these optimized kernels require the model to be quantized into one of the formats: `Q4_0_4_4` (Arm Neon), `Q4_0_4_8` (int8mm) or `Q4_0_8_8` (SVE). The SVE mulmat kernel specifically requires a vector width of 256 bits. When running on devices with a different vector width, it is recommended to use the `Q4_0_4_8` (int8mm) or `Q4_0_4_4` (Arm Neon) formats for better performance. Refer to [examples/quantize/README.md](../examples/quantize/README.md) for more information on the quantization formats.

To support `Q4_0_4_4`, you must build with `GGML_NO_LLAMAFILE=1` (`make`) or `-DGGML_LLAMAFILE=OFF` (`cmake`).
To support `Q4_0_4_4`, you must build with `GGML_NO_LLAMAFILE=1` (`make`) or `-DGGML_TINYBLAS=OFF` (`cmake`).
8 changes: 4 additions & 4 deletions ggml/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -57,8 +57,8 @@ else()
endif()

# defaults
if (NOT GGML_LLAMAFILE_DEFAULT)
set(GGML_LLAMAFILE_DEFAULT OFF)
if (NOT GGML_TINYBLAS_DEFAULT)
set(GGML_TINYBLAS_DEFAULT OFF)
endif()

if (NOT GGML_CUDA_GRAPHS_DEFAULT)
Expand Down Expand Up @@ -124,8 +124,7 @@ option(GGML_ACCELERATE "ggml: enable Accelerate framework"
option(GGML_BLAS "ggml: use BLAS" ${GGML_BLAS_DEFAULT})
set(GGML_BLAS_VENDOR ${GGML_BLAS_VENDOR_DEFAULT} CACHE STRING
"ggml: BLAS library vendor")
option(GGML_LLAMAFILE "ggml: use LLAMAFILE" ${GGML_LLAMAFILE_DEFAULT})

option(GGML_TINYBLAS "ggml: use TINYBLAS" OFF)
option(GGML_CUDA "ggml: use CUDA" OFF)
option(GGML_MUSA "ggml: use MUSA" OFF)
option(GGML_CUDA_FORCE_DMMV "ggml: use dmmv instead of mmvq CUDA kernels" OFF)
Expand Down Expand Up @@ -231,6 +230,7 @@ set(GGML_PUBLIC_HEADERS
include/ggml-metal.h
include/ggml-rpc.h
include/ggml-sycl.h
include/ggml-tinyblas.h
include/ggml-vulkan.h)

set_target_properties(ggml PROPERTIES PUBLIC_HEADER "${GGML_PUBLIC_HEADERS}")
Expand Down
1 change: 0 additions & 1 deletion ggml/include/ggml-cpu.h
Original file line number Diff line number Diff line change
Expand Up @@ -124,7 +124,6 @@ extern "C" {
GGML_BACKEND_API int ggml_cpu_has_riscv_v (void);
GGML_BACKEND_API int ggml_cpu_has_vsx (void);
GGML_BACKEND_API int ggml_cpu_has_wasm_simd (void);
GGML_BACKEND_API int ggml_cpu_has_llamafile (void);

// Internal types and functions exposed for tests and benchmarks

Expand Down
17 changes: 17 additions & 0 deletions ggml/include/ggml-tinyblas.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
#pragma once

#include "ggml.h"
#include "ggml-backend.h"


#ifdef __cplusplus
extern "C" {
#endif

// backend register
GGML_API ggml_backend_reg_t ggml_backend_tinyblas_reg(void);


#ifdef __cplusplus
}
#endif
1 change: 1 addition & 0 deletions ggml/src/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -256,6 +256,7 @@ ggml_add_backend(Kompute)
ggml_add_backend(METAL)
ggml_add_backend(RPC)
ggml_add_backend(SYCL)
ggml_add_backend(TINYBLAS)
ggml_add_backend(Vulkan)
ggml_add_backend(MUSA)

Expand Down
15 changes: 12 additions & 3 deletions ggml/src/ggml-backend-reg.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,10 @@
#include "ggml-blas.h"
#endif

#ifdef GGML_USE_TINYBLAS
#include "ggml-tinyblas.h"
#endif

#ifdef GGML_USE_RPC
#include "ggml-rpc.h"
#endif
Expand Down Expand Up @@ -66,6 +70,9 @@ struct ggml_backend_registry {
#ifdef GGML_USE_BLAS
register_backend(ggml_backend_blas_reg());
#endif
#ifdef GGML_USE_TINYBLAS
register_backend(ggml_backend_tinyblas_reg());
#endif
#ifdef GGML_USE_RPC
register_backend(ggml_backend_rpc_reg());
#endif
Expand All @@ -84,10 +91,12 @@ struct ggml_backend_registry {
return;
}

#ifndef NDEBUG
GGML_LOG_DEBUG("%s: registered backend %s (%zu devices)\n",
GGML_LOG_INFO("%s: registered backend %s (%zu devices)\n",
__func__, ggml_backend_reg_name(reg), ggml_backend_reg_dev_count(reg));
#endif
//#ifndef NDEBUG
// GGML_LOG_DEBUG("%s: registered backend %s (%zu devices)\n",
// __func__, ggml_backend_reg_name(reg), ggml_backend_reg_dev_count(reg));
//#endif
backends.push_back(reg);
for (size_t i = 0; i < ggml_backend_reg_dev_count(reg); i++) {
register_device(ggml_backend_reg_dev_get(reg, i));
Expand Down
58 changes: 41 additions & 17 deletions ggml/src/ggml-common.h
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,20 @@
typedef uint16_t ggml_half;
typedef uint32_t ggml_half2;

#define GGML_COMMON_AGGR
#define GGML_COMMON_AGGR_U
#define GGML_COMMON_AGGR_S

#define GGML_COMMON_DECL
#elif defined(GGML_COMMON_DECL_CPP)
#include <cstdint>

typedef uint16_t ggml_half;
typedef uint32_t ggml_half2;

// std-c++ allow anonymous unions but some compiler warn on it
#define GGML_COMMON_AGGR_U data
// std-c++ do not allow it.
#define GGML_COMMON_AGGR_S data

#define GGML_COMMON_DECL
#elif defined(GGML_COMMON_DECL_METAL)
Expand All @@ -15,7 +28,8 @@ typedef uint32_t ggml_half2;
typedef half ggml_half;
typedef half2 ggml_half2;

#define GGML_COMMON_AGGR
#define GGML_COMMON_AGGR_U
#define GGML_COMMON_AGGR_S

#define GGML_COMMON_DECL
#elif defined(GGML_COMMON_DECL_CUDA)
Expand All @@ -29,7 +43,8 @@ typedef half2 ggml_half2;
typedef half ggml_half;
typedef half2 ggml_half2;

#define GGML_COMMON_AGGR data
#define GGML_COMMON_AGGR_U
#define GGML_COMMON_AGGR_S data

#define GGML_COMMON_DECL
#elif defined(GGML_COMMON_DECL_HIP)
Expand All @@ -39,7 +54,8 @@ typedef half2 ggml_half2;
typedef half ggml_half;
typedef half2 ggml_half2;

#define GGML_COMMON_AGGR data
#define GGML_COMMON_AGGR_U
#define GGML_COMMON_AGGR_S data

#define GGML_COMMON_DECL
#elif defined(GGML_COMMON_DECL_SYCL)
Expand All @@ -49,7 +65,8 @@ typedef half2 ggml_half2;
typedef sycl::half ggml_half;
typedef sycl::half2 ggml_half2;

#define GGML_COMMON_AGGR data
#define GGML_COMMON_AGGR_U
#define GGML_COMMON_AGGR_S data

#define GGML_COMMON_DECL
#endif
Expand Down Expand Up @@ -154,9 +171,9 @@ typedef struct {
struct {
ggml_half d; // delta
ggml_half m; // min
} GGML_COMMON_AGGR;
} GGML_COMMON_AGGR_S;
ggml_half2 dm;
};
} GGML_COMMON_AGGR_U;
uint8_t qs[QK4_1 / 2]; // nibbles / quants
} block_q4_1;
static_assert(sizeof(block_q4_1) == 2 * sizeof(ggml_half) + QK4_1 / 2, "wrong q4_1 block size/padding");
Expand All @@ -175,9 +192,9 @@ typedef struct {
struct {
ggml_half d; // delta
ggml_half m; // min
} GGML_COMMON_AGGR;
} GGML_COMMON_AGGR_S;
ggml_half2 dm;
};
} GGML_COMMON_AGGR_U;
uint8_t qh[4]; // 5-th bit of quants
uint8_t qs[QK5_1 / 2]; // nibbles / quants
} block_q5_1;
Expand All @@ -196,9 +213,9 @@ typedef struct {
struct {
ggml_half d; // delta
ggml_half s; // d * sum(qs[i])
} GGML_COMMON_AGGR;
} GGML_COMMON_AGGR_S;
ggml_half2 ds;
};
} GGML_COMMON_AGGR_U;
int8_t qs[QK8_1]; // quants
} block_q8_1;
static_assert(sizeof(block_q8_1) == 2*sizeof(ggml_half) + QK8_1, "wrong q8_1 block size/padding");
Expand Down Expand Up @@ -261,9 +278,9 @@ typedef struct {
struct {
ggml_half d; // super-block scale for quantized scales
ggml_half dmin; // super-block scale for quantized mins
} GGML_COMMON_AGGR;
} GGML_COMMON_AGGR_S;
ggml_half2 dm;
};
} GGML_COMMON_AGGR_U;
} block_q2_K;
static_assert(sizeof(block_q2_K) == 2*sizeof(ggml_half) + QK_K/16 + QK_K/4, "wrong q2_K block size/padding");

Expand All @@ -288,9 +305,9 @@ typedef struct {
struct {
ggml_half d; // super-block scale for quantized scales
ggml_half dmin; // super-block scale for quantized mins
} GGML_COMMON_AGGR;
} GGML_COMMON_AGGR_S;
ggml_half2 dm;
};
} GGML_COMMON_AGGR_U;
uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits
uint8_t qs[QK_K/2]; // 4--bit quants
} block_q4_K;
Expand All @@ -305,9 +322,9 @@ typedef struct {
struct {
ggml_half d; // super-block scale for quantized scales
ggml_half dmin; // super-block scale for quantized mins
} GGML_COMMON_AGGR;
} GGML_COMMON_AGGR_S;
ggml_half2 dm;
};
} GGML_COMMON_AGGR_U;
uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits
uint8_t qh[QK_K/8]; // quants, high bit
uint8_t qs[QK_K/2]; // quants, low 4 bits
Expand Down Expand Up @@ -431,6 +448,13 @@ static_assert(sizeof(block_iq4_xs) == sizeof(ggml_half) + sizeof(uint16_t) + QK_
#define GGML_TABLE_BEGIN(type, name, size) static const type name[size] = {
#define GGML_TABLE_END() };

#define GGML_COMMON_IMPL
#elif defined(GGML_COMMON_IMPL_CPP)
#include <cstdint>

#define GGML_TABLE_BEGIN(type, name, size) static const type name[size] = {
#define GGML_TABLE_END() };

#define GGML_COMMON_IMPL
#elif defined(GGML_COMMON_IMPL_METAL)
#include <metal_stdlib>
Expand Down
10 changes: 0 additions & 10 deletions ggml/src/ggml-cpu/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -44,16 +44,6 @@ if (GGML_OPENMP)
endif()
endif()

if (GGML_LLAMAFILE)
message(STATUS "Using llamafile")

add_compile_definitions(GGML_USE_LLAMAFILE)

target_sources(ggml-cpu PRIVATE
llamafile/sgemm.cpp
llamafile/sgemm.h)
endif()

if (GGML_CPU_HBM)
find_library(memkind memkind REQUIRED)

Expand Down
Loading
Loading