Skip to content

Commit

Permalink
refactor add new buffer type for online flow
Browse files Browse the repository at this point in the history
  • Loading branch information
chaxu01 committed Nov 6, 2024
1 parent 639949f commit e44a529
Show file tree
Hide file tree
Showing 14 changed files with 328 additions and 912 deletions.
5 changes: 5 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -874,6 +874,11 @@ ggml/src/ggml-cuda/%.o: \
$(HIPCC) $(CXXFLAGS) $(HIPFLAGS) -x hip -c -o $@ $<
endif # GGML_HIPBLAS

ifdef GGML_CPU_AARCH64
MK_CPPFLAGS += -DGGML_USE_CPU_AARCH64
MK_CFLAGS += -DGGML_USE_CPU_AARCH64
endif

ifdef GGML_METAL
MK_CPPFLAGS += -DGGML_USE_METAL
MK_LDFLAGS += -framework Foundation -framework Metal -framework MetalKit
Expand Down
7 changes: 0 additions & 7 deletions common/arg.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2047,13 +2047,6 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
common_log_set_timestamps(common_log_main(), true);
}
).set_env("LLAMA_LOG_TIMESTAMPS"));
add_opt(common_arg(
{"-rtrp", "--runtime-repack"},
string_format("Allow runtime requantization and repacking of Q4_0 to enable optimized GEMM and GEMV kernels (default: %d)", params.runtime_repack),
[](common_params & params) {
params.runtime_repack = true;
}
).set_examples({LLAMA_EXAMPLE_MAIN}));

return ctx_arg;
}
3 changes: 1 addition & 2 deletions common/common.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -983,7 +983,7 @@ struct llama_model_params common_model_params_to_llama(const common_params & par
mparams.main_gpu = params.main_gpu;
mparams.split_mode = params.split_mode;
mparams.tensor_split = params.tensor_split;
mparams.use_mmap = params.use_mmap && !params.runtime_repack;
mparams.use_mmap = params.use_mmap;
mparams.use_mlock = params.use_mlock;
mparams.check_tensors = params.check_tensors;
if (params.kv_overrides.empty()) {
Expand Down Expand Up @@ -1053,7 +1053,6 @@ struct llama_context_params common_context_params_to_llama(const common_params &
cparams.offload_kqv = !params.no_kv_offload;
cparams.flash_attn = params.flash_attn;
cparams.no_perf = params.no_perf;
cparams.runtime_repack = params.runtime_repack;

if (params.reranking) {
cparams.embeddings = true;
Expand Down
2 changes: 0 additions & 2 deletions common/common.h
Original file line number Diff line number Diff line change
Expand Up @@ -271,8 +271,6 @@ struct common_params {
bool warmup = true; // warmup run
bool check_tensors = false; // validate tensor data

bool runtime_repack = false; // runtime repack weight for optimized kernels

std::string cache_type_k = "f16"; // KV cache data type for the K
std::string cache_type_v = "f16"; // KV cache data type for the V

Expand Down
196 changes: 84 additions & 112 deletions examples/llama-bench/llama-bench.cpp

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions ggml/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,7 @@ else()
endif()

option(GGML_CPU_HBM "ggml: use memkind for CPU HBM" OFF)
option(GGML_CPU_AARCH64 "ggml: use runtime weight quantization to enable optimized GEMM/GEMV kernels for AARCH64 cpu" OFF)

option(GGML_AVX "ggml: enable AVX" ${INS_ENB})
option(GGML_AVX2 "ggml: enable AVX2" ${INS_ENB})
Expand Down
14 changes: 1 addition & 13 deletions ggml/include/ggml-backend.h
Original file line number Diff line number Diff line change
Expand Up @@ -305,19 +305,7 @@ extern "C" {
GGML_API void ggml_backend_tensor_alloc(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, void * addr);
GGML_API void ggml_backend_view_init(struct ggml_tensor * tensor);

//
// CPU backend
//

GGML_API ggml_backend_t ggml_backend_cpu_init(void);

GGML_API bool ggml_backend_is_cpu (ggml_backend_t backend);
GGML_API void ggml_backend_cpu_set_n_threads (ggml_backend_t backend_cpu, int n_threads);
GGML_API void ggml_backend_cpu_set_threadpool (ggml_backend_t backend_cpu, ggml_threadpool_t threadpool);
GGML_API void ggml_backend_cpu_set_abort_callback(ggml_backend_t backend_cpu, ggml_abort_callback abort_callback, void * abort_callback_data);
GGML_API void ggml_backend_cpu_set_runtime_repack(ggml_backend_t backend_cpu, bool runtime_repack);

// Create a backend buffer from an existing pointer
// CPU buffer types are always available
GGML_API ggml_backend_buffer_t ggml_backend_cpu_buffer_from_ptr(void * ptr, size_t size);
GGML_API ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void);

Expand Down
4 changes: 4 additions & 0 deletions ggml/include/ggml-cpu.h
Original file line number Diff line number Diff line change
Expand Up @@ -145,6 +145,10 @@ extern "C" {
GGML_API ggml_backend_buffer_type_t ggml_backend_cpu_hbm_buffer_type(void);
#endif

#ifdef GGML_USE_CPU_AARCH64
GGML_API ggml_backend_buffer_type_t ggml_backend_cpu_aarch64_buffer_type(void);
#endif

#ifdef __cplusplus
}
#endif
6 changes: 6 additions & 0 deletions ggml/src/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -880,6 +880,12 @@ if (GGML_CPU_HBM)
target_link_libraries(ggml PUBLIC memkind)
endif()

if (GGML_CPU_AARCH64)
message(STATUS "Using runtime weight quantization to enable optimized GEMM/GEMV kernels for AARCH64 cpu")

add_compile_definitions(GGML_USE_CPU_AARCH64)
endif()

if (GGML_CANN)
if ("cann${CANN_INSTALL_DIR}" STREQUAL "cann" AND DEFINED ENV{ASCEND_TOOLKIT_HOME})
set(CANN_INSTALL_DIR $ENV{ASCEND_TOOLKIT_HOME})
Expand Down
112 changes: 49 additions & 63 deletions ggml/src/ggml-aarch64.c
Original file line number Diff line number Diff line change
Expand Up @@ -3477,101 +3477,87 @@ void ggml_gemm_q4_0_8x8_q8_0(int n, float * restrict s, size_t bs, const void *
}
}

static int repack_q4_0_to_q4_0_4_bl(struct ggml_tensor *t, int interleave_block, uint8_t **pmem, size_t *psize) {
#ifdef GGML_USE_CPU_AARCH64
static void repack_q4_0_to_q4_0_4_bl(struct ggml_tensor * t, int interleave_block, const void * data, size_t data_size) {
GGML_ASSERT(t->type == GGML_TYPE_Q4_0);
GGML_ASSERT(t->ne[0] % 8 == 0);
GGML_ASSERT(interleave_block == 4 || interleave_block == 8);

// Do in-place transformation. Allocate scratch buffer
size_t size = sizeof(block_q4_0x4) * t->ne[0] / QK4_0;
if (size > *psize) {
uint8_t *new_mem = realloc(*pmem, size);
if (!new_mem) {
return -1;
}
*pmem = new_mem;
*psize = size;
}
block_q4_0x4 *dst = (block_q4_0x4*) *pmem;
block_q4_0 *src = (block_q4_0*) t->data;
block_q4_0x4 *dst = (block_q4_0x4 *)t->data;
const block_q4_0 *src = (const block_q4_0 *)data;
block_q4_0 dst_tmp[4];
int n = t->ne[0];
int nrow = t->ne[1]; // Number of rows
int nrows_interleaved = 4;
int nblocks = t->ne[0] / QK4_0;
for (int b = 0; b < (nrow * n); b += nrows_interleaved * n) {
int cnt = 0;
for (int64_t x = 0; x < nblocks; x++) {
for (int i = 0; i < nrows_interleaved; i++ ) {

GGML_ASSERT(data_size == nrow * nblocks * sizeof(block_q4_0));

for (int b = 0; b < nrow; b += nrows_interleaved) {
for (int64_t x = 0; x < nblocks; x++)
{
for (int i = 0; i < nrows_interleaved; i++) {
dst_tmp[i] = src[x + i * nblocks];
}
dst[cnt++] = make_block_q4_0x4(dst_tmp, interleave_block, 0x88);
*dst++ = make_block_q4_0x4(dst_tmp, interleave_block, 0x88);
}
memcpy(src, dst, size);
src += cnt * 4;
src += nrows_interleaved * nblocks;
}
return 0;

GGML_UNUSED(data_size);
}

static int repack_q4_0_to_q4_0_8_bl(struct ggml_tensor *t, int interleave_block, uint8_t **pmem, size_t *psize) {
static void repack_q4_0_to_q4_0_8_bl(struct ggml_tensor *t, int interleave_block, const void * data, size_t data_size) {
GGML_ASSERT(t->type == GGML_TYPE_Q4_0);
GGML_ASSERT(t->ne[0] % 8 == 0);
GGML_ASSERT(interleave_block == 8);

// Do in-place transformation. Allocate scratch buffer
size_t size = sizeof(block_q4_0x8) * t->ne[0] / QK4_0;
if (size > *psize) {
uint8_t *new_mem = realloc(*pmem, size);
if (!new_mem) {
return -1;
}
*pmem = new_mem;
*psize = size;
}
block_q4_0x8 *dst = (block_q4_0x8*) *pmem;
block_q4_0 *src = (block_q4_0*) t->data;
block_q4_0x8 *dst = (block_q4_0x8*)t->data;
const block_q4_0 *src = (const block_q4_0*) data;
block_q4_0 dst_tmp[8];
int n = t->ne[0];
int nrow = t->ne[1]; // Number of rows
int nrows_interleaved = 8;
int nblocks = t->ne[0] / QK4_0;
for (int b = 0; b < (nrow * n); b += nrows_interleaved * n) {
int cnt = 0;

GGML_ASSERT(data_size == nrow * nblocks * sizeof(block_q4_0));

for (int b = 0; b < nrow; b += nrows_interleaved) {
for (int64_t x = 0; x < nblocks; x++) {
for (int i = 0; i < nrows_interleaved; i++ ) {
dst_tmp[i] = src[x + i * nblocks];
}
dst[cnt++] = make_block_q4_0x8(dst_tmp, interleave_block, 0x88);
*dst++ = make_block_q4_0x8(dst_tmp, interleave_block, 0x88);
}
memcpy(src, dst, size);
src += cnt * 4;
src += nrows_interleaved * nblocks;
}
return 0;

GGML_UNUSED(data_size);
}

// Prepare for optimized kernels if applicable
void ggml_prepare_optimal_kernel(struct ggml_tensor *cur, uint8_t **pmem, size_t *psize) {
UNUSED(cur);
UNUSED(pmem);
UNUSED(psize);

int ggml_prepare_optimal_kernel(struct ggml_tensor * cur, const void * data, size_t data_size) {
GGML_ASSERT(cur->type == GGML_TYPE_Q4_0);
int ret = -1;
#if defined(__ARM_ARCH)
if (cur->type == GGML_TYPE_Q4_0) {
if (ggml_cpu_has_sve() && ggml_cpu_has_matmul_int8() && ggml_cpu_get_sve_cnt() == QK8_0) {
if (repack_q4_0_to_q4_0_8_bl(cur, 8, pmem, psize) == 0) {
cur->type = GGML_TYPE_Q4_0_8_8;
}
}
else if (ggml_cpu_has_neon() && ggml_cpu_has_matmul_int8()) {
if (repack_q4_0_to_q4_0_4_bl(cur, 8, pmem, psize) == 0) {
cur->type = GGML_TYPE_Q4_0_4_8;
}
}
else if (ggml_cpu_has_neon()) {
if (repack_q4_0_to_q4_0_4_bl(cur, 4, pmem, psize) == 0) {
cur->type = GGML_TYPE_Q4_0_4_4;
}
}
if (ggml_cpu_has_sve() && ggml_cpu_has_matmul_int8() && ggml_cpu_get_sve_cnt() == QK8_0) {
repack_q4_0_to_q4_0_8_bl(cur, 8, data, data_size);
cur->type = GGML_TYPE_Q4_0_8_8;
ret = 0;
}
else if (ggml_cpu_has_neon() && ggml_cpu_has_matmul_int8()) {
repack_q4_0_to_q4_0_4_bl(cur, 8, data, data_size);
cur->type = GGML_TYPE_Q4_0_4_8;
ret = 0;
}
else if (ggml_cpu_has_neon()) {
repack_q4_0_to_q4_0_4_bl(cur, 4, data, data_size);
cur->type = GGML_TYPE_Q4_0_4_4;
ret = 0;
}
#endif
return ret;

GGML_UNUSED(cur);
GGML_UNUSED(data);
GGML_UNUSED(data_size);
}
#endif
4 changes: 3 additions & 1 deletion ggml/src/ggml-aarch64.h
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,9 @@ void ggml_gemm_q4_0_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const vo
void ggml_gemm_q4_0_4x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
void ggml_gemm_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);

void ggml_prepare_optimal_kernel(struct ggml_tensor *cur, uint8_t **pmem, size_t *psize);
#ifdef GGML_USE_CPU_AARCH64
int ggml_prepare_optimal_kernel(struct ggml_tensor * cur, const void * data, size_t data_size);
#endif

#ifdef __cplusplus
}
Expand Down
Loading

0 comments on commit e44a529

Please sign in to comment.