diff --git a/.gitignore b/.gitignore index 565866fd4bcdc..e479c61805a6a 100644 --- a/.gitignore +++ b/.gitignore @@ -32,6 +32,7 @@ models/* /vdot /Pipfile +build-info.h arm_neon.h compile_commands.json diff --git a/CMakeLists.txt b/CMakeLists.txt index e4d37497ee199..42f9c6a000765 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -73,6 +73,41 @@ option(LLAMA_HIPBLAS "llama: use hipBLAS" option(LLAMA_BUILD_TESTS "llama: build tests" ${LLAMA_STANDALONE}) option(LLAMA_BUILD_EXAMPLES "llama: build examples" ${LLAMA_STANDALONE}) +# +# Build info header +# + +# Write header template to binary dir to keep source directory clean +file(WRITE "${CMAKE_BINARY_DIR}/BUILD_INFO.h.in" "\ +#ifndef BUILD_INFO_H\n\ +#define BUILD_INFO_H\n\ +\n\ +#define BUILD_NUMBER @BUILD_NUMBER@\n\ +#define BUILD_COMMIT \"@BUILD_COMMIT@\"\n\ +\n\ +#endif // BUILD_INFO_H\n\ +") + +# Generate initial build-info.h +include(${CMAKE_CURRENT_SOURCE_DIR}/scripts/build-info.cmake) + +if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/.git") + # Add a custom target for build-info.h + add_custom_target(BUILD_INFO ALL DEPENDS "${CMAKE_CURRENT_SOURCE_DIR}/build-info.h") + + # Add a custom command to rebuild build-info.h when .git/index changes + add_custom_command( + OUTPUT "${CMAKE_CURRENT_SOURCE_DIR}/build-info.h" + COMMENT "Generating build details from Git" + COMMAND ${CMAKE_COMMAND} -P "${CMAKE_CURRENT_SOURCE_DIR}/scripts/build-info.cmake" + WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} + DEPENDS "${CMAKE_CURRENT_SOURCE_DIR}/.git/index" + VERBATIM + ) +else() + message(WARNING "Git repository not found; to enable automatic generation of build info, make sure Git is installed and the project is a Git repository.") +endif() + # # Compile flags # @@ -288,9 +323,22 @@ if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "arm" OR ${CMAKE_SYSTEM_PROCESSOR} MATCHES # TODO: arm msvc? else() if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "aarch64") + # Apple M1, M2, etc. + # Raspberry Pi 3, 4, Zero 2 (64-bit) add_compile_options(-mcpu=native) endif() - # TODO: armv6,7,8 version specific flags + if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "armv6") + # Raspberry Pi 1, Zero + add_compile_options(-mfpu=neon-fp-armv8 -mfp16-format=ieee -mno-unaligned-access) + endif() + if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "armv7") + # Raspberry Pi 2 + add_compile_options(-mfpu=neon-fp-armv8 -mfp16-format=ieee -mno-unaligned-access -funsafe-math-optimizations) + endif() + if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "armv8") + # Raspberry Pi 3, 4, Zero 2 (32-bit) + add_compile_options(-mfp16-format=ieee -mno-unaligned-access) + endif() endif() elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "^(x86_64|i686|AMD64)$") message(STATUS "x86 detected") diff --git a/Makefile b/Makefile index 8c9721d743fcd..9f3113e0be5bb 100644 --- a/Makefile +++ b/Makefile @@ -148,19 +148,21 @@ ifdef LLAMA_PERF CXXFLAGS += -DGGML_PERF endif ifneq ($(filter aarch64%,$(UNAME_M)),) + # Apple M1, M2, etc. + # Raspberry Pi 3, 4, Zero 2 (64-bit) CFLAGS += -mcpu=native CXXFLAGS += -mcpu=native endif ifneq ($(filter armv6%,$(UNAME_M)),) - # Raspberry Pi 1, 2, 3 + # Raspberry Pi 1, Zero CFLAGS += -mfpu=neon-fp-armv8 -mfp16-format=ieee -mno-unaligned-access endif ifneq ($(filter armv7%,$(UNAME_M)),) - # Raspberry Pi 4 + # Raspberry Pi 2 CFLAGS += -mfpu=neon-fp-armv8 -mfp16-format=ieee -mno-unaligned-access -funsafe-math-optimizations endif ifneq ($(filter armv8%,$(UNAME_M)),) - # Raspberry Pi 4 + # Raspberry Pi 3, 4, Zero 2 (32-bit) CFLAGS += -mfp16-format=ieee -mno-unaligned-access endif @@ -192,41 +194,56 @@ llama.o: llama.cpp ggml.h ggml-cuda.h llama.h llama-util.h common.o: examples/common.cpp examples/common.h $(CXX) $(CXXFLAGS) -c $< -o $@ +libllama.so: llama.o ggml.o $(OBJS) + $(CXX) $(CXXFLAGS) -shared -fPIC -o $@ $^ $(LDFLAGS) + clean: - rm -vf *.o main quantize quantize-stats perplexity embedding benchmark-matmult + rm -vf *.o main quantize quantize-stats perplexity embedding benchmark-matmult save-load-state build-info.h -main: examples/main/main.cpp ggml.o llama.o common.o $(OBJS) - $(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS) +# +# Examples +# + +main: examples/main/main.cpp build-info.h ggml.o llama.o common.o $(OBJS) + $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) @echo @echo '==== Run ./main -h for help. ====' @echo -quantize: examples/quantize/quantize.cpp ggml.o llama.o $(OBJS) - $(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS) +quantize: examples/quantize/quantize.cpp build-info.h ggml.o llama.o $(OBJS) + $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) -quantize-stats: examples/quantize-stats/quantize-stats.cpp ggml.o llama.o $(OBJS) - $(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS) +quantize-stats: examples/quantize-stats/quantize-stats.cpp build-info.h ggml.o llama.o $(OBJS) + $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) -perplexity: examples/perplexity/perplexity.cpp ggml.o llama.o common.o $(OBJS) - $(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS) +perplexity: examples/perplexity/perplexity.cpp build-info.h ggml.o llama.o common.o $(OBJS) + $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) -embedding: examples/embedding/embedding.cpp ggml.o llama.o common.o $(OBJS) - $(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS) +embedding: examples/embedding/embedding.cpp build-info.h ggml.o llama.o common.o $(OBJS) + $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) -vdot: pocs/vdot/vdot.cpp ggml.o $(OBJS) - $(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS) +save-load-state: examples/save-load-state/save-load-state.cpp build-info.h ggml.o llama.o common.o $(OBJS) + $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) -libllama.so: llama.o ggml.o $(OBJS) - $(CXX) $(CXXFLAGS) -shared -fPIC -o $@ $^ $(LDFLAGS) +build-info.h: $(wildcard .git/index) scripts/build-info.sh + @scripts/build-info.sh > $@.tmp + @if ! cmp -s $@.tmp $@; then \ + mv $@.tmp $@; \ + else \ + rm $@.tmp; \ + fi # # Tests # -benchmark-matmult: examples/benchmark/benchmark-matmult.cpp ggml.o $(OBJS) - $(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS) +benchmark-matmult: examples/benchmark/benchmark-matmult.cpp build-info.h ggml.o $(OBJS) + $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) ./$@ +vdot: pocs/vdot/vdot.cpp ggml.o $(OBJS) + $(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS) + .PHONY: tests tests: bash ./tests/run-tests.sh diff --git a/examples/benchmark/CMakeLists.txt b/examples/benchmark/CMakeLists.txt index 05deebcd10c79..0376961945ad7 100644 --- a/examples/benchmark/CMakeLists.txt +++ b/examples/benchmark/CMakeLists.txt @@ -2,3 +2,6 @@ set(TARGET benchmark) add_executable(${TARGET} benchmark-matmult.cpp) target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) target_compile_features(${TARGET} PRIVATE cxx_std_11) +if(TARGET BUILD_INFO) + add_dependencies(${TARGET} BUILD_INFO) +endif() diff --git a/examples/benchmark/benchmark-matmult.cpp b/examples/benchmark/benchmark-matmult.cpp index 19cbab1c38825..2cc1a1477762c 100644 --- a/examples/benchmark/benchmark-matmult.cpp +++ b/examples/benchmark/benchmark-matmult.cpp @@ -1,5 +1,6 @@ #include #include "ggml.h" +#include "build-info.h" #include #include #include @@ -90,9 +91,10 @@ int main(int argc, char ** argv) { } } - // create the ggml context + fprintf(stderr, "%s: build = %d (%s)\n", __func__, BUILD_NUMBER, BUILD_COMMIT); printf("Starting Test\n"); + // create the ggml context struct ggml_context * ctx; //const int sizex = 4096; //const int sizey = 11008; diff --git a/examples/common.cpp b/examples/common.cpp index 6c712c713db9b..ad7b0bba32f1f 100644 --- a/examples/common.cpp +++ b/examples/common.cpp @@ -1,13 +1,18 @@ #include "common.h" #include +#include #include #include #include #include #include #include -#include + +#if defined(__APPLE__) && defined(__MACH__) +#include +#include +#endif #if defined (_WIN32) #include @@ -25,19 +30,43 @@ extern "C" __declspec(dllimport) int __stdcall WideCharToMultiByte(unsigned int #define CP_UTF8 65001 #endif -bool gpt_params_parse(int argc, char ** argv, gpt_params & params) { - // determine sensible default number of threads. - // std::thread::hardware_concurrency may not be equal to the number of cores, or may return 0. +int32_t get_num_physical_cores() { #ifdef __linux__ std::ifstream cpuinfo("/proc/cpuinfo"); - params.n_threads = std::count(std::istream_iterator(cpuinfo), - std::istream_iterator(), - std::string("processor")); -#endif - if (params.n_threads == 0) { - params.n_threads = std::max(1, (int32_t) std::thread::hardware_concurrency()); + std::string line; + while (std::getline(cpuinfo, line)) { + std::size_t pos = line.find("cpu cores"); + if (pos != std::string::npos) { + pos = line.find(": ", pos); + if (pos != std::string::npos) { + try { + // Extract the number and return it + return static_cast(std::stoul(line.substr(pos + 2))); + } catch (const std::invalid_argument &) { + // Ignore if we could not parse + } + } + } + } +#elif defined(__APPLE__) && defined(__MACH__) + int32_t num_physical_cores; + size_t len = sizeof(num_physical_cores); + int result = sysctlbyname("hw.perflevel0.physicalcpu", &num_physical_cores, &len, NULL, 0); + if (result == 0) { + return num_physical_cores; + } + result = sysctlbyname("hw.physicalcpu", &num_physical_cores, &len, NULL, 0); + if (result == 0) { + return num_physical_cores; } +#elif defined(_WIN32) + //TODO: Implement +#endif + unsigned int n_threads = std::thread::hardware_concurrency(); + return n_threads > 0 ? (n_threads <= 4 ? n_threads : n_threads / 2) : 4; +} +bool gpt_params_parse(int argc, char ** argv, gpt_params & params) { bool invalid_param = false; std::string arg; gpt_params default_params; diff --git a/examples/common.h b/examples/common.h index fce1d42a9da70..627696e30a4f6 100644 --- a/examples/common.h +++ b/examples/common.h @@ -13,11 +13,12 @@ // // CLI argument parsing // +int32_t get_num_physical_cores(); struct gpt_params { int32_t seed = -1; // RNG seed - int32_t n_threads = std::min(4, (int32_t) std::thread::hardware_concurrency()); - int32_t n_predict = -1; // new tokens to predict + int32_t n_threads = get_num_physical_cores(); + int32_t n_predict = -1; // new tokens to predict int32_t n_parts = -1; // amount of model parts (-1 = determine from model dimensions) int32_t n_ctx = 512; // context size int32_t n_batch = 512; // batch size for prompt processing (must be >=32 to use BLAS) diff --git a/examples/embedding/CMakeLists.txt b/examples/embedding/CMakeLists.txt index 88c425d4a1fd1..db73b6b44f07f 100644 --- a/examples/embedding/CMakeLists.txt +++ b/examples/embedding/CMakeLists.txt @@ -2,3 +2,6 @@ set(TARGET embedding) add_executable(${TARGET} embedding.cpp) target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) target_compile_features(${TARGET} PRIVATE cxx_std_11) +if(TARGET BUILD_INFO) + add_dependencies(${TARGET} BUILD_INFO) +endif() diff --git a/examples/embedding/embedding.cpp b/examples/embedding/embedding.cpp index e10de619c9d5f..b3e001476ea95 100644 --- a/examples/embedding/embedding.cpp +++ b/examples/embedding/embedding.cpp @@ -1,5 +1,6 @@ #include "common.h" #include "llama.h" +#include "build-info.h" #include @@ -18,11 +19,13 @@ int main(int argc, char ** argv) { "expect poor results\n", __func__, params.n_ctx); } + fprintf(stderr, "%s: build = %d (%s)\n", __func__, BUILD_NUMBER, BUILD_COMMIT); + if (params.seed <= 0) { params.seed = time(NULL); } - fprintf(stderr, "%s: seed = %d\n", __func__, params.seed); + fprintf(stderr, "%s: seed = %d\n", __func__, params.seed); std::mt19937 rng(params.seed); if (params.random_prompt) { diff --git a/examples/main/CMakeLists.txt b/examples/main/CMakeLists.txt index b2dcc2910f333..c364242fbadb4 100644 --- a/examples/main/CMakeLists.txt +++ b/examples/main/CMakeLists.txt @@ -2,3 +2,6 @@ set(TARGET main) add_executable(${TARGET} main.cpp) target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) target_compile_features(${TARGET} PRIVATE cxx_std_11) +if(TARGET BUILD_INFO) + add_dependencies(${TARGET} BUILD_INFO) +endif() diff --git a/examples/main/main.cpp b/examples/main/main.cpp index 990d0fa023c63..7dc1005123775 100644 --- a/examples/main/main.cpp +++ b/examples/main/main.cpp @@ -5,6 +5,7 @@ #include "common.h" #include "llama.h" +#include "build-info.h" #include #include @@ -81,11 +82,13 @@ int main(int argc, char ** argv) { "expect poor results\n", __func__, params.n_ctx); } + fprintf(stderr, "%s: build = %d (%s)\n", __func__, BUILD_NUMBER, BUILD_COMMIT); + if (params.seed <= 0) { params.seed = time(NULL); } - fprintf(stderr, "%s: seed = %d\n", __func__, params.seed); + fprintf(stderr, "%s: seed = %d\n", __func__, params.seed); std::mt19937 rng(params.seed); if (params.random_prompt) { @@ -161,23 +164,22 @@ int main(int argc, char ** argv) { std::vector session_tokens; if (!path_session.empty()) { - fprintf(stderr, "%s: attempting to load saved session from %s..\n", __func__, path_session.c_str()); + fprintf(stderr, "%s: attempting to load saved session from '%s'\n", __func__, path_session.c_str()); - // REVIEW - fopen to check for existing session + // fopen to check for existing session FILE * fp = std::fopen(path_session.c_str(), "rb"); if (fp != NULL) { std::fclose(fp); session_tokens.resize(params.n_ctx); size_t n_token_count_out = 0; - const size_t n_session_bytes = llama_load_session_file(ctx, path_session.c_str(), session_tokens.data(), session_tokens.capacity(), &n_token_count_out); + if (!llama_load_session_file(ctx, path_session.c_str(), session_tokens.data(), session_tokens.capacity(), &n_token_count_out)) { + fprintf(stderr, "%s: error: failed to load session file '%s'\n", __func__, path_session.c_str()); + return 1; + } session_tokens.resize(n_token_count_out); - if (n_session_bytes > 0) { - fprintf(stderr, "%s: loaded %zu bytes of session data!\n", __func__, n_session_bytes); - } else { - fprintf(stderr, "%s: could not load session file, will recreate\n", __func__); - } + fprintf(stderr, "%s: loaded a session with prompt size of %d tokens\n", __func__, (int) session_tokens.size()); } else { fprintf(stderr, "%s: session file does not exist, will create\n", __func__); } @@ -214,7 +216,7 @@ int main(int argc, char ** argv) { } // number of tokens to keep when resetting context - if (params.n_keep < 0 || params.n_keep > (int)embd_inp.size() || params.instruct) { + if (params.n_keep < 0 || params.n_keep > (int) embd_inp.size() || params.instruct) { params.n_keep = (int)embd_inp.size(); } @@ -329,7 +331,7 @@ int main(int argc, char ** argv) { // insert n_left/2 tokens at the start of embd from last_n_tokens embd.insert(embd.begin(), last_n_tokens.begin() + n_ctx - n_left/2 - embd.size(), last_n_tokens.end() - embd.size()); - // REVIEW - stop saving session if we run out of context + // stop saving session if we run out of context path_session = ""; //printf("\n---\n"); @@ -355,6 +357,7 @@ int main(int argc, char ** argv) { n_session_consumed++; if (n_session_consumed >= (int) session_tokens.size()) { + ++i; break; } } diff --git a/examples/perplexity/CMakeLists.txt b/examples/perplexity/CMakeLists.txt index 5836df8b27752..61b17b828dd1b 100644 --- a/examples/perplexity/CMakeLists.txt +++ b/examples/perplexity/CMakeLists.txt @@ -2,3 +2,6 @@ set(TARGET perplexity) add_executable(${TARGET} perplexity.cpp) target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) target_compile_features(${TARGET} PRIVATE cxx_std_11) +if(TARGET BUILD_INFO) + add_dependencies(${TARGET} BUILD_INFO) +endif() diff --git a/examples/perplexity/perplexity.cpp b/examples/perplexity/perplexity.cpp index 615157e7b68ec..2ca3388355bd0 100644 --- a/examples/perplexity/perplexity.cpp +++ b/examples/perplexity/perplexity.cpp @@ -1,5 +1,6 @@ #include "common.h" #include "llama.h" +#include "build-info.h" #include #include @@ -106,11 +107,13 @@ int main(int argc, char ** argv) { "expect poor results\n", __func__, params.n_ctx); } + fprintf(stderr, "%s: build = %d (%s)\n", __func__, BUILD_NUMBER, BUILD_COMMIT); + if (params.seed <= 0) { params.seed = time(NULL); } - fprintf(stderr, "%s: seed = %d\n", __func__, params.seed); + fprintf(stderr, "%s: seed = %d\n", __func__, params.seed); std::mt19937 rng(params.seed); if (params.random_prompt) { diff --git a/examples/quantize-stats/quantize-stats.cpp b/examples/quantize-stats/quantize-stats.cpp index 4e6c2c8314661..9a2aa7c6474fb 100644 --- a/examples/quantize-stats/quantize-stats.cpp +++ b/examples/quantize-stats/quantize-stats.cpp @@ -1,4 +1,5 @@ #include "ggml.h" +#include "build-info.h" #define LLAMA_API_INTERNAL #include "llama.h" @@ -308,6 +309,8 @@ int main(int argc, char ** argv) { return 1; } + fprintf(stderr, "%s: build = %d (%s)\n", __func__, BUILD_NUMBER, BUILD_COMMIT); + // load the model fprintf(stderr, "Loading model\n"); diff --git a/examples/quantize/CMakeLists.txt b/examples/quantize/CMakeLists.txt index fb27d45171c7a..475fc8be885a6 100644 --- a/examples/quantize/CMakeLists.txt +++ b/examples/quantize/CMakeLists.txt @@ -2,3 +2,6 @@ set(TARGET quantize) add_executable(${TARGET} quantize.cpp) target_link_libraries(${TARGET} PRIVATE llama ${CMAKE_THREAD_LIBS_INIT}) target_compile_features(${TARGET} PRIVATE cxx_std_11) +if(TARGET BUILD_INFO) + add_dependencies(${TARGET} BUILD_INFO) +endif() diff --git a/examples/quantize/quantize.cpp b/examples/quantize/quantize.cpp index dd175c690232e..198bd5fcb4cf6 100644 --- a/examples/quantize/quantize.cpp +++ b/examples/quantize/quantize.cpp @@ -1,5 +1,6 @@ #include "ggml.h" #include "llama.h" +#include "build-info.h" #include #include @@ -50,6 +51,8 @@ int main(int argc, char ** argv) { ftype = (enum llama_ftype)atoi(argv[3]); } + fprintf(stderr, "%s: build = %d (%s)\n", __func__, BUILD_NUMBER, BUILD_COMMIT); + int nthread = argc > 4 ? atoi(argv[4]) : 0; const int64_t t_main_start_us = ggml_time_us(); diff --git a/examples/save-load-state/CMakeLists.txt b/examples/save-load-state/CMakeLists.txt index cff79fa1f3e17..08dbe5c2b3edf 100644 --- a/examples/save-load-state/CMakeLists.txt +++ b/examples/save-load-state/CMakeLists.txt @@ -2,3 +2,6 @@ set(TARGET save-load-state) add_executable(${TARGET} save-load-state.cpp) target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) target_compile_features(${TARGET} PRIVATE cxx_std_11) +if(TARGET BUILD_INFO) + add_dependencies(${TARGET} BUILD_INFO) +endif() diff --git a/examples/save-load-state/save-load-state.cpp b/examples/save-load-state/save-load-state.cpp index f1531ba39eb5e..ea0a984d93816 100644 --- a/examples/save-load-state/save-load-state.cpp +++ b/examples/save-load-state/save-load-state.cpp @@ -1,5 +1,6 @@ #include "common.h" #include "llama.h" +#include "build-info.h" #include #include @@ -17,6 +18,8 @@ int main(int argc, char ** argv) { return 1; } + fprintf(stderr, "%s: build = %d (%s)\n", __func__, BUILD_NUMBER, BUILD_COMMIT); + if (params.n_predict < 0) { params.n_predict = 16; } diff --git a/ggml-cuda.cu b/ggml-cuda.cu index b9d60db010077..033c7d5c88ff0 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -1,15 +1,44 @@ +#include +#include #include #include +#include + #if defined(GGML_USE_HIPBLAS) +#include "hip/hip_runtime.h" +#include "hipblas/hipblas.h" #include "hip/hip_fp16.h" #else +#include +#include #include #endif -#include -#include "ggml-cuda.h" -typedef uint16_t ggml_fp16_t; -static_assert(sizeof(__half) == sizeof(ggml_fp16_t), "wrong fp16 size"); +#include "ggml-cuda.h" +#include "ggml.h" + +static_assert(sizeof(half) == sizeof(ggml_fp16_t), "wrong fp16 size"); + +#define CUDA_CHECK(err) \ + do { \ + cudaError_t err_ = (err); \ + if (err_ != cudaSuccess) { \ + fprintf(stderr, "CUDA error %d at %s:%d: %s\n", err_, __FILE__, __LINE__, \ + cudaGetErrorString(err_)); \ + exit(1); \ + } \ + } while (0) + +#define CUBLAS_CHECK(err) \ + do { \ + cublasStatus_t err_ = (err); \ + if (err_ != CUBLAS_STATUS_SUCCESS) { \ + fprintf(stderr, "cuBLAS error %d at %s:%d\n", err_, __FILE__, __LINE__); \ + exit(1); \ + } \ + } while (0) + +typedef void (*to_fp32_cuda_t)(const void * x, float * y, int k, cudaStream_t stream); #define QK4_0 32 typedef struct { @@ -28,14 +57,14 @@ static_assert(sizeof(block_q4_1) == sizeof(float) * 2 + QK4_1 / 2, "wrong q4_1 b #define QK4_2 16 typedef struct { - __half d; // delta + half d; // delta uint8_t qs[QK4_2 / 2]; // nibbles / quants } block_q4_2; static_assert(sizeof(block_q4_2) == sizeof(ggml_fp16_t) + QK4_2 / 2, "wrong q4_2 block size/padding"); #define QK5_0 32 typedef struct { - __half d; // delta + half d; // delta uint8_t qh[4]; // 5-th bit of quants uint8_t qs[QK5_0 / 2]; // nibbles / quants } block_q5_0; @@ -43,9 +72,9 @@ static_assert(sizeof(block_q5_0) == sizeof(ggml_fp16_t) + sizeof(uint32_t) + QK5 #define QK5_1 32 typedef struct { - __half d; // delta - __half m; // min - uint32_t qh; // 5-th bit of quants + half d; // delta + half m; // min + uint8_t qh[4]; // 5-th bit of quants uint8_t qs[QK5_1 / 2]; // nibbles / quants } block_q5_1; static_assert(sizeof(block_q5_1) == 2 * sizeof(ggml_fp16_t) + sizeof(uint32_t) + QK5_1 / 2, "wrong q5_1 block size/padding"); @@ -166,7 +195,8 @@ static __global__ void dequantize_block_q5_1(const void * vx, float * y) { const uint8_t * pp = x[i].qs; - const uint32_t qh = x[i].qh; + uint32_t qh; + memcpy(&qh, x[i].qh, sizeof(qh)); for (int l = 0; l < QK5_1; l += 2) { const uint8_t vi = pp[l/2]; @@ -201,37 +231,50 @@ static __global__ void dequantize_block_q8_0(const void * vx, float * y) { } } -void dequantize_row_q4_0_cuda(const void * vx, float * y, int k, cudaStream_t stream) { +static void dequantize_row_q4_0_cuda(const void * vx, float * y, int k, cudaStream_t stream) { const int nb = k / QK4_0; dequantize_block_q4_0<<>>(vx, y); } -void dequantize_row_q4_1_cuda(const void * vx, float * y, int k, cudaStream_t stream) { +static void dequantize_row_q4_1_cuda(const void * vx, float * y, int k, cudaStream_t stream) { const int nb = k / QK4_1; dequantize_block_q4_1<<>>(vx, y); } -void dequantize_row_q4_2_cuda(const void * vx, float * y, int k, cudaStream_t stream) { +static void dequantize_row_q4_2_cuda(const void * vx, float * y, int k, cudaStream_t stream) { const int nb = k / QK4_2; dequantize_block_q4_2<<>>(vx, y); } -void dequantize_row_q5_0_cuda(const void * vx, float * y, int k, cudaStream_t stream) { +static void dequantize_row_q5_0_cuda(const void * vx, float * y, int k, cudaStream_t stream) { const int nb = k / QK5_0; dequantize_block_q5_0<<>>(vx, y); } -void dequantize_row_q5_1_cuda(const void * vx, float * y, int k, cudaStream_t stream) { +static void dequantize_row_q5_1_cuda(const void * vx, float * y, int k, cudaStream_t stream) { const int nb = k / QK5_1; dequantize_block_q5_1<<>>(vx, y); } -void dequantize_row_q8_0_cuda(const void * vx, float * y, int k, cudaStream_t stream) { +static void dequantize_row_q8_0_cuda(const void * vx, float * y, int k, cudaStream_t stream) { const int nb = k / QK8_0; dequantize_block_q8_0<<>>(vx, y); } -dequantize_row_q_cuda_t ggml_get_dequantize_row_q_cuda(ggml_type type) { +// TODO: optimize +static __global__ void convert_fp16_to_fp32(const void * vx, float * y) { + const half * x = (const half *) vx; + + const int i = blockIdx.x; + + y[i] = __half2float(x[i]); +} + +static void convert_fp16_to_fp32_cuda(const void * x, float * y, int k, cudaStream_t stream) { + convert_fp16_to_fp32<<>>(x, y); +} + +static to_fp32_cuda_t ggml_get_to_fp32_cuda(ggml_type type) { switch (type) { case GGML_TYPE_Q4_0: return dequantize_row_q4_0_cuda; @@ -245,6 +288,8 @@ dequantize_row_q_cuda_t ggml_get_dequantize_row_q_cuda(ggml_type type) { return dequantize_row_q5_1_cuda; case GGML_TYPE_Q8_0: return dequantize_row_q8_0_cuda; + case GGML_TYPE_F16: + return convert_fp16_to_fp32_cuda; default: return nullptr; } @@ -275,7 +320,7 @@ struct cuda_buffer { static cuda_buffer g_cuda_buffer_pool[MAX_CUDA_BUFFERS]; static std::atomic_flag g_cuda_pool_lock = ATOMIC_FLAG_INIT; -void * ggml_cuda_pool_malloc(size_t size, size_t * actual_size) { +static void * ggml_cuda_pool_malloc(size_t size, size_t * actual_size) { scoped_spin_lock lock(g_cuda_pool_lock); for (int i = 0; i < MAX_CUDA_BUFFERS; ++i) { @@ -294,7 +339,7 @@ void * ggml_cuda_pool_malloc(size_t size, size_t * actual_size) { return ptr; } -void ggml_cuda_pool_free(void * ptr, size_t size) { +static void ggml_cuda_pool_free(void * ptr, size_t size) { scoped_spin_lock lock(g_cuda_pool_lock); for (int i = 0; i < MAX_CUDA_BUFFERS; ++i) { @@ -309,28 +354,55 @@ void ggml_cuda_pool_free(void * ptr, size_t size) { CUDA_CHECK(cudaFree(ptr)); } -cublasHandle_t g_cublasH = nullptr; -cudaStream_t g_cudaStream = nullptr; -cudaStream_t g_cudaStream2 = nullptr; -cudaEvent_t g_cudaEvent = nullptr; +#define GGML_CUDA_MAX_STREAMS 8 +#define GGML_CUDA_MAX_EVENTS 64 +static cublasHandle_t g_cublasH = nullptr; +static cudaStream_t g_cudaStreams[GGML_CUDA_MAX_STREAMS] = { nullptr }; +static cudaStream_t g_cudaStreams2[GGML_CUDA_MAX_STREAMS] = { nullptr }; +static cudaEvent_t g_cudaEvents[GGML_CUDA_MAX_EVENTS] = { nullptr }; void ggml_init_cublas() { if (g_cublasH == nullptr) { - // create cublas handle, bind a stream - CUBLAS_CHECK(cublasCreate(&g_cublasH)); - CUDA_CHECK(cudaStreamCreateWithFlags(&g_cudaStream, cudaStreamNonBlocking)); - CUBLAS_CHECK(cublasSetStream(g_cublasH, g_cudaStream)); + // create streams + for (int i = 0; i < GGML_CUDA_MAX_STREAMS; ++i) { + CUDA_CHECK(cudaStreamCreateWithFlags(&g_cudaStreams[i], cudaStreamNonBlocking)); + CUDA_CHECK(cudaStreamCreateWithFlags(&g_cudaStreams2[i], cudaStreamNonBlocking)); + } + // create events + for (int i = 0; i < GGML_CUDA_MAX_EVENTS; ++i) { + CUDA_CHECK(cudaEventCreateWithFlags(&g_cudaEvents[i], cudaEventDisableTiming)); + } - // create additional stream and event for synchronization - CUDA_CHECK(cudaStreamCreateWithFlags(&g_cudaStream2, cudaStreamNonBlocking)); - CUDA_CHECK(cudaEventCreateWithFlags(&g_cudaEvent, cudaEventDisableTiming)); + // create cublas handle + CUBLAS_CHECK(cublasCreate(&g_cublasH)); + CUBLAS_CHECK(cublasSetMathMode(g_cublasH, CUBLAS_TF32_TENSOR_OP_MATH)); // configure logging to stdout - // CUBLAS_CHECK(cublasLoggerConfigure(1, 1, 0, NULL)); + // CUBLAS_CHECK(cublasLoggerConfigure(1, 1, 0, nullptr)); } } -cudaError_t ggml_cuda_h2d_tensor_2d(void * dst, const struct ggml_tensor * src, uint64_t i3, uint64_t i2, cudaStream_t stream) { +void * ggml_cuda_host_malloc(size_t size) { + if (getenv("GGML_CUDA_NO_PINNED") != nullptr) { + return nullptr; + } + + void * ptr = nullptr; + cudaError_t err = cudaMallocHost((void **) &ptr, size); + if (err != cudaSuccess) { + fprintf(stderr, "WARNING: failed to allocate %.2f MB of pinned memory: %s\n", + size/1024.0/1024.0, cudaGetErrorString(err)); + return nullptr; + } + + return ptr; +} + +void ggml_cuda_host_free(void * ptr) { + CUDA_CHECK(cudaFreeHost(ptr)); +} + +static cudaError_t ggml_cuda_h2d_tensor_2d(void * dst, const struct ggml_tensor * src, uint64_t i3, uint64_t i2, cudaStream_t stream) { const uint64_t ne0 = src->ne[0]; const uint64_t ne1 = src->ne[1]; const uint64_t nb0 = src->nb[0]; @@ -358,12 +430,293 @@ cudaError_t ggml_cuda_h2d_tensor_2d(void * dst, const struct ggml_tensor * src, } } -void * ggml_cuda_host_malloc(size_t size) { - void * ptr; - CUDA_CHECK(cudaMallocHost((void **) &ptr, size)); - return ptr; +static void ggml_cuda_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + const int64_t ne00 = src0->ne[0]; + const int64_t ne01 = src0->ne[1]; + const int64_t ne02 = src0->ne[2]; + const int64_t ne03 = src0->ne[3]; + + const int64_t ne10 = src1->ne[0]; + const int64_t ne11 = src1->ne[1]; + + const int nb2 = dst->nb[2]; + const int nb3 = dst->nb[3]; + + const float alpha = 1.0f; + const float beta = 0.0f; + const int x_ne = ne01 * ne00; + const int y_ne = ne11 * ne10; + const int d_ne = ne11 * ne01; + const int n_mm = ne03 * ne02; + + size_t x_size, y_size, d_size; + float * d_X = (float *) ggml_cuda_pool_malloc(n_mm * sizeof(float) * x_ne, &x_size); + float * d_Y = (float *) ggml_cuda_pool_malloc(n_mm * sizeof(float) * y_ne, &y_size); + float * d_D = (float *) ggml_cuda_pool_malloc(n_mm * sizeof(float) * d_ne, &d_size); + + for (int64_t i03 = 0; i03 < ne03; i03++) { + for (int64_t i02 = 0; i02 < ne02; i02++) { + int i = i03*ne02 + i02; + cudaStream_t cudaStream = g_cudaStreams[i % GGML_CUDA_MAX_STREAMS]; + + float * c_X = d_X + i * x_ne; + float * c_Y = d_Y + i * y_ne; + float * c_D = d_D + i * d_ne; + + // copy data to device + CUDA_CHECK(ggml_cuda_h2d_tensor_2d(c_X, src0, i03, i02, cudaStream)); + CUDA_CHECK(ggml_cuda_h2d_tensor_2d(c_Y, src1, i03, i02, cudaStream)); + + // compute + CUBLAS_CHECK(cublasSetStream(g_cublasH, cudaStream)); + CUBLAS_CHECK( + cublasSgemm(g_cublasH, CUBLAS_OP_T, CUBLAS_OP_N, + ne01, ne11, ne10, + &alpha, c_X, ne00, + c_Y, ne10, + &beta, c_D, ne01)); + + // copy dst to host + float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3); + CUDA_CHECK(cudaMemcpyAsync(d, c_D, sizeof(float) * d_ne, cudaMemcpyDeviceToHost, cudaStream)); + } + } + + CUDA_CHECK(cudaDeviceSynchronize()); + ggml_cuda_pool_free(d_X, x_size); + ggml_cuda_pool_free(d_Y, y_size); + ggml_cuda_pool_free(d_D, d_size); } -void ggml_cuda_host_free(void * ptr) { - CUDA_CHECK(cudaFreeHost(ptr)); +static void ggml_cuda_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, void * wdata, size_t /* wsize */) { + const int64_t ne00 = src0->ne[0]; + const int64_t ne01 = src0->ne[1]; + const int64_t ne02 = src0->ne[2]; + const int64_t ne03 = src0->ne[3]; + + const int64_t ne10 = src1->ne[0]; + const int64_t ne11 = src1->ne[1]; + + const int nb10 = src1->nb[0]; + const int nb11 = src1->nb[1]; + const int nb12 = src1->nb[2]; + const int nb13 = src1->nb[3]; + + const int nb2 = dst->nb[2]; + const int nb3 = dst->nb[3]; + + const float alpha = 1.0f; + const float beta = 0.0f; + const int x_ne = ne01 * ne00; + const int y_ne = ne11 * ne10; + const int d_ne = ne11 * ne01; + const int n_mm = ne03 * ne02; + + size_t x_size, y_size, d_size; + half * d_X = (half *) ggml_cuda_pool_malloc(n_mm * sizeof(half) * x_ne, &x_size); + half * d_Y = (half *) ggml_cuda_pool_malloc(n_mm * sizeof(half) * y_ne, &y_size); + float * d_D = (float *) ggml_cuda_pool_malloc(n_mm * sizeof(float) * d_ne, &d_size); + + bool src1_cont_rows = nb10 == sizeof(float); + bool src1_cont_cols = (size_t)nb11 == ne11*sizeof(float); + + for (int64_t i03 = 0; i03 < ne03; i03++) { + for (int64_t i02 = 0; i02 < ne02; i02++) { + int i = i03*ne02 + i02; + cudaStream_t cudaStream = g_cudaStreams[i % GGML_CUDA_MAX_STREAMS]; + + half * c_X = d_X + i * x_ne; + half * c_Y = d_Y + i * y_ne; + float * c_D = d_D + i * d_ne; + + // copy src0 to device + CUDA_CHECK(ggml_cuda_h2d_tensor_2d(c_X, src0, i03, i02, cudaStream)); + + // convert src1 to fp16 + // TODO: use multiple threads + ggml_fp16_t * const tmp = (ggml_fp16_t *) wdata + (ne11 * ne10) * (i03 * ne02 + i02); + char * src1i = (char *) src1->data + i03*nb13 + i02*nb12; + if (src1_cont_rows) { + if (src1_cont_cols) { + ggml_fp32_to_fp16_row((float *) src1i, tmp, ne10*ne11); + } + else { + for (int64_t i01 = 0; i01 < ne11; i01++) { + ggml_fp32_to_fp16_row((float *) (src1i + i01*nb11), tmp + i01*ne10, ne10); + } + } + } + else { + for (int64_t i01 = 0; i01 < ne11; i01++) { + for (int64_t i00 = 0; i00 < ne10; i00++) { + // very slow due to no inlining + tmp[i01*ne10 + i00] = ggml_fp32_to_fp16(*(float *) (src1i + i01*nb11 + i00*nb10)); + } + } + } + + // copy src1 to device + CUDA_CHECK(cudaMemcpyAsync(c_Y, tmp, sizeof(half) * y_ne, cudaMemcpyHostToDevice, cudaStream)); + + // compute + CUBLAS_CHECK(cublasSetStream(g_cublasH, cudaStream)); + CUBLAS_CHECK( + cublasGemmEx(g_cublasH, CUBLAS_OP_T, CUBLAS_OP_N, + ne01, ne11, ne10, + &alpha, c_X, CUDA_R_16F, ne00, + c_Y, CUDA_R_16F, ne10, + &beta, c_D, CUDA_R_32F, ne01, + CUBLAS_COMPUTE_32F_FAST_16F, + CUBLAS_GEMM_DEFAULT)); + + // copy dst to host + float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3); + CUDA_CHECK(cudaMemcpyAsync(d, c_D, sizeof(float) * d_ne, cudaMemcpyDeviceToHost, cudaStream)); + } + } + + CUDA_CHECK(cudaDeviceSynchronize()); + ggml_cuda_pool_free(d_X, x_size); + ggml_cuda_pool_free(d_Y, y_size); + ggml_cuda_pool_free(d_D, d_size); +} + +static void ggml_cuda_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + const int64_t ne00 = src0->ne[0]; + const int64_t ne01 = src0->ne[1]; + const int64_t ne02 = src0->ne[2]; + const int64_t ne03 = src0->ne[3]; + + const int64_t ne10 = src1->ne[0]; + const int64_t ne11 = src1->ne[1]; + + const int nb2 = dst->nb[2]; + const int nb3 = dst->nb[3]; + const ggml_type type = src0->type; + + const float alpha = 1.0f; + const float beta = 0.0f; + const int x_ne = ne01 * ne00; + const int y_ne = ne11 * ne10; + const int d_ne = ne11 * ne01; + const int n_mm = ne03 * ne02; + const size_t q_sz = ggml_type_size(type) * x_ne / ggml_blck_size(type); + + size_t x_size, y_size, d_size, q_size; + float * d_X = (float *) ggml_cuda_pool_malloc(n_mm * sizeof(float) * x_ne, &x_size); + float * d_Y = (float *) ggml_cuda_pool_malloc(n_mm * sizeof(float) * y_ne, &y_size); + float * d_D = (float *) ggml_cuda_pool_malloc(n_mm * sizeof(float) * d_ne, &d_size); + char * d_Q = (char *) ggml_cuda_pool_malloc(n_mm * q_sz, &q_size); + + const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(type); + GGML_ASSERT(to_fp32_cuda != nullptr); + + for (int64_t i03 = 0; i03 < ne03; i03++) { + for (int64_t i02 = 0; i02 < ne02; i02++) { + int i = i03*ne02 + i02; + cudaStream_t cudaStream = g_cudaStreams[i % GGML_CUDA_MAX_STREAMS]; + cudaStream_t cudaStream2 = g_cudaStreams2[i % GGML_CUDA_MAX_STREAMS]; + cudaEvent_t cudaEvent = g_cudaEvents[i % GGML_CUDA_MAX_EVENTS]; + + float * c_X = d_X + i * x_ne; + float * c_Y = d_Y + i * y_ne; + float * c_D = d_D + i * d_ne; + char * c_Q = d_Q + i * q_sz; + + // copy src0 and convert to fp32 on device + CUDA_CHECK(ggml_cuda_h2d_tensor_2d(c_Q, src0, i03, i02, cudaStream2)); + to_fp32_cuda(c_Q, c_X, x_ne, cudaStream2); + CUDA_CHECK(cudaGetLastError()); + CUDA_CHECK(cudaEventRecord(cudaEvent, cudaStream2)); + + // copy src1 to device + CUDA_CHECK(ggml_cuda_h2d_tensor_2d(c_Y, src1, i03, i02, cudaStream)); + + // wait for conversion + CUDA_CHECK(cudaStreamWaitEvent(cudaStream, cudaEvent, 0)); + + // compute + CUBLAS_CHECK(cublasSetStream(g_cublasH, cudaStream)); + CUBLAS_CHECK( + cublasSgemm(g_cublasH, CUBLAS_OP_T, CUBLAS_OP_N, + ne01, ne11, ne10, + &alpha, c_X, ne00, + c_Y, ne10, + &beta, c_D, ne01)); + + // copy dst to host + float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3); + CUDA_CHECK(cudaMemcpyAsync(d, c_D, sizeof(float) * d_ne, cudaMemcpyDeviceToHost, cudaStream)); + } + } + + CUDA_CHECK(cudaDeviceSynchronize()); + ggml_cuda_pool_free(d_X, x_size); + ggml_cuda_pool_free(d_Y, y_size); + ggml_cuda_pool_free(d_D, d_size); + ggml_cuda_pool_free(d_Q, q_size); +} + +bool ggml_cuda_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) { + const int64_t ne10 = src1->ne[0]; + + const int64_t ne0 = dst->ne[0]; + const int64_t ne1 = dst->ne[1]; + + // TODO: find the optimal values for these + if ((src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) && + src1->type == GGML_TYPE_F32 && + dst->type == GGML_TYPE_F32 && + (ne0 >= 32 && ne1 >= 32 && ne10 >= 32)) { + + return true; + } + + return false; +} + +bool ggml_cuda_mul_mat_use_f16(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * /* dst */) { + size_t src0_sz = ggml_nbytes(src0); + size_t src1_sz = ggml_nbytes(src1); + + // mul_mat_q: src0 is converted to fp32 on device + size_t mul_mat_q_transfer = src0_sz + src1_sz; + + // mul_mat_f16: src1 is converted to fp16 on cpu + size_t mul_mat_f16_transfer = src0_sz + sizeof(half) * ggml_nelements(src1); + + // choose the smaller one to transfer to the device + // TODO: this is not always the best choice due to the overhead of converting to fp16 + return mul_mat_f16_transfer < mul_mat_q_transfer; +} + +void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, void * wdata, size_t wsize) { + GGML_ASSERT(ggml_cuda_can_mul_mat(src0, src1, dst)); + + if (src0->type == GGML_TYPE_F32) { + ggml_cuda_mul_mat_f32(src0, src1, dst); + } + else if (src0->type == GGML_TYPE_F16) { + if (ggml_cuda_mul_mat_use_f16(src0, src1, dst)) { + ggml_cuda_mul_mat_f16(src0, src1, dst, wdata, wsize); + } + else { + ggml_cuda_mul_mat_q_f32(src0, src1, dst); + } + } + else if (ggml_is_quantized(src0->type)) { + ggml_cuda_mul_mat_q_f32(src0, src1, dst); + } + else { + GGML_ASSERT(false); + } +} + +size_t ggml_cuda_mul_mat_get_wsize(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) { + if (ggml_cuda_mul_mat_use_f16(src0, src1, dst)) { + return ggml_nelements(src1) * sizeof(ggml_fp16_t); + } + else { + return 0; + } } diff --git a/ggml-cuda.h b/ggml-cuda.h index 778995d372fbe..0e740e30908bc 100644 --- a/ggml-cuda.h +++ b/ggml-cuda.h @@ -2,6 +2,7 @@ #include "hipblas/hipblas.h" #include "hip/hip_runtime.h" #define CUBLAS_COMPUTE_32F HIPBLAS_R_32F +#define CUBLAS_COMPUTE_32F_FAST_16F HIPBLAS_R_32F #define CUBLAS_GEMM_DEFAULT HIPBLAS_GEMM_DEFAULT #define CUBLAS_OP_N HIPBLAS_OP_N #define CUBLAS_OP_T HIPBLAS_OP_T @@ -49,49 +50,16 @@ extern "C" { #endif -#define CUDA_CHECK(err) \ - do { \ - cudaError_t err_ = (err); \ - if (err_ != cudaSuccess) { \ - fprintf(stderr, "CUDA error %d at %s:%d: %s\n", err_, __FILE__, __LINE__, \ - cudaGetErrorString(err_)); \ - exit(1); \ - } \ - } while (0) - -#define CUBLAS_CHECK(err) \ - do { \ - cublasStatus_t err_ = (err); \ - if (err_ != CUBLAS_STATUS_SUCCESS) { \ - fprintf(stderr, "cuBLAS error %d at %s:%d\n", err_, __FILE__, __LINE__); \ - exit(1); \ - } \ - } while (0) +void ggml_init_cublas(void); -extern cublasHandle_t g_cublasH; -extern cudaStream_t g_cudaStream; -extern cudaStream_t g_cudaStream2; -extern cudaEvent_t g_cudaEvent; +bool ggml_cuda_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst); +size_t ggml_cuda_mul_mat_get_wsize(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst); +void ggml_cuda_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst, void * wdata, size_t wsize); -void ggml_init_cublas(void); +// TODO: export these with GGML_API void * ggml_cuda_host_malloc(size_t size); void ggml_cuda_host_free(void * ptr); -void * ggml_cuda_pool_malloc(size_t size, size_t * actual_size); -void ggml_cuda_pool_free(void * ptr, size_t size); - -void dequantize_row_q4_0_cuda(const void * vx, float * y, int k, cudaStream_t stream); -void dequantize_row_q4_1_cuda(const void * vx, float * y, int k, cudaStream_t stream); -void dequantize_row_q4_2_cuda(const void * vx, float * y, int k, cudaStream_t stream); -void dequantize_row_q5_0_cuda(const void * vx, float * y, int k, cudaStream_t stream); -void dequantize_row_q5_1_cuda(const void * vx, float * y, int k, cudaStream_t stream); -void dequantize_row_q8_0_cuda(const void * vx, float * y, int k, cudaStream_t stream); - -cudaError_t ggml_cuda_h2d_tensor_2d(void * dst, const struct ggml_tensor * src, uint64_t i3, uint64_t i2, cudaStream_t stream); - -typedef void (*dequantize_row_q_cuda_t)(const void * x, float * y, int k, cudaStream_t stream); -dequantize_row_q_cuda_t ggml_get_dequantize_row_q_cuda(enum ggml_type type); - #ifdef __cplusplus } #endif diff --git a/ggml-opencl-dequant.cl b/ggml-opencl-dequant.cl deleted file mode 100644 index a65a79f4d6b58..0000000000000 --- a/ggml-opencl-dequant.cl +++ /dev/null @@ -1,63 +0,0 @@ -#define MULTILINE_QUOTE(...) #__VA_ARGS__ -const char * clblast_dequant = MULTILINE_QUOTE( - -struct block_q4_0 -{ - float d; - uchar qs[16]; -}; - -__kernel void dequantize_row_q4_0(__global struct block_q4_0* blocks, __global float* result) { - const uint i = get_global_id(0) / 32; - const uint l = get_local_id(0); - - const float d = blocks[i].d; - - const uchar vi = blocks[i].qs[l]; - - const uint index = i*32 + l*2; - result[index + 0] = ((vi & 0xf) - 8)*d; - result[index + 1] = ((vi >> 4) - 8)*d; -} - -struct block_q4_1 -{ - float d; - float m; - uchar qs[16]; -}; - -__kernel void dequantize_row_q4_1(__global struct block_q4_1* blocks, __global float* result) { - const uint i = get_global_id(0) / 32; - const uint l = get_local_id(0); - - const float d = blocks[i].d; - const float m = blocks[i].m; - - const uchar vi = blocks[i].qs[l]; - - const uint index = i*32 + l*2; - result[index + 0] = (vi & 0xf) * d + m; - result[index + 1] = (vi >> 4) * d + m; -} - -struct block_q4_2 -{ - ushort d; - uchar qs[8]; -}; - -__kernel void dequantize_row_q4_2(__global struct block_q4_2* blocks, __global float* result) { - const uint i = get_global_id(0) / 16; - const uint l = get_local_id(0); - - const float d = vload_half(0, (__global half*) &blocks[i].d);; - - const uchar vi = blocks[i].qs[l]; - - const uint index = i*16 + l*2; - result[index + 0] = ((vi & 0xf) - 8)*d; - result[index + 1] = ((vi >> 4) - 8)*d; -} - -); diff --git a/ggml-opencl.c b/ggml-opencl.c index b748f86b7a37e..4389eca393466 100644 --- a/ggml-opencl.c +++ b/ggml-opencl.c @@ -3,12 +3,141 @@ #define CL_TARGET_OPENCL_VERSION 110 #include +#include #include #include #include "ggml.h" -#include "ggml-opencl-dequant.cl" +#define MULTILINE_QUOTE(...) #__VA_ARGS__ +const char * clblast_dequant = MULTILINE_QUOTE( + +struct block_q4_0 +{ + float d; + uchar qs[16]; +}; + +__kernel void dequantize_row_q4_0(__global struct block_q4_0* blocks, __global float* result) { + const uint i = get_global_id(0) / 32; + const uint l = get_local_id(0); + + const float d = blocks[i].d; + + const uchar vi = blocks[i].qs[l]; + + const uint index = i*32 + l*2; + result[index + 0] = ((vi & 0xf) - 8)*d; + result[index + 1] = ((vi >> 4) - 8)*d; +} + +struct block_q4_1 +{ + float d; + float m; + uchar qs[16]; +}; + +__kernel void dequantize_row_q4_1(__global struct block_q4_1* blocks, __global float* result) { + const uint i = get_global_id(0) / 32; + const uint l = get_local_id(0); + + const float d = blocks[i].d; + const float m = blocks[i].m; + + const uchar vi = blocks[i].qs[l]; + + const uint index = i*32 + l*2; + result[index + 0] = (vi & 0xf) * d + m; + result[index + 1] = (vi >> 4) * d + m; +} + +struct block_q4_2 +{ + ushort d; + uchar qs[8]; +}; + +__kernel void dequantize_row_q4_2(__global struct block_q4_2* blocks, __global float* result) { + const uint i = get_global_id(0) / 16; + const uint l = get_local_id(0); + + const float d = vload_half(0, (__global half*) &blocks[i].d); + + const uchar vi = blocks[i].qs[l]; + + const uint index = i*16 + l*2; + result[index + 0] = ((vi & 0xf) - 8)*d; + result[index + 1] = ((vi >> 4) - 8)*d; +} + + +struct block_q5_0 +{ + float d; + uint qh; + uchar qs[16]; +}; + +__kernel void dequantize_row_q5_0(__global struct block_q5_0* blocks, __global float* result) { + const uint i = get_global_id(0) / 32; + const uint l = get_local_id(0); + + const float d = blocks[i].d; + + const uchar vi = blocks[i].qs[l]; + + const uint l2 = l * 2; + + const uchar vh0 = ((blocks[i].qh & (1 << (l2 + 0))) >> (l2 + 0)) << 4; + const uchar vh1 = ((blocks[i].qh & (1 << (l2 + 1))) >> (l2 + 1)) << 4; + + const uint index = i*32 + l2; + result[index + 0] = (((vi & 0xf) | vh0) - 16)*d; + result[index + 1] = (((vi >> 4) | vh1) - 16)*d; +} + +struct block_q5_1 +{ + ushort d; + ushort m; + uint qh; + uchar qs[16]; +}; + +__kernel void dequantize_row_q5_1(__global struct block_q5_1* blocks, __global float* result) { + const uint i = get_global_id(0) / 32; + const uint l = get_local_id(0); + + const float d = vload_half(0, (__global half*) &blocks[i].d); + const float m = vload_half(0, (__global half*) &blocks[i].m); + + const uchar vi = blocks[i].qs[l]; + + const uint l2 = l * 2; + + const uchar vh0 = ((blocks[i].qh & (1 << (l2 + 0))) >> (l2 + 0)) << 4; + const uchar vh1 = ((blocks[i].qh & (1 << (l2 + 1))) >> (l2 + 1)) << 4; + + const uint index = i*32 + l2; + result[index + 0] = ((vi & 0xf) | vh0)*d + m; + result[index + 1] = ((vi >> 4) | vh1)*d + m; +} + +struct block_q8_0 +{ + float d; + char qs[32]; +}; + +__kernel void dequantize_row_q8_0(__global struct block_q8_0* blocks, __global float* result) { + const uint i = get_global_id(0) / 32; + const uint l = get_local_id(0); + + result[i*32 + l] = blocks[i].qs[l] * blocks[i].d; +} + +); #define CL_CHECK(err, name) \ do { \ @@ -19,12 +148,26 @@ } \ } while (0) +#define QK5_0 32 +typedef struct { + ggml_fp16_t d; // delta + uint8_t qh[4]; // 5-th bit of quants + uint8_t qs[QK5_0 / 2]; // nibbles / quants +} block_q5_0; + + +typedef struct { + float d; // delta + uint32_t qh; // 5-th bit of quants + uint8_t qs[QK5_0 / 2]; // nibbles / quants +} cl_block_q5_0; + static cl_platform_id platform; static cl_device_id device; static cl_context context; static cl_command_queue queue; static cl_program program; -static cl_kernel kernel_q4_0, kernel_q4_1, kernel_q4_2; +static cl_kernel kernel_q4_0, kernel_q4_1, kernel_q4_2, kernel_q5_0, kernel_q5_1, kernel_q8_0; static cl_mem cl_buffer_a, cl_buffer_qb, cl_buffer_b, cl_buffer_c; static size_t cl_size_a = 0, cl_size_qb = 0, cl_size_b = 0, cl_size_c = 0; @@ -97,6 +240,12 @@ void ggml_cl_init(void) { CL_CHECK(err, "clCreateKernel"); kernel_q4_2 = clCreateKernel(program, "dequantize_row_q4_2", &err); CL_CHECK(err, "clCreateKernel"); + kernel_q5_0 = clCreateKernel(program, "dequantize_row_q5_0", &err); + CL_CHECK(err, "clCreateKernel"); + kernel_q5_1 = clCreateKernel(program, "dequantize_row_q5_1", &err); + CL_CHECK(err, "clCreateKernel"); + kernel_q8_0 = clCreateKernel(program, "dequantize_row_q8_0", &err); + CL_CHECK(err, "clCreateKernel"); } static void ggml_cl_malloc(size_t req_size, size_t* cur_size, cl_mem_flags flags, cl_mem* buf) { @@ -125,6 +274,7 @@ void ggml_cl_sgemm_wrapper( cl_kernel kernel; size_t global = n * k, local, size_qb; bool dequant; + cl_block_q5_0* cl_host_b; switch (btype) { case GGML_TYPE_F32: @@ -146,7 +296,36 @@ void ggml_cl_sgemm_wrapper( dequant = true; kernel = kernel_q4_2; local = 8; - size_qb = global * (sizeof(short) + local) / 16; + size_qb = global * (sizeof(ggml_fp16_t) + local) / 16; + break; + case GGML_TYPE_Q5_0: + dequant = true; + kernel = kernel_q5_0; + local = 16; + // For some reason OpenCL seems to be incapable of working with structs of size 22. + // 20 and 24 bytes are fine. Workaround to do the fp16 to fp32 step on CPU... + // TODO Find the reason, fix and remove workaround. + const block_q5_0* b = (const block_q5_0*) host_b; + cl_host_b = (cl_block_q5_0*) malloc(sizeof(cl_block_q5_0) * global / 32); + for (size_t i = 0; i < global / 32; i++) { + cl_host_b[i].d = ggml_fp16_to_fp32(b[i].d); + memcpy(&cl_host_b[i].qh, b[i].qh, sizeof(uint32_t)); + memcpy(&cl_host_b[i].qs, b[i].qs, QK5_0 / 2); + } + host_b = (const float*) cl_host_b; + size_qb = global * (sizeof(float) + sizeof(uint32_t) + local) / 32; + break; + case GGML_TYPE_Q5_1: + dequant = true; + kernel = kernel_q5_1; + local = 16; + size_qb = global * (sizeof(ggml_fp16_t) * 2 + sizeof(uint32_t) + local) / 32; + break; + case GGML_TYPE_Q8_0: + dequant = true; + kernel = kernel_q8_0; + local = 32; + size_qb = global * (sizeof(float) + local) / 32; break; default: fprintf(stderr, "Error: Unsupported OpenCL btype %d\n", btype); @@ -171,12 +350,15 @@ void ggml_cl_sgemm_wrapper( err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &cl_buffer_qb); err |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &cl_buffer_b); CL_CHECK(err, "clSetKernelArg"); - clEnqueueWriteBuffer(queue, cl_buffer_qb, CL_FALSE, 0, size_qb, host_b, 0, NULL, &ev_qb); + err = clEnqueueWriteBuffer(queue, cl_buffer_qb, CL_FALSE, 0, size_qb, host_b, 0, NULL, &ev_qb); + CL_CHECK(err, "clEnqueueWriteBuffer qb"); } else { - clEnqueueWriteBuffer(queue, cl_buffer_b, CL_FALSE, 0, size_b, host_b, 0, NULL, &ev_b); + err = clEnqueueWriteBuffer(queue, cl_buffer_b, CL_FALSE, 0, size_b, host_b, 0, NULL, &ev_b); + CL_CHECK(err, "clEnqueueWriteBuffer b"); } - clEnqueueWriteBuffer(queue, cl_buffer_a, CL_FALSE, 0, size_a, host_a, 0, NULL, &ev_a); + err = clEnqueueWriteBuffer(queue, cl_buffer_a, CL_FALSE, 0, size_a, host_a, 0, NULL, &ev_a); + CL_CHECK(err, "clEnqueueWriteBuffer a"); if (dequant) { err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, &local, 1, &ev_qb, &ev_b); CL_CHECK(err, "clEnqueueNDRangeKernel"); @@ -188,15 +370,20 @@ void ggml_cl_sgemm_wrapper( clReleaseEvent(ev_b); cl_event ev_sgemm; - CLBlastSgemm((CLBlastLayout)order, - (CLBlastTranspose)trans_a, (CLBlastTranspose)trans_b, - m, n, k, - alpha, - cl_buffer_a, 0, lda, - cl_buffer_b, 0, ldb, - beta, - cl_buffer_c, 0, ldc, - &queue, &ev_sgemm); + CLBlastStatusCode status = CLBlastSgemm((CLBlastLayout)order, + (CLBlastTranspose)trans_a, (CLBlastTranspose)trans_b, + m, n, k, + alpha, + cl_buffer_a, 0, lda, + cl_buffer_b, 0, ldb, + beta, + cl_buffer_c, 0, ldc, + &queue, &ev_sgemm); + + if (status != CLBlastSuccess) { + fprintf(stderr, "Error: CLBlast SGEMM %d\n", status); + abort(); + } cl_event ev_c; clEnqueueReadBuffer(queue, cl_buffer_c, CL_TRUE, 0, size_c, host_c, 1, &ev_sgemm, &ev_c); @@ -205,4 +392,7 @@ void ggml_cl_sgemm_wrapper( clWaitForEvents(1, &ev_c); clReleaseEvent(ev_sgemm); clReleaseEvent(ev_c); + if (btype == GGML_TYPE_Q5_0) { + free((void*) cl_host_b); + } } diff --git a/ggml.c b/ggml.c index d8882d6446f85..795839abcedcd 100644 --- a/ggml.c +++ b/ggml.c @@ -135,14 +135,6 @@ inline static void* ggml_aligned_malloc(size_t size) { #define UNUSED(x) (void)(x) #define SWAP(x, y, T) do { T SWAP = x; x = y; y = SWAP; } while (0) -#define GGML_ASSERT(x) \ - do { \ - if (!(x)) { \ - fprintf(stderr, "GGML_ASSERT: %s:%d: %s\n", __FILE__, __LINE__, #x); \ - abort(); \ - } \ - } while (0) - #if defined(GGML_USE_ACCELERATE) #include #elif defined(GGML_USE_OPENBLAS) @@ -330,7 +322,7 @@ static ggml_fp16_t table_exp_f16[1 << 16]; // precomputed f32 table for f16 (256 KB) static float table_f32_f16[1 << 16]; -#if defined(__ARM_NEON) +#if defined(__ARM_NEON) || defined(__wasm_simd128__) #define B1(c,s,n) 0x ## n ## c , 0x ## n ## s #define B2(c,s,n) B1(c,s,n ## c), B1(c,s,n ## s) #define B3(c,s,n) B2(c,s,n ## c), B2(c,s,n ## s) @@ -370,6 +362,32 @@ ggml_fp16_t ggml_fp32_to_fp16(float x) { return GGML_FP32_TO_FP16(x); } +void ggml_fp16_to_fp32_row(const ggml_fp16_t * x, float * y, size_t n) { + for (size_t i = 0; i < n; i++) { + y[i] = GGML_FP16_TO_FP32(x[i]); + } +} + +void ggml_fp32_to_fp16_row(const float * x, ggml_fp16_t * y, size_t n) { + size_t i = 0; +#if defined(__F16C__) + for (; i + 7 < n; i += 8) { + __m256 x_vec = _mm256_loadu_ps(x + i); + __m128i y_vec = _mm256_cvtps_ph(x_vec, _MM_FROUND_TO_NEAREST_INT); + _mm_storeu_si128((__m128i *)(y + i), y_vec); + } + for(; i + 3 < n; i += 4) { + __m128 x_vec = _mm_loadu_ps(x + i); + __m128i y_vec = _mm_cvtps_ph(x_vec, _MM_FROUND_TO_NEAREST_INT); + _mm_storel_epi64((__m128i *)(y + i), y_vec); + } +#endif + for (; i < n; i++) { + y[i] = GGML_FP32_TO_FP16(x[i]); + } +} + + // // timing // @@ -1087,7 +1105,7 @@ static void quantize_row_q4_0(const float * restrict x, void * restrict vy, int const v128_t v = wasm_f32x4_mul(srcv[l], wasm_f32x4_splat(id)); const v128_t vf = wasm_f32x4_add(v, wasm_f32x4_splat(8.5f)); const v128_t vi = wasm_i32x4_trunc_sat_f32x4(vf); - const v128_t vc = wasm_i32x4_min_u(vi, wasm_i32x4_splat(15)); + const v128_t vc = wasm_i32x4_min(vi, wasm_i32x4_splat(15)); y[i].qs[2*l + 0] = wasm_i32x4_extract_lane(vc, 0) | (wasm_i32x4_extract_lane(vc, 1) << 4); y[i].qs[2*l + 1] = wasm_i32x4_extract_lane(vc, 2) | (wasm_i32x4_extract_lane(vc, 3) << 4); @@ -1911,8 +1929,8 @@ static void dequantize_row_q5_0(const void * restrict vx, float * restrict y, in const uint8_t vi = pp[l/2]; // extract the 5-th bit from qh - const uint8_t vh0 = ((qh & (1 << (l + 0))) >> (l + 0)) << 4; - const uint8_t vh1 = ((qh & (1 << (l + 1))) >> (l + 1)) << 4; + const uint8_t vh0 = ((qh & (1u << (l + 0))) >> (l + 0)) << 4; + const uint8_t vh1 = ((qh & (1u << (l + 1))) >> (l + 1)) << 4; const int8_t vi0 = (vi & 0x0F) | vh0; const int8_t vi1 = (vi >> 4) | vh1; @@ -1948,8 +1966,8 @@ static void dequantize_row_q5_1(const void * restrict vx, float * restrict y, in const uint8_t vi = pp[l/2]; // extract the 5-th bit from qh - const uint8_t vh0 = ((qh & (1 << (l + 0))) >> (l + 0)) << 4; - const uint8_t vh1 = ((qh & (1 << (l + 1))) >> (l + 1)) << 4; + const uint8_t vh0 = ((qh & (1u << (l + 0))) >> (l + 0)) << 4; + const uint8_t vh1 = ((qh & (1u << (l + 1))) >> (l + 1)) << 4; const uint8_t vi0 = (vi & 0x0F) | vh0; const uint8_t vi1 = (vi >> 4) | vh1; @@ -3180,6 +3198,72 @@ static void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void * } *s = vaddvq_f32(sumv); +#elif defined(__wasm_simd128__) + v128_t sumv = wasm_f32x4_splat(0.0f); + + uint64_t tmp[4]; + + for (int i = 0; i < nb; ++i) { + const block_q5_0 * restrict x0 = &x[i]; + const block_q8_0 * restrict y0 = &y[i]; + + const v128_t m4b = wasm_i8x16_splat(0x0F); + const v128_t s16b = wasm_i8x16_splat(0x10); + + // extract the 5th bit + uint32_t qh; + memcpy(&qh, x0->qh, sizeof(qh)); + + tmp[0] = table_b2b_u[(qh >> 0) & 0xFF]; + tmp[1] = table_b2b_u[(qh >> 8) & 0xFF]; + tmp[2] = table_b2b_u[(qh >> 16) & 0xFF]; + tmp[3] = table_b2b_u[(qh >> 24) ]; + + const v128_t qhl = wasm_v128_load(tmp + 0); + const v128_t qhh = wasm_v128_load(tmp + 2); + + const v128_t v0 = wasm_v128_load(x0->qs); + + // 4-bit -> 8-bit + const v128_t v0l = wasm_v128_and (v0, m4b); + const v128_t v0h = wasm_u8x16_shr(v0, 4); + + // interleave + const v128_t v0lz = wasm_v8x16_shuffle(v0l, v0h, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23); + const v128_t v0hz = wasm_v8x16_shuffle(v0l, v0h, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31); + + // add high bit and sub 16 + const v128_t v0lf = wasm_i8x16_sub(wasm_v128_or(v0lz, qhl), s16b); + const v128_t v0hf = wasm_i8x16_sub(wasm_v128_or(v0hz, qhh), s16b); + + // load y + const v128_t v1l = wasm_v128_load(y0->qs); + const v128_t v1h = wasm_v128_load(y0->qs + 16); + + // int8x16 -> int16x8 + const v128_t v0lfl = wasm_i16x8_extend_low_i8x16 (v0lf); + const v128_t v0lfh = wasm_i16x8_extend_high_i8x16(v0lf); + const v128_t v0hfl = wasm_i16x8_extend_low_i8x16 (v0hf); + const v128_t v0hfh = wasm_i16x8_extend_high_i8x16(v0hf); + + const v128_t v1ll = wasm_i16x8_extend_low_i8x16 (v1l); + const v128_t v1lh = wasm_i16x8_extend_high_i8x16(v1l); + const v128_t v1hl = wasm_i16x8_extend_low_i8x16 (v1h); + const v128_t v1hh = wasm_i16x8_extend_high_i8x16(v1h); + + const float x0d = GGML_FP16_TO_FP32(x0->d); + + // dot product + sumv = wasm_f32x4_add(sumv, wasm_f32x4_mul(wasm_f32x4_convert_i32x4( + wasm_i32x4_add( + wasm_i32x4_add(wasm_i32x4_dot_i16x8(v0lfl, v1ll), + wasm_i32x4_dot_i16x8(v0lfh, v1lh)), + wasm_i32x4_add(wasm_i32x4_dot_i16x8(v0hfl, v1hl), + wasm_i32x4_dot_i16x8(v0hfh, v1hh)))), wasm_f32x4_splat(x0d*y0->d))); + } + + *s = wasm_f32x4_extract_lane(sumv, 0) + wasm_f32x4_extract_lane(sumv, 1) + + wasm_f32x4_extract_lane(sumv, 2) + wasm_f32x4_extract_lane(sumv, 3); #elif defined(__AVX2__) // Initialize accumulator with zeros __m256 acc = _mm256_setzero_ps(); @@ -3220,8 +3304,8 @@ static void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void * for (int j = 0; j < QK8_0/2; j++) { const uint8_t v0 = x0[j]; - const int x0_0h = ((qh & (1 << (2*j + 0))) >> (2*j + 0)) << 4; - const int x1_0h = ((qh & (1 << (2*j + 1))) >> (2*j + 1)) << 4; + const int x0_0h = ((qh & (1u << (2*j + 0))) >> (2*j + 0)) << 4; + const int x1_0h = ((qh & (1u << (2*j + 1))) >> (2*j + 1)) << 4; const int x0_0 = ((v0 & 0x0F) | x0_0h) - 16; const int x1_0 = ((v0 >> 4) | x1_0h) - 16; @@ -3311,6 +3395,77 @@ static void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void * } *s = vaddvq_f32(sumv) + summs; +#elif defined(__wasm_simd128__) + v128_t sumv = wasm_f32x4_splat(0.0f); + + float summs = 0.0f; + + uint64_t tmp[4]; + + for (int i = 0; i < nb; ++i) { + const block_q5_1 * restrict x0 = &x[i]; + const block_q8_1 * restrict y0 = &y[i]; + + summs += GGML_FP16_TO_FP32(x0->m) * (y0->s0 + y0->s1); + + const v128_t m4b = wasm_i8x16_splat(0x0F); + + // extract the 5th bit + uint32_t qh; + memcpy(&qh, x0->qh, sizeof(qh)); + + tmp[0] = table_b2b_u[(qh >> 0) & 0xFF]; + tmp[1] = table_b2b_u[(qh >> 8) & 0xFF]; + tmp[2] = table_b2b_u[(qh >> 16) & 0xFF]; + tmp[3] = table_b2b_u[(qh >> 24) ]; + + const v128_t qhl = wasm_v128_load(tmp + 0); + const v128_t qhh = wasm_v128_load(tmp + 2); + + const v128_t v0 = wasm_v128_load(x0->qs); + + // 4-bit -> 8-bit + const v128_t v0l = wasm_v128_and (v0, m4b); + const v128_t v0h = wasm_u8x16_shr(v0, 4); + + static bool x = true; + + // interleave + const v128_t v0lz = wasm_v8x16_shuffle(v0l, v0h, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23); + const v128_t v0hz = wasm_v8x16_shuffle(v0l, v0h, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31); + + // add high bit + const v128_t v0lf = wasm_v128_or(v0lz, qhl); + const v128_t v0hf = wasm_v128_or(v0hz, qhh); + + // load y + const v128_t v1l = wasm_v128_load(y0->qs); + const v128_t v1h = wasm_v128_load(y0->qs + 16); + + // int8x16 -> int16x8 + const v128_t v0lfl = wasm_i16x8_extend_low_i8x16 (v0lf); + const v128_t v0lfh = wasm_i16x8_extend_high_i8x16(v0lf); + const v128_t v0hfl = wasm_i16x8_extend_low_i8x16 (v0hf); + const v128_t v0hfh = wasm_i16x8_extend_high_i8x16(v0hf); + + const v128_t v1ll = wasm_i16x8_extend_low_i8x16 (v1l); + const v128_t v1lh = wasm_i16x8_extend_high_i8x16(v1l); + const v128_t v1hl = wasm_i16x8_extend_low_i8x16 (v1h); + const v128_t v1hh = wasm_i16x8_extend_high_i8x16(v1h); + + const float x0d = GGML_FP16_TO_FP32(x0->d); + + // dot product + sumv = wasm_f32x4_add(sumv, wasm_f32x4_mul(wasm_f32x4_convert_i32x4( + wasm_i32x4_add( + wasm_i32x4_add(wasm_i32x4_dot_i16x8(v0lfl, v1ll), + wasm_i32x4_dot_i16x8(v0lfh, v1lh)), + wasm_i32x4_add(wasm_i32x4_dot_i16x8(v0hfl, v1hl), + wasm_i32x4_dot_i16x8(v0hfh, v1hh)))), wasm_f32x4_splat(x0d*y0->d))); + } + + *s = wasm_f32x4_extract_lane(sumv, 0) + wasm_f32x4_extract_lane(sumv, 1) + + wasm_f32x4_extract_lane(sumv, 2) + wasm_f32x4_extract_lane(sumv, 3) + summs; #elif defined(__AVX2__) // Initialize accumulator with zeros __m256 acc = _mm256_setzero_ps(); @@ -3354,8 +3509,8 @@ static void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void * for (int j = 0; j < QK8_1/2; j++) { const uint8_t v0 = x0[j]; - const int x0_0h = ((qh & (1 << (2*j + 0))) >> (2*j + 0)) << 4; - const int x1_0h = ((qh & (1 << (2*j + 1))) >> (2*j + 1)) << 4; + const int x0_0h = ((qh & (1u << (2*j + 0))) >> (2*j + 0)) << 4; + const int x1_0h = ((qh & (1u << (2*j + 1))) >> (2*j + 1)) << 4; const int x0_0 = (v0 & 0x0F) | x0_0h; const int x1_0 = (v0 >> 4) | x1_0h; @@ -4057,6 +4212,27 @@ bool ggml_is_quantized(enum ggml_type type) { return GGML_IS_QUANTIZED[type]; } +enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype) { + enum ggml_type wtype = GGML_TYPE_COUNT; + + switch (ftype) { + case GGML_FTYPE_ALL_F32: wtype = GGML_TYPE_F32; break; + case GGML_FTYPE_MOSTLY_F16: wtype = GGML_TYPE_F16; break; + case GGML_FTYPE_MOSTLY_Q4_0: wtype = GGML_TYPE_Q4_0; break; + case GGML_FTYPE_MOSTLY_Q4_1: wtype = GGML_TYPE_Q4_1; break; + case GGML_FTYPE_MOSTLY_Q4_2: wtype = GGML_TYPE_Q4_2; break; + case GGML_FTYPE_MOSTLY_Q5_0: wtype = GGML_TYPE_Q5_0; break; + case GGML_FTYPE_MOSTLY_Q5_1: wtype = GGML_TYPE_Q5_1; break; + case GGML_FTYPE_MOSTLY_Q8_0: wtype = GGML_TYPE_Q8_0; break; + case GGML_FTYPE_UNKNOWN: wtype = GGML_TYPE_COUNT; break; + case GGML_FTYPE_MOSTLY_Q4_1_SOME_F16: wtype = GGML_TYPE_COUNT; break; + } + + GGML_ASSERT(wtype != GGML_TYPE_COUNT); + + return wtype; +} + static inline bool ggml_is_transposed(const struct ggml_tensor * tensor) { return tensor->nb[0] > tensor->nb[1]; } @@ -4167,12 +4343,11 @@ struct ggml_context * ggml_init(struct ggml_init_params params) { GGML_PRINT_DEBUG("%s: g_state initialized in %f ms\n", __func__, (t_end - t_start)/1000.0f); } - // initialize cuBLAS - #if defined(GGML_USE_CUBLAS) +#if defined(GGML_USE_CUBLAS) ggml_init_cublas(); - #elif defined(GGML_USE_CLBLAST) +#elif defined(GGML_USE_CLBLAST) ggml_cl_init(); - #endif +#endif is_first_call = false; } @@ -4253,7 +4428,7 @@ void ggml_free(struct ggml_context * ctx) { } size_t ggml_used_mem(const struct ggml_context * ctx) { - return ctx->objects_end->offs + ctx->objects_end->size; + return ctx->objects_end == NULL ? 0 : ctx->objects_end->offs + ctx->objects_end->size; } size_t ggml_set_scratch(struct ggml_context * ctx, struct ggml_scratch scratch) { @@ -7943,7 +8118,7 @@ static void ggml_compute_forward_rms_norm( // ggml_compute_forward_mul_mat -#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST) +#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST) // helper function to determine if it is better to use BLAS or not // for large matrices, BLAS is faster static bool ggml_compute_forward_mul_mat_use_blas( @@ -7959,12 +8134,9 @@ static bool ggml_compute_forward_mul_mat_use_blas( const int64_t ne1 = dst->ne[1]; // TODO: find the optimal values for these - if ( -#if !defined(GGML_USE_CUBLAS) - ggml_is_contiguous(src0) && + if (ggml_is_contiguous(src0) && ggml_is_contiguous(src1) && -#endif - ((ne0 >= 32 && ne1 >= 32 && ne10 >= 32))) { + (ne0 >= 32 && ne1 >= 32 && ne10 >= 32)) { /*printf("BLAS: %d %d %d %d %d\n", ne0, ne1, ne10, ne00, ne01);*/ return true; @@ -7972,7 +8144,6 @@ static bool ggml_compute_forward_mul_mat_use_blas( return false; } - #endif static void ggml_compute_forward_mul_mat_f32( @@ -7988,7 +8159,7 @@ static void ggml_compute_forward_mul_mat_f32( const int64_t ne02 = src0->ne[2]; const int64_t ne03 = src0->ne[3]; -#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST) +#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST) const int64_t ne10 = src1->ne[0]; #endif const int64_t ne11 = src1->ne[1]; @@ -8045,7 +8216,16 @@ static void ggml_compute_forward_mul_mat_f32( // nb01 >= nb00 - src0 is not transposed // compute by src0 rows -#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST) +#if defined(GGML_USE_CUBLAS) + if (ggml_cuda_can_mul_mat(src0, src1, dst)) { + if (params->ith == 0 && params->type == GGML_TASK_COMPUTE) { + ggml_cuda_mul_mat(src0, src1, dst, params->wdata, params->wsize); + } + return; + } +#endif + +#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST) if (ggml_compute_forward_mul_mat_use_blas(src0, src1, dst)) { if (params->ith != 0) { return; @@ -8059,43 +8239,13 @@ static void ggml_compute_forward_mul_mat_f32( return; } -#if defined(GGML_USE_CUBLAS) - const float alpha = 1.0f; - const float beta = 0.0f; - const int x_ne = ne01 * ne00; - const int y_ne = ne11 * ne10; - const int d_ne = ne11 * ne01; - - size_t x_size, y_size, d_size; - float *d_X = ggml_cuda_pool_malloc(sizeof(float) * x_ne, &x_size); - float *d_Y = ggml_cuda_pool_malloc(sizeof(float) * y_ne, &y_size); - float *d_D = ggml_cuda_pool_malloc(sizeof(float) * d_ne, &d_size); -#endif - for (int64_t i03 = 0; i03 < ne03; i03++) { for (int64_t i02 = 0; i02 < ne02; i02++) { -#if !defined(GGML_USE_CUBLAS) const float * x = (float *) ((char *) src0->data + i02*nb02 + i03*nb03); const float * y = (float *) ((char *) src1->data + i02*nb12 + i03*nb13); -#endif float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3); -#if defined(GGML_USE_CUBLAS) - // copy data to device - CUDA_CHECK(ggml_cuda_h2d_tensor_2d(d_X, src0, i03, i02, g_cudaStream)); - CUDA_CHECK(ggml_cuda_h2d_tensor_2d(d_Y, src1, i03, i02, g_cudaStream)); - - // compute - CUBLAS_CHECK( - cublasSgemm(g_cublasH, CUBLAS_OP_T, CUBLAS_OP_N, - ne01, ne11, ne10, - &alpha, d_X, ne00, - d_Y, ne10, - &beta, d_D, ne01)); - - // copy data to host - CUDA_CHECK(cudaMemcpyAsync(d, d_D, sizeof(float) * d_ne, cudaMemcpyDeviceToHost, g_cudaStream)); -#elif defined(GGML_USE_CLBLAST) +#if defined(GGML_USE_CLBLAST) // zT = y * xT ggml_cl_sgemm_wrapper(GGML_BLAS_ORDER_ROW_MAJOR, GGML_BLAS_OP_N, GGML_BLAS_OP_T, ne11, ne01, ne10, @@ -8112,12 +8262,6 @@ static void ggml_compute_forward_mul_mat_f32( #endif } } -#if defined(GGML_USE_CUBLAS) - CUDA_CHECK(cudaStreamSynchronize(g_cudaStream)); - ggml_cuda_pool_free(d_X, x_size); - ggml_cuda_pool_free(d_Y, y_size); - ggml_cuda_pool_free(d_D, d_size); -#endif //printf("CBLAS F32 = %f ms, %d x %d x %d x %d\n", (ggml_perf_time_us() - t0)/1000.0, ne0, ne1, ne2, ne3); return; @@ -8247,7 +8391,16 @@ static void ggml_compute_forward_mul_mat_f16_f32( // nb01 >= nb00 - src0 is not transposed // compute by src0 rows -#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST) +#if defined(GGML_USE_CUBLAS) + if (ggml_cuda_can_mul_mat(src0, src1, dst)) { + if (params->ith == 0 && params->type == GGML_TASK_COMPUTE) { + ggml_cuda_mul_mat(src0, src1, dst, params->wdata, params->wsize); + } + return; + } +#endif + +#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST) if (ggml_compute_forward_mul_mat_use_blas(src0, src1, dst)) { GGML_ASSERT(nb10 == sizeof(float)); @@ -8263,37 +8416,8 @@ static void ggml_compute_forward_mul_mat_f16_f32( return; } -#if defined(GGML_USE_CUBLAS) - const float alpha = 1.0f; - const float beta = 0.0f; - const int x_ne = ne01 * ne00; - const int y_ne = ne11 * ne10; - const int d_ne = ne11 * ne01; - - size_t x_size, y_size, d_size; - ggml_fp16_t * d_X = ggml_cuda_pool_malloc(sizeof(float) * x_ne, &x_size); - ggml_fp16_t * d_Y = ggml_cuda_pool_malloc(sizeof(float) * y_ne, &y_size); - float * d_D = ggml_cuda_pool_malloc(sizeof(float) * d_ne, &d_size); -#endif for (int64_t i03 = 0; i03 < ne03; i03++) { for (int64_t i02 = 0; i02 < ne02; i02++) { -#if defined(GGML_USE_CUBLAS) - // copy src0 while converting src1 - CUDA_CHECK(ggml_cuda_h2d_tensor_2d(d_X, src0, i03, i02, g_cudaStream)); - - // with cuBlAS, instead of converting src0 to fp32, we convert src1 to fp16 - ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + (ne11 * ne10) * (i03 * ne02 + i02); - { - size_t id = 0; - for (int64_t i01 = 0; i01 < ne11; ++i01) { - for (int64_t i00 = 0; i00 < ne10; ++i00) { - wdata[id++] = GGML_FP32_TO_FP16(*(float *) ((char *) src1->data + i03*nb13 + i02*nb12 + i01*nb11 + i00*nb10)); - } - } - - assert(id*sizeof(ggml_fp16_t) <= params->wsize); - } -#else float * const wdata = params->wdata; { size_t id = 0; @@ -8305,28 +8429,8 @@ static void ggml_compute_forward_mul_mat_f16_f32( assert(id*sizeof(float) <= params->wsize); } -#endif -#if defined(GGML_USE_CUBLAS) - const ggml_fp16_t * y = (ggml_fp16_t *) wdata; - float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3); - - // copy data to device - CUDA_CHECK(cudaMemcpyAsync(d_Y, y, sizeof(ggml_fp16_t) * y_ne, cudaMemcpyHostToDevice, g_cudaStream)); - - // compute - CUBLAS_CHECK( - cublasGemmEx(g_cublasH, CUBLAS_OP_T, CUBLAS_OP_N, - ne01, ne11, ne10, - &alpha, d_X, CUDA_R_16F, ne00, - d_Y, CUDA_R_16F, ne10, - &beta, d_D, CUDA_R_32F, ne01, - CUBLAS_COMPUTE_32F, - CUBLAS_GEMM_DEFAULT)); - - // copy data to host - CUDA_CHECK(cudaMemcpyAsync(d, d_D, sizeof(float) * d_ne, cudaMemcpyDeviceToHost, g_cudaStream)); -#elif defined(GGML_USE_CLBLAST) +#if defined(GGML_USE_CLBLAST) const float * x = wdata; const float * y = (float *) ((char *) src1->data + i02*nb12 + i03*nb13); @@ -8355,12 +8459,6 @@ static void ggml_compute_forward_mul_mat_f16_f32( } } -#if defined(GGML_USE_CUBLAS) - CUDA_CHECK(cudaStreamSynchronize(g_cudaStream)); - ggml_cuda_pool_free(d_X, x_size); - ggml_cuda_pool_free(d_Y, y_size); - ggml_cuda_pool_free(d_D, d_size); -#endif /*printf("CBLAS F16 = %f ms, %d x %d x %d x %d\n", (ggml_perf_time_us() - t0)/1000.0, ne0, ne1, ne2, ne3);*/ return; @@ -8513,7 +8611,16 @@ static void ggml_compute_forward_mul_mat_q_f32( // nb01 >= nb00 - src0 is not transposed // compute by src0 rows -#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST) +#if defined(GGML_USE_CUBLAS) + if (ggml_cuda_can_mul_mat(src0, src1, dst)) { + if (params->ith == 0 && params->type == GGML_TASK_COMPUTE) { + ggml_cuda_mul_mat(src0, src1, dst, params->wdata, params->wsize); + } + return; + } +#endif + +#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST) if (ggml_compute_forward_mul_mat_use_blas(src0, src1, dst)) { if (params->ith != 0) { return; @@ -8527,25 +8634,8 @@ static void ggml_compute_forward_mul_mat_q_f32( return; } -#if defined(GGML_USE_CUBLAS) - const float alpha = 1.0f; - const float beta = 0.0f; - const int x_ne = ne01 * ne00; - const int y_ne = ne11 * ne10; - const int d_ne = ne11 * ne01; - - size_t x_size, y_size, d_size, q_size; - float * d_X = ggml_cuda_pool_malloc(sizeof(float) * x_ne, &x_size); - float * d_Y = ggml_cuda_pool_malloc(sizeof(float) * y_ne, &y_size); - float * d_D = ggml_cuda_pool_malloc(sizeof(float) * d_ne, &d_size); - void * d_Q = ggml_cuda_pool_malloc(GGML_TYPE_SIZE[type] * x_ne / GGML_BLCK_SIZE[type], &q_size); - - const dequantize_row_q_cuda_t dequantize_row_q_cuda = ggml_get_dequantize_row_q_cuda(type); - GGML_ASSERT(dequantize_row_q_cuda != NULL); -#else float * const wdata = params->wdata; dequantize_row_q_t const dequantize_row_q = quantize_fns[type].dequantize_row_q; -#endif for (int64_t i03 = 0; i03 < ne03; i03++) { for (int64_t i02 = 0; i02 < ne02; i02++) { @@ -8553,14 +8643,7 @@ static void ggml_compute_forward_mul_mat_q_f32( float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3); -#if defined(GGML_USE_CUBLAS) - // copy and dequantize on device - CUDA_CHECK(ggml_cuda_h2d_tensor_2d(d_Q, src0, i03, i02, g_cudaStream2)); - - dequantize_row_q_cuda(d_Q, d_X, x_ne, g_cudaStream2); - CUDA_CHECK(cudaGetLastError()); - CUDA_CHECK(cudaEventRecord(g_cudaEvent, g_cudaStream2)); -#elif defined(GGML_USE_CLBLAST) +#if defined(GGML_USE_CLBLAST) const void* x = (char *) src0->data + i03*nb03 + i02*nb02; #else { @@ -8576,24 +8659,7 @@ static void ggml_compute_forward_mul_mat_q_f32( const float * x = wdata; #endif -#if defined(GGML_USE_CUBLAS) - // copy data to device - CUDA_CHECK(ggml_cuda_h2d_tensor_2d(d_Y, src1, i03, i02, g_cudaStream)); - - // wait for dequantization - CUDA_CHECK(cudaStreamWaitEvent(g_cudaStream, g_cudaEvent, 0)); - - // compute - CUBLAS_CHECK( - cublasSgemm(g_cublasH, CUBLAS_OP_T, CUBLAS_OP_N, - ne01, ne11, ne10, - &alpha, d_X, ne00, - d_Y, ne10, - &beta, d_D, ne01)); - - // copy data to host - CUDA_CHECK(cudaMemcpyAsync(d, d_D, sizeof(float) * d_ne, cudaMemcpyDeviceToHost, g_cudaStream)); -#elif defined(GGML_USE_CLBLAST) +#if defined(GGML_USE_CLBLAST) // zT = y * xT ggml_cl_sgemm_wrapper(GGML_BLAS_ORDER_ROW_MAJOR, GGML_BLAS_OP_N, GGML_BLAS_OP_T, ne11, ne01, ne10, @@ -8611,13 +8677,6 @@ static void ggml_compute_forward_mul_mat_q_f32( } } -#if defined(GGML_USE_CUBLAS) - CUDA_CHECK(cudaStreamSynchronize(g_cudaStream)); - ggml_cuda_pool_free(d_X, x_size); - ggml_cuda_pool_free(d_Y, y_size); - ggml_cuda_pool_free(d_D, d_size); - ggml_cuda_pool_free(d_Q, q_size); -#endif //printf("CBLAS = %f ms, %d x %d x %d x %d\n", (ggml_perf_time_us() - t0)/1000.0, ne0, ne1, ne2, ne3); return; @@ -11601,18 +11660,21 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph) size_t cur = 0; +#if defined(GGML_USE_CUBLAS) + if (ggml_cuda_can_mul_mat(node->src0, node->src1, node)) { + node->n_tasks = 1; // TODO: this actually is doing nothing + // the threads are still spinning + cur = ggml_cuda_mul_mat_get_wsize(node->src0, node->src1, node); + } + else +#endif if (node->src0->type == GGML_TYPE_F16 && node->src1->type == GGML_TYPE_F32) { -#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST) +#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST) if (ggml_compute_forward_mul_mat_use_blas(node->src0, node->src1, node)) { node->n_tasks = 1; // TODO: this actually is doing nothing // the threads are still spinning -#if defined(GGML_USE_CUBLAS) - // with cuBLAS, we need memory for the full 3D / 4D data of src1 - cur = GGML_TYPE_SIZE[GGML_TYPE_F16]*ggml_nelements(node->src1); -#else // here we need memory just for single 2D matrix from src0 cur = GGML_TYPE_SIZE[GGML_TYPE_F32]*(node->src0->ne[0]*node->src0->ne[1]); -#endif } else { cur = GGML_TYPE_SIZE[GGML_TYPE_F16]*ggml_nelements(node->src1); } @@ -11621,13 +11683,13 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph) #endif } else if (node->src0->type == GGML_TYPE_F32 && node->src1->type == GGML_TYPE_F32) { cur = 0; -#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST) +#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST) if (ggml_compute_forward_mul_mat_use_blas(node->src0, node->src1, node)) { node->n_tasks = 1; } #endif } else if (ggml_is_quantized(node->src0->type) && node->src1->type == GGML_TYPE_F32) { -#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST) +#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST) if (ggml_compute_forward_mul_mat_use_blas(node->src0, node->src1, node)) { node->n_tasks = 1; cur = GGML_TYPE_SIZE[GGML_TYPE_F32]*(node->src0->ne[0]*node->src0->ne[1]); @@ -12899,8 +12961,8 @@ size_t ggml_quantize_q5_0(const float * src, void * dst, int n, int k, int64_t * memcpy(&qh, &y[i].qh, sizeof(qh)); for (int l = 0; l < QK5_0; l += 2) { - const uint8_t vh0 = ((qh & (1 << (l + 0))) >> (l + 0)) << 4; - const uint8_t vh1 = ((qh & (1 << (l + 1))) >> (l + 1)) << 4; + const uint8_t vh0 = ((qh & (1u << (l + 0))) >> (l + 0)) << 4; + const uint8_t vh1 = ((qh & (1u << (l + 1))) >> (l + 1)) << 4; // cast to 16 bins const uint8_t vi0 = ((y[i].qs[l/2] & 0x0F) | vh0) / 2; @@ -12929,8 +12991,8 @@ size_t ggml_quantize_q5_1(const float * src, void * dst, int n, int k, int64_t * memcpy(&qh, &y[i].qh, sizeof(qh)); for (int l = 0; l < QK5_1; l += 2) { - const uint8_t vh0 = ((qh & (1 << (l + 0))) >> (l + 0)) << 4; - const uint8_t vh1 = ((qh & (1 << (l + 1))) >> (l + 1)) << 4; + const uint8_t vh0 = ((qh & (1u << (l + 0))) >> (l + 0)) << 4; + const uint8_t vh1 = ((qh & (1u << (l + 1))) >> (l + 1)) << 4; // cast to 16 bins const uint8_t vi0 = ((y[i].qs[l/2] & 0x0F) | vh0) / 2; diff --git a/ggml.h b/ggml.h index c1c5495c63f44..ef5a048c3b7e4 100644 --- a/ggml.h +++ b/ggml.h @@ -197,6 +197,14 @@ #define GGML_MAX_OPT 4 #define GGML_DEFAULT_N_THREADS 4 +#define GGML_ASSERT(x) \ + do { \ + if (!(x)) { \ + fprintf(stderr, "GGML_ASSERT: %s:%d: %s\n", __FILE__, __LINE__, #x); \ + abort(); \ + } \ + } while (0) + #ifdef __cplusplus extern "C" { #endif @@ -212,6 +220,9 @@ extern "C" { GGML_API float ggml_fp16_to_fp32(ggml_fp16_t x); GGML_API ggml_fp16_t ggml_fp32_to_fp16(float x); + GGML_API void ggml_fp16_to_fp32_row(const ggml_fp16_t * x, float * y, size_t n); + GGML_API void ggml_fp32_to_fp16_row(const float * x, ggml_fp16_t * y, size_t n); + struct ggml_object; struct ggml_context; @@ -232,6 +243,20 @@ extern "C" { GGML_TYPE_COUNT, }; + // model file types + enum ggml_ftype { + GGML_FTYPE_UNKNOWN = -1, + GGML_FTYPE_ALL_F32 = 0, + GGML_FTYPE_MOSTLY_F16 = 1, // except 1d tensors + GGML_FTYPE_MOSTLY_Q4_0 = 2, // except 1d tensors + GGML_FTYPE_MOSTLY_Q4_1 = 3, // except 1d tensors + GGML_FTYPE_MOSTLY_Q4_1_SOME_F16 = 4, // tok_embeddings.weight and output.weight are F16 + GGML_FTYPE_MOSTLY_Q4_2 = 5, // except 1d tensors + GGML_FTYPE_MOSTLY_Q8_0 = 7, // except 1d tensors + GGML_FTYPE_MOSTLY_Q5_0 = 8, // except 1d tensors + GGML_FTYPE_MOSTLY_Q5_1 = 9, // except 1d tensors + }; + // available tensor operations: enum ggml_op { GGML_OP_NONE = 0, @@ -385,6 +410,9 @@ extern "C" { GGML_API bool ggml_is_quantized(enum ggml_type type); + // TODO: temporary until model loading of ggml examples is refactored + GGML_API enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype); + // main GGML_API struct ggml_context * ggml_init(struct ggml_init_params params); diff --git a/llama-util.h b/llama-util.h index ca4dd162f59fe..d531588d58019 100644 --- a/llama-util.h +++ b/llama-util.h @@ -243,7 +243,8 @@ struct llama_mmap { #else static constexpr bool SUPPORTED = false; - llama_mmap(struct llama_file *) { + llama_mmap(struct llama_file *, bool prefetch = true) { + (void)prefetch; throw std::string("mmap not supported"); } #endif @@ -382,8 +383,13 @@ struct llama_mlock { #else static constexpr bool SUPPORTED = false; - void raw_lock(const void * addr, size_t size) { + size_t lock_granularity() { + return (size_t) 65536; + } + + bool raw_lock(const void * addr, size_t size) { fprintf(stderr, "warning: mlock not supported on this system\n"); + return false; } void raw_unlock(const void * addr, size_t size) {} @@ -395,6 +401,8 @@ struct llama_buffer { uint8_t * addr = NULL; size_t size = 0; + llama_buffer() = default; + void resize(size_t size) { delete[] addr; addr = new uint8_t[size]; @@ -404,27 +412,59 @@ struct llama_buffer { ~llama_buffer() { delete[] addr; } + + // disable copy and move + llama_buffer(const llama_buffer&) = delete; + llama_buffer(llama_buffer&&) = delete; + llama_buffer& operator=(const llama_buffer&) = delete; + llama_buffer& operator=(llama_buffer&&) = delete; }; #ifdef GGML_USE_CUBLAS #include "ggml-cuda.h" struct llama_ctx_buffer { uint8_t * addr = NULL; + bool is_cuda; size_t size = 0; + llama_ctx_buffer() = default; + void resize(size_t size) { + free(); + + addr = (uint8_t *) ggml_cuda_host_malloc(size); if (addr) { - ggml_cuda_host_free(addr); + is_cuda = true; + } + else { + // fall back to pageable memory + addr = new uint8_t[size]; + is_cuda = false; } - addr = (uint8_t *) ggml_cuda_host_malloc(size); this->size = size; } - ~llama_ctx_buffer() { + void free() { if (addr) { - ggml_cuda_host_free(addr); + if (is_cuda) { + ggml_cuda_host_free(addr); + } + else { + delete[] addr; + } } + addr = NULL; } + + ~llama_ctx_buffer() { + free(); + } + + // disable copy and move + llama_ctx_buffer(const llama_ctx_buffer&) = delete; + llama_ctx_buffer(llama_ctx_buffer&&) = delete; + llama_ctx_buffer& operator=(const llama_ctx_buffer&) = delete; + llama_ctx_buffer& operator=(llama_ctx_buffer&&) = delete; }; #else typedef llama_buffer llama_ctx_buffer; diff --git a/llama.cpp b/llama.cpp index f8b4c8e46b521..868a58a8b0b93 100644 --- a/llama.cpp +++ b/llama.cpp @@ -727,8 +727,7 @@ struct llama_model_loader { LLAMA_ASSERT(offset == lt.size); } else if (lt.split_type == SPLIT_BY_COLUMNS) { // Let's load the data into temporary buffers to ensure the OS performs large loads. - std::vector tmp_bufs; - tmp_bufs.resize(lt.shards.size()); + std::vector tmp_bufs(lt.shards.size()); for (size_t i = 0; i < lt.shards.size(); i++) { llama_load_tensor_shard & shard = lt.shards.at(i); llama_file & file = file_loaders.at(shard.file_idx)->file; @@ -2373,7 +2372,7 @@ int llama_apply_lora_from_file(struct llama_context * ctx, const char * path_lor } } -int llama_get_kv_cache_token_count(struct llama_context * ctx) { +int llama_get_kv_cache_token_count(const struct llama_context * ctx) { return ctx->model.kv_self.n; } @@ -2387,7 +2386,7 @@ void llama_set_rng_seed(struct llama_context * ctx, int seed) { } // Returns the size of the state -size_t llama_get_state_size(struct llama_context * ctx) { +size_t llama_get_state_size(const struct llama_context * ctx) { // we don't know size of rng until we actually serialize it. so reserve more than enough memory for its serialized state. // for reference, std::mt19937(1337) serializes to 6701 bytes. const size_t s_rng_size = sizeof(size_t); @@ -2567,6 +2566,85 @@ size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src) { return nread; } +bool llama_load_session_file(struct llama_context * ctx, const char * path_session, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out) { + llama_file file(path_session, "rb"); + + // sanity checks + { + const uint32_t magic = file.read_u32(); + const uint32_t version = file.read_u32(); + + if (!(magic == LLAMA_SESSION_MAGIC && version == LLAMA_SESSION_VERSION)) { + fprintf(stderr, "%s : unknown (magic, version) for session file: %08x, %08x\n", __func__, magic, version); + return false; + } + + llama_hparams session_hparams; + file.read_raw(&session_hparams, sizeof(llama_hparams)); + + if (session_hparams != ctx->model.hparams) { + fprintf(stderr, "%s : model hparams didn't match from session file!\n", __func__); + return false; + } + } + + // load the prompt + { + const uint32_t n_token_count = file.read_u32(); + + if (n_token_count > n_token_capacity) { + fprintf(stderr, "%s : token count in session file exceeded capacity! %u > %zu\n", __func__, n_token_count, n_token_capacity); + return false; + } + + file.read_raw(tokens_out, sizeof(llama_token) * n_token_count); + *n_token_count_out = n_token_count; + } + + // restore the context state + { + const size_t n_state_size_cur = file.size - file.tell(); + const size_t n_state_size_exp = llama_get_state_size(ctx); + + if (n_state_size_cur != n_state_size_exp) { + fprintf(stderr, "%s : the state size in session file didn't match! expected %zu, got %zu\n", __func__, n_state_size_exp, n_state_size_cur); + return false; + } + + std::vector state_data(n_state_size_cur); + file.read_raw(state_data.data(), n_state_size_cur); + + llama_set_state_data(ctx, state_data.data()); + } + + return true; +} + +bool llama_save_session_file(struct llama_context * ctx, const char * path_session, const llama_token * tokens, size_t n_token_count) { + llama_file file(path_session, "wb"); + + file.write_u32(LLAMA_SESSION_MAGIC); + file.write_u32(LLAMA_SESSION_VERSION); + + file.write_raw(&ctx->model.hparams, sizeof(llama_hparams)); + + // save the prompt + file.write_u32((uint32_t) n_token_count); + file.write_raw(tokens, sizeof(llama_token) * n_token_count); + + // save the context state + { + const size_t n_state_size = llama_get_state_size(ctx); + + std::vector state_data(n_state_size); + llama_copy_state_data(ctx, state_data.data()); + + file.write_raw(state_data.data(), n_state_size); + } + + return true; +} + int llama_eval( struct llama_context * ctx, const llama_token * tokens, @@ -2605,15 +2683,15 @@ int llama_tokenize( return res.size(); } -int llama_n_vocab(struct llama_context * ctx) { +int llama_n_vocab(const struct llama_context * ctx) { return ctx->vocab.id_to_token.size(); } -int llama_n_ctx(struct llama_context * ctx) { +int llama_n_ctx(const struct llama_context * ctx) { return ctx->model.hparams.n_ctx; } -int llama_n_embd(struct llama_context * ctx) { +int llama_n_embd(const struct llama_context * ctx) { return ctx->model.hparams.n_embd; } @@ -2625,7 +2703,7 @@ float * llama_get_embeddings(struct llama_context * ctx) { return ctx->embedding.data(); } -const char * llama_token_to_str(struct llama_context * ctx, llama_token token) { +const char * llama_token_to_str(const struct llama_context * ctx, llama_token token) { if (token >= llama_n_vocab(ctx)) { return nullptr; } @@ -2694,57 +2772,3 @@ const char * llama_print_system_info(void) { std::vector>& llama_internal_get_tensor_map(struct llama_context * ctx) { return ctx->model.tensors_by_name; } - -size_t llama_load_session_file(struct llama_context * ctx, const char * path_session, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out) { - // TODO leverage mmap - llama_file file(path_session, "rb"); - const uint32_t magic = file.read_u32(); - const uint32_t version = file.read_u32(); - - if (!(magic == 'ggsn' && version == 0)) { - fprintf(stderr, "%s : unknown (magic, version) for session file: %08x, %08x\n", __func__, magic, version); - return 0; - } - - llama_hparams session_hparams; - file.read_raw(&session_hparams, sizeof(llama_hparams)); - - // REVIEW - if (session_hparams != ctx->model.hparams) { - fprintf(stderr, "%s : model hparams didn't match from session file!\n", __func__); - return 0; - } - - const uint32_t n_token_count = file.read_u32(); - LLAMA_ASSERT(n_token_capacity >= n_token_count); - file.read_raw(tokens_out, sizeof(llama_token) * n_token_count); - *n_token_count_out = n_token_count; - - const size_t n_state_size = file.size - file.tell(); - const size_t n_orig_state_size = llama_get_state_size(ctx); - if (n_state_size != n_orig_state_size) { - fprintf(stderr, "%s : failed to validate state size\n", __func__); - } - std::unique_ptr state_data(new uint8_t[n_state_size]); - file.read_raw(state_data.get(), n_state_size); - return llama_set_state_data(ctx, state_data.get()); -} - -size_t llama_save_session_file(struct llama_context * ctx, const char * path_session, const llama_token * tokens, size_t n_token_count) { - // TODO save temp & swap - llama_file file(path_session, "wb"); - - const size_t n_state_size = llama_get_state_size(ctx); - std::unique_ptr state_data(new uint8_t[n_state_size]); - llama_copy_state_data(ctx, state_data.get()); - - file.write_u32('ggsn'); // magic - file.write_u32(0); // version - file.write_raw(&ctx->model.hparams, sizeof(llama_hparams)); - - file.write_u32((uint32_t) n_token_count); // REVIEW - file.write_raw(tokens, sizeof(llama_token) * n_token_count); - - file.write_raw(state_data.get(), n_state_size); - return n_state_size; // REVIEW -} diff --git a/llama.h b/llama.h index 34a8f5b3ca52c..2f6ce8d831e6c 100644 --- a/llama.h +++ b/llama.h @@ -19,9 +19,11 @@ # define LLAMA_API #endif -#define LLAMA_FILE_VERSION 1 -#define LLAMA_FILE_MAGIC 0x67676a74 // 'ggjt' in hex -#define LLAMA_FILE_MAGIC_UNVERSIONED 0x67676d6c // pre-versioned files +#define LLAMA_FILE_VERSION 1 +#define LLAMA_FILE_MAGIC 'ggjt' +#define LLAMA_FILE_MAGIC_UNVERSIONED 'ggml' +#define LLAMA_SESSION_MAGIC 'ggsn' +#define LLAMA_SESSION_VERSION 0 #ifdef __cplusplus extern "C" { @@ -120,13 +122,13 @@ extern "C" { int n_threads); // Returns the number of tokens in the KV cache - LLAMA_API int llama_get_kv_cache_token_count(struct llama_context * ctx); + LLAMA_API int llama_get_kv_cache_token_count(const struct llama_context * ctx); // Sets the current rng seed. LLAMA_API void llama_set_rng_seed(struct llama_context * ctx, int seed); // Returns the size in bytes of the state (rng, logits, embedding and kv_cache) - LLAMA_API size_t llama_get_state_size(struct llama_context * ctx); + LLAMA_API size_t llama_get_state_size(const struct llama_context * ctx); // Copies the state to the specified destination address. // Destination needs to have allocated enough memory. @@ -138,8 +140,8 @@ extern "C" { LLAMA_API size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src); // Save/load session file - LLAMA_API size_t llama_load_session_file(struct llama_context * ctx, const char * path_session, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out); - LLAMA_API size_t llama_save_session_file(struct llama_context * ctx, const char * path_session, const llama_token * tokens, size_t n_token_count); + LLAMA_API bool llama_load_session_file(struct llama_context * ctx, const char * path_session, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out); + LLAMA_API bool llama_save_session_file(struct llama_context * ctx, const char * path_session, const llama_token * tokens, size_t n_token_count); // Run the llama inference to obtain the logits and probabilities for the next token. // tokens + n_tokens is the provided batch of new tokens to process @@ -164,9 +166,9 @@ extern "C" { int n_max_tokens, bool add_bos); - LLAMA_API int llama_n_vocab(struct llama_context * ctx); - LLAMA_API int llama_n_ctx (struct llama_context * ctx); - LLAMA_API int llama_n_embd (struct llama_context * ctx); + LLAMA_API int llama_n_vocab(const struct llama_context * ctx); + LLAMA_API int llama_n_ctx (const struct llama_context * ctx); + LLAMA_API int llama_n_embd (const struct llama_context * ctx); // Token logits obtained from the last call to llama_eval() // The logits for the last token are stored in the last row @@ -180,7 +182,7 @@ extern "C" { LLAMA_API float * llama_get_embeddings(struct llama_context * ctx); // Token Id -> String. Uses the vocabulary in the provided context - LLAMA_API const char * llama_token_to_str(struct llama_context * ctx, llama_token token); + LLAMA_API const char * llama_token_to_str(const struct llama_context * ctx, llama_token token); // Special tokens LLAMA_API llama_token llama_token_bos(); diff --git a/scripts/build-info.cmake b/scripts/build-info.cmake new file mode 100644 index 0000000000000..fb46ed2b5939e --- /dev/null +++ b/scripts/build-info.cmake @@ -0,0 +1,53 @@ +set(TEMPLATE_FILE "${CMAKE_BINARY_DIR}/BUILD_INFO.h.in") +set(HEADER_FILE "${CMAKE_CURRENT_SOURCE_DIR}/build-info.h") +set(BUILD_NUMBER 0) +set(BUILD_COMMIT "unknown") + +# Look for git +find_package(Git) +if(NOT Git_FOUND) + execute_process( + COMMAND which git + OUTPUT_VARIABLE GIT_EXECUTABLE + OUTPUT_STRIP_TRAILING_WHITESPACE + ) + if(NOT GIT_EXECUTABLE STREQUAL "") + set(Git_FOUND TRUE) + message(STATUS "Found Git using 'which': ${GIT_EXECUTABLE}") + else() + message(WARNING "Git not found using 'find_package' or 'which'. Build info will not be accurate. Consider installing Git or ensuring it is in the PATH.") + endif() +endif() + +# Get the commit count and hash +if(Git_FOUND) + execute_process( + COMMAND ${GIT_EXECUTABLE} rev-parse --short HEAD + WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} + OUTPUT_VARIABLE HEAD + OUTPUT_STRIP_TRAILING_WHITESPACE + RESULT_VARIABLE GIT_HEAD_RESULT + ) + execute_process( + COMMAND ${GIT_EXECUTABLE} rev-list --count HEAD + WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} + OUTPUT_VARIABLE COUNT + OUTPUT_STRIP_TRAILING_WHITESPACE + RESULT_VARIABLE GIT_COUNT_RESULT + ) + if(GIT_HEAD_RESULT EQUAL 0 AND GIT_COUNT_RESULT EQUAL 0) + set(BUILD_COMMIT ${HEAD}) + set(BUILD_NUMBER ${COUNT}) + endif() +endif() + +# Only write the header if it's changed to prevent unnecessary recompilation +if(EXISTS ${HEADER_FILE}) + file(STRINGS ${HEADER_FILE} CONTENTS REGEX "BUILD_COMMIT \"([^\"]*)\"") + list(GET CONTENTS 0 EXISTING) + if(NOT EXISTING STREQUAL "#define BUILD_COMMIT \"${BUILD_COMMIT}\"") + configure_file(${TEMPLATE_FILE} ${HEADER_FILE}) + endif() +else() + configure_file(${TEMPLATE_FILE} ${HEADER_FILE}) +endif() diff --git a/scripts/build-info.sh b/scripts/build-info.sh new file mode 100755 index 0000000000000..507d7e1537b69 --- /dev/null +++ b/scripts/build-info.sh @@ -0,0 +1,22 @@ +#!/bin/sh + +BUILD_NUMBER="0" +BUILD_COMMIT="unknown" + +REV_LIST=$(git rev-list --count HEAD) +if [ $? -eq 0 ]; then + BUILD_NUMBER=$REV_LIST +fi + +REV_PARSE=$(git rev-parse --short HEAD) +if [ $? -eq 0 ]; then + BUILD_COMMIT=$REV_PARSE +fi + +echo "#ifndef BUILD_INFO_H" +echo "#define BUILD_INFO_H" +echo "" +echo "#define BUILD_NUMBER $BUILD_NUMBER" +echo "#define BUILD_COMMIT \"$BUILD_COMMIT\"" +echo "" +echo "#endif // BUILD_INFO_H"