From 0849f322c33fb86a5b472c7f65d672aea7276a34 Mon Sep 17 00:00:00 2001 From: Justine Tunney Date: Wed, 31 Jul 2024 03:18:22 -0700 Subject: [PATCH] Get CUDA and Metal GPU working in whisperfile --- llamafile/metal.c | 8 ++++++++ whisper.cpp/main.cpp | 42 ++++++++++++++++++++++++++++++++++------- whisper.cpp/server.cpp | 20 +++++++++++++++++--- whisper.cpp/whisper.cpp | 37 ++++++++++++++++++++++-------------- whisper.cpp/whisper.h | 1 - 5 files changed, 83 insertions(+), 25 deletions(-) diff --git a/llamafile/metal.c b/llamafile/metal.c index 816b3841d7..efa0f07460 100644 --- a/llamafile/metal.c +++ b/llamafile/metal.c @@ -74,6 +74,7 @@ static struct Metal { typeof(ggml_backend_metal_set_n_cb) *backend_set_n_cb; typeof(ggml_backend_metal_log_set_callback) *log_set_callback; typeof(ggml_backend_reg_metal_init) *reg_init; + typeof(ggml_backend_metal_supports_family) *supports_family; } ggml_metal; static const char *Dlerror(void) { @@ -217,6 +218,7 @@ static bool LinkMetal(const char *dso) { ok &= !!(ggml_metal.backend_set_n_cb = cosmo_dlsym(lib, "ggml_backend_metal_set_n_cb")); ok &= !!(ggml_metal.log_set_callback = cosmo_dlsym(lib, "ggml_backend_metal_log_set_callback")); ok &= !!(ggml_metal.reg_init = cosmo_dlsym(lib, "ggml_backend_reg_metal_init")); + ok &= !!(ggml_metal.supports_family = cosmo_dlsym(lib, "ggml_backend_metal_supports_family")); if (!ok) { tinylog(Dlerror(), ": not all symbols could be imported\n", NULL); return false; @@ -318,3 +320,9 @@ ggml_backend_t ggml_backend_reg_metal_init(const char *params, void *user_data) return 0; return ggml_metal.reg_init(params, user_data); } + +bool ggml_backend_metal_supports_family(ggml_backend_t backend, int family) { + if (!llamafile_has_metal()) + return 0; + return ggml_metal.supports_family(backend, family); +} diff --git a/whisper.cpp/main.cpp b/whisper.cpp/main.cpp index 0d18c59304..8ff8bbc36c 100644 --- a/whisper.cpp/main.cpp +++ b/whisper.cpp/main.cpp @@ -14,6 +14,9 @@ #include #include +#include "llamafile/llamafile.h" +#include "llamafile/debug.h" + #if defined(_MSC_VER) #pragma warning(disable: 4244 4267) // possible loss of data #endif @@ -28,11 +31,11 @@ static void replace_all(std::string & s, const std::string & search, const std:: } } -int32_t get_num_physical_cores(); +int cpu_get_num_math(); // command-line parameters struct whisper_params { - int32_t n_threads = std::min(4, get_num_physical_cores()); + int32_t n_threads = cpu_get_num_math(); int32_t n_processors = 1; int32_t offset_t_ms = 0; int32_t offset_n = 0; @@ -72,7 +75,6 @@ struct whisper_params { bool print_progress = false; bool no_timestamps = false; bool log_score = false; - bool use_gpu = true; bool flash_attn = false; std::string language = "en"; @@ -122,6 +124,33 @@ static bool whisper_params_parse(int argc, char ** argv, whisper_params & params continue; } + if (arg == "--log-disable") { + FLAG_log_disable = true; + } else if (arg == "--trap") { + FLAG_trap = true; + FLAG_unsecure = true; // for better backtraces + llamafile_trapping_enabled(+1); + } else if (arg == "--unsecure") { + FLAG_unsecure = true; + } else if (arg == "--nocompile") { + FLAG_nocompile = true; + } else if (arg == "--recompile") { + FLAG_recompile = true; + } else if (arg == "--tinyblas") { + FLAG_tinyblas = true; // undocumented + } else if (arg == "--gpu") { + if (++i >= argc) { + fprintf(stderr, "error: missing --gpu flag value\n"); + exit(1); + } + FLAG_gpu = llamafile_gpu_parse(argv[i]); + if (FLAG_gpu == LLAMAFILE_GPU_ERROR) { + fprintf(stderr, "error: invalid --gpu flag value: %s\n", argv[i]); + exit(1); + } + return true; + } else + if (arg == "-h" || arg == "--help") { whisper_print_usage(argc, argv, params); exit(0); @@ -157,7 +186,7 @@ static bool whisper_params_parse(int argc, char ** argv, whisper_params & params else if (arg == "-oj" || arg == "--output-json") { params.output_jsn = true; } else if (arg == "-ojf" || arg == "--output-json-full"){ params.output_jsn_full = params.output_jsn = true; } else if (arg == "-of" || arg == "--output-file") { params.fname_out.emplace_back(argv[++i]); } - else if (arg == "-np" || arg == "--no-prints") { params.no_prints = true; } + else if (arg == "-np" || arg == "--no-prints") { params.no_prints = true; FLAG_log_disable = true; } else if (arg == "-ps" || arg == "--print-special") { params.print_special = true; } else if (arg == "-pc" || arg == "--print-colors") { params.print_colors = true; } else if (arg == "-pp" || arg == "--print-progress") { params.print_progress = true; } @@ -170,7 +199,7 @@ static bool whisper_params_parse(int argc, char ** argv, whisper_params & params else if (arg == "-oved" || arg == "--ov-e-device") { params.openvino_encode_device = argv[++i]; } else if (arg == "-dtw" || arg == "--dtw") { params.dtw = argv[++i]; } else if (arg == "-ls" || arg == "--log-score") { params.log_score = true; } - else if (arg == "-ng" || arg == "--no-gpu") { params.use_gpu = false; } + else if (arg == "-ng" || arg == "--no-gpu") { FLAG_gpu = LLAMAFILE_GPU_DISABLE; } else if (arg == "-fa" || arg == "--flash-attn") { params.flash_attn = true; } else if ( arg == "--suppress-regex") { params.suppress_regex = argv[++i]; } else if ( arg == "--grammar") { params.grammar = argv[++i]; } @@ -236,7 +265,7 @@ static void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params fprintf(stderr, " -oved D, --ov-e-device DNAME [%-7s] the OpenVINO device used for encode inference\n", params.openvino_encode_device.c_str()); fprintf(stderr, " -dtw MODEL --dtw MODEL [%-7s] compute token-level timestamps\n", params.dtw.c_str()); fprintf(stderr, " -ls, --log-score [%-7s] log best decoder scores of tokens\n", params.log_score?"true":"false"); - fprintf(stderr, " -ng, --no-gpu [%-7s] disable GPU\n", params.use_gpu ? "false" : "true"); + fprintf(stderr, " -ng, --no-gpu [%-7s] disable GPU\n", FLAG_gpu == LLAMAFILE_GPU_DISABLE ? "false" : "true"); fprintf(stderr, " -fa, --flash-attn [%-7s] flash attention\n", params.flash_attn ? "true" : "false"); fprintf(stderr, " --suppress-regex REGEX [%-7s] regular expression matching tokens to suppress\n", params.suppress_regex.c_str()); fprintf(stderr, " --grammar GRAMMAR [%-7s] GBNF grammar to guide decoding\n", params.grammar.c_str()); @@ -983,7 +1012,6 @@ int main(int argc, char ** argv) { struct whisper_context_params cparams = whisper_context_default_params(); - cparams.use_gpu = params.use_gpu; cparams.flash_attn = params.flash_attn; if (!params.dtw.empty()) { diff --git a/whisper.cpp/server.cpp b/whisper.cpp/server.cpp index 23b27a8bf7..fbd1246e13 100644 --- a/whisper.cpp/server.cpp +++ b/whisper.cpp/server.cpp @@ -79,7 +79,6 @@ struct whisper_params { bool print_realtime = false; bool print_progress = false; bool no_timestamps = false; - bool use_gpu = true; bool flash_attn = false; std::string language = "en"; @@ -184,7 +183,7 @@ bool whisper_params_parse(int argc, char ** argv, whisper_params & params, serve else if (arg == "-m" || arg == "--model") { params.model = argv[++i]; } else if (arg == "-oved" || arg == "--ov-e-device") { params.openvino_encode_device = argv[++i]; } else if (arg == "-dtw" || arg == "--dtw") { params.dtw = argv[++i]; } - else if (arg == "-ng" || arg == "--no-gpu") { params.use_gpu = false; } + else if (arg == "-ng" || arg == "--no-gpu") { FLAG_gpu = LLAMAFILE_GPU_DISABLE; } else if (arg == "-fa" || arg == "--flash-attn") { params.flash_attn = true; } // server params else if ( arg == "--port") { sparams.port = std::stoi(argv[++i]); } @@ -194,6 +193,22 @@ bool whisper_params_parse(int argc, char ** argv, whisper_params & params, serve else if ( arg == "--convert") { sparams.ffmpeg_converter = true; } else if ( arg == "--recompile") { FLAG_recompile = true; } else if ( arg == "--nocompile") { FLAG_nocompile = true; } + else if ( arg == "--tinyblas") { FLAG_tinyblas = true; } + else if ( arg == "--unsecure") { FLAG_unsecure = true; } + + else if (arg == "--gpu") { + if (++i >= argc) { + fprintf(stderr, "error: missing --gpu flag value\n"); + exit(1); + } + FLAG_gpu = llamafile_gpu_parse(argv[i]); + if (FLAG_gpu == LLAMAFILE_GPU_ERROR) { + fprintf(stderr, "error: invalid --gpu flag value: %s\n", argv[i]); + exit(1); + } + return true; + } + else { fprintf(stderr, "error: unknown argument: %s\n", arg.c_str()); whisper_print_usage(argc, argv, params, sparams); @@ -515,7 +530,6 @@ int main(int argc, char ** argv) { // whisper init struct whisper_context_params cparams = whisper_context_default_params(); - cparams.use_gpu = params.use_gpu; cparams.flash_attn = params.flash_attn; if (!params.dtw.empty()) { diff --git a/whisper.cpp/whisper.cpp b/whisper.cpp/whisper.cpp index 2f29be0bf4..f1fb860a7b 100644 --- a/whisper.cpp/whisper.cpp +++ b/whisper.cpp/whisper.cpp @@ -2,6 +2,9 @@ // vi: set et ft=cpp ts=4 sts=4 sw=4 fenc=utf-8 :vi #include "whisper.h" +#define GGML_USE_CUDA +#define GGML_USE_METAL + #ifdef GGML_USE_METAL #include "llama.cpp/ggml-metal.h" #endif @@ -15,6 +18,8 @@ #include "llama.cpp/ggml-alloc.h" #include "llama.cpp/ggml-backend.h" +#include "llamafile/llamafile.h" + #include "whisper-mel.hpp" #include @@ -208,7 +213,8 @@ static bool ggml_graph_compute_helper( // and X_1 and Y_1 are the remaining views. X_1 and Y_1 end up being small matrices that can be processed with more // general-purpose kernels // -static struct ggml_tensor * ggml_mul_mat_pad(struct ggml_context * ctx, struct ggml_tensor * x, struct ggml_tensor * y, int pad = 32) { +static struct ggml_tensor * ggml_mul_mat_pad(struct ggml_context * ctx, struct ggml_tensor * x, struct ggml_tensor * y) { + int pad = 32; // use padding only if dimension 0 is at least 8 times larger than the padding // else we won't get much benefit from the optimization const int n_pad_req = 8; @@ -231,7 +237,7 @@ static struct ggml_tensor * ggml_mul_mat_pad(struct ggml_context * ctx, struct g // TODO: check if other platforms can benefit from this optimization // TODO: CUDA is currently broken - seems ggml_mul_mat does not handle views correctly #if defined(GGML_USE_METAL) -#define ggml_mul_mat ggml_mul_mat_pad +#define ggml_mul_mat (llamafile_has_metal() ? ggml_mul_mat_pad : ggml_mul_mat) #endif // available whisper models @@ -1067,18 +1073,18 @@ static void whisper_kv_cache_seq_cp( } static uint32_t whisper_kv_cache_get_padding(const struct whisper_context & wctx) { - if (!wctx.params.flash_attn || !wctx.params.use_gpu) { + if (!wctx.params.flash_attn) { return 1u; } #ifdef GGML_USE_METAL - if (wctx.params.use_gpu) { + if (llamafile_has_metal()) { return 32u; } #endif #ifdef GGML_USE_CUDA - if (wctx.params.use_gpu) { + if (llamafile_has_cuda()) { return 256u; } #endif @@ -1221,7 +1227,7 @@ static ggml_backend_t whisper_backend_init_gpu(const whisper_context_params & pa ggml_backend_t result = NULL; #ifdef GGML_USE_CUDA - if (params.use_gpu) { + if (llamafile_has_cuda()) { WHISPER_LOG_INFO("%s: using CUDA backend\n", __func__); result = ggml_backend_cuda_init(params.gpu_device); if (!result) { @@ -1231,7 +1237,7 @@ static ggml_backend_t whisper_backend_init_gpu(const whisper_context_params & pa #endif #ifdef GGML_USE_METAL - if (params.use_gpu) { + if (!result && llamafile_has_metal()) { WHISPER_LOG_INFO("%s: using Metal backend\n", __func__); ggml_backend_metal_log_set_callback(g_state.log_callback, g_state.log_callback_user_data); result = ggml_backend_metal_init(); @@ -1299,14 +1305,14 @@ static std::vector whisper_backend_init(const whisper_context_pa static ggml_backend_buffer_type_t whisper_default_buffer_type(const whisper_context_params & params) { ggml_backend_buffer_type_t result = nullptr; - params.use_gpu || (result = ggml_backend_cpu_buffer_type()); - #ifdef GGML_USE_CUDA - result || (result = ggml_backend_cuda_buffer_type(params.gpu_device)); + if (!result && llamafile_has_cuda()) + result = ggml_backend_cuda_buffer_type(params.gpu_device); #endif #ifdef GGML_USE_METAL - result || (result = ggml_backend_metal_buffer_type()); + if (!result && llamafile_has_metal()) + result = ggml_backend_metal_buffer_type(); #endif #ifdef GGML_USE_SYCL @@ -1317,7 +1323,8 @@ static ggml_backend_buffer_type_t whisper_default_buffer_type(const whisper_cont result || (result = ggml_backend_vk_buffer_type(params.gpu_device)); #endif - result || (result = ggml_backend_cpu_buffer_type()); + if (!result) + result = ggml_backend_cpu_buffer_type(); return result; } @@ -3585,7 +3592,6 @@ int whisper_ctx_init_openvino_encoder( struct whisper_context_params whisper_context_default_params() { struct whisper_context_params result = { - /*.use_gpu =*/ true, /*.flash_attn =*/ false, /*.gpu_device =*/ 0, @@ -3690,7 +3696,8 @@ struct whisper_context * whisper_init_with_params_no_state(struct whisper_model_ params.dtw_token_timestamps = false; } - WHISPER_LOG_INFO("%s: use gpu = %d\n", __func__, params.use_gpu); + WHISPER_LOG_INFO("%s: cuda gpu = %d\n", __func__, llamafile_has_cuda()); + WHISPER_LOG_INFO("%s: metal gpu = %d\n", __func__, llamafile_has_metal()); WHISPER_LOG_INFO("%s: flash attn = %d\n", __func__, params.flash_attn); WHISPER_LOG_INFO("%s: gpu_device = %d\n", __func__, params.gpu_device); WHISPER_LOG_INFO("%s: dtw = %d\n", __func__, params.dtw_token_timestamps); @@ -7444,6 +7451,8 @@ static void whisper_log_internal(ggml_log_level level, const char * format, ...) static void whisper_log_callback_default(ggml_log_level level, const char * text, void * user_data) { (void) level; (void) user_data; + if (FLAG_log_disable) + return; fputs(text, stderr); fflush(stderr); } diff --git a/whisper.cpp/whisper.h b/whisper.cpp/whisper.h index 9608f02585..b5091c6c61 100644 --- a/whisper.cpp/whisper.h +++ b/whisper.cpp/whisper.h @@ -114,7 +114,6 @@ extern "C" { } whisper_aheads; struct whisper_context_params { - bool use_gpu; bool flash_attn; int gpu_device; // CUDA device