From 0dc95d59d5fadb3cf2547acc30c0b17c4f10c46d Mon Sep 17 00:00:00 2001 From: Ryan Metcalfe <107415876+RyanMetcalfeInt8@users.noreply.github.com> Date: Tue, 4 Jul 2023 08:56:11 -0400 Subject: [PATCH] whisper : add OpenVINO support (#1037) * openvino: use OpenVINO encoder inference * openvino: add python script for OpenVINO model generation * whisper: Fix 'unused' warnings when OpenVINO isn't enabled in build * Apply suggestions from code review Co-authored-by: Georgi Gerganov * whisper: Fix compilation error * whisper: revert whisper_get_openvino_path_encoder & whisper_get_openvino_path_cache to non-const func signatures * cmake: Add openvino-encoder as separate object target * whisper : minor style fixes * minor : indentation fixes --------- Co-authored-by: Georgi Gerganov --- CMakeLists.txt | 28 +++++ examples/main/main.cpp | 7 ++ models/convert-whisper-to-openvino.py | 53 +++++++++ models/openvino-conversion-requirements.txt | 2 + openvino/whisper-openvino-encoder.cpp | 108 +++++++++++++++++ openvino/whisper-openvino-encoder.h | 31 +++++ whisper.cpp | 123 +++++++++++++++++++- whisper.h | 18 +++ 8 files changed, 367 insertions(+), 3 deletions(-) create mode 100644 models/convert-whisper-to-openvino.py create mode 100644 models/openvino-conversion-requirements.txt create mode 100644 openvino/whisper-openvino-encoder.cpp create mode 100644 openvino/whisper-openvino-encoder.h diff --git a/CMakeLists.txt b/CMakeLists.txt index db5d3478b26..88021e01470 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -54,6 +54,8 @@ option(WHISPER_NO_AVX2 "whisper: disable AVX2" OFF) option(WHISPER_NO_FMA "whisper: disable FMA" OFF) option(WHISPER_NO_F16C "whisper: disable F16c" OFF) +option(WHISPER_OPENVINO "whisper: support for OpenVINO" OFF) + if (APPLE) option(WHISPER_NO_ACCELERATE "whisper: disable Accelerate framework" OFF) option(WHISPER_COREML "whisper: enable Core ML framework" OFF) @@ -192,6 +194,10 @@ if (WHISPER_CLBLAST) endif() endif() +if( WHISPER_OPENVINO ) + find_package(OpenVINO REQUIRED COMPONENTS Runtime) +endif() + # compiler flags if (NOT CMAKE_BUILD_TYPE AND NOT CMAKE_CONFIGURATION_TYPES) @@ -297,6 +303,24 @@ if (WHISPER_COREML) ) endif() +if (WHISPER_OPENVINO) + set(TARGET whisper.openvino) + + add_library(${TARGET} OBJECT + openvino/whisper-openvino-encoder.h + openvino/whisper-openvino-encoder.cpp + ) + + target_include_directories(${TARGET} PUBLIC + . + ) + + set_property(TARGET ${TARGET} PROPERTY POSITION_INDEPENDENT_CODE ON) + set(WHISPER_EXTRA_FLAGS ${WHISPER_EXTRA_FLAGS} -DWHISPER_USE_OPENVINO) + + target_link_libraries(${TARGET} PRIVATE openvino::runtime) +endif() + # # whisper - this is the main library of the project # @@ -322,6 +346,10 @@ if (WHISPER_COREML) target_link_libraries(${TARGET} PRIVATE whisper.coreml) endif() +if (WHISPER_OPENVINO) + target_link_libraries(${TARGET} PRIVATE whisper.openvino) +endif() + if (MSVC) target_link_libraries(${TARGET} PRIVATE ${WHISPER_EXTRA_LIBS} ${CMAKE_THREAD_LIBS_INIT}) diff --git a/examples/main/main.cpp b/examples/main/main.cpp index 344b6877882..9a68367186d 100644 --- a/examples/main/main.cpp +++ b/examples/main/main.cpp @@ -95,6 +95,8 @@ struct whisper_params { // [TDRZ] speaker turn string std::string tdrz_speaker_turn = " [SPEAKER_TURN]"; // TODO: set from command line + std::string openvino_encode_device = "CPU"; + std::vector fname_inp = {}; std::vector fname_out = {}; }; @@ -155,6 +157,7 @@ bool whisper_params_parse(int argc, char ** argv, whisper_params & params) { else if ( arg == "--prompt") { params.prompt = argv[++i]; } else if (arg == "-m" || arg == "--model") { params.model = argv[++i]; } else if (arg == "-f" || arg == "--file") { params.fname_inp.emplace_back(argv[++i]); } + else if (arg == "-oved" || arg == "--ov-e-device") { params.openvino_encode_device = argv[++i]; } else { fprintf(stderr, "error: unknown argument: %s\n", arg.c_str()); whisper_print_usage(argc, argv, params); @@ -207,6 +210,7 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para fprintf(stderr, " --prompt PROMPT [%-7s] initial prompt\n", params.prompt.c_str()); fprintf(stderr, " -m FNAME, --model FNAME [%-7s] model path\n", params.model.c_str()); fprintf(stderr, " -f FNAME, --file FNAME [%-7s] input WAV file path\n", ""); + fprintf(stderr, " -oved D, --ov-e-device DNAME [%-7s] the OpenVINO device used for encode inference\n", params.openvino_encode_device.c_str()); fprintf(stderr, "\n"); } @@ -809,6 +813,9 @@ int main(int argc, char ** argv) { return 3; } + // initialize openvino encoder. This has no effect on whisper.cpp builds that don't have OpenVINO configured. + whisper_ctx_init_openvino_encoder(ctx, nullptr, params.openvino_encode_device.c_str(), nullptr); + for (int f = 0; f < (int) params.fname_inp.size(); ++f) { const auto fname_inp = params.fname_inp[f]; const auto fname_out = f < (int) params.fname_out.size() && !params.fname_out[f].empty() ? params.fname_out[f] : params.fname_inp[f]; diff --git a/models/convert-whisper-to-openvino.py b/models/convert-whisper-to-openvino.py new file mode 100644 index 00000000000..31cf29abdf5 --- /dev/null +++ b/models/convert-whisper-to-openvino.py @@ -0,0 +1,53 @@ +import argparse +import torch +from whisper import load_model +import os +from openvino.tools import mo +from openvino.runtime import serialize +import shutil + +def convert_encoder(hparams, encoder, mname): + encoder.eval() + + mel = torch.zeros((1, 80, 3000)) + + onnx_folder=os.path.join(os.path.dirname(__file__),"onnx_encoder") + + #create a directory to store the onnx model, and other collateral that is saved during onnx export procedure + if not os.path.isdir(onnx_folder): + os.makedirs(onnx_folder) + + onnx_path = os.path.join(onnx_folder, "whisper_encoder.onnx") + + torch.onnx.export( + encoder, + mel, + onnx_path, + input_names=["mel"], + output_names=["output_features"] + ) + + # use model optimizer to convert onnx to OpenVINO IR format + encoder_model = mo.convert_model(onnx_path, compress_to_fp16=True) + serialize(encoder_model, xml_path='ggml-' + mname + '-encoder-openvino.xml') + + #cleanup + if os.path.isdir(onnx_folder): + shutil.rmtree(onnx_folder) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("--model", type=str, help="model to convert (e.g. tiny, tiny.en, base, base.en, small, small.en, medium, medium.en, large, large-v1)", required=True) + args = parser.parse_args() + + if args.model not in ["tiny", "tiny.en", "base", "base.en", "small", "small.en", "medium", "medium.en", "large", "large-v1"]: + raise ValueError("Invalid model name") + + whisper = load_model(args.model).cpu() + hparams = whisper.dims + + encoder = whisper.encoder + + # Convert encoder to onnx + convert_encoder(hparams, encoder, args.model) diff --git a/models/openvino-conversion-requirements.txt b/models/openvino-conversion-requirements.txt new file mode 100644 index 00000000000..5bfd95db88e --- /dev/null +++ b/models/openvino-conversion-requirements.txt @@ -0,0 +1,2 @@ +openvino-dev[pytorch,onnx] +openai-whisper \ No newline at end of file diff --git a/openvino/whisper-openvino-encoder.cpp b/openvino/whisper-openvino-encoder.cpp new file mode 100644 index 00000000000..11aef39dd43 --- /dev/null +++ b/openvino/whisper-openvino-encoder.cpp @@ -0,0 +1,108 @@ +#include "openvino/whisper-openvino-encoder.h" +#include "ggml.h" +#include +#include + +struct whisper_openvino_context { + ov::InferRequest inferRequest; +}; + +struct whisper_openvino_context * whisper_openvino_init(const char* path_model, + const char* device, + const char* cache_dir) +{ + if (!path_model || !device) { + fprintf(stderr, "%s: path_model and/or device is null\n", __func__); + return nullptr; + } + + fprintf(stderr, "%s: path_model = %s, device = %s, cache_dir = %s\n", + __func__, path_model, device, cache_dir ? cache_dir : "(not set)"); + + whisper_openvino_context *context = new whisper_openvino_context; + try { + ov::Core core; + + if (cache_dir) { + // enables caching of device-specific 'blobs' during core.compile_model + // routine. This speeds up calls to compile_model for successive runs. + core.set_property(ov::cache_dir(cache_dir)); + } + + //Read the OpenVINO encoder IR (.xml/.bin) from disk, producing an ov::Model object. + std::shared_ptr model = core.read_model(path_model); + + // Produce a compiled-model object, given the device ("CPU", "GPU", etc.) + auto compiledModel = core.compile_model(model, device); + + // From the compiled model object, create an infer request. This is the thing that we + // we will use later on to trigger inference execution. + context->inferRequest = compiledModel.create_infer_request(); + } + catch (const std::exception& error) { + std::cout << "in openvino encoder compile routine: exception: " << error.what() << std::endl; + delete context; + context = nullptr; + } + + return context; +} + +void whisper_openvino_free(struct whisper_openvino_context * ctx) { + if( ctx ) { + delete ctx; + } +} + +int whisper_openvino_encode( + whisper_openvino_context* ctx, + ggml_tensor* mel, + ggml_tensor* out) { + + if (!ctx || !mel || !out) { + fprintf(stderr, "%s: Error! ctx / mel / out is null\n", __func__); + return 0; + } + + if (mel->n_dims != 2) { + fprintf(stderr, "%s: Error! mel ggml_tensor expected to have n_dims=2, but it has n_dims=%d\n", + __func__, mel->n_dims); + return 0; + } + + if (out->n_dims != 2) { + fprintf(stderr, "%s: Error! out ggml_tensor expected to have n_dims=2, but it has n_dims=%d\n", + __func__, out->n_dims); + return 0; + } + + try { + + //wrap the passed-in mel ggml_tensor as an OpenVINO Tensor object, and set as input tensor to infer request + { + // note, we populate shape & stride dimensions in opposite order from how they are listed in ne / nb arrays + ov::Shape input_shape = { 1, (unsigned long long)mel->ne[1], (unsigned long long)mel->ne[0] }; + ov::Strides input_strides = { mel->nb[2], mel->nb[1], mel->nb[0] }; + ov::Tensor input_tensor(ov::element::f32, input_shape, mel->data, input_strides); + ctx->inferRequest.set_input_tensor(input_tensor); + } + + //wrap the passed-in out ggml_tensor as an OpenVINO Tensor object, and set as output tensor to infer request + { + // note, we populate shape & stride dimensions in opposite order from how they are listed in ne / nb arrays + ov::Shape output_shape = { 1, (unsigned long long)out->ne[1], (unsigned long long)out->ne[0] }; + ov::Strides output_strides = { out->nb[2], out->nb[1], out->nb[0] }; + ov::Tensor out_tensor(ov::element::f32, output_shape, out->data, output_strides); + ctx->inferRequest.set_output_tensor(out_tensor); + } + + //run inference + ctx->inferRequest.infer(); + } + catch (const std::exception& error) { + std::cout << "in openvino encode inference execution routine: exception: " << error.what() << std::endl; + return 0; + } + + return 1; +} \ No newline at end of file diff --git a/openvino/whisper-openvino-encoder.h b/openvino/whisper-openvino-encoder.h new file mode 100644 index 00000000000..7c2f6dfc2e0 --- /dev/null +++ b/openvino/whisper-openvino-encoder.h @@ -0,0 +1,31 @@ +// Wrapper of the OpenVINO Whisper Encoder model +// + +#if __cplusplus +extern "C" { +#endif + +struct whisper_openvino_context; + +// initialize openvino encoder, given path to model xml, device ("CPU", "GPU", etc.), and +// path to cache_dir. Returns null upon failure. +struct whisper_openvino_context * whisper_openvino_init(const char * path_model, + const char * device, + const char * cache_dir); + +// clean up a ctx previously returned from whisper_openvino_init() +void whisper_openvino_free(struct whisper_openvino_context * ctx); + +struct ggml_tensor; + +// Perform encode using OpenVINO. +// Returns 1 on success +// Returns 0 on failure +int whisper_openvino_encode( + whisper_openvino_context* ctx, + ggml_tensor* mel, + ggml_tensor* out); + +#if __cplusplus +} +#endif diff --git a/whisper.cpp b/whisper.cpp index fb489b38453..19b43b5193f 100644 --- a/whisper.cpp +++ b/whisper.cpp @@ -3,6 +3,10 @@ #include "coreml/whisper-encoder.h" #endif +#if WHISPER_USE_OPENVINO +#include "openvino/whisper-openvino-encoder.h" +#endif + #include "ggml.h" #include @@ -660,6 +664,10 @@ struct whisper_state { whisper_coreml_context * ctx_coreml = nullptr; #endif +#ifdef WHISPER_USE_OPENVINO + whisper_openvino_context * ctx_openvino = nullptr; +#endif + // [EXPERIMENTAL] token-level timestamps data int64_t t_beg = 0; int64_t t_last = 0; @@ -1478,7 +1486,13 @@ static bool whisper_encode_internal( const bool use_coreml = wstate.ctx_coreml != nullptr; #endif - if (!use_coreml) { +#ifndef WHISPER_USE_OPENVINO + const bool use_openvino = false; +#else + const bool use_openvino = wstate.ctx_openvino != nullptr; +#endif + + if (!use_coreml && !use_openvino) { // convolution + gelu { wstate.use_buf(ctx0, 1); @@ -1777,8 +1791,7 @@ static bool whisper_encode_internal( } } #ifdef WHISPER_USE_COREML - else - { + else if (use_coreml) { wstate.use_buf(ctx0, -1); cur = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_state, n_ctx); @@ -1786,6 +1799,17 @@ static bool whisper_encode_internal( whisper_coreml_encode(wstate.ctx_coreml, (float *) mel->data, (float *) cur->data); } #endif +#ifdef WHISPER_USE_OPENVINO + else if (use_openvino) { + wstate.use_buf(ctx0, -1); + + cur = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_state, n_ctx); + + if (!whisper_openvino_encode(wstate.ctx_openvino, mel, cur)) { + return false; + } + } +#endif // cur //{ @@ -2628,6 +2652,31 @@ static std::string whisper_get_coreml_path_encoder(std::string path_bin) { } #endif +#ifdef WHISPER_USE_OPENVINO +// replace .bin with-encoder-openvino.xml +static std::string whisper_get_openvino_path_encoder(std::string path_bin) { + auto pos = path_bin.rfind('.'); + if (pos != std::string::npos) { + path_bin = path_bin.substr(0, pos); + } + + path_bin += "-encoder-openvino.xml"; + + return path_bin; +} + +static std::string whisper_get_openvino_path_cache(std::string path_bin) { + auto pos = path_bin.rfind('.'); + if (pos != std::string::npos) { + path_bin = path_bin.substr(0, pos); + } + + path_bin += "-encoder-openvino-cache"; + + return path_bin; +} +#endif + struct whisper_state * whisper_init_state(whisper_context * ctx) { whisper_state * state = new whisper_state; @@ -2694,6 +2743,58 @@ struct whisper_state * whisper_init_state(whisper_context * ctx) { return state; } +int whisper_ctx_init_openvino_encoder(struct whisper_context* ctx, + const char* openvino_model_path, + const char* openvino_device, + const char* openvino_cache_dir) +{ +#ifndef WHISPER_USE_OPENVINO + (void)(ctx); + (void)(openvino_model_path); + (void)(openvino_device); + (void)(openvino_cache_dir); + return 0; +#else + if (!openvino_model_path && ctx->path_model.empty()) + { + fprintf(stderr, "%s: openvino_model_path is nullptr, and ctx has no model_path set.\n", __func__); + return 0; + } + + std::string path_openvino; + if (!openvino_model_path) { + //if openvino_model_path is not set, attempt to find it in the same directory as ggml-.bin model + path_openvino = whisper_get_openvino_path_encoder(ctx->path_model); + } + else { + path_openvino = openvino_model_path; + } + + std::string path_openvino_cache_dir; + if (!openvino_cache_dir) { + //if openvino_cache_dir is not set, set it as a dir residing next to ggml-.bin + path_openvino_cache_dir = whisper_get_openvino_path_cache(ctx->path_model); + } + else { + path_openvino_cache_dir = openvino_cache_dir; + } + + fprintf(stderr, "%s: loading OpenVINO model from '%s'\n", __func__, path_openvino.c_str()); + fprintf(stderr, "%s: first run on a device may take a while ...\n", __func__); + + ctx->state->ctx_openvino = whisper_openvino_init(path_openvino.c_str(), openvino_device, path_openvino_cache_dir.c_str()); + if (!ctx->state->ctx_openvino) { + fprintf(stderr, "%s: failed to init OpenVINO encoder from '%s'\n", __func__, path_openvino.c_str()); + return 0; + } + else { + fprintf(stderr, "%s: OpenVINO model loaded\n", __func__); + } + + return 1; +#endif +} + struct whisper_context * whisper_init_from_file_no_state(const char * path_model) { fprintf(stderr, "%s: loading model from '%s'\n", __func__, path_model); @@ -2848,6 +2949,13 @@ void whisper_free_state(struct whisper_state * state) } #endif +#ifdef WHISPER_USE_OPENVINO + if (state->ctx_openvino != nullptr) { + whisper_openvino_free(state->ctx_openvino); + state->ctx_openvino = nullptr; + } +#endif + delete state; } } @@ -3287,6 +3395,14 @@ static int whisper_has_coreml(void) { #endif } +static int whisper_has_openvino(void) { +#ifdef WHISPER_USE_OPENVINO + return 1; +#else + return 0; +#endif +} + const char * whisper_print_system_info(void) { static std::string s; @@ -3304,6 +3420,7 @@ const char * whisper_print_system_info(void) { s += "SSE3 = " + std::to_string(ggml_cpu_has_sse3()) + " | "; s += "VSX = " + std::to_string(ggml_cpu_has_vsx()) + " | "; s += "COREML = " + std::to_string(whisper_has_coreml()) + " | "; + s += "OPENVINO = " + std::to_string(whisper_has_openvino()) + " | "; return s.c_str(); } diff --git a/whisper.h b/whisper.h index c08723bbb2b..103581c74c6 100644 --- a/whisper.h +++ b/whisper.h @@ -110,6 +110,24 @@ extern "C" { WHISPER_API struct whisper_state * whisper_init_state(struct whisper_context * ctx); + // Given a context, enable use of OpenVINO for encode inference. + // model_path: Optional path to OpenVINO encoder IR model. If set to nullptr, + // the path will be generated from the ggml model path that was passed + // in to whisper_init_from_file. For example, if 'path_model' was + // "/path/to/ggml-base.en.bin", then OpenVINO IR model path will be + // assumed to be "/path/to/ggml-base.en-encoder-openvino.xml". + // device: OpenVINO device to run inference on ("CPU", "GPU", etc.) + // cache_dir: Optional cache directory that can speed up init time, especially for + // GPU, by caching compiled 'blobs' there. + // Set to nullptr if not used. + // Returns 1 on success. If OpenVINO is not enabled in build, this + // simply returns 0. + WHISPER_API int whisper_ctx_init_openvino_encoder( + struct whisper_context * ctx, + const char * model_path, + const char * device, + const char * cache_dir); + // Frees all allocated memory WHISPER_API void whisper_free (struct whisper_context * ctx); WHISPER_API void whisper_free_state(struct whisper_state * state);