-
Notifications
You must be signed in to change notification settings - Fork 3.8k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
OpenVINO support #1037
OpenVINO support #1037
Changes from all commits
c352893
93b8be4
58eae32
4bc1ebc
6bfa371
df77368
76c4186
bc5746e
0ed471c
df98287
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,53 @@ | ||
import argparse | ||
import torch | ||
from whisper import load_model | ||
import os | ||
from openvino.tools import mo | ||
from openvino.runtime import serialize | ||
import shutil | ||
|
||
def convert_encoder(hparams, encoder, mname): | ||
encoder.eval() | ||
|
||
mel = torch.zeros((1, 80, 3000)) | ||
|
||
onnx_folder=os.path.join(os.path.dirname(__file__),"onnx_encoder") | ||
|
||
#create a directory to store the onnx model, and other collateral that is saved during onnx export procedure | ||
if not os.path.isdir(onnx_folder): | ||
os.makedirs(onnx_folder) | ||
|
||
onnx_path = os.path.join(onnx_folder, "whisper_encoder.onnx") | ||
|
||
torch.onnx.export( | ||
encoder, | ||
mel, | ||
onnx_path, | ||
input_names=["mel"], | ||
output_names=["output_features"] | ||
) | ||
|
||
# use model optimizer to convert onnx to OpenVINO IR format | ||
encoder_model = mo.convert_model(onnx_path, compress_to_fp16=True) | ||
serialize(encoder_model, xml_path='ggml-' + mname + '-encoder-openvino.xml') | ||
|
||
#cleanup | ||
if os.path.isdir(onnx_folder): | ||
shutil.rmtree(onnx_folder) | ||
|
||
|
||
if __name__ == "__main__": | ||
parser = argparse.ArgumentParser() | ||
parser.add_argument("--model", type=str, help="model to convert (e.g. tiny, tiny.en, base, base.en, small, small.en, medium, medium.en, large, large-v1)", required=True) | ||
args = parser.parse_args() | ||
|
||
if args.model not in ["tiny", "tiny.en", "base", "base.en", "small", "small.en", "medium", "medium.en", "large", "large-v1"]: | ||
raise ValueError("Invalid model name") | ||
|
||
whisper = load_model(args.model).cpu() | ||
hparams = whisper.dims | ||
|
||
encoder = whisper.encoder | ||
|
||
# Convert encoder to onnx | ||
convert_encoder(hparams, encoder, args.model) |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
openvino-dev[pytorch,onnx] | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. we can use |
||
openai-whisper |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,108 @@ | ||
#include "openvino/whisper-openvino-encoder.h" | ||
#include "ggml.h" | ||
#include <openvino/openvino.hpp> | ||
#include <iostream> | ||
|
||
struct whisper_openvino_context { | ||
ov::InferRequest inferRequest; | ||
}; | ||
|
||
struct whisper_openvino_context * whisper_openvino_init(const char* path_model, | ||
const char* device, | ||
const char* cache_dir) | ||
{ | ||
if (!path_model || !device) { | ||
fprintf(stderr, "%s: path_model and/or device is null\n", __func__); | ||
return nullptr; | ||
} | ||
|
||
fprintf(stderr, "%s: path_model = %s, device = %s, cache_dir = %s\n", | ||
__func__, path_model, device, cache_dir ? cache_dir : "(not set)"); | ||
|
||
whisper_openvino_context *context = new whisper_openvino_context; | ||
try { | ||
ov::Core core; | ||
|
||
if (cache_dir) { | ||
// enables caching of device-specific 'blobs' during core.compile_model | ||
// routine. This speeds up calls to compile_model for successive runs. | ||
core.set_property(ov::cache_dir(cache_dir)); | ||
} | ||
|
||
//Read the OpenVINO encoder IR (.xml/.bin) from disk, producing an ov::Model object. | ||
std::shared_ptr<ov::Model> model = core.read_model(path_model); | ||
|
||
// Produce a compiled-model object, given the device ("CPU", "GPU", etc.) | ||
auto compiledModel = core.compile_model(model, device); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. you can pass There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Any practical speedup from this change? I'm on OpenVINO 2022.3.1 for device which is EOL'ed. I can compile master and run it with cache: whisper_openvino_init: path_model = models/ggml-base.en-encoder-openvino.xml, device = MYRIAD, cache_dir = models/ggml-base.en-encoder-openvino-cache The speed is on par with CPU/GPU OpenVINO. And it helps RPi to inference on base model. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Probably some yes, but the speedup will be during initialization (i.e. the time it takes to pull the model / cached blob from disk and prep the execution device). |
||
|
||
// From the compiled model object, create an infer request. This is the thing that we | ||
// we will use later on to trigger inference execution. | ||
context->inferRequest = compiledModel.create_infer_request(); | ||
} | ||
catch (const std::exception& error) { | ||
std::cout << "in openvino encoder compile routine: exception: " << error.what() << std::endl; | ||
delete context; | ||
context = nullptr; | ||
} | ||
|
||
return context; | ||
} | ||
|
||
void whisper_openvino_free(struct whisper_openvino_context * ctx) { | ||
if( ctx ) { | ||
delete ctx; | ||
} | ||
} | ||
|
||
int whisper_openvino_encode( | ||
whisper_openvino_context* ctx, | ||
ggml_tensor* mel, | ||
ggml_tensor* out) { | ||
|
||
if (!ctx || !mel || !out) { | ||
fprintf(stderr, "%s: Error! ctx / mel / out is null\n", __func__); | ||
return 0; | ||
} | ||
|
||
if (mel->n_dims != 2) { | ||
fprintf(stderr, "%s: Error! mel ggml_tensor expected to have n_dims=2, but it has n_dims=%d\n", | ||
__func__, mel->n_dims); | ||
return 0; | ||
} | ||
|
||
if (out->n_dims != 2) { | ||
fprintf(stderr, "%s: Error! out ggml_tensor expected to have n_dims=2, but it has n_dims=%d\n", | ||
__func__, out->n_dims); | ||
return 0; | ||
} | ||
|
||
try { | ||
|
||
//wrap the passed-in mel ggml_tensor as an OpenVINO Tensor object, and set as input tensor to infer request | ||
{ | ||
// note, we populate shape & stride dimensions in opposite order from how they are listed in ne / nb arrays | ||
ov::Shape input_shape = { 1, (unsigned long long)mel->ne[1], (unsigned long long)mel->ne[0] }; | ||
ov::Strides input_strides = { mel->nb[2], mel->nb[1], mel->nb[0] }; | ||
ov::Tensor input_tensor(ov::element::f32, input_shape, mel->data, input_strides); | ||
ctx->inferRequest.set_input_tensor(input_tensor); | ||
} | ||
|
||
//wrap the passed-in out ggml_tensor as an OpenVINO Tensor object, and set as output tensor to infer request | ||
{ | ||
// note, we populate shape & stride dimensions in opposite order from how they are listed in ne / nb arrays | ||
ov::Shape output_shape = { 1, (unsigned long long)out->ne[1], (unsigned long long)out->ne[0] }; | ||
ov::Strides output_strides = { out->nb[2], out->nb[1], out->nb[0] }; | ||
ov::Tensor out_tensor(ov::element::f32, output_shape, out->data, output_strides); | ||
ctx->inferRequest.set_output_tensor(out_tensor); | ||
} | ||
|
||
//run inference | ||
ctx->inferRequest.infer(); | ||
} | ||
catch (const std::exception& error) { | ||
std::cout << "in openvino encode inference execution routine: exception: " << error.what() << std::endl; | ||
return 0; | ||
} | ||
|
||
return 1; | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,31 @@ | ||
// Wrapper of the OpenVINO Whisper Encoder model | ||
// | ||
|
||
#if __cplusplus | ||
extern "C" { | ||
#endif | ||
|
||
struct whisper_openvino_context; | ||
|
||
// initialize openvino encoder, given path to model xml, device ("CPU", "GPU", etc.), and | ||
// path to cache_dir. Returns null upon failure. | ||
struct whisper_openvino_context * whisper_openvino_init(const char * path_model, | ||
const char * device, | ||
const char * cache_dir); | ||
|
||
// clean up a ctx previously returned from whisper_openvino_init() | ||
void whisper_openvino_free(struct whisper_openvino_context * ctx); | ||
|
||
struct ggml_tensor; | ||
|
||
// Perform encode using OpenVINO. | ||
// Returns 1 on success | ||
// Returns 0 on failure | ||
int whisper_openvino_encode( | ||
whisper_openvino_context* ctx, | ||
ggml_tensor* mel, | ||
ggml_tensor* out); | ||
|
||
#if __cplusplus | ||
} | ||
#endif |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
it's not required to export to ONNX before usage in OpenVINO.
You can use convert_model with PyTorch in-memory object https://docs.openvino.ai/2023.1/openvino_docs_OV_Converter_UG_prepare_model_convert_model_Convert_Model_From_PyTorch.html