From b0ac915265ce60409fef79fbf1d8862fa3929d7d Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sun, 5 Mar 2023 11:11:51 +0200 Subject: [PATCH] coreml : use Core ML encoder inference --- .gitignore | 2 + CMakeLists.txt | 68 +++++++++-- Makefile | 44 ++++--- coreml/whisper-encoder-impl.h | 142 +++++++++++++++++++++++ coreml/whisper-encoder-impl.m | 197 ++++++++++++++++++++++++++++++++ coreml/whisper-encoder.h | 22 ++++ coreml/whisper-encoder.mm | 61 ++++++++++ models/download-coreml-model.sh | 82 +++++++++++++ whisper.cpp | 49 +++++++- 9 files changed, 643 insertions(+), 24 deletions(-) create mode 100644 coreml/whisper-encoder-impl.h create mode 100644 coreml/whisper-encoder-impl.m create mode 100644 coreml/whisper-encoder.h create mode 100644 coreml/whisper-encoder.mm create mode 100755 models/download-coreml-model.sh diff --git a/.gitignore b/.gitignore index 2dae32884e6..402ca15264e 100644 --- a/.gitignore +++ b/.gitignore @@ -1,5 +1,7 @@ *.o *.a +*.mlmodel +*.mlmodelc .cache/ .vs/ .vscode/ diff --git a/CMakeLists.txt b/CMakeLists.txt index 476ceb8b6ab..422d7f036a1 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -54,6 +54,8 @@ if (APPLE) option(WHISPER_NO_AVX "whisper: disable AVX" OFF) option(WHISPER_NO_AVX2 "whisper: disable AVX2" OFF) option(WHISPER_NO_FMA "whisper: disable FMA" OFF) + + option(WHISPER_COREML "whisper: enable Core ML framework" OFF) else() option(WHISPER_SUPPORT_OPENBLAS "whisper: support for OpenBLAS" OFF) endif() @@ -86,16 +88,33 @@ endif() find_package(Threads REQUIRED) -# on APPLE - include Accelerate framework -if (APPLE AND NOT WHISPER_NO_ACCELERATE) - find_library(ACCELERATE_FRAMEWORK Accelerate) - if (ACCELERATE_FRAMEWORK) - message(STATUS "Accelerate framework found") +# on APPLE +if (APPLE) + # include Accelerate framework + if (NOT WHISPER_NO_ACCELERATE) + find_library(ACCELERATE_FRAMEWORK Accelerate) + + if (ACCELERATE_FRAMEWORK) + message(STATUS "Accelerate framework found") - set(WHISPER_EXTRA_LIBS ${WHISPER_EXTRA_LIBS} ${ACCELERATE_FRAMEWORK}) - set(WHISPER_EXTRA_FLAGS ${WHISPER_EXTRA_FLAGS} -DGGML_USE_ACCELERATE) - else() - message(WARNING "Accelerate framework not found") + set(WHISPER_EXTRA_LIBS ${WHISPER_EXTRA_LIBS} ${ACCELERATE_FRAMEWORK}) + set(WHISPER_EXTRA_FLAGS ${WHISPER_EXTRA_FLAGS} -DGGML_USE_ACCELERATE) + else() + message(WARNING "Accelerate framework not found") + endif() + endif() + + if (WHISPER_COREML) + find_library(FOUNDATION_FRAMEWORK Foundation) + find_library(COREML_FRAMEWORK CoreML) + + if (COREML_FRAMEWORK) + message(STATUS "CoreML framework found") + + set(WHISPER_EXTRA_FLAGS ${WHISPER_EXTRA_FLAGS} -DWHISPER_USE_COREML) + else() + message(WARNING "CoreML framework not found") + endif() endif() endif() @@ -181,6 +200,33 @@ if (WHISPER_PERF) set(WHISPER_EXTRA_FLAGS ${WHISPER_EXTRA_FLAGS} -DGGML_PERF) endif() +# +# whisper.coreml - Core ML support +# + +if (WHISPER_COREML) + set(TARGET whisper.coreml) + + add_library(${TARGET} + coreml/whisper-encoder.h + coreml/whisper-encoder.mm + coreml/whisper-encoder-impl.h + coreml/whisper-encoder-impl.m + ) + + include(DefaultTargetOptions) + + target_include_directories(${TARGET} PUBLIC + . + ) + + target_link_libraries(${TARGET} PRIVATE ${FOUNDATION_FRAMEWORK} ${COREML_FRAMEWORK}) + + set_target_properties(${TARGET} PROPERTIES + COMPILE_FLAGS "-fobjc-arc" + ) +endif() + # # whisper - this is the main library of the project # @@ -200,6 +246,10 @@ target_include_directories(${TARGET} PUBLIC . ) +if (WHISPER_COREML) + target_link_libraries(${TARGET} PRIVATE whisper.coreml) +endif() + if (MSVC) target_link_libraries(${TARGET} PRIVATE ${WHISPER_EXTRA_LIBS} ${CMAKE_THREAD_LIBS_INIT}) diff --git a/Makefile b/Makefile index 15094a4bd4e..441bfcf58b3 100644 --- a/Makefile +++ b/Makefile @@ -132,6 +132,10 @@ ifndef WHISPER_NO_ACCELERATE LDFLAGS += -framework Accelerate endif endif +ifdef WHISPER_COREML + CXXFLAGS += -DWHISPER_USE_COREML + LDFLAGS += -framework Foundation -framework CoreML +endif ifdef WHISPER_OPENBLAS CFLAGS += -DGGML_USE_OPENBLAS -I/usr/local/include/openblas LDFLAGS += -lopenblas @@ -184,11 +188,23 @@ ggml.o: ggml.c ggml.h whisper.o: whisper.cpp whisper.h $(CXX) $(CXXFLAGS) -c whisper.cpp -o whisper.o -libwhisper.a: ggml.o whisper.o - $(AR) rcs libwhisper.a ggml.o whisper.o +ifndef WHISPER_COREML +WHISPER_OBJ = whisper.o +else +whisper-encoder.o: coreml/whisper-encoder.mm coreml/whisper-encoder.h + $(CXX) -O3 -I . -c coreml/whisper-encoder.mm -o whisper-encoder.o + +whisper-encoder-impl.o: coreml/whisper-encoder-impl.m coreml/whisper-encoder-impl.h + $(CXX) -O3 -I . -fobjc-arc -c coreml/whisper-encoder-impl.m -o whisper-encoder-impl.o + +WHISPER_OBJ = whisper.o whisper-encoder.o whisper-encoder-impl.o +endif + +libwhisper.a: ggml.o $(WHISPER_OBJ) + $(AR) rcs libwhisper.a ggml.o $(WHISPER_OBJ) -libwhisper.so: ggml.o whisper.o - $(CXX) $(CXXFLAGS) -shared -o libwhisper.so ggml.o whisper.o $(LDFLAGS) +libwhisper.so: ggml.o $(WHISPER_OBJ) + $(CXX) $(CXXFLAGS) -shared -o libwhisper.so ggml.o $(WHISPER_OBJ) $(LDFLAGS) clean: rm -f *.o main stream command talk bench libwhisper.a libwhisper.so @@ -202,21 +218,21 @@ CC_SDL=`sdl2-config --cflags --libs` SRC_COMMON = examples/common.cpp SRC_COMMON_SDL = examples/common-sdl.cpp -main: examples/main/main.cpp $(SRC_COMMON) ggml.o whisper.o - $(CXX) $(CXXFLAGS) examples/main/main.cpp $(SRC_COMMON) ggml.o whisper.o -o main $(LDFLAGS) +main: examples/main/main.cpp $(SRC_COMMON) ggml.o $(WHISPER_OBJ) + $(CXX) $(CXXFLAGS) examples/main/main.cpp $(SRC_COMMON) ggml.o $(WHISPER_OBJ) -o main $(LDFLAGS) ./main -h -stream: examples/stream/stream.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) ggml.o whisper.o - $(CXX) $(CXXFLAGS) examples/stream/stream.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) ggml.o whisper.o -o stream $(CC_SDL) $(LDFLAGS) +stream: examples/stream/stream.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) ggml.o $(WHISPER_OBJ) + $(CXX) $(CXXFLAGS) examples/stream/stream.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) ggml.o $(WHISPER_OBJ) -o stream $(CC_SDL) $(LDFLAGS) -command: examples/command/command.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) ggml.o whisper.o - $(CXX) $(CXXFLAGS) examples/command/command.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) ggml.o whisper.o -o command $(CC_SDL) $(LDFLAGS) +command: examples/command/command.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) ggml.o $(WHISPER_OBJ) + $(CXX) $(CXXFLAGS) examples/command/command.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) ggml.o $(WHISPER_OBJ) -o command $(CC_SDL) $(LDFLAGS) -talk: examples/talk/talk.cpp examples/talk/gpt-2.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) ggml.o whisper.o - $(CXX) $(CXXFLAGS) examples/talk/talk.cpp examples/talk/gpt-2.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) ggml.o whisper.o -o talk $(CC_SDL) $(LDFLAGS) +talk: examples/talk/talk.cpp examples/talk/gpt-2.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) ggml.o $(WHISPER_OBJ) + $(CXX) $(CXXFLAGS) examples/talk/talk.cpp examples/talk/gpt-2.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) ggml.o $(WHISPER_OBJ) -o talk $(CC_SDL) $(LDFLAGS) -bench: examples/bench/bench.cpp ggml.o whisper.o - $(CXX) $(CXXFLAGS) examples/bench/bench.cpp ggml.o whisper.o -o bench $(LDFLAGS) +bench: examples/bench/bench.cpp ggml.o $(WHISPER_OBJ) + $(CXX) $(CXXFLAGS) examples/bench/bench.cpp ggml.o $(WHISPER_OBJ) -o bench $(LDFLAGS) # # Audio samples diff --git a/coreml/whisper-encoder-impl.h b/coreml/whisper-encoder-impl.h new file mode 100644 index 00000000000..9395acb250f --- /dev/null +++ b/coreml/whisper-encoder-impl.h @@ -0,0 +1,142 @@ +// +// CoremlEncoder.h +// +// This file was automatically generated and should not be edited. +// + +#import +#import +#include +#include + +NS_ASSUME_NONNULL_BEGIN + + +/// Model Prediction Input Type +API_AVAILABLE(macos(10.15), ios(13.0), watchos(6.0), tvos(13.0)) __attribute__((visibility("hidden"))) +@interface CoremlEncoderInput : NSObject + +/// melSegment as 1 × 80 × 3000 3-dimensional array of floats +@property (readwrite, nonatomic, strong) MLMultiArray * melSegment; +- (instancetype)init NS_UNAVAILABLE; +- (instancetype)initWithMelSegment:(MLMultiArray *)melSegment NS_DESIGNATED_INITIALIZER; + +@end + + +/// Model Prediction Output Type +API_AVAILABLE(macos(10.15), ios(13.0), watchos(6.0), tvos(13.0)) __attribute__((visibility("hidden"))) +@interface CoremlEncoderOutput : NSObject + +/// output as multidimensional array of floats +@property (readwrite, nonatomic, strong) MLMultiArray * output; +- (instancetype)init NS_UNAVAILABLE; +- (instancetype)initWithOutput:(MLMultiArray *)output NS_DESIGNATED_INITIALIZER; + +@end + + +/// Class for model loading and prediction +API_AVAILABLE(macos(10.15), ios(13.0), watchos(6.0), tvos(13.0)) __attribute__((visibility("hidden"))) +@interface CoremlEncoder : NSObject +@property (readonly, nonatomic, nullable) MLModel * model; + +/** + URL of the underlying .mlmodelc directory. +*/ ++ (nullable NSURL *)URLOfModelInThisBundle; + +/** + Initialize CoremlEncoder instance from an existing MLModel object. + + Usually the application does not use this initializer unless it makes a subclass of CoremlEncoder. + Such application may want to use `-[MLModel initWithContentsOfURL:configuration:error:]` and `+URLOfModelInThisBundle` to create a MLModel object to pass-in. +*/ +- (instancetype)initWithMLModel:(MLModel *)model NS_DESIGNATED_INITIALIZER; + +/** + Initialize CoremlEncoder instance with the model in this bundle. +*/ +- (nullable instancetype)init; + +/** + Initialize CoremlEncoder instance with the model in this bundle. + + @param configuration The model configuration object + @param error If an error occurs, upon return contains an NSError object that describes the problem. If you are not interested in possible errors, pass in NULL. +*/ +- (nullable instancetype)initWithConfiguration:(MLModelConfiguration *)configuration error:(NSError * _Nullable __autoreleasing * _Nullable)error; + +/** + Initialize CoremlEncoder instance from the model URL. + + @param modelURL URL to the .mlmodelc directory for CoremlEncoder. + @param error If an error occurs, upon return contains an NSError object that describes the problem. If you are not interested in possible errors, pass in NULL. +*/ +- (nullable instancetype)initWithContentsOfURL:(NSURL *)modelURL error:(NSError * _Nullable __autoreleasing * _Nullable)error; + +/** + Initialize CoremlEncoder instance from the model URL. + + @param modelURL URL to the .mlmodelc directory for CoremlEncoder. + @param configuration The model configuration object + @param error If an error occurs, upon return contains an NSError object that describes the problem. If you are not interested in possible errors, pass in NULL. +*/ +- (nullable instancetype)initWithContentsOfURL:(NSURL *)modelURL configuration:(MLModelConfiguration *)configuration error:(NSError * _Nullable __autoreleasing * _Nullable)error; + +/** + Construct CoremlEncoder instance asynchronously with configuration. + Model loading may take time when the model content is not immediately available (e.g. encrypted model). Use this factory method especially when the caller is on the main thread. + + @param configuration The model configuration + @param handler When the model load completes successfully or unsuccessfully, the completion handler is invoked with a valid CoremlEncoder instance or NSError object. +*/ ++ (void)loadWithConfiguration:(MLModelConfiguration *)configuration completionHandler:(void (^)(CoremlEncoder * _Nullable model, NSError * _Nullable error))handler API_AVAILABLE(macos(11.0), ios(14.0), watchos(7.0), tvos(14.0)) __attribute__((visibility("hidden"))); + +/** + Construct CoremlEncoder instance asynchronously with URL of .mlmodelc directory and optional configuration. + + Model loading may take time when the model content is not immediately available (e.g. encrypted model). Use this factory method especially when the caller is on the main thread. + + @param modelURL The model URL. + @param configuration The model configuration + @param handler When the model load completes successfully or unsuccessfully, the completion handler is invoked with a valid CoremlEncoder instance or NSError object. +*/ ++ (void)loadContentsOfURL:(NSURL *)modelURL configuration:(MLModelConfiguration *)configuration completionHandler:(void (^)(CoremlEncoder * _Nullable model, NSError * _Nullable error))handler API_AVAILABLE(macos(11.0), ios(14.0), watchos(7.0), tvos(14.0)) __attribute__((visibility("hidden"))); + +/** + Make a prediction using the standard interface + @param input an instance of CoremlEncoderInput to predict from + @param error If an error occurs, upon return contains an NSError object that describes the problem. If you are not interested in possible errors, pass in NULL. + @return the prediction as CoremlEncoderOutput +*/ +- (nullable CoremlEncoderOutput *)predictionFromFeatures:(CoremlEncoderInput *)input error:(NSError * _Nullable __autoreleasing * _Nullable)error; + +/** + Make a prediction using the standard interface + @param input an instance of CoremlEncoderInput to predict from + @param options prediction options + @param error If an error occurs, upon return contains an NSError object that describes the problem. If you are not interested in possible errors, pass in NULL. + @return the prediction as CoremlEncoderOutput +*/ +- (nullable CoremlEncoderOutput *)predictionFromFeatures:(CoremlEncoderInput *)input options:(MLPredictionOptions *)options error:(NSError * _Nullable __autoreleasing * _Nullable)error; + +/** + Make a prediction using the convenience interface + @param melSegment as 1 × 80 × 3000 3-dimensional array of floats: + @param error If an error occurs, upon return contains an NSError object that describes the problem. If you are not interested in possible errors, pass in NULL. + @return the prediction as CoremlEncoderOutput +*/ +- (nullable CoremlEncoderOutput *)predictionFromMelSegment:(MLMultiArray *)melSegment error:(NSError * _Nullable __autoreleasing * _Nullable)error; + +/** + Batch prediction + @param inputArray array of CoremlEncoderInput instances to obtain predictions from + @param options prediction options + @param error If an error occurs, upon return contains an NSError object that describes the problem. If you are not interested in possible errors, pass in NULL. + @return the predictions as NSArray +*/ +- (nullable NSArray *)predictionsFromInputs:(NSArray *)inputArray options:(MLPredictionOptions *)options error:(NSError * _Nullable __autoreleasing * _Nullable)error; +@end + +NS_ASSUME_NONNULL_END diff --git a/coreml/whisper-encoder-impl.m b/coreml/whisper-encoder-impl.m new file mode 100644 index 00000000000..9d3a08b8d0b --- /dev/null +++ b/coreml/whisper-encoder-impl.m @@ -0,0 +1,197 @@ +// +// CoremlEncoder.m +// +// This file was automatically generated and should not be edited. +// + +#if !__has_feature(objc_arc) +#error This file must be compiled with automatic reference counting enabled (-fobjc-arc) +#endif + +#import "whisper-encoder-impl.h" + +@implementation CoremlEncoderInput + +- (instancetype)initWithMelSegment:(MLMultiArray *)melSegment { + self = [super init]; + if (self) { + _melSegment = melSegment; + } + return self; +} + +- (NSSet *)featureNames { + return [NSSet setWithArray:@[@"melSegment"]]; +} + +- (nullable MLFeatureValue *)featureValueForName:(NSString *)featureName { + if ([featureName isEqualToString:@"melSegment"]) { + return [MLFeatureValue featureValueWithMultiArray:self.melSegment]; + } + return nil; +} + +@end + +@implementation CoremlEncoderOutput + +- (instancetype)initWithOutput:(MLMultiArray *)output { + self = [super init]; + if (self) { + _output = output; + } + return self; +} + +- (NSSet *)featureNames { + return [NSSet setWithArray:@[@"output"]]; +} + +- (nullable MLFeatureValue *)featureValueForName:(NSString *)featureName { + if ([featureName isEqualToString:@"output"]) { + return [MLFeatureValue featureValueWithMultiArray:self.output]; + } + return nil; +} + +@end + +@implementation CoremlEncoder + + +/** + URL of the underlying .mlmodelc directory. +*/ ++ (nullable NSURL *)URLOfModelInThisBundle { + NSString *assetPath = [[NSBundle bundleForClass:[self class]] pathForResource:@"CoremlEncoder" ofType:@"mlmodelc"]; + if (nil == assetPath) { os_log_error(OS_LOG_DEFAULT, "Could not load CoremlEncoder.mlmodelc in the bundle resource"); return nil; } + return [NSURL fileURLWithPath:assetPath]; +} + + +/** + Initialize CoremlEncoder instance from an existing MLModel object. + + Usually the application does not use this initializer unless it makes a subclass of CoremlEncoder. + Such application may want to use `-[MLModel initWithContentsOfURL:configuration:error:]` and `+URLOfModelInThisBundle` to create a MLModel object to pass-in. +*/ +- (instancetype)initWithMLModel:(MLModel *)model { + self = [super init]; + if (!self) { return nil; } + _model = model; + if (_model == nil) { return nil; } + return self; +} + + +/** + Initialize CoremlEncoder instance with the model in this bundle. +*/ +- (nullable instancetype)init { + return [self initWithContentsOfURL:(NSURL * _Nonnull)self.class.URLOfModelInThisBundle error:nil]; +} + + +/** + Initialize CoremlEncoder instance with the model in this bundle. + + @param configuration The model configuration object + @param error If an error occurs, upon return contains an NSError object that describes the problem. If you are not interested in possible errors, pass in NULL. +*/ +- (nullable instancetype)initWithConfiguration:(MLModelConfiguration *)configuration error:(NSError * _Nullable __autoreleasing * _Nullable)error { + return [self initWithContentsOfURL:(NSURL * _Nonnull)self.class.URLOfModelInThisBundle configuration:configuration error:error]; +} + + +/** + Initialize CoremlEncoder instance from the model URL. + + @param modelURL URL to the .mlmodelc directory for CoremlEncoder. + @param error If an error occurs, upon return contains an NSError object that describes the problem. If you are not interested in possible errors, pass in NULL. +*/ +- (nullable instancetype)initWithContentsOfURL:(NSURL *)modelURL error:(NSError * _Nullable __autoreleasing * _Nullable)error { + MLModel *model = [MLModel modelWithContentsOfURL:modelURL error:error]; + if (model == nil) { return nil; } + return [self initWithMLModel:model]; +} + + +/** + Initialize CoremlEncoder instance from the model URL. + + @param modelURL URL to the .mlmodelc directory for CoremlEncoder. + @param configuration The model configuration object + @param error If an error occurs, upon return contains an NSError object that describes the problem. If you are not interested in possible errors, pass in NULL. +*/ +- (nullable instancetype)initWithContentsOfURL:(NSURL *)modelURL configuration:(MLModelConfiguration *)configuration error:(NSError * _Nullable __autoreleasing * _Nullable)error { + MLModel *model = [MLModel modelWithContentsOfURL:modelURL configuration:configuration error:error]; + if (model == nil) { return nil; } + return [self initWithMLModel:model]; +} + + +/** + Construct CoremlEncoder instance asynchronously with configuration. + Model loading may take time when the model content is not immediately available (e.g. encrypted model). Use this factory method especially when the caller is on the main thread. + + @param configuration The model configuration + @param handler When the model load completes successfully or unsuccessfully, the completion handler is invoked with a valid CoremlEncoder instance or NSError object. +*/ ++ (void)loadWithConfiguration:(MLModelConfiguration *)configuration completionHandler:(void (^)(CoremlEncoder * _Nullable model, NSError * _Nullable error))handler { + [self loadContentsOfURL:(NSURL * _Nonnull)[self URLOfModelInThisBundle] + configuration:configuration + completionHandler:handler]; +} + + +/** + Construct CoremlEncoder instance asynchronously with URL of .mlmodelc directory and optional configuration. + + Model loading may take time when the model content is not immediately available (e.g. encrypted model). Use this factory method especially when the caller is on the main thread. + + @param modelURL The model URL. + @param configuration The model configuration + @param handler When the model load completes successfully or unsuccessfully, the completion handler is invoked with a valid CoremlEncoder instance or NSError object. +*/ ++ (void)loadContentsOfURL:(NSURL *)modelURL configuration:(MLModelConfiguration *)configuration completionHandler:(void (^)(CoremlEncoder * _Nullable model, NSError * _Nullable error))handler { + [MLModel loadContentsOfURL:modelURL + configuration:configuration + completionHandler:^(MLModel *model, NSError *error) { + if (model != nil) { + CoremlEncoder *typedModel = [[CoremlEncoder alloc] initWithMLModel:model]; + handler(typedModel, nil); + } else { + handler(nil, error); + } + }]; +} + +- (nullable CoremlEncoderOutput *)predictionFromFeatures:(CoremlEncoderInput *)input error:(NSError * _Nullable __autoreleasing * _Nullable)error { + return [self predictionFromFeatures:input options:[[MLPredictionOptions alloc] init] error:error]; +} + +- (nullable CoremlEncoderOutput *)predictionFromFeatures:(CoremlEncoderInput *)input options:(MLPredictionOptions *)options error:(NSError * _Nullable __autoreleasing * _Nullable)error { + id outFeatures = [self.model predictionFromFeatures:input options:options error:error]; + if (!outFeatures) { return nil; } + return [[CoremlEncoderOutput alloc] initWithOutput:(MLMultiArray *)[outFeatures featureValueForName:@"output"].multiArrayValue]; +} + +- (nullable CoremlEncoderOutput *)predictionFromMelSegment:(MLMultiArray *)melSegment error:(NSError * _Nullable __autoreleasing * _Nullable)error { + CoremlEncoderInput *input_ = [[CoremlEncoderInput alloc] initWithMelSegment:melSegment]; + return [self predictionFromFeatures:input_ error:error]; +} + +- (nullable NSArray *)predictionsFromInputs:(NSArray *)inputArray options:(MLPredictionOptions *)options error:(NSError * _Nullable __autoreleasing * _Nullable)error { + id inBatch = [[MLArrayBatchProvider alloc] initWithFeatureProviderArray:inputArray]; + id outBatch = [self.model predictionsFromBatch:inBatch options:options error:error]; + if (!outBatch) { return nil; } + NSMutableArray *results = [NSMutableArray arrayWithCapacity:(NSUInteger)outBatch.count]; + for (NSInteger i = 0; i < outBatch.count; i++) { + id resultProvider = [outBatch featuresAtIndex:i]; + CoremlEncoderOutput * result = [[CoremlEncoderOutput alloc] initWithOutput:(MLMultiArray *)[resultProvider featureValueForName:@"output"].multiArrayValue]; + [results addObject:result]; + } + return results; +} + +@end diff --git a/coreml/whisper-encoder.h b/coreml/whisper-encoder.h new file mode 100644 index 00000000000..84bbe416505 --- /dev/null +++ b/coreml/whisper-encoder.h @@ -0,0 +1,22 @@ +// Wrapper of the Core ML Whisper Encoder model +// +// Code is derived from the work of Github user @wangchou +// ref: https://github.com/wangchou/callCoreMLFromCpp + +#if __cplusplus +extern "C" { +#endif + +struct whisper_coreml_context; + +struct whisper_coreml_context * whisper_coreml_init(const char * path_model); +void whisper_coreml_free(struct whisper_coreml_context * ctx); + +void whisper_coreml_encode( + const whisper_coreml_context * ctx, + float * mel, + float * out); + +#if __cplusplus +} +#endif diff --git a/coreml/whisper-encoder.mm b/coreml/whisper-encoder.mm new file mode 100644 index 00000000000..09091c2003c --- /dev/null +++ b/coreml/whisper-encoder.mm @@ -0,0 +1,61 @@ +#import "coreml/whisper-encoder.h" +#import "coreml/whisper-encoder-impl.h" + +#import + +#include + +#if __cplusplus +extern "C" { +#endif + +struct whisper_coreml_context { + const void * data; +}; + +struct whisper_coreml_context * whisper_coreml_init(const char * path_model) { + NSString * path_model_str = [[NSString alloc] initWithUTF8String:path_model]; + + NSURL * url_model = [NSURL fileURLWithPath: path_model_str]; + + const void * data = CFBridgingRetain([[CoremlEncoder alloc] initWithContentsOfURL:url_model error:nil]); + + if (data == NULL) { + return NULL; + } + + whisper_coreml_context * ctx = new whisper_coreml_context; + + ctx->data = data; + + return ctx; +} + +void whisper_coreml_free(struct whisper_coreml_context * ctx) { + CFRelease(ctx->data); + delete ctx; +} + +void whisper_coreml_encode( + const whisper_coreml_context * ctx, + float * mel, + float * out) { + MLMultiArray * inMultiArray = [ + [MLMultiArray alloc] initWithDataPointer: mel + shape: @[@1, @80, @3000] + dataType: MLMultiArrayDataTypeFloat32 + strides: @[@(240000), @(3000), @1] + deallocator: nil + error: nil + ]; + + CoremlEncoderOutput * outCoreML = [(__bridge id) ctx->data predictionFromMelSegment:inMultiArray error:nil]; + + MLMultiArray * outMA = outCoreML.output; + + memcpy(out, outMA.dataPointer, outMA.count * sizeof(float)); +} + +#if __cplusplus +} +#endif diff --git a/models/download-coreml-model.sh b/models/download-coreml-model.sh new file mode 100755 index 00000000000..d46789d7c06 --- /dev/null +++ b/models/download-coreml-model.sh @@ -0,0 +1,82 @@ +#!/bin/bash + +# This script downloads Whisper model files that have already been converted to Core ML format. +# This way you don't have to convert them yourself. + +src="https://huggingface.co/datasets/ggerganov/whisper.cpp-coreml" +pfx="resolve/main/ggml" + +# get the path of this script +function get_script_path() { + if [ -x "$(command -v realpath)" ]; then + echo "$(dirname $(realpath $0))" + else + local ret="$(cd -- "$(dirname "$0")" >/dev/null 2>&1 ; pwd -P)" + echo "$ret" + fi +} + +models_path="$(get_script_path)" + +# Whisper models +models=( "tiny.en" "tiny" "base.en" "base" "small.en" "small" "medium.en" "medium" "large-v1" "large" ) + +# list available models +function list_models { + printf "\n" + printf " Available models:" + for model in "${models[@]}"; do + printf " $model" + done + printf "\n\n" +} + +if [ "$#" -ne 1 ]; then + printf "Usage: $0 \n" + list_models + + exit 1 +fi + +model=$1 + +if [[ ! " ${models[@]} " =~ " ${model} " ]]; then + printf "Invalid model: $model\n" + list_models + + exit 1 +fi + +# download Core ML model + +printf "Downloading Core ML model $model from '$src' ...\n" + +cd $models_path + +if [ -f "ggml-$model.mlmodel" ]; then + printf "Model $model already exists. Skipping download.\n" + exit 0 +fi + +if [ -x "$(command -v wget)" ]; then + wget --quiet --show-progress -O ggml-$model.mlmodel $src/$pfx-$model.mlmodel +elif [ -x "$(command -v curl)" ]; then + curl -L --output ggml-$model.mlmodel $src/$pfx-$model.mlmodel +else + printf "Either wget or curl is required to download models.\n" + exit 1 +fi + + +if [ $? -ne 0 ]; then + printf "Failed to download Core ML model $model \n" + printf "Please try again later or download the original Whisper model files and convert them yourself.\n" + exit 1 +fi + +printf "Done! Model '$model' saved in 'models/ggml-$model.mlmodel'\n" +printf "Run the following command to compile it:\n\n" +printf " $ xcrun coremlc compile ./models/ggml-$model.mlmodel ./models\n\n" +printf "You can now use it like this:\n\n" +printf " $ ./main -m models/ggml-$model.bin -f samples/jfk.wav\n" +printf "\n" diff --git a/whisper.cpp b/whisper.cpp index 3a21581c682..ab242d70eeb 100644 --- a/whisper.cpp +++ b/whisper.cpp @@ -1,5 +1,8 @@ #define WHISPER_BUILD #include "whisper.h" +#if WHISPER_USE_COREML +#include "coreml/whisper-encoder.h" +#endif #include "ggml.h" @@ -594,6 +597,11 @@ struct whisper_context { int lang_id = 0; // english by default + std::string path_model; // populated by whisper_init_from_file() +#ifdef WHISPER_USE_COREML + whisper_coreml_context * ctx_coreml; +#endif + // [EXPERIMENTAL] token-level timestamps data int64_t t_beg = 0; int64_t t_last = 0; @@ -1696,6 +1704,9 @@ static bool whisper_encode( wctx.use_buf(ctx0, -1); // run the computation +#ifdef WHISPER_USE_COREML + whisper_coreml_encode(wctx.ctx_coreml, (float *) mel->data, (float *) cur->data); +#else { struct ggml_cgraph gf = {}; gf.n_threads = n_threads; @@ -1705,6 +1716,7 @@ static bool whisper_encode( //ggml_graph_print(&gf); } +#endif // cur //{ @@ -2507,6 +2519,20 @@ static std::vector tokenize(const whisper_vocab & vocab, cons // interface implementation // +#ifdef WHISPER_USE_COREML +// replace .bin with .mlmodelc +static std::string whisper_get_coreml_path(std::string path_bin) { + auto pos = path_bin.rfind('.'); + if (pos != std::string::npos) { + path_bin = path_bin.substr(0, pos); + } + + path_bin += ".mlmodelc"; + + return path_bin; +} +#endif + struct whisper_context * whisper_init_from_file(const char * path_model) { whisper_model_loader loader = {}; @@ -2519,6 +2545,7 @@ struct whisper_context * whisper_init_from_file(const char * path_model) { } loader.context = &fin; + loader.read = [](void * ctx, void * output, size_t read_size) { std::ifstream * fin = (std::ifstream*)ctx; fin->read((char *)output, read_size); @@ -2535,7 +2562,23 @@ struct whisper_context * whisper_init_from_file(const char * path_model) { fin->close(); }; - return whisper_init(&loader); + auto ctx = whisper_init(&loader); + + if (ctx) { + ctx->path_model = path_model; +#ifdef WHISPER_USE_COREML + const auto path_coreml = whisper_get_coreml_path(ctx->path_model); + fprintf(stderr, "%s: loading Core ML model from '%s'\n", __func__, path_coreml.c_str()); + + ctx->ctx_coreml = whisper_coreml_init(path_coreml.c_str()); + if (!ctx->ctx_coreml) { + fprintf(stderr, "%s: failed to load Core ML model from '%s'\n", __func__, path_coreml.c_str()); + return nullptr; + } +#endif + } + + return ctx; } struct whisper_context * whisper_init_from_buffer(void * buffer, size_t buffer_size) { @@ -2607,6 +2650,10 @@ void whisper_free(struct whisper_context * ctx) { ggml_free(ctx->decoders[i].kv_self.ctx); } } +#ifdef WHISPER_USE_COREML + whisper_coreml_free(ctx->ctx_coreml); + ctx->ctx_coreml = nullptr; +#endif delete ctx; } }