Main perf issue solved. Transcoding now a sliding window, with max 15…

…s. No longer exponential increase to transcode time as recording gets longer! Still need to - figure out how to make long running transciption. The transcoder only outputs last 15s, so we lose old content - move the float16 audio buffer into a circular buffer, never ending.
scosman · Dec 15, 2023 · 339f575 · 339f575
1 parent f65c8ec
commit 339f575
Showing 1 changed file with 23 additions and 8 deletions.
diff --git a/ios/voicebox/Whisper/VBAudioListener.m b/ios/voicebox/Whisper/VBAudioListener.m
@@ -15,7 +15,8 @@
 #define NUM_BYTES_PER_BUFFER 16 * 1024
 
 #define NUM_BUFFERS 3
-#define MAX_AUDIO_SEC 60
+#define MAX_AUDIO_SEC 120
+#define MAX_TRANSCRIBE_AUDIO_SEC 15
 #define SAMPLE_RATE WHISPER_SAMPLE_RATE
 
 struct whisper_context;
@@ -181,7 +182,7 @@ - (void)startListening
         stateInp.audioBufferI16 = malloc(MAX_AUDIO_SEC * SAMPLE_RATE * sizeof(int16_t));
     }
     if (!stateInp.audioBufferF32) {
-        stateInp.audioBufferF32 = malloc(MAX_AUDIO_SEC * SAMPLE_RATE * sizeof(float));
+        stateInp.audioBufferF32 = malloc(MAX_TRANSCRIBE_AUDIO_SEC * SAMPLE_RATE * sizeof(float));
     }
 
     stateInp.isTranscribing = false;
@@ -262,12 +263,24 @@ - (IBAction)onTranscribe
     stateInp.isTranscribing = true;
 
     // dispatch the model to a background thread
-    dispatch_async(dispatch_get_global_queue(DISPATCH_QUEUE_PRIORITY_HIGH, 0), ^{
+    // TODO: really default? High was helping
+    dispatch_async(dispatch_get_global_queue(DISPATCH_QUEUE_PRIORITY_DEFAULT, 0), ^{
+        // Transcribe window: last 15s (max) gets slid to latest.
+        float totalTime = self->stateInp.n_samples / SAMPLE_RATE;
+        float offsetStartTime = MAX(totalTime - 15.0, 0);
+        int offsetStartMs = offsetStartTime*1000;
+        int sampleWindowSize = MIN(MAX_TRANSCRIBE_AUDIO_SEC * SAMPLE_RATE, self->stateInp.n_samples);
+        int sampleStartOffset = self->stateInp.n_samples - sampleWindowSize;
+        NSLog(@"Transcribe offset start time: %f, %d\nTranscribe total samples: %d\nTranscribe sample window size: %d\nTranscribe sample offset: %d", offsetStartTime, offsetStartMs, self->stateInp.n_samples, sampleWindowSize, sampleStartOffset);
+
         // process captured audio
         // convert I16 to F32
-        NSLog(@"Transcribing: %d", self->stateInp.n_samples);
-        for (int i = 0; i < self->stateInp.n_samples; i++) {
+        //NSLog(@"Transcribing: %d samples available", self->stateInp.n_samples);
+        /*for (int i = 0; i < self->stateInp.n_samples; i++) {
             self->stateInp.audioBufferF32[i] = (float)self->stateInp.audioBufferI16[i] / 32768.0f;
+        }*/
+        for (int i = 0; i < sampleWindowSize; i++) {
+            self->stateInp.audioBufferF32[i] = (float)self->stateInp.audioBufferI16[sampleStartOffset+i] / 32768.0f;
         }
 
         // run the model
@@ -276,6 +289,7 @@ - (IBAction)onTranscribe
         // get maximum number of threads on this device (max 8)
         const int max_threads = MIN(8, (int)[[NSProcessInfo processInfo] processorCount]);
 
+
         params.print_realtime = true;
         params.print_progress = false;
         params.print_timestamps = true;
@@ -285,16 +299,17 @@ - (IBAction)onTranscribe
         params.suppress_non_speech_tokens = true;
         params.suppress_blank = true;
         params.n_threads = max_threads;
-        // TODO: think we're processing whole thing each time?
         params.offset_ms = 0;
         params.no_context = true;
-        params.single_segment = false;
+        //params.single_segment = false;
+        params.single_segment   = true; // true for realtime
+        params.no_timestamps    = params.single_segment;
 
         CFTimeInterval startTime = CACurrentMediaTime();
 
         whisper_reset_timings(self->stateInp.ctx);
 
-        int whisperStatus = whisper_full(self->stateInp.ctx, params, self->stateInp.audioBufferF32, self->stateInp.n_samples);
+        int whisperStatus = whisper_full(self->stateInp.ctx, params, self->stateInp.audioBufferF32, sampleWindowSize);
         if (whisperStatus != 0) {
             NSLog(@"Failed to run the model");
             [self distributeStateUpdate:false segments:nil];