ggerganov · artshcherbina · Dec 16, 2023 · Dec 16, 2023 · Dec 16, 2023 · Dec 16, 2023
diff --git a/examples/common-sdl.cpp b/examples/common-sdl.cpp
@@ -169,7 +169,7 @@ void audio_async::callback(uint8_t * stream, int len) {
     }
 }
 
-void audio_async::get(int ms, std::vector<float> & result) {
+void audio_async::get(int ms, int step, std::vector<float> & result) {
     if (!m_dev_id_in) {
         fprintf(stderr, "%s: no audio device to get audio from!\n", __func__);
         return;
@@ -196,7 +196,7 @@ void audio_async::get(int ms, std::vector<float> & result) {
 
         result.resize(n_samples);
 
-        int s0 = m_audio_pos - n_samples;
+        int s0 = (m_audio_pos / step * step) - n_samples;
         if (s0 < 0) {
             s0 += m_audio.size();
         }

diff --git a/examples/common-sdl.h b/examples/common-sdl.h
@@ -29,7 +29,7 @@ class audio_async {
     void callback(uint8_t * stream, int len);
 
     // get audio data from the circular buffer
-    void get(int ms, std::vector<float> & audio);
+    void get(int ms, int step, std::vector<float> & audio);
 
 private:
     SDL_AudioDeviceID m_dev_id_in = 0;

diff --git a/examples/stream/stream.cpp b/examples/stream/stream.cpp
@@ -32,7 +32,9 @@ std::string to_timestamp(int64_t t) {
 struct whisper_params {
     int32_t n_threads  = std::min(4, (int32_t) std::thread::hardware_concurrency());
     int32_t step_ms    = 3000;
-    int32_t length_ms  = 10000;
+    int32_t pressure_t = 1000;
+    int32_t silence_t  = 1000;
+    int32_t length_ms  = 30000;
     int32_t keep_ms    = 200;
     int32_t capture_id = -1;
     int32_t max_tokens = 32;
@@ -68,6 +70,8 @@ bool whisper_params_parse(int argc, char ** argv, whisper_params & params) {
         }
         else if (arg == "-t"    || arg == "--threads")       { params.n_threads     = std::stoi(argv[++i]); }
         else if (                  arg == "--step")          { params.step_ms       = std::stoi(argv[++i]); }
+        else if (                  arg == "--pressure-t")    { params.pressure_t    = std::stoi(argv[++i]); }
+        else if (                  arg == "--silence-t")     { params.silence_t     = std::stoi(argv[++i]); }
         else if (                  arg == "--length")        { params.length_ms     = std::stoi(argv[++i]); }
         else if (                  arg == "--keep")          { params.keep_ms       = std::stoi(argv[++i]); }
         else if (arg == "-c"    || arg == "--capture")       { params.capture_id    = std::stoi(argv[++i]); }
@@ -105,6 +109,8 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para
     fprintf(stderr, "  -h,       --help          [default] show this help message and exit\n");
     fprintf(stderr, "  -t N,     --threads N     [%-7d] number of threads to use during computation\n",    params.n_threads);
     fprintf(stderr, "            --step N        [%-7d] audio step size in milliseconds\n",                params.step_ms);
+    fprintf(stderr, "            --pressure_t N  [%-7d] pressure threshold\n",                             params.pressure_t);
+    fprintf(stderr, "            --silence_t N   [%-7d] silence time, ms\n",                               params.silence_t);
     fprintf(stderr, "            --length N      [%-7d] audio length in milliseconds\n",                   params.length_ms);
     fprintf(stderr, "            --keep N        [%-7d] audio to keep from previous step in ms\n",         params.keep_ms);
     fprintf(stderr, "  -c ID,    --capture ID    [%-7d] capture device ID\n",                              params.capture_id);
@@ -250,41 +256,55 @@ int main(int argc, char ** argv) {
         }
 
         // process new audio
-
         if (!use_vad) {
-            while (true) {
-                audio.get(params.step_ms, pcmf32_new);
+            const int STEP = 2000;
+            audio.get(params.length_ms, STEP, pcmf32_new);
+
+            float average = 0.f;
+            int start = -1;
+            int end = 0;
+            std::vector<float> averages(pcmf32_new.size() / STEP, 0);
+            for (int i = 0; i < int(averages.size() * STEP); i++)
+            {
+                averages[i / STEP] += fabs(pcmf32_new[pcmf32_new.size() - 1 - i]) * params.pressure_t;
+            }
 
-                if ((int) pcmf32_new.size() > 2*n_samples_step) {
-                    fprintf(stderr, "\n\n%s: WARNING: cannot process audio fast enough, dropping audio ...\n\n", __func__);
-                    audio.clear();
-                    continue;
+            const int SOUND_FREQUENCY = 16000;
+            int silenceDuration = SOUND_FREQUENCY * params.silence_t / 1000;
+            for (int i = 0; i < averages.size(); i++)
+            {
+                int level = std::min(9, int(averages[i] / STEP));
+                printf(level >= 2 ? "%d" : "-", level);
+                if (level >= 2)
+                {
+                    if (start < 0)
+                    {
+                        start = i * STEP;
+                    }
+                    end = i * STEP;
                 }
-
-                if ((int) pcmf32_new.size() >= n_samples_step) {
-                    audio.clear();
+                else if (i * STEP >= end + silenceDuration)
+                {
                     break;
                 }
-
-                std::this_thread::sleep_for(std::chrono::milliseconds(1));
             }
 
-            const int n_samples_new = pcmf32_new.size();
-
-            // take up to params.length_ms audio from previous iteration
-            const int n_samples_take = std::min((int) pcmf32_old.size(), std::max(0, n_samples_keep + n_samples_len - n_samples_new));
-
-            //printf("processing: take = %d, new = %d, old = %d\n", n_samples_take, n_samples_new, (int) pcmf32_old.size());
-
-            pcmf32.resize(n_samples_new + n_samples_take);
-
-            for (int i = 0; i < n_samples_take; i++) {
-                pcmf32[i] = pcmf32_old[pcmf32_old.size() - n_samples_take + i];
+            printf("\n");
+            if (start >= silenceDuration)
+            {
+                start = std::max(start - SOUND_FREQUENCY / 2, 0);
+                end = std::min(end + SOUND_FREQUENCY / 2, (int)pcmf32_new.size() - 1);
+                pcmf32.resize(end - start);
+                for (int i = start; i < end; i++) {
+                    pcmf32[pcmf32.size() - 1 - (i - start)] = pcmf32_new[pcmf32_new.size() - 1 - i];
+                }
+                for (int i = 0; i < std::max(3 * SOUND_FREQUENCY - (int)pcmf32.size(), 0); i++)
+                {
+                    pcmf32.push_back(0.f);
+                }
             }
-
-            memcpy(pcmf32.data() + n_samples_take, pcmf32_new.data(), n_samples_new*sizeof(float));
-
-            pcmf32_old = pcmf32;
+            else
+                pcmf32.clear();
         } else {
             const auto t_now  = std::chrono::high_resolution_clock::now();
             const auto t_diff = std::chrono::duration_cast<std::chrono::milliseconds>(t_now - t_last).count();
@@ -295,10 +315,10 @@ int main(int argc, char ** argv) {
                 continue;
             }
 
-            audio.get(2000, pcmf32_new);
+            audio.get(2000, 1, pcmf32_new);
 
             if (::vad_simple(pcmf32_new, WHISPER_SAMPLE_RATE, 1000, params.vad_thold, params.freq_thold, false)) {
-                audio.get(params.length_ms, pcmf32);
+                audio.get(params.length_ms, 1, pcmf32);
             } else {
                 std::this_thread::sleep_for(std::chrono::milliseconds(100));
 
@@ -309,6 +329,7 @@ int main(int argc, char ** argv) {
         }
 
         // run the inference
+        if (pcmf32.size())
         {
             whisper_full_params wparams = whisper_full_default_params(WHISPER_SAMPLING_GREEDY);
 
@@ -334,6 +355,8 @@ int main(int argc, char ** argv) {
             wparams.prompt_tokens    = params.no_context ? nullptr : prompt_tokens.data();
             wparams.prompt_n_tokens  = params.no_context ? 0       : prompt_tokens.size();
 
+            auto start = std::chrono::system_clock::now();
+
             if (whisper_full(ctx, wparams, pcmf32.data(), pcmf32.size()) != 0) {
                 fprintf(stderr, "%s: failed to process audio\n", argv[0]);
                 return 6;
@@ -359,10 +382,20 @@ int main(int argc, char ** argv) {
 
                 const int n_segments = whisper_full_n_segments(ctx);
                 for (int i = 0; i < n_segments; ++i) {
-                    const char * text = whisper_full_get_segment_text(ctx, i);
+                    std::string text = whisper_full_get_segment_text(ctx, i);
 
                     if (params.no_timestamps) {
-                        printf("%s", text);
+                        if (text[0] == ' ') { // Remove initial space
+                            text = text.substr(1);
+                        }
+                        std::toupper(text[0]); // Make the first character uppercase
+                        if (text[text.size() - 1] == '.') { // Remove trailing dot
+                            text.resize(text.size() - 1);
+                        }
+                        printf("[%3.1fs] <%s>\n", (std::chrono::system_clock::now() - start).count() / 1000000000.f, text.data());
+                        std::string cmd = std::string("echo -n \"") + text + std::string("\" | xclip -selection clipboard");
+                        int r = system(cmd.data());
+
                         fflush(stdout);
 
                         if (params.fname_out.length() > 0) {
@@ -420,8 +453,11 @@ int main(int argc, char ** argv) {
                     }
                 }
             }
+
             fflush(stdout);
         }
+        else
+            std::this_thread::sleep_for(std::chrono::milliseconds(100));
     }
 
     audio.pause();

diff --git a/whisper.cpp b/whisper.cpp
@@ -5006,7 +5006,7 @@ int whisper_full_with_state(
         state->lang_id = lang_id;
         params.language = whisper_lang_str(lang_id);
 
-        WHISPER_LOG_INFO("%s: auto-detected language: %s (p = %f)\n", __func__, params.language, probs[whisper_lang_id(params.language)]);
+        // WHISPER_LOG_INFO("%s: auto-detected language: %s (p = %f)\n", __func__, params.language, probs[whisper_lang_id(params.language)]);
         if (params.detect_language) {
             return 0;
         }