silero: fix incomplete speech_buffer on END_OF_SPEECH (#898)

Co-authored-by: Long Chen <longch1024@gmail.com>
livekit · Oct 14, 2024 · 3eeb5dc · 3eeb5dc
1 parent 3125a34
commit 3eeb5dc
Show file tree

Hide file tree

Showing 3 changed files with 12 additions and 7 deletions.
diff --git a/.changeset/tricky-parrots-notice.md b/.changeset/tricky-parrots-notice.md
@@ -0,0 +1,5 @@
+---
+"livekit-plugins-silero": patch
+---
+
+silero: fix speech_buffer for END_OF_SPEECH
diff --git a/livekit-plugins/livekit-plugins-silero/livekit/plugins/silero/vad.py b/livekit-plugins/livekit-plugins-silero/livekit/plugins/silero/vad.py
@@ -15,7 +15,6 @@
 from __future__ import annotations, print_function
 
 import asyncio
-import math
 import time
 from concurrent.futures import ThreadPoolExecutor
 from dataclasses import dataclass
@@ -54,8 +53,8 @@ def load(
         cls,
         *,
         min_speech_duration: float = 0.05,
-        min_silence_duration: float = 0.25,
-        prefix_padding_duration: float = 0.1,
+        min_silence_duration: float = 0.55,
+        prefix_padding_duration: float = 0.5,
         max_buffered_speech: float = 60.0,
         activation_threshold: float = 0.5,
         sample_rate: Literal[8000, 16000] = 16000,
@@ -199,13 +198,13 @@ async def _main_task(self):
                 pub_sample_rate = input_frame.sample_rate
 
                 # alloc the buffers now that we know the input sample rate
-                pub_prefix_padding_samples = math.ceil(
+                pub_prefix_padding_samples = int(
                     self._opts.prefix_padding_duration * pub_sample_rate
                 )
 
                 speech_buffer = np.empty(
                     int(self._opts.max_buffered_speech * pub_sample_rate)
-                    + int(self._opts.prefix_padding_duration * pub_sample_rate),
+                    + pub_prefix_padding_samples,
                     dtype=np.int16,
                 )
 
@@ -273,7 +272,7 @@ async def _main_task(self):
 
                 # copy the inference window to the speech buffer
                 available_space = len(speech_buffer) - speech_buffer_index
-                to_copy_buffer = min(self._model.window_size_samples, available_space)
+                to_copy_buffer = min(to_copy_int, available_space)
                 if to_copy_buffer > 0:
                     speech_buffer[
                         speech_buffer_index : speech_buffer_index + to_copy_buffer

diff --git a/tests/test_vad.py b/tests/test_vad.py
@@ -4,7 +4,8 @@
 from . import utils
 
 VAD = silero.VAD.load(
-    min_speech_duration=0.5, min_silence_duration=0.5, padding_duration=1.0
+    min_speech_duration=0.5,
+    min_silence_duration=0.6,
 )