From 3eeb5dc0a4454731e7b0754872fafa33b1b3e83b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Th=C3=A9o=20Monnom?= <theo.8bits@gmail.com>
Date: Mon, 14 Oct 2024 12:02:32 -0700
Subject: [PATCH] silero: fix incomplete speech_buffer on END_OF_SPEECH (#898)

Co-authored-by: Long Chen <longch1024@gmail.com>
---
 .changeset/tricky-parrots-notice.md                   |  5 +++++
 .../livekit/plugins/silero/vad.py                     | 11 +++++------
 tests/test_vad.py                                     |  3 ++-
 3 files changed, 12 insertions(+), 7 deletions(-)
 create mode 100644 .changeset/tricky-parrots-notice.md

diff --git a/.changeset/tricky-parrots-notice.md b/.changeset/tricky-parrots-notice.md
new file mode 100644
index 000000000..6ca19c766
--- /dev/null
+++ b/.changeset/tricky-parrots-notice.md
@@ -0,0 +1,5 @@
+---
+"livekit-plugins-silero": patch
+---
+
+silero: fix speech_buffer for END_OF_SPEECH
diff --git a/livekit-plugins/livekit-plugins-silero/livekit/plugins/silero/vad.py b/livekit-plugins/livekit-plugins-silero/livekit/plugins/silero/vad.py
index 1c1995e94..13e869caf 100644
--- a/livekit-plugins/livekit-plugins-silero/livekit/plugins/silero/vad.py
+++ b/livekit-plugins/livekit-plugins-silero/livekit/plugins/silero/vad.py
@@ -15,7 +15,6 @@
 from __future__ import annotations, print_function
 
 import asyncio
-import math
 import time
 from concurrent.futures import ThreadPoolExecutor
 from dataclasses import dataclass
@@ -54,8 +53,8 @@ def load(
         cls,
         *,
         min_speech_duration: float = 0.05,
-        min_silence_duration: float = 0.25,
-        prefix_padding_duration: float = 0.1,
+        min_silence_duration: float = 0.55,
+        prefix_padding_duration: float = 0.5,
         max_buffered_speech: float = 60.0,
         activation_threshold: float = 0.5,
         sample_rate: Literal[8000, 16000] = 16000,
@@ -199,13 +198,13 @@ async def _main_task(self):
                 pub_sample_rate = input_frame.sample_rate
 
                 # alloc the buffers now that we know the input sample rate
-                pub_prefix_padding_samples = math.ceil(
+                pub_prefix_padding_samples = int(
                     self._opts.prefix_padding_duration * pub_sample_rate
                 )
 
                 speech_buffer = np.empty(
                     int(self._opts.max_buffered_speech * pub_sample_rate)
-                    + int(self._opts.prefix_padding_duration * pub_sample_rate),
+                    + pub_prefix_padding_samples,
                     dtype=np.int16,
                 )
 
@@ -273,7 +272,7 @@ async def _main_task(self):
 
                 # copy the inference window to the speech buffer
                 available_space = len(speech_buffer) - speech_buffer_index
-                to_copy_buffer = min(self._model.window_size_samples, available_space)
+                to_copy_buffer = min(to_copy_int, available_space)
                 if to_copy_buffer > 0:
                     speech_buffer[
                         speech_buffer_index : speech_buffer_index + to_copy_buffer
diff --git a/tests/test_vad.py b/tests/test_vad.py
index 940d67a06..a655c836b 100644
--- a/tests/test_vad.py
+++ b/tests/test_vad.py
@@ -4,7 +4,8 @@
 from . import utils
 
 VAD = silero.VAD.load(
-    min_speech_duration=0.5, min_silence_duration=0.5, padding_duration=1.0
+    min_speech_duration=0.5,
+    min_silence_duration=0.6,
 )