From 3eeb5dc0a4454731e7b0754872fafa33b1b3e83b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Th=C3=A9o=20Monnom?= Date: Mon, 14 Oct 2024 12:02:32 -0700 Subject: [PATCH] silero: fix incomplete speech_buffer on END_OF_SPEECH (#898) Co-authored-by: Long Chen --- .changeset/tricky-parrots-notice.md | 5 +++++ .../livekit/plugins/silero/vad.py | 11 +++++------ tests/test_vad.py | 3 ++- 3 files changed, 12 insertions(+), 7 deletions(-) create mode 100644 .changeset/tricky-parrots-notice.md diff --git a/.changeset/tricky-parrots-notice.md b/.changeset/tricky-parrots-notice.md new file mode 100644 index 000000000..6ca19c766 --- /dev/null +++ b/.changeset/tricky-parrots-notice.md @@ -0,0 +1,5 @@ +--- +"livekit-plugins-silero": patch +--- + +silero: fix speech_buffer for END_OF_SPEECH diff --git a/livekit-plugins/livekit-plugins-silero/livekit/plugins/silero/vad.py b/livekit-plugins/livekit-plugins-silero/livekit/plugins/silero/vad.py index 1c1995e94..13e869caf 100644 --- a/livekit-plugins/livekit-plugins-silero/livekit/plugins/silero/vad.py +++ b/livekit-plugins/livekit-plugins-silero/livekit/plugins/silero/vad.py @@ -15,7 +15,6 @@ from __future__ import annotations, print_function import asyncio -import math import time from concurrent.futures import ThreadPoolExecutor from dataclasses import dataclass @@ -54,8 +53,8 @@ def load( cls, *, min_speech_duration: float = 0.05, - min_silence_duration: float = 0.25, - prefix_padding_duration: float = 0.1, + min_silence_duration: float = 0.55, + prefix_padding_duration: float = 0.5, max_buffered_speech: float = 60.0, activation_threshold: float = 0.5, sample_rate: Literal[8000, 16000] = 16000, @@ -199,13 +198,13 @@ async def _main_task(self): pub_sample_rate = input_frame.sample_rate # alloc the buffers now that we know the input sample rate - pub_prefix_padding_samples = math.ceil( + pub_prefix_padding_samples = int( self._opts.prefix_padding_duration * pub_sample_rate ) speech_buffer = np.empty( int(self._opts.max_buffered_speech * pub_sample_rate) - + int(self._opts.prefix_padding_duration * pub_sample_rate), + + pub_prefix_padding_samples, dtype=np.int16, ) @@ -273,7 +272,7 @@ async def _main_task(self): # copy the inference window to the speech buffer available_space = len(speech_buffer) - speech_buffer_index - to_copy_buffer = min(self._model.window_size_samples, available_space) + to_copy_buffer = min(to_copy_int, available_space) if to_copy_buffer > 0: speech_buffer[ speech_buffer_index : speech_buffer_index + to_copy_buffer diff --git a/tests/test_vad.py b/tests/test_vad.py index 940d67a06..a655c836b 100644 --- a/tests/test_vad.py +++ b/tests/test_vad.py @@ -4,7 +4,8 @@ from . import utils VAD = silero.VAD.load( - min_speech_duration=0.5, min_silence_duration=0.5, padding_duration=1.0 + min_speech_duration=0.5, + min_silence_duration=0.6, )