From fe3ff6db9492106fc64c5ce2a250b18d535bc6d4 Mon Sep 17 00:00:00 2001 From: Long Chen Date: Fri, 11 Oct 2024 18:32:04 +0800 Subject: [PATCH 1/9] fix: fix speech_buffer missing data in VADStream --- .../livekit-plugins-silero/livekit/plugins/silero/vad.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/livekit-plugins/livekit-plugins-silero/livekit/plugins/silero/vad.py b/livekit-plugins/livekit-plugins-silero/livekit/plugins/silero/vad.py index 1c1995e94..3d2ac934d 100644 --- a/livekit-plugins/livekit-plugins-silero/livekit/plugins/silero/vad.py +++ b/livekit-plugins/livekit-plugins-silero/livekit/plugins/silero/vad.py @@ -273,7 +273,7 @@ async def _main_task(self): # copy the inference window to the speech buffer available_space = len(speech_buffer) - speech_buffer_index - to_copy_buffer = min(self._model.window_size_samples, available_space) + to_copy_buffer = min(len(input_frame.data), available_space) if to_copy_buffer > 0: speech_buffer[ speech_buffer_index : speech_buffer_index + to_copy_buffer From 7216997e96ed987d25963e30f668fc223f4fed6e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?The=CC=81o=20Monnom?= Date: Fri, 11 Oct 2024 16:27:10 -0700 Subject: [PATCH 2/9] use to_copy_int instead --- .../livekit-plugins-silero/livekit/plugins/silero/vad.py | 8 +++----- tests/test_vad.py | 3 ++- 2 files changed, 5 insertions(+), 6 deletions(-) diff --git a/livekit-plugins/livekit-plugins-silero/livekit/plugins/silero/vad.py b/livekit-plugins/livekit-plugins-silero/livekit/plugins/silero/vad.py index 3d2ac934d..4fe435605 100644 --- a/livekit-plugins/livekit-plugins-silero/livekit/plugins/silero/vad.py +++ b/livekit-plugins/livekit-plugins-silero/livekit/plugins/silero/vad.py @@ -15,7 +15,6 @@ from __future__ import annotations, print_function import asyncio -import math import time from concurrent.futures import ThreadPoolExecutor from dataclasses import dataclass @@ -55,7 +54,6 @@ def load( *, min_speech_duration: float = 0.05, min_silence_duration: float = 0.25, - prefix_padding_duration: float = 0.1, max_buffered_speech: float = 60.0, activation_threshold: float = 0.5, sample_rate: Literal[8000, 16000] = 16000, @@ -199,13 +197,13 @@ async def _main_task(self): pub_sample_rate = input_frame.sample_rate # alloc the buffers now that we know the input sample rate - pub_prefix_padding_samples = math.ceil( + pub_prefix_padding_samples = int( self._opts.prefix_padding_duration * pub_sample_rate ) speech_buffer = np.empty( int(self._opts.max_buffered_speech * pub_sample_rate) - + int(self._opts.prefix_padding_duration * pub_sample_rate), + + pub_prefix_padding_samples, dtype=np.int16, ) @@ -273,7 +271,7 @@ async def _main_task(self): # copy the inference window to the speech buffer available_space = len(speech_buffer) - speech_buffer_index - to_copy_buffer = min(len(input_frame.data), available_space) + to_copy_buffer = min(to_copy_int, available_space) if to_copy_buffer > 0: speech_buffer[ speech_buffer_index : speech_buffer_index + to_copy_buffer diff --git a/tests/test_vad.py b/tests/test_vad.py index 940d67a06..d14fef993 100644 --- a/tests/test_vad.py +++ b/tests/test_vad.py @@ -4,7 +4,8 @@ from . import utils VAD = silero.VAD.load( - min_speech_duration=0.5, min_silence_duration=0.5, padding_duration=1.0 + min_speech_duration=0.5, + min_silence_duration=0.5, ) From 6264e2534f21cf322a2f6ad9746b33e9eb102abb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Th=C3=A9o=20Monnom?= Date: Fri, 11 Oct 2024 16:29:57 -0700 Subject: [PATCH 3/9] Create tricky-parrots-notice.md --- .changeset/tricky-parrots-notice.md | 5 +++++ 1 file changed, 5 insertions(+) create mode 100644 .changeset/tricky-parrots-notice.md diff --git a/.changeset/tricky-parrots-notice.md b/.changeset/tricky-parrots-notice.md new file mode 100644 index 000000000..6ca19c766 --- /dev/null +++ b/.changeset/tricky-parrots-notice.md @@ -0,0 +1,5 @@ +--- +"livekit-plugins-silero": patch +--- + +silero: fix speech_buffer for END_OF_SPEECH From 8cdba888fccac41cf51c8750c6d94490dba16d28 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?The=CC=81o=20Monnom?= Date: Fri, 11 Oct 2024 16:36:14 -0700 Subject: [PATCH 4/9] better defaults --- .../livekit-plugins-silero/livekit/plugins/silero/vad.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/livekit-plugins/livekit-plugins-silero/livekit/plugins/silero/vad.py b/livekit-plugins/livekit-plugins-silero/livekit/plugins/silero/vad.py index 4fe435605..13e869caf 100644 --- a/livekit-plugins/livekit-plugins-silero/livekit/plugins/silero/vad.py +++ b/livekit-plugins/livekit-plugins-silero/livekit/plugins/silero/vad.py @@ -53,7 +53,8 @@ def load( cls, *, min_speech_duration: float = 0.05, - min_silence_duration: float = 0.25, + min_silence_duration: float = 0.55, + prefix_padding_duration: float = 0.5, max_buffered_speech: float = 60.0, activation_threshold: float = 0.5, sample_rate: Literal[8000, 16000] = 16000, From fae263bcf31b0a35096af7bc7574817039457abb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?The=CC=81o=20Monnom?= Date: Fri, 11 Oct 2024 16:37:18 -0700 Subject: [PATCH 5/9] Update test_vad.py --- tests/test_vad.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_vad.py b/tests/test_vad.py index d14fef993..a655c836b 100644 --- a/tests/test_vad.py +++ b/tests/test_vad.py @@ -5,7 +5,7 @@ VAD = silero.VAD.load( min_speech_duration=0.5, - min_silence_duration=0.5, + min_silence_duration=0.6, ) From 341e83d7e0d8b8f6d69f3627e4a92529063743f7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?The=CC=81o=20Monnom?= Date: Fri, 11 Oct 2024 18:17:29 -0700 Subject: [PATCH 6/9] Update vad.py --- .../livekit/plugins/silero/vad.py | 153 +++++++++++++----- 1 file changed, 115 insertions(+), 38 deletions(-) diff --git a/livekit-plugins/livekit-plugins-silero/livekit/plugins/silero/vad.py b/livekit-plugins/livekit-plugins-silero/livekit/plugins/silero/vad.py index 13e869caf..14c772160 100644 --- a/livekit-plugins/livekit-plugins-silero/livekit/plugins/silero/vad.py +++ b/livekit-plugins/livekit-plugins-silero/livekit/plugins/silero/vad.py @@ -134,6 +134,8 @@ def __init__( self._onnx_session = session self._opts = opts + self._streams: list[VADStream] = [] + def stream(self) -> "VADStream": """ Create a new VADStream for processing audio data. @@ -141,12 +143,52 @@ def stream(self) -> "VADStream": Returns: VADStream: A stream object for processing audio input and detecting speech. """ - return VADStream( + stream = VADStream( self._opts, onnx_model.OnnxModel( onnx_session=self._onnx_session, sample_rate=self._opts.sample_rate ), ) + self._streams.append(stream) + return stream + + def update_options( + self, + *, + min_speech_duration: float, + min_silence_duration: float, + prefix_padding_duration: float, + max_buffered_speech: float, + activation_threshold: float, + ) -> None: + """ + Update the VAD options. + + This method allows you to update the VAD options after the VAD object has been created. + + Args: + min_speech_duration (float): Minimum duration of speech to start a new speech chunk. + min_silence_duration (float): At the end of each speech, wait this duration before ending the speech. + prefix_padding_duration (float): Duration of padding to add to the beginning of each speech chunk. + max_buffered_speech (float): Maximum duration of speech to keep in the buffer (in seconds). + activation_threshold (float): Threshold to consider a frame as speech. + """ + self._opts = _VADOptions( + min_speech_duration=min_speech_duration, + min_silence_duration=min_silence_duration, + prefix_padding_duration=prefix_padding_duration, + max_buffered_speech=max_buffered_speech, + activation_threshold=activation_threshold, + sample_rate=self._opts.sample_rate, + ) + for stream in self._streams: + stream.update_options( + min_speech_duration=min_speech_duration, + min_silence_duration=min_silence_duration, + prefix_padding_duration=prefix_padding_duration, + max_buffered_speech=max_buffered_speech, + activation_threshold=activation_threshold, + ) class VADStream(agents.vad.VADStream): @@ -159,15 +201,49 @@ def __init__(self, opts: _VADOptions, model: onnx_model.OnnxModel) -> None: self._task.add_done_callback(lambda _: self._executor.shutdown(wait=False)) self._exp_filter = utils.ExpFilter(alpha=0.35) - self._extra_inference_time = 0.0 + self._input_sample_rate = 0 + self._speech_buffer: np.ndarray | None = None + self._speech_buffer_max_reached = False + self._prefix_padding_samples = 0 # (input_sample_rate) + + def update_options( + self, + *, + min_speech_duration: float, + min_silence_duration: float, + prefix_padding_duration: float, + max_buffered_speech: float, + activation_threshold: float, + ) -> None: + old_max_buffered_speech = self._opts.max_buffered_speech + + self._opts = _VADOptions( + min_speech_duration=min_speech_duration, + min_silence_duration=min_silence_duration, + prefix_padding_duration=prefix_padding_duration, + max_buffered_speech=max_buffered_speech, + activation_threshold=activation_threshold, + sample_rate=self._opts.sample_rate, + ) + + if self._input_sample_rate: + assert self._speech_buffer is not None + + self._prefix_padding_samples = int( + self._opts.prefix_padding_duration * self._input_sample_rate + ) + + self._speech_buffer.resize( + int(self._opts.max_buffered_speech * self._input_sample_rate) + + self._prefix_padding_samples + ) + + if self._opts.max_buffered_speech > old_max_buffered_speech: + self._speech_buffer_max_reached = False @agents.utils.log_exceptions(logger=logger) async def _main_task(self): inference_f32_data = np.empty(self._model.window_size_samples, dtype=np.float32) - - # a copy is exposed to the user in END_OF_SPEECH - speech_buffer: np.ndarray | None = None - speech_buffer_max_reached = False speech_buffer_index: int = 0 # "pub_" means public, these values are exposed to the users through events @@ -177,9 +253,6 @@ async def _main_task(self): pub_current_sample = 0 pub_timestamp = 0.0 - pub_sample_rate = 0 - pub_prefix_padding_samples = 0 # size in samples of padding data - speech_threshold_duration = 0.0 silence_threshold_duration = 0.0 @@ -190,37 +263,41 @@ async def _main_task(self): # used to avoid drift when the sample_rate ratio is not an integer input_copy_remaining_fract = 0.0 + extra_inference_time = 0.0 + async for input_frame in self._input_ch: if not isinstance(input_frame, rtc.AudioFrame): continue # ignore flush sentinel for now - if not pub_sample_rate or speech_buffer is None: - pub_sample_rate = input_frame.sample_rate + if not self._input_sample_rate: + self._input_sample_rate = input_frame.sample_rate # alloc the buffers now that we know the input sample rate - pub_prefix_padding_samples = int( - self._opts.prefix_padding_duration * pub_sample_rate + self._prefix_padding_samples = int( + self._opts.prefix_padding_duration * self._input_sample_rate ) - speech_buffer = np.empty( - int(self._opts.max_buffered_speech * pub_sample_rate) - + pub_prefix_padding_samples, + self._speech_buffer = np.empty( + int(self._opts.max_buffered_speech * self._input_sample_rate) + + self._prefix_padding_samples, dtype=np.int16, ) - if pub_sample_rate != self._opts.sample_rate: + if self._input_sample_rate != self._opts.sample_rate: # resampling needed: the input sample rate isn't the same as the model's # sample rate used for inference resampler = rtc.AudioResampler( - input_rate=pub_sample_rate, + input_rate=self._input_sample_rate, output_rate=self._opts.sample_rate, quality=rtc.AudioResamplerQuality.QUICK, # VAD doesn't need high quality ) - elif pub_sample_rate != input_frame.sample_rate: + elif self._input_sample_rate != input_frame.sample_rate: logger.error("a frame with another sample rate was already pushed") continue + assert self._speech_buffer is not None + input_frames.append(input_frame) if resampler is not None: # the resampler may have a bit of latency, but it is OK to ignore since it should be @@ -262,7 +339,7 @@ async def _main_task(self): pub_current_sample += self._model.window_size_samples pub_timestamp += window_duration - resampling_ratio = pub_sample_rate / self._model.sample_rate + resampling_ratio = self._input_sample_rate / self._model.sample_rate to_copy = ( self._model.window_size_samples * resampling_ratio + input_copy_remaining_fract @@ -271,14 +348,14 @@ async def _main_task(self): input_copy_remaining_fract = to_copy - to_copy_int # copy the inference window to the speech buffer - available_space = len(speech_buffer) - speech_buffer_index + available_space = len(self._speech_buffer) - speech_buffer_index to_copy_buffer = min(to_copy_int, available_space) if to_copy_buffer > 0: - speech_buffer[ + self._speech_buffer[ speech_buffer_index : speech_buffer_index + to_copy_buffer ] = input_frame.data[:to_copy_buffer] speech_buffer_index += to_copy_buffer - elif not speech_buffer_max_reached: + elif not self._speech_buffer_max_reached: # reached self._opts.max_buffered_speech (padding is included) speech_buffer_max_reached = True logger.warning( @@ -286,39 +363,39 @@ async def _main_task(self): ) inference_duration = time.perf_counter() - start_time - self._extra_inference_time = max( + extra_inference_time = max( 0.0, - self._extra_inference_time + inference_duration - window_duration, + extra_inference_time + inference_duration - window_duration, ) if inference_duration > SLOW_INFERENCE_THRESHOLD: logger.warning( "inference is slower than realtime", - extra={"delay": self._extra_inference_time}, + extra={"delay": extra_inference_time}, ) def _reset_write_cursor(): nonlocal speech_buffer_index, speech_buffer_max_reached - assert speech_buffer is not None + assert self._speech_buffer is not None - if speech_buffer_index <= pub_prefix_padding_samples: + if speech_buffer_index <= self._prefix_padding_samples: return - padding_data = speech_buffer[ + padding_data = self._speech_buffer[ speech_buffer_index - - pub_prefix_padding_samples : speech_buffer_index + - self._prefix_padding_samples : speech_buffer_index ] - speech_buffer[:pub_prefix_padding_samples] = padding_data - speech_buffer_index = pub_prefix_padding_samples - speech_buffer_max_reached = False + self._speech_buffer_max_reached = False + self._speech_buffer[: self._prefix_padding_samples] = padding_data + speech_buffer_index = self._prefix_padding_samples def _copy_speech_buffer() -> rtc.AudioFrame: # copy the data from speech_buffer - assert speech_buffer is not None - speech_data = speech_buffer[:speech_buffer_index].tobytes() + assert self._speech_buffer is not None + speech_data = self._speech_buffer[:speech_buffer_index].tobytes() return rtc.AudioFrame( - sample_rate=pub_sample_rate, + sample_rate=self._input_sample_rate, num_channels=1, samples_per_channel=speech_buffer_index, data=speech_data, @@ -341,7 +418,7 @@ def _copy_speech_buffer() -> rtc.AudioFrame: frames=[ rtc.AudioFrame( data=input_frame.data[:to_copy_int].tobytes(), - sample_rate=pub_sample_rate, + sample_rate=self._input_sample_rate, num_channels=1, samples_per_channel=to_copy_int, ) @@ -412,7 +489,7 @@ def _copy_speech_buffer() -> rtc.AudioFrame: input_frames.append( rtc.AudioFrame( data=data, - sample_rate=pub_sample_rate, + sample_rate=self._input_sample_rate, num_channels=1, samples_per_channel=len(data) // 2, ) From 54700788b59c5fa63cffe7242376c7bf00cf32d7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?The=CC=81o=20Monnom?= Date: Fri, 11 Oct 2024 18:19:01 -0700 Subject: [PATCH 7/9] docstrings --- .../livekit/plugins/silero/vad.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/livekit-plugins/livekit-plugins-silero/livekit/plugins/silero/vad.py b/livekit-plugins/livekit-plugins-silero/livekit/plugins/silero/vad.py index 14c772160..31a324377 100644 --- a/livekit-plugins/livekit-plugins-silero/livekit/plugins/silero/vad.py +++ b/livekit-plugins/livekit-plugins-silero/livekit/plugins/silero/vad.py @@ -215,6 +215,18 @@ def update_options( max_buffered_speech: float, activation_threshold: float, ) -> None: + """ + Update the VAD options. + + This method allows you to update the VAD options after the VAD object has been created. + + Args: + min_speech_duration (float): Minimum duration of speech to start a new speech chunk. + min_silence_duration (float): At the end of each speech, wait this duration before ending the speech. + prefix_padding_duration (float): Duration of padding to add to the beginning of each speech chunk. + max_buffered_speech (float): Maximum duration of speech to keep in the buffer (in seconds). + activation_threshold (float): Threshold to consider a frame as speech. + """ old_max_buffered_speech = self._opts.max_buffered_speech self._opts = _VADOptions( From 565ed491459832aac5392fe6e039a13556955149 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Th=C3=A9o=20Monnom?= Date: Fri, 11 Oct 2024 18:19:15 -0700 Subject: [PATCH 8/9] Create shy-ghosts-greet.md --- .changeset/shy-ghosts-greet.md | 5 +++++ 1 file changed, 5 insertions(+) create mode 100644 .changeset/shy-ghosts-greet.md diff --git a/.changeset/shy-ghosts-greet.md b/.changeset/shy-ghosts-greet.md new file mode 100644 index 000000000..1da35e858 --- /dev/null +++ b/.changeset/shy-ghosts-greet.md @@ -0,0 +1,5 @@ +--- +"livekit-plugins-silero": patch +--- + +silero: add update_options From 563cdb72665763dc2517aecd885cf7d3eb7f1d39 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?The=CC=81o=20Monnom?= Date: Fri, 11 Oct 2024 18:21:09 -0700 Subject: [PATCH 9/9] optional args --- .../livekit/plugins/silero/vad.py | 46 +++++++++++-------- 1 file changed, 26 insertions(+), 20 deletions(-) diff --git a/livekit-plugins/livekit-plugins-silero/livekit/plugins/silero/vad.py b/livekit-plugins/livekit-plugins-silero/livekit/plugins/silero/vad.py index 31a324377..0659ca45d 100644 --- a/livekit-plugins/livekit-plugins-silero/livekit/plugins/silero/vad.py +++ b/livekit-plugins/livekit-plugins-silero/livekit/plugins/silero/vad.py @@ -155,11 +155,11 @@ def stream(self) -> "VADStream": def update_options( self, *, - min_speech_duration: float, - min_silence_duration: float, - prefix_padding_duration: float, - max_buffered_speech: float, - activation_threshold: float, + min_speech_duration: float | None = None, + min_silence_duration: float | None = None, + prefix_padding_duration: float | None = None, + max_buffered_speech: float | None = None, + activation_threshold: float | None = None, ) -> None: """ Update the VAD options. @@ -174,11 +174,14 @@ def update_options( activation_threshold (float): Threshold to consider a frame as speech. """ self._opts = _VADOptions( - min_speech_duration=min_speech_duration, - min_silence_duration=min_silence_duration, - prefix_padding_duration=prefix_padding_duration, - max_buffered_speech=max_buffered_speech, - activation_threshold=activation_threshold, + min_speech_duration=min_speech_duration or self._opts.min_speech_duration, + min_silence_duration=min_silence_duration + or self._opts.min_silence_duration, + prefix_padding_duration=prefix_padding_duration + or self._opts.prefix_padding_duration, + max_buffered_speech=max_buffered_speech or self._opts.max_buffered_speech, + activation_threshold=activation_threshold + or self._opts.activation_threshold, sample_rate=self._opts.sample_rate, ) for stream in self._streams: @@ -209,11 +212,11 @@ def __init__(self, opts: _VADOptions, model: onnx_model.OnnxModel) -> None: def update_options( self, *, - min_speech_duration: float, - min_silence_duration: float, - prefix_padding_duration: float, - max_buffered_speech: float, - activation_threshold: float, + min_speech_duration: float | None = None, + min_silence_duration: float | None = None, + prefix_padding_duration: float | None = None, + max_buffered_speech: float | None = None, + activation_threshold: float | None = None, ) -> None: """ Update the VAD options. @@ -230,11 +233,14 @@ def update_options( old_max_buffered_speech = self._opts.max_buffered_speech self._opts = _VADOptions( - min_speech_duration=min_speech_duration, - min_silence_duration=min_silence_duration, - prefix_padding_duration=prefix_padding_duration, - max_buffered_speech=max_buffered_speech, - activation_threshold=activation_threshold, + min_speech_duration=min_speech_duration or self._opts.min_speech_duration, + min_silence_duration=min_silence_duration + or self._opts.min_silence_duration, + prefix_padding_duration=prefix_padding_duration + or self._opts.prefix_padding_duration, + max_buffered_speech=max_buffered_speech or self._opts.max_buffered_speech, + activation_threshold=activation_threshold + or self._opts.activation_threshold, sample_rate=self._opts.sample_rate, )