From fe3ff6db9492106fc64c5ce2a250b18d535bc6d4 Mon Sep 17 00:00:00 2001
From: Long Chen <longch1024@gmail.com>
Date: Fri, 11 Oct 2024 18:32:04 +0800
Subject: [PATCH 1/9] fix: fix speech_buffer missing data in VADStream

---
 .../livekit-plugins-silero/livekit/plugins/silero/vad.py        | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/livekit-plugins/livekit-plugins-silero/livekit/plugins/silero/vad.py b/livekit-plugins/livekit-plugins-silero/livekit/plugins/silero/vad.py
index 1c1995e94..3d2ac934d 100644
--- a/livekit-plugins/livekit-plugins-silero/livekit/plugins/silero/vad.py
+++ b/livekit-plugins/livekit-plugins-silero/livekit/plugins/silero/vad.py
@@ -273,7 +273,7 @@ async def _main_task(self):
 
                 # copy the inference window to the speech buffer
                 available_space = len(speech_buffer) - speech_buffer_index
-                to_copy_buffer = min(self._model.window_size_samples, available_space)
+                to_copy_buffer = min(len(input_frame.data), available_space)
                 if to_copy_buffer > 0:
                     speech_buffer[
                         speech_buffer_index : speech_buffer_index + to_copy_buffer

From 7216997e96ed987d25963e30f668fc223f4fed6e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?The=CC=81o=20Monnom?= <theo.monnom@outlook.com>
Date: Fri, 11 Oct 2024 16:27:10 -0700
Subject: [PATCH 2/9] use to_copy_int instead

---
 .../livekit-plugins-silero/livekit/plugins/silero/vad.py  | 8 +++-----
 tests/test_vad.py                                         | 3 ++-
 2 files changed, 5 insertions(+), 6 deletions(-)

diff --git a/livekit-plugins/livekit-plugins-silero/livekit/plugins/silero/vad.py b/livekit-plugins/livekit-plugins-silero/livekit/plugins/silero/vad.py
index 3d2ac934d..4fe435605 100644
--- a/livekit-plugins/livekit-plugins-silero/livekit/plugins/silero/vad.py
+++ b/livekit-plugins/livekit-plugins-silero/livekit/plugins/silero/vad.py
@@ -15,7 +15,6 @@
 from __future__ import annotations, print_function
 
 import asyncio
-import math
 import time
 from concurrent.futures import ThreadPoolExecutor
 from dataclasses import dataclass
@@ -55,7 +54,6 @@ def load(
         *,
         min_speech_duration: float = 0.05,
         min_silence_duration: float = 0.25,
-        prefix_padding_duration: float = 0.1,
         max_buffered_speech: float = 60.0,
         activation_threshold: float = 0.5,
         sample_rate: Literal[8000, 16000] = 16000,
@@ -199,13 +197,13 @@ async def _main_task(self):
                 pub_sample_rate = input_frame.sample_rate
 
                 # alloc the buffers now that we know the input sample rate
-                pub_prefix_padding_samples = math.ceil(
+                pub_prefix_padding_samples = int(
                     self._opts.prefix_padding_duration * pub_sample_rate
                 )
 
                 speech_buffer = np.empty(
                     int(self._opts.max_buffered_speech * pub_sample_rate)
-                    + int(self._opts.prefix_padding_duration * pub_sample_rate),
+                    + pub_prefix_padding_samples,
                     dtype=np.int16,
                 )
 
@@ -273,7 +271,7 @@ async def _main_task(self):
 
                 # copy the inference window to the speech buffer
                 available_space = len(speech_buffer) - speech_buffer_index
-                to_copy_buffer = min(len(input_frame.data), available_space)
+                to_copy_buffer = min(to_copy_int, available_space)
                 if to_copy_buffer > 0:
                     speech_buffer[
                         speech_buffer_index : speech_buffer_index + to_copy_buffer
diff --git a/tests/test_vad.py b/tests/test_vad.py
index 940d67a06..d14fef993 100644
--- a/tests/test_vad.py
+++ b/tests/test_vad.py
@@ -4,7 +4,8 @@
 from . import utils
 
 VAD = silero.VAD.load(
-    min_speech_duration=0.5, min_silence_duration=0.5, padding_duration=1.0
+    min_speech_duration=0.5,
+    min_silence_duration=0.5,
 )
 
 

From 6264e2534f21cf322a2f6ad9746b33e9eb102abb Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Th=C3=A9o=20Monnom?= <theo.8bits@gmail.com>
Date: Fri, 11 Oct 2024 16:29:57 -0700
Subject: [PATCH 3/9] Create tricky-parrots-notice.md

---
 .changeset/tricky-parrots-notice.md | 5 +++++
 1 file changed, 5 insertions(+)
 create mode 100644 .changeset/tricky-parrots-notice.md

diff --git a/.changeset/tricky-parrots-notice.md b/.changeset/tricky-parrots-notice.md
new file mode 100644
index 000000000..6ca19c766
--- /dev/null
+++ b/.changeset/tricky-parrots-notice.md
@@ -0,0 +1,5 @@
+---
+"livekit-plugins-silero": patch
+---
+
+silero: fix speech_buffer for END_OF_SPEECH

From 8cdba888fccac41cf51c8750c6d94490dba16d28 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?The=CC=81o=20Monnom?= <theo.monnom@outlook.com>
Date: Fri, 11 Oct 2024 16:36:14 -0700
Subject: [PATCH 4/9] better defaults

---
 .../livekit-plugins-silero/livekit/plugins/silero/vad.py       | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/livekit-plugins/livekit-plugins-silero/livekit/plugins/silero/vad.py b/livekit-plugins/livekit-plugins-silero/livekit/plugins/silero/vad.py
index 4fe435605..13e869caf 100644
--- a/livekit-plugins/livekit-plugins-silero/livekit/plugins/silero/vad.py
+++ b/livekit-plugins/livekit-plugins-silero/livekit/plugins/silero/vad.py
@@ -53,7 +53,8 @@ def load(
         cls,
         *,
         min_speech_duration: float = 0.05,
-        min_silence_duration: float = 0.25,
+        min_silence_duration: float = 0.55,
+        prefix_padding_duration: float = 0.5,
         max_buffered_speech: float = 60.0,
         activation_threshold: float = 0.5,
         sample_rate: Literal[8000, 16000] = 16000,

From fae263bcf31b0a35096af7bc7574817039457abb Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?The=CC=81o=20Monnom?= <theo.monnom@outlook.com>
Date: Fri, 11 Oct 2024 16:37:18 -0700
Subject: [PATCH 5/9] Update test_vad.py

---
 tests/test_vad.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/test_vad.py b/tests/test_vad.py
index d14fef993..a655c836b 100644
--- a/tests/test_vad.py
+++ b/tests/test_vad.py
@@ -5,7 +5,7 @@
 
 VAD = silero.VAD.load(
     min_speech_duration=0.5,
-    min_silence_duration=0.5,
+    min_silence_duration=0.6,
 )
 
 

From 341e83d7e0d8b8f6d69f3627e4a92529063743f7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?The=CC=81o=20Monnom?= <theo.monnom@outlook.com>
Date: Fri, 11 Oct 2024 18:17:29 -0700
Subject: [PATCH 6/9] Update vad.py

---
 .../livekit/plugins/silero/vad.py             | 153 +++++++++++++-----
 1 file changed, 115 insertions(+), 38 deletions(-)

diff --git a/livekit-plugins/livekit-plugins-silero/livekit/plugins/silero/vad.py b/livekit-plugins/livekit-plugins-silero/livekit/plugins/silero/vad.py
index 13e869caf..14c772160 100644
--- a/livekit-plugins/livekit-plugins-silero/livekit/plugins/silero/vad.py
+++ b/livekit-plugins/livekit-plugins-silero/livekit/plugins/silero/vad.py
@@ -134,6 +134,8 @@ def __init__(
         self._onnx_session = session
         self._opts = opts
 
+        self._streams: list[VADStream] = []
+
     def stream(self) -> "VADStream":
         """
         Create a new VADStream for processing audio data.
@@ -141,12 +143,52 @@ def stream(self) -> "VADStream":
         Returns:
             VADStream: A stream object for processing audio input and detecting speech.
         """
-        return VADStream(
+        stream = VADStream(
             self._opts,
             onnx_model.OnnxModel(
                 onnx_session=self._onnx_session, sample_rate=self._opts.sample_rate
             ),
         )
+        self._streams.append(stream)
+        return stream
+
+    def update_options(
+        self,
+        *,
+        min_speech_duration: float,
+        min_silence_duration: float,
+        prefix_padding_duration: float,
+        max_buffered_speech: float,
+        activation_threshold: float,
+    ) -> None:
+        """
+        Update the VAD options.
+
+        This method allows you to update the VAD options after the VAD object has been created.
+
+        Args:
+            min_speech_duration (float): Minimum duration of speech to start a new speech chunk.
+            min_silence_duration (float): At the end of each speech, wait this duration before ending the speech.
+            prefix_padding_duration (float): Duration of padding to add to the beginning of each speech chunk.
+            max_buffered_speech (float): Maximum duration of speech to keep in the buffer (in seconds).
+            activation_threshold (float): Threshold to consider a frame as speech.
+        """
+        self._opts = _VADOptions(
+            min_speech_duration=min_speech_duration,
+            min_silence_duration=min_silence_duration,
+            prefix_padding_duration=prefix_padding_duration,
+            max_buffered_speech=max_buffered_speech,
+            activation_threshold=activation_threshold,
+            sample_rate=self._opts.sample_rate,
+        )
+        for stream in self._streams:
+            stream.update_options(
+                min_speech_duration=min_speech_duration,
+                min_silence_duration=min_silence_duration,
+                prefix_padding_duration=prefix_padding_duration,
+                max_buffered_speech=max_buffered_speech,
+                activation_threshold=activation_threshold,
+            )
 
 
 class VADStream(agents.vad.VADStream):
@@ -159,15 +201,49 @@ def __init__(self, opts: _VADOptions, model: onnx_model.OnnxModel) -> None:
         self._task.add_done_callback(lambda _: self._executor.shutdown(wait=False))
         self._exp_filter = utils.ExpFilter(alpha=0.35)
 
-        self._extra_inference_time = 0.0
+        self._input_sample_rate = 0
+        self._speech_buffer: np.ndarray | None = None
+        self._speech_buffer_max_reached = False
+        self._prefix_padding_samples = 0  # (input_sample_rate)
+
+    def update_options(
+        self,
+        *,
+        min_speech_duration: float,
+        min_silence_duration: float,
+        prefix_padding_duration: float,
+        max_buffered_speech: float,
+        activation_threshold: float,
+    ) -> None:
+        old_max_buffered_speech = self._opts.max_buffered_speech
+
+        self._opts = _VADOptions(
+            min_speech_duration=min_speech_duration,
+            min_silence_duration=min_silence_duration,
+            prefix_padding_duration=prefix_padding_duration,
+            max_buffered_speech=max_buffered_speech,
+            activation_threshold=activation_threshold,
+            sample_rate=self._opts.sample_rate,
+        )
+
+        if self._input_sample_rate:
+            assert self._speech_buffer is not None
+
+            self._prefix_padding_samples = int(
+                self._opts.prefix_padding_duration * self._input_sample_rate
+            )
+
+            self._speech_buffer.resize(
+                int(self._opts.max_buffered_speech * self._input_sample_rate)
+                + self._prefix_padding_samples
+            )
+
+            if self._opts.max_buffered_speech > old_max_buffered_speech:
+                self._speech_buffer_max_reached = False
 
     @agents.utils.log_exceptions(logger=logger)
     async def _main_task(self):
         inference_f32_data = np.empty(self._model.window_size_samples, dtype=np.float32)
-
-        # a copy is exposed to the user in END_OF_SPEECH
-        speech_buffer: np.ndarray | None = None
-        speech_buffer_max_reached = False
         speech_buffer_index: int = 0
 
         # "pub_" means public, these values are exposed to the users through events
@@ -177,9 +253,6 @@ async def _main_task(self):
         pub_current_sample = 0
         pub_timestamp = 0.0
 
-        pub_sample_rate = 0
-        pub_prefix_padding_samples = 0  # size in samples of padding data
-
         speech_threshold_duration = 0.0
         silence_threshold_duration = 0.0
 
@@ -190,37 +263,41 @@ async def _main_task(self):
         # used to avoid drift when the sample_rate ratio is not an integer
         input_copy_remaining_fract = 0.0
 
+        extra_inference_time = 0.0
+
         async for input_frame in self._input_ch:
             if not isinstance(input_frame, rtc.AudioFrame):
                 continue  # ignore flush sentinel for now
 
-            if not pub_sample_rate or speech_buffer is None:
-                pub_sample_rate = input_frame.sample_rate
+            if not self._input_sample_rate:
+                self._input_sample_rate = input_frame.sample_rate
 
                 # alloc the buffers now that we know the input sample rate
-                pub_prefix_padding_samples = int(
-                    self._opts.prefix_padding_duration * pub_sample_rate
+                self._prefix_padding_samples = int(
+                    self._opts.prefix_padding_duration * self._input_sample_rate
                 )
 
-                speech_buffer = np.empty(
-                    int(self._opts.max_buffered_speech * pub_sample_rate)
-                    + pub_prefix_padding_samples,
+                self._speech_buffer = np.empty(
+                    int(self._opts.max_buffered_speech * self._input_sample_rate)
+                    + self._prefix_padding_samples,
                     dtype=np.int16,
                 )
 
-                if pub_sample_rate != self._opts.sample_rate:
+                if self._input_sample_rate != self._opts.sample_rate:
                     # resampling needed: the input sample rate isn't the same as the model's
                     # sample rate used for inference
                     resampler = rtc.AudioResampler(
-                        input_rate=pub_sample_rate,
+                        input_rate=self._input_sample_rate,
                         output_rate=self._opts.sample_rate,
                         quality=rtc.AudioResamplerQuality.QUICK,  # VAD doesn't need high quality
                     )
 
-            elif pub_sample_rate != input_frame.sample_rate:
+            elif self._input_sample_rate != input_frame.sample_rate:
                 logger.error("a frame with another sample rate was already pushed")
                 continue
 
+            assert self._speech_buffer is not None
+
             input_frames.append(input_frame)
             if resampler is not None:
                 # the resampler may have a bit of latency, but it is OK to ignore since it should be
@@ -262,7 +339,7 @@ async def _main_task(self):
                 pub_current_sample += self._model.window_size_samples
                 pub_timestamp += window_duration
 
-                resampling_ratio = pub_sample_rate / self._model.sample_rate
+                resampling_ratio = self._input_sample_rate / self._model.sample_rate
                 to_copy = (
                     self._model.window_size_samples * resampling_ratio
                     + input_copy_remaining_fract
@@ -271,14 +348,14 @@ async def _main_task(self):
                 input_copy_remaining_fract = to_copy - to_copy_int
 
                 # copy the inference window to the speech buffer
-                available_space = len(speech_buffer) - speech_buffer_index
+                available_space = len(self._speech_buffer) - speech_buffer_index
                 to_copy_buffer = min(to_copy_int, available_space)
                 if to_copy_buffer > 0:
-                    speech_buffer[
+                    self._speech_buffer[
                         speech_buffer_index : speech_buffer_index + to_copy_buffer
                     ] = input_frame.data[:to_copy_buffer]
                     speech_buffer_index += to_copy_buffer
-                elif not speech_buffer_max_reached:
+                elif not self._speech_buffer_max_reached:
                     # reached self._opts.max_buffered_speech (padding is included)
                     speech_buffer_max_reached = True
                     logger.warning(
@@ -286,39 +363,39 @@ async def _main_task(self):
                     )
 
                 inference_duration = time.perf_counter() - start_time
-                self._extra_inference_time = max(
+                extra_inference_time = max(
                     0.0,
-                    self._extra_inference_time + inference_duration - window_duration,
+                    extra_inference_time + inference_duration - window_duration,
                 )
                 if inference_duration > SLOW_INFERENCE_THRESHOLD:
                     logger.warning(
                         "inference is slower than realtime",
-                        extra={"delay": self._extra_inference_time},
+                        extra={"delay": extra_inference_time},
                     )
 
                 def _reset_write_cursor():
                     nonlocal speech_buffer_index, speech_buffer_max_reached
-                    assert speech_buffer is not None
+                    assert self._speech_buffer is not None
 
-                    if speech_buffer_index <= pub_prefix_padding_samples:
+                    if speech_buffer_index <= self._prefix_padding_samples:
                         return
 
-                    padding_data = speech_buffer[
+                    padding_data = self._speech_buffer[
                         speech_buffer_index
-                        - pub_prefix_padding_samples : speech_buffer_index
+                        - self._prefix_padding_samples : speech_buffer_index
                     ]
 
-                    speech_buffer[:pub_prefix_padding_samples] = padding_data
-                    speech_buffer_index = pub_prefix_padding_samples
-                    speech_buffer_max_reached = False
+                    self._speech_buffer_max_reached = False
+                    self._speech_buffer[: self._prefix_padding_samples] = padding_data
+                    speech_buffer_index = self._prefix_padding_samples
 
                 def _copy_speech_buffer() -> rtc.AudioFrame:
                     # copy the data from speech_buffer
-                    assert speech_buffer is not None
-                    speech_data = speech_buffer[:speech_buffer_index].tobytes()
+                    assert self._speech_buffer is not None
+                    speech_data = self._speech_buffer[:speech_buffer_index].tobytes()
 
                     return rtc.AudioFrame(
-                        sample_rate=pub_sample_rate,
+                        sample_rate=self._input_sample_rate,
                         num_channels=1,
                         samples_per_channel=speech_buffer_index,
                         data=speech_data,
@@ -341,7 +418,7 @@ def _copy_speech_buffer() -> rtc.AudioFrame:
                         frames=[
                             rtc.AudioFrame(
                                 data=input_frame.data[:to_copy_int].tobytes(),
-                                sample_rate=pub_sample_rate,
+                                sample_rate=self._input_sample_rate,
                                 num_channels=1,
                                 samples_per_channel=to_copy_int,
                             )
@@ -412,7 +489,7 @@ def _copy_speech_buffer() -> rtc.AudioFrame:
                     input_frames.append(
                         rtc.AudioFrame(
                             data=data,
-                            sample_rate=pub_sample_rate,
+                            sample_rate=self._input_sample_rate,
                             num_channels=1,
                             samples_per_channel=len(data) // 2,
                         )

From 54700788b59c5fa63cffe7242376c7bf00cf32d7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?The=CC=81o=20Monnom?= <theo.monnom@outlook.com>
Date: Fri, 11 Oct 2024 18:19:01 -0700
Subject: [PATCH 7/9] docstrings

---
 .../livekit/plugins/silero/vad.py                    | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/livekit-plugins/livekit-plugins-silero/livekit/plugins/silero/vad.py b/livekit-plugins/livekit-plugins-silero/livekit/plugins/silero/vad.py
index 14c772160..31a324377 100644
--- a/livekit-plugins/livekit-plugins-silero/livekit/plugins/silero/vad.py
+++ b/livekit-plugins/livekit-plugins-silero/livekit/plugins/silero/vad.py
@@ -215,6 +215,18 @@ def update_options(
         max_buffered_speech: float,
         activation_threshold: float,
     ) -> None:
+        """
+        Update the VAD options.
+
+        This method allows you to update the VAD options after the VAD object has been created.
+
+        Args:
+            min_speech_duration (float): Minimum duration of speech to start a new speech chunk.
+            min_silence_duration (float): At the end of each speech, wait this duration before ending the speech.
+            prefix_padding_duration (float): Duration of padding to add to the beginning of each speech chunk.
+            max_buffered_speech (float): Maximum duration of speech to keep in the buffer (in seconds).
+            activation_threshold (float): Threshold to consider a frame as speech.
+        """
         old_max_buffered_speech = self._opts.max_buffered_speech
 
         self._opts = _VADOptions(

From 565ed491459832aac5392fe6e039a13556955149 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Th=C3=A9o=20Monnom?= <theo.8bits@gmail.com>
Date: Fri, 11 Oct 2024 18:19:15 -0700
Subject: [PATCH 8/9] Create shy-ghosts-greet.md

---
 .changeset/shy-ghosts-greet.md | 5 +++++
 1 file changed, 5 insertions(+)
 create mode 100644 .changeset/shy-ghosts-greet.md

diff --git a/.changeset/shy-ghosts-greet.md b/.changeset/shy-ghosts-greet.md
new file mode 100644
index 000000000..1da35e858
--- /dev/null
+++ b/.changeset/shy-ghosts-greet.md
@@ -0,0 +1,5 @@
+---
+"livekit-plugins-silero": patch
+---
+
+silero: add update_options

From 563cdb72665763dc2517aecd885cf7d3eb7f1d39 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?The=CC=81o=20Monnom?= <theo.monnom@outlook.com>
Date: Fri, 11 Oct 2024 18:21:09 -0700
Subject: [PATCH 9/9] optional args

---
 .../livekit/plugins/silero/vad.py             | 46 +++++++++++--------
 1 file changed, 26 insertions(+), 20 deletions(-)

diff --git a/livekit-plugins/livekit-plugins-silero/livekit/plugins/silero/vad.py b/livekit-plugins/livekit-plugins-silero/livekit/plugins/silero/vad.py
index 31a324377..0659ca45d 100644
--- a/livekit-plugins/livekit-plugins-silero/livekit/plugins/silero/vad.py
+++ b/livekit-plugins/livekit-plugins-silero/livekit/plugins/silero/vad.py
@@ -155,11 +155,11 @@ def stream(self) -> "VADStream":
     def update_options(
         self,
         *,
-        min_speech_duration: float,
-        min_silence_duration: float,
-        prefix_padding_duration: float,
-        max_buffered_speech: float,
-        activation_threshold: float,
+        min_speech_duration: float | None = None,
+        min_silence_duration: float | None = None,
+        prefix_padding_duration: float | None = None,
+        max_buffered_speech: float | None = None,
+        activation_threshold: float | None = None,
     ) -> None:
         """
         Update the VAD options.
@@ -174,11 +174,14 @@ def update_options(
             activation_threshold (float): Threshold to consider a frame as speech.
         """
         self._opts = _VADOptions(
-            min_speech_duration=min_speech_duration,
-            min_silence_duration=min_silence_duration,
-            prefix_padding_duration=prefix_padding_duration,
-            max_buffered_speech=max_buffered_speech,
-            activation_threshold=activation_threshold,
+            min_speech_duration=min_speech_duration or self._opts.min_speech_duration,
+            min_silence_duration=min_silence_duration
+            or self._opts.min_silence_duration,
+            prefix_padding_duration=prefix_padding_duration
+            or self._opts.prefix_padding_duration,
+            max_buffered_speech=max_buffered_speech or self._opts.max_buffered_speech,
+            activation_threshold=activation_threshold
+            or self._opts.activation_threshold,
             sample_rate=self._opts.sample_rate,
         )
         for stream in self._streams:
@@ -209,11 +212,11 @@ def __init__(self, opts: _VADOptions, model: onnx_model.OnnxModel) -> None:
     def update_options(
         self,
         *,
-        min_speech_duration: float,
-        min_silence_duration: float,
-        prefix_padding_duration: float,
-        max_buffered_speech: float,
-        activation_threshold: float,
+        min_speech_duration: float | None = None,
+        min_silence_duration: float | None = None,
+        prefix_padding_duration: float | None = None,
+        max_buffered_speech: float | None = None,
+        activation_threshold: float | None = None,
     ) -> None:
         """
         Update the VAD options.
@@ -230,11 +233,14 @@ def update_options(
         old_max_buffered_speech = self._opts.max_buffered_speech
 
         self._opts = _VADOptions(
-            min_speech_duration=min_speech_duration,
-            min_silence_duration=min_silence_duration,
-            prefix_padding_duration=prefix_padding_duration,
-            max_buffered_speech=max_buffered_speech,
-            activation_threshold=activation_threshold,
+            min_speech_duration=min_speech_duration or self._opts.min_speech_duration,
+            min_silence_duration=min_silence_duration
+            or self._opts.min_silence_duration,
+            prefix_padding_duration=prefix_padding_duration
+            or self._opts.prefix_padding_duration,
+            max_buffered_speech=max_buffered_speech or self._opts.max_buffered_speech,
+            activation_threshold=activation_threshold
+            or self._opts.activation_threshold,
             sample_rate=self._opts.sample_rate,
         )