Skip to content

Commit

Permalink
silero: fix incomplete speech_buffer on END_OF_SPEECH (#898)
Browse files Browse the repository at this point in the history
Co-authored-by: Long Chen <longch1024@gmail.com>
  • Loading branch information
theomonnom and longcw authored Oct 14, 2024
1 parent 3125a34 commit 3eeb5dc
Show file tree
Hide file tree
Showing 3 changed files with 12 additions and 7 deletions.
5 changes: 5 additions & 0 deletions .changeset/tricky-parrots-notice.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
---
"livekit-plugins-silero": patch
---

silero: fix speech_buffer for END_OF_SPEECH
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,6 @@
from __future__ import annotations, print_function

import asyncio
import math
import time
from concurrent.futures import ThreadPoolExecutor
from dataclasses import dataclass
Expand Down Expand Up @@ -54,8 +53,8 @@ def load(
cls,
*,
min_speech_duration: float = 0.05,
min_silence_duration: float = 0.25,
prefix_padding_duration: float = 0.1,
min_silence_duration: float = 0.55,
prefix_padding_duration: float = 0.5,
max_buffered_speech: float = 60.0,
activation_threshold: float = 0.5,
sample_rate: Literal[8000, 16000] = 16000,
Expand Down Expand Up @@ -199,13 +198,13 @@ async def _main_task(self):
pub_sample_rate = input_frame.sample_rate

# alloc the buffers now that we know the input sample rate
pub_prefix_padding_samples = math.ceil(
pub_prefix_padding_samples = int(
self._opts.prefix_padding_duration * pub_sample_rate
)

speech_buffer = np.empty(
int(self._opts.max_buffered_speech * pub_sample_rate)
+ int(self._opts.prefix_padding_duration * pub_sample_rate),
+ pub_prefix_padding_samples,
dtype=np.int16,
)

Expand Down Expand Up @@ -273,7 +272,7 @@ async def _main_task(self):

# copy the inference window to the speech buffer
available_space = len(speech_buffer) - speech_buffer_index
to_copy_buffer = min(self._model.window_size_samples, available_space)
to_copy_buffer = min(to_copy_int, available_space)
if to_copy_buffer > 0:
speech_buffer[
speech_buffer_index : speech_buffer_index + to_copy_buffer
Expand Down
3 changes: 2 additions & 1 deletion tests/test_vad.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,8 @@
from . import utils

VAD = silero.VAD.load(
min_speech_duration=0.5, min_silence_duration=0.5, padding_duration=1.0
min_speech_duration=0.5,
min_silence_duration=0.6,
)


Expand Down

0 comments on commit 3eeb5dc

Please sign in to comment.