diff --git a/README.md b/README.md index 4f324c0..f1e6875 100644 --- a/README.md +++ b/README.md @@ -16,7 +16,9 @@ using [ovos-vad-plugin-silero](https://github.com/OpenVoiceOS/ovos-vad-plugin-si ovos exclusive features: +- fallback STT - non-streaming STT support +- compatible with all existing wake-word/STT plugins - continuous listening (no wakeword, VAD only) - hybrid listening (no wakeword for follow up commands) - multiple wakewords diff --git a/ovos_dinkum_listener/__main__.py b/ovos_dinkum_listener/__main__.py index 608407a..94e0503 100644 --- a/ovos_dinkum_listener/__main__.py +++ b/ovos_dinkum_listener/__main__.py @@ -36,7 +36,7 @@ from ovos_utils.log import LOG from ovos_utils.sound import play_audio -from ovos_dinkum_listener.plugins import load_stt_module +from ovos_dinkum_listener.plugins import load_stt_module, load_fallback_stt from ovos_dinkum_listener.voice_loop import AlsaMicrophone, DinkumVoiceLoop, ListeningMode, ListeningState from ovos_dinkum_listener.voice_loop.hotwords import HotwordContainer @@ -163,7 +163,8 @@ def start(self): hotwords.load_hotword_engines() vad = OVOSVADFactory.create() - stt = load_stt_module(self.config, self.bus) + stt = load_stt_module() + fallback_stt = load_fallback_stt() transformers = AudioTransformersService(self.bus, self.config) @@ -171,6 +172,7 @@ def start(self): mic=mic, hotwords=hotwords, stt=stt, + fallback_stt=fallback_stt, vad=vad, transformers=transformers, # diff --git a/ovos_dinkum_listener/plugins.py b/ovos_dinkum_listener/plugins.py index b7c0acc..8e11103 100644 --- a/ovos_dinkum_listener/plugins.py +++ b/ovos_dinkum_listener/plugins.py @@ -43,9 +43,9 @@ def update(self, chunk: bytes): class FakeStreamingSTT(StreamingSTT): - def __init__(self, config=None): + def __init__(self, engine, config=None): super().__init__(config) - self.engine = OVOSSTTFactory.create() + self.engine = engine def create_streaming_thread(self): listener = Configuration().get("listener", {}) @@ -54,10 +54,26 @@ def create_streaming_thread(self): return FakeStreamThread(self.queue, self.lang, self.engine, sample_rate, sample_width) -def load_stt_module(config: Dict[str, Any], bus: MessageBusClient) -> StreamingSTT: - stt_config = config["stt"] +def load_stt_module(config: Dict[str, Any] = None) -> StreamingSTT: + stt_config = config or Configuration()["stt"] plug = OVOSSTTFactory.create(stt_config) if not isinstance(plug, StreamingSTT): LOG.debug("Using FakeStreamingSTT wrapper") - return FakeStreamingSTT(config) + return FakeStreamingSTT(plug, config) return plug + + +def load_fallback_stt(cfg: Dict[str, Any] = None) -> StreamingSTT: + cfg = cfg or Configuration().get("stt", {}) + fbm = cfg.get("fallback_module") + if fbm: + try: + config = cfg.get(fbm, {}) + plug = OVOSSTTFactory.create({"stt": {"module": fbm, fbm: config}}) + if not isinstance(plug, StreamingSTT): + LOG.debug("Using FakeStreamingSTT wrapper") + return FakeStreamingSTT(plug, config) + return plug + except: + LOG.exception("Failed to load fallback STT") + return None diff --git a/ovos_dinkum_listener/voice_loop/voice_loop.py b/ovos_dinkum_listener/voice_loop/voice_loop.py index 6c55fba..8811822 100644 --- a/ovos_dinkum_listener/voice_loop/voice_loop.py +++ b/ovos_dinkum_listener/voice_loop/voice_loop.py @@ -54,6 +54,7 @@ class VoiceLoop: mic: Microphone hotwords: HotwordContainer stt: StreamingSTT + fallback_stt: StreamingSTT vad: VADEngine transformers: AudioTransformersService @@ -331,6 +332,8 @@ def _detect_ww(self, chunk): self.timeout_seconds_left = self.timeout_seconds self.stt_audio_bytes = bytes() self.stt.stream_start() + if self.fallback_stt is not None: + self.fallback_stt.stream_start() # Reset the VAD internal state to avoid the model getting # into a degenerative state where it always reports silence. @@ -370,6 +373,8 @@ def _before_cmd(self, chunk): while self.stt_chunks: stt_chunk = self.stt_chunks.popleft() self.stt.stream_data(stt_chunk) + if self.fallback_stt is not None: + self.fallback_stt.stream_data(stt_chunk) self.timeout_seconds_left -= self.mic.seconds_per_chunk if self.timeout_seconds_left <= 0: @@ -404,6 +409,8 @@ def _in_cmd(self, chunk): stt_chunk = self.stt_chunks.popleft() self.stt.stream_data(stt_chunk) + if self.fallback_stt is not None: + self.fallback_stt.stream_data(stt_chunk) self.timeout_seconds_left -= self.mic.seconds_per_chunk if self.timeout_seconds_left <= 0: @@ -431,7 +438,16 @@ def _after_cmd(self, chunk): LOG.debug(f"transformers metadata: {stt_context}") # get text and trigger callback - text = self.stt.stream_stop() or "" + try: + text = self.stt.stream_stop() or "" + except: + LOG.exception("STT failed") + text = "" + + if not text and self.fallback_stt is not None: + LOG.info("Attempting fallback STT plugin") + text = self.fallback_stt.stream_stop() or "" + # TODO - some plugins return list of transcripts some just text # standardize support for this if isinstance(text, list):