feat/fallback_stt (#10)

OpenVoiceOS · Apr 23, 2023 · ca4f88e · ca4f88e
1 parent 6d13c82
commit ca4f88e
Show file tree

Hide file tree

Showing 4 changed files with 44 additions and 8 deletions.
diff --git a/README.md b/README.md
@@ -16,7 +16,9 @@ using [ovos-vad-plugin-silero](https://github.com/OpenVoiceOS/ovos-vad-plugin-si
 
 ovos exclusive features:
 
+- fallback STT
 - non-streaming STT support
+- compatible with all existing wake-word/STT plugins
 - continuous listening  (no wakeword, VAD only)
 - hybrid listening  (no wakeword for follow up commands)
 - multiple wakewords

diff --git a/ovos_dinkum_listener/__main__.py b/ovos_dinkum_listener/__main__.py
@@ -36,7 +36,7 @@
 from ovos_utils.log import LOG
 from ovos_utils.sound import play_audio
 
-from ovos_dinkum_listener.plugins import load_stt_module
+from ovos_dinkum_listener.plugins import load_stt_module, load_fallback_stt
 from ovos_dinkum_listener.voice_loop import AlsaMicrophone, DinkumVoiceLoop, ListeningMode, ListeningState
 from ovos_dinkum_listener.voice_loop.hotwords import HotwordContainer
 
@@ -163,14 +163,16 @@ def start(self):
         hotwords.load_hotword_engines()
 
         vad = OVOSVADFactory.create()
-        stt = load_stt_module(self.config, self.bus)
+        stt = load_stt_module()
+        fallback_stt = load_fallback_stt()
 
         transformers = AudioTransformersService(self.bus, self.config)
 
         self.voice_loop = DinkumVoiceLoop(
             mic=mic,
             hotwords=hotwords,
             stt=stt,
+            fallback_stt=fallback_stt,
             vad=vad,
             transformers=transformers,
             #

diff --git a/ovos_dinkum_listener/plugins.py b/ovos_dinkum_listener/plugins.py
@@ -43,9 +43,9 @@ def update(self, chunk: bytes):
 
 
 class FakeStreamingSTT(StreamingSTT):
-    def __init__(self, config=None):
+    def __init__(self, engine, config=None):
         super().__init__(config)
-        self.engine = OVOSSTTFactory.create()
+        self.engine = engine
 
     def create_streaming_thread(self):
         listener = Configuration().get("listener", {})
@@ -54,10 +54,26 @@ def create_streaming_thread(self):
         return FakeStreamThread(self.queue, self.lang, self.engine, sample_rate, sample_width)
 
 
-def load_stt_module(config: Dict[str, Any], bus: MessageBusClient) -> StreamingSTT:
-    stt_config = config["stt"]
+def load_stt_module(config: Dict[str, Any] = None) -> StreamingSTT:
+    stt_config = config or Configuration()["stt"]
     plug = OVOSSTTFactory.create(stt_config)
     if not isinstance(plug, StreamingSTT):
         LOG.debug("Using FakeStreamingSTT wrapper")
-        return FakeStreamingSTT(config)
+        return FakeStreamingSTT(plug, config)
     return plug
+
+
+def load_fallback_stt(cfg: Dict[str, Any] = None) -> StreamingSTT:
+    cfg = cfg or Configuration().get("stt", {})
+    fbm = cfg.get("fallback_module")
+    if fbm:
+        try:
+            config = cfg.get(fbm, {})
+            plug = OVOSSTTFactory.create({"stt": {"module": fbm, fbm: config}})
+            if not isinstance(plug, StreamingSTT):
+                LOG.debug("Using FakeStreamingSTT wrapper")
+                return FakeStreamingSTT(plug, config)
+            return plug
+        except:
+            LOG.exception("Failed to load fallback STT")
+    return None
diff --git a/ovos_dinkum_listener/voice_loop/voice_loop.py b/ovos_dinkum_listener/voice_loop/voice_loop.py
@@ -54,6 +54,7 @@ class VoiceLoop:
     mic: Microphone
     hotwords: HotwordContainer
     stt: StreamingSTT
+    fallback_stt: StreamingSTT
     vad: VADEngine
     transformers: AudioTransformersService
 
@@ -331,6 +332,8 @@ def _detect_ww(self, chunk):
                 self.timeout_seconds_left = self.timeout_seconds
                 self.stt_audio_bytes = bytes()
                 self.stt.stream_start()
+                if self.fallback_stt is not None:
+                    self.fallback_stt.stream_start()
 
             # Reset the VAD internal state to avoid the model getting
             # into a degenerative state where it always reports silence.
@@ -370,6 +373,8 @@ def _before_cmd(self, chunk):
         while self.stt_chunks:
             stt_chunk = self.stt_chunks.popleft()
             self.stt.stream_data(stt_chunk)
+            if self.fallback_stt is not None:
+                self.fallback_stt.stream_data(stt_chunk)
 
             self.timeout_seconds_left -= self.mic.seconds_per_chunk
             if self.timeout_seconds_left <= 0:
@@ -404,6 +409,8 @@ def _in_cmd(self, chunk):
             stt_chunk = self.stt_chunks.popleft()
 
             self.stt.stream_data(stt_chunk)
+            if self.fallback_stt is not None:
+                self.fallback_stt.stream_data(stt_chunk)
 
             self.timeout_seconds_left -= self.mic.seconds_per_chunk
             if self.timeout_seconds_left <= 0:
@@ -431,7 +438,16 @@ def _after_cmd(self, chunk):
         LOG.debug(f"transformers metadata: {stt_context}")
 
         # get text and trigger callback
-        text = self.stt.stream_stop() or ""
+        try:
+            text = self.stt.stream_stop() or ""
+        except:
+            LOG.exception("STT failed")
+            text = ""
+
+        if not text and self.fallback_stt is not None:
+            LOG.info("Attempting fallback STT plugin")
+            text = self.fallback_stt.stream_stop() or ""
+
         # TODO - some plugins return list of transcripts some just text
         # standardize support for this
         if isinstance(text, list):