OpenVoiceOS · JarbasAl · Apr 10, 2023 · Apr 10, 2023 · Apr 10, 2023 · Apr 10, 2023
diff --git a/README.md b/README.md
@@ -18,16 +18,17 @@ using [ovos-vad-plugin-silero](https://github.com/OpenVoiceOS/ovos-vad-plugin-si
 
 ovos exclusive features:
 
-- sleep mode
 - continuous listening  (no wakeword, VAD only)
 - hybrid listening  (no wakeword for follow up commands)
 - multiple wakewords
    - assign a STT lang per wakeword (multilingual support)
 - hotword types (perform actions other than listen)
+- sleep mode (no stt -> no accidental activations)
 - recording mode (save speech to file instead of STT)
 - OPM bus api (query available plugins)
-- wake word upload (backend)
+- sample upload (DatasetApi ovos-backend-client)
 - XDG path standards for recorded audio data
+- [neon-transformers](https://github.com/NeonGeckoCom/neon-transformers) support
 
 ## Usage
 

diff --git a/ovos_dinkum_listener/__main__.py b/ovos_dinkum_listener/__main__.py
@@ -21,6 +21,7 @@
 from typing import List, Optional
 
 import sdnotify
+from ovos_dinkum_listener.transformers import AudioTransformersService
 from ovos_backend_client.api import DatasetApi
 from ovos_bus_client import Message, MessageBusClient
 from ovos_bus_client.session import SessionManager
@@ -164,11 +165,14 @@ def start(self):
         vad = OVOSVADFactory.create()
         stt = load_stt_module(self.config, self.bus)
 
+        transformers = AudioTransformersService(self.bus, self.config)
+
         self.voice_loop = DinkumVoiceLoop(
             mic=mic,
             hotwords=hotwords,
             stt=stt,
             vad=vad,
+            transformers=transformers,
             #
             speech_seconds=listener.get("speech_begin", 0.3),
             silence_seconds=listener.get("silence_end", 0.7),
@@ -286,7 +290,7 @@ def _save_ww(self, audio_bytes, ww_meta, save_path=None):
             hotword_audio_dir = Path(f"{self.default_save_path}/wake_words")
             hotword_audio_dir.mkdir(parents=True, exist_ok=True)
 
-        metafile = self._compile_ww_metadata(ww_meta["key_phrase"], ww_meta["module"])
+        metafile = self._compile_ww_context(ww_meta["key_phrase"], ww_meta["module"])
         # TODO - do we need to keep this convention? i don't think so...
         #   move to the standard ww_id + timestamp from OPM
         filename = '_'.join(str(metafile[k]) for k in sorted(metafile))
@@ -319,7 +323,7 @@ def upload(wav_data, metadata):
         Thread(target=upload, daemon=True, args=(wav_data, metadata)).start()
 
     @staticmethod
-    def _compile_ww_metadata(key_phrase, ww_module):
+    def _compile_ww_context(key_phrase, ww_module):
         """ creates metadata in the format expected by selene
         while this format is mostly deprecated we want to
         ensure backwards compat and no missing keys"""
@@ -333,25 +337,25 @@ def _compile_ww_metadata(key_phrase, ww_module):
             'model': str(model_hash)
         }
 
-    def _hotword_audio(self, audio_bytes: bytes, ww_metadata: dict):
-        payload = ww_metadata
+    def _hotword_audio(self, audio_bytes: bytes, ww_context: dict):
+        payload = ww_context
         context = {'client_name': 'ovos_dinkum_listener',
                    'source': 'audio',  # default native audio source
                    'destination': ["skills"]}
-        stt_lang = ww_metadata.get("lang")
+        stt_lang = ww_context.get("lang")
         if stt_lang:
             context["lang"] = stt_lang
 
         try:
             listener = self.config["listener"]
             if listener["record_wake_words"]:
-                payload["filename"] = self._save_ww(audio_bytes, ww_metadata)
+                payload["filename"] = self._save_ww(audio_bytes, ww_context)
 
             upload_disabled = listener.get('wake_word_upload', {}).get('disable')
             if self.config['opt_in'] and not upload_disabled:
-                self._upload_hotword(audio_bytes, ww_metadata)
+                self._upload_hotword(audio_bytes, ww_context)
 
-            utterance = ww_metadata.get("utterance")
+            utterance = ww_context.get("utterance")
             if utterance:
                 LOG.debug("Hotword utterance: " + utterance)
                 # send the transcribed word on for processing
@@ -364,9 +368,9 @@ def _hotword_audio(self, audio_bytes: bytes, ww_metadata: dict):
 
             # If enabled, play a wave file with a short sound to audibly
             # indicate hotword was detected.
-            sound = ww_metadata.get("sound")
-            listen = ww_metadata.get("listen")
-            event = ww_metadata.get("event")
+            sound = ww_context.get("sound")
+            listen = ww_context.get("listen")
+            event = ww_context.get("event")
 
             if sound:
                 try:
@@ -378,13 +382,13 @@ def _hotword_audio(self, audio_bytes: bytes, ww_metadata: dict):
 
             if listen:
                 msg_type = "recognizer_loop:wakeword"
-                payload["utterance"] = ww_metadata["key_phrase"].replace("_", " ").replace("-", " ")
+                payload["utterance"] = ww_context["key_phrase"].replace("_", " ").replace("-", " ")
             elif event:
                 msg_type = event
             else:
-                if ww_metadata.get("wakeup"):
+                if ww_context.get("wakeup"):
                     wordtype = "wakeupword"
-                elif ww_metadata.get("stop"):
+                elif ww_context.get("stop"):
                     wordtype = "stopword"
                 else:
                     wordtype = "hotword"
@@ -397,25 +401,21 @@ def _hotword_audio(self, audio_bytes: bytes, ww_metadata: dict):
             LOG.exception("Error while saving STT audio")
         return payload
 
-    def _stt_text(self, text: str, stt_metadata: dict):
+    def _stt_text(self, text: str, stt_context: dict):
         if isinstance(text, list):
             text = text[0]
 
         LOG.debug("Record end")
-
-        context = {'client_name': 'ovos_dinkum_listener',
-                   'source': 'audio',  # default native audio source
-                   'destination': ["skills"]}
         self.bus.emit(Message("recognizer_loop:record_end",
-                              context=context))
+                              context=stt_context))
 
         # Report utterance to intent service
         if text:
-            payload = stt_metadata
+            payload = stt_context
             payload["utterances"] = [text]
-            self.bus.emit(Message("recognizer_loop:utterance", payload, context))
+            self.bus.emit(Message("recognizer_loop:utterance", payload, stt_context))
         else:
-            self.bus.emit(Message("recognizer_loop:speech.recognition.unknown", context=context))
+            self.bus.emit(Message("recognizer_loop:speech.recognition.unknown", context=stt_context))
 
         LOG.debug(f"STT: {text}")
 
@@ -457,17 +457,17 @@ def upload(wav_data, metadata):
 
         Thread(target=upload, daemon=True, args=(wav_data, metadata)).start()
 
-    def _stt_audio(self, audio_bytes: bytes, stt_metadata: dict):
+    def _stt_audio(self, audio_bytes: bytes, stt_context: dict):
         try:
             listener = self.config["listener"]
             if listener["save_utterances"]:
-                stt_metadata["filename"] = self._save_stt(audio_bytes, stt_metadata)
+                stt_context["filename"] = self._save_stt(audio_bytes, stt_context)
                 upload_disabled = listener.get('stt_upload', {}).get('disable')
                 if self.config['opt_in'] and not upload_disabled:
-                    self._upload_stt(audio_bytes, stt_metadata)
+                    self._upload_stt(audio_bytes, stt_context)
         except Exception:
             LOG.exception("Error while saving STT audio")
-        return stt_metadata
+        return stt_context
 
     def _save_recording(self, audio_bytes, stt_meta, save_path=None):
         LOG.info("Saving Recording")
@@ -493,12 +493,12 @@ def _save_recording(self, audio_bytes, stt_meta, save_path=None):
         LOG.debug(f"Wrote {wav_path}")
         return f"file://{wav_path.absolute()}"
 
-    def _recording_audio(self, audio_bytes: bytes, stt_metadata: dict):
+    def _recording_audio(self, audio_bytes: bytes, stt_context: dict):
         try:
-            stt_metadata["filename"] = self._save_recording(audio_bytes, stt_metadata)
+            stt_context["filename"] = self._save_recording(audio_bytes, stt_context)
         except Exception:
             LOG.exception("Error while saving recording audio")
-        return stt_metadata
+        return stt_context
 
     # mic bus api
     def _handle_mute(self, _message: Message):

diff --git a/ovos_dinkum_listener/transformers.py b/ovos_dinkum_listener/transformers.py
@@ -0,0 +1,103 @@
+# NEON AI (TM) SOFTWARE, Software Development Kit & Application Framework
+# All trademark and other rights reserved by their respective owners
+# Copyright 2008-2022 Neongecko.com Inc.
+# Contributors: Daniel McKnight, Guy Daniels, Elon Gasper, Richard Leeds,
+# Regina Bloomstine, Casimiro Ferreira, Andrii Pernatii, Kirill Hrymailo
+# BSD-3 License
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# 1. Redistributions of source code must retain the above copyright notice,
+#    this list of conditions and the following disclaimer.
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+#    this list of conditions and the following disclaimer in the documentation
+#    and/or other materials provided with the distribution.
+# 3. Neither the name of the copyright holder nor the names of its
+#    contributors may be used to endorse or promote products derived from this
+#    software without specific prior written permission.
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+# THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
+# CONTRIBUTORS  BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
+# OR PROFITS;  OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+# SOFTWARE,  EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+from ovos_plugin_manager.audio_transformers import find_audio_transformer_plugins
+from ovos_utils.json_helper import merge_dict
+from ovos_utils.log import LOG
+
+
+class AudioTransformersService:
+
+    def __init__(self, bus, config=None):
+        self.config_core = config or {}
+        self.loaded_modules = {}
+        self.has_loaded = False
+        self.bus = bus
+        # to activate a plugin, just add an entry to mycroft.conf for it
+        self.config = self.config_core.get("audio_transformers") or {
+            # "ovos_audio_transformer_xxx_plugin": {}
+        }
+        self.load_plugins()
+
+    def load_plugins(self):
+        for plug_name, plug in find_audio_transformer_plugins().items():
+            if plug_name in self.config:
+                # if disabled skip it
+                if not self.config[plug_name].get("active", True):
+                    continue
+                try:
+                    self.loaded_modules[plug_name] = plug()
+                    LOG.info(f"loaded audio transformer plugin: {plug_name}")
+                except Exception as e:
+                    LOG.exception(f"Failed to load audio transfomer plugin: {plug_name}")
+
+    @property
+    def modules(self):
+        """
+        Return loaded transformers in priority order, such that modules with a
+        higher `priority` rank are called first and changes from lower ranked
+        transformers are applied last.
+
+        A plugin of `priority` 1 will override any existing context keys and
+        will be the last to modify `audio_data`
+        """
+        return sorted(self.loaded_modules.values(),
+                      key=lambda k: k.priority, reverse=True)
+
+    def shutdown(self):
+        for module in self.modules:
+            try:
+                module.shutdown()
+            except:
+                pass
+
+    def feed_audio(self, chunk):
+        for module in self.modules:
+            module.feed_audio_chunk(chunk)
+
+    def feed_hotword(self, chunk):
+        for module in self.modules:
+            module.feed_hotword_chunk(chunk)
+
+    def feed_speech(self, chunk):
+        for module in self.modules:
+            module.feed_speech_chunk(chunk)
+
+    def transform(self, chunk):
+        context = {'client_name': 'ovos_dinkum_listener',
+                   'source': 'audio',  # default native audio source
+                   'destination': ["skills"]}
+        for module in self.modules:
+            try:
+                chunk = module.feed_speech_utterance(chunk)
+                chunk, data = module.transform(chunk)
+                LOG.debug(f"{module.name}: {data}")
+                context = merge_dict(context, data)
+            except:
+                pass
+        return chunk, context