Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat/neon_transformers #3

Merged
merged 4 commits into from
Apr 10, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 3 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -18,16 +18,17 @@ using [ovos-vad-plugin-silero](https://github.com/OpenVoiceOS/ovos-vad-plugin-si

ovos exclusive features:

- sleep mode
- continuous listening (no wakeword, VAD only)
- hybrid listening (no wakeword for follow up commands)
- multiple wakewords
- assign a STT lang per wakeword (multilingual support)
- hotword types (perform actions other than listen)
- sleep mode (no stt -> no accidental activations)
- recording mode (save speech to file instead of STT)
- OPM bus api (query available plugins)
- wake word upload (backend)
- sample upload (DatasetApi ovos-backend-client)
- XDG path standards for recorded audio data
- [neon-transformers](https://github.com/NeonGeckoCom/neon-transformers) support

## Usage

Expand Down
60 changes: 30 additions & 30 deletions ovos_dinkum_listener/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
from typing import List, Optional

import sdnotify
from ovos_dinkum_listener.transformers import AudioTransformersService
from ovos_backend_client.api import DatasetApi
from ovos_bus_client import Message, MessageBusClient
from ovos_bus_client.session import SessionManager
Expand Down Expand Up @@ -164,11 +165,14 @@ def start(self):
vad = OVOSVADFactory.create()
stt = load_stt_module(self.config, self.bus)

transformers = AudioTransformersService(self.bus, self.config)

self.voice_loop = DinkumVoiceLoop(
mic=mic,
hotwords=hotwords,
stt=stt,
vad=vad,
transformers=transformers,
#
speech_seconds=listener.get("speech_begin", 0.3),
silence_seconds=listener.get("silence_end", 0.7),
Expand Down Expand Up @@ -286,7 +290,7 @@ def _save_ww(self, audio_bytes, ww_meta, save_path=None):
hotword_audio_dir = Path(f"{self.default_save_path}/wake_words")
hotword_audio_dir.mkdir(parents=True, exist_ok=True)

metafile = self._compile_ww_metadata(ww_meta["key_phrase"], ww_meta["module"])
metafile = self._compile_ww_context(ww_meta["key_phrase"], ww_meta["module"])
# TODO - do we need to keep this convention? i don't think so...
# move to the standard ww_id + timestamp from OPM
filename = '_'.join(str(metafile[k]) for k in sorted(metafile))
Expand Down Expand Up @@ -319,7 +323,7 @@ def upload(wav_data, metadata):
Thread(target=upload, daemon=True, args=(wav_data, metadata)).start()

@staticmethod
def _compile_ww_metadata(key_phrase, ww_module):
def _compile_ww_context(key_phrase, ww_module):
""" creates metadata in the format expected by selene
while this format is mostly deprecated we want to
ensure backwards compat and no missing keys"""
Expand All @@ -333,25 +337,25 @@ def _compile_ww_metadata(key_phrase, ww_module):
'model': str(model_hash)
}

def _hotword_audio(self, audio_bytes: bytes, ww_metadata: dict):
payload = ww_metadata
def _hotword_audio(self, audio_bytes: bytes, ww_context: dict):
payload = ww_context
context = {'client_name': 'ovos_dinkum_listener',
'source': 'audio', # default native audio source
'destination': ["skills"]}
stt_lang = ww_metadata.get("lang")
stt_lang = ww_context.get("lang")
if stt_lang:
context["lang"] = stt_lang

try:
listener = self.config["listener"]
if listener["record_wake_words"]:
payload["filename"] = self._save_ww(audio_bytes, ww_metadata)
payload["filename"] = self._save_ww(audio_bytes, ww_context)

upload_disabled = listener.get('wake_word_upload', {}).get('disable')
if self.config['opt_in'] and not upload_disabled:
self._upload_hotword(audio_bytes, ww_metadata)
self._upload_hotword(audio_bytes, ww_context)

utterance = ww_metadata.get("utterance")
utterance = ww_context.get("utterance")
if utterance:
LOG.debug("Hotword utterance: " + utterance)
# send the transcribed word on for processing
Expand All @@ -364,9 +368,9 @@ def _hotword_audio(self, audio_bytes: bytes, ww_metadata: dict):

# If enabled, play a wave file with a short sound to audibly
# indicate hotword was detected.
sound = ww_metadata.get("sound")
listen = ww_metadata.get("listen")
event = ww_metadata.get("event")
sound = ww_context.get("sound")
listen = ww_context.get("listen")
event = ww_context.get("event")

if sound:
try:
Expand All @@ -378,13 +382,13 @@ def _hotword_audio(self, audio_bytes: bytes, ww_metadata: dict):

if listen:
msg_type = "recognizer_loop:wakeword"
payload["utterance"] = ww_metadata["key_phrase"].replace("_", " ").replace("-", " ")
payload["utterance"] = ww_context["key_phrase"].replace("_", " ").replace("-", " ")
elif event:
msg_type = event
else:
if ww_metadata.get("wakeup"):
if ww_context.get("wakeup"):
wordtype = "wakeupword"
elif ww_metadata.get("stop"):
elif ww_context.get("stop"):
wordtype = "stopword"
else:
wordtype = "hotword"
Expand All @@ -397,25 +401,21 @@ def _hotword_audio(self, audio_bytes: bytes, ww_metadata: dict):
LOG.exception("Error while saving STT audio")
return payload

def _stt_text(self, text: str, stt_metadata: dict):
def _stt_text(self, text: str, stt_context: dict):
if isinstance(text, list):
text = text[0]

LOG.debug("Record end")

context = {'client_name': 'ovos_dinkum_listener',
'source': 'audio', # default native audio source
'destination': ["skills"]}
self.bus.emit(Message("recognizer_loop:record_end",
context=context))
context=stt_context))

# Report utterance to intent service
if text:
payload = stt_metadata
payload = stt_context
payload["utterances"] = [text]
self.bus.emit(Message("recognizer_loop:utterance", payload, context))
self.bus.emit(Message("recognizer_loop:utterance", payload, stt_context))
else:
self.bus.emit(Message("recognizer_loop:speech.recognition.unknown", context=context))
self.bus.emit(Message("recognizer_loop:speech.recognition.unknown", context=stt_context))

LOG.debug(f"STT: {text}")

Expand Down Expand Up @@ -457,17 +457,17 @@ def upload(wav_data, metadata):

Thread(target=upload, daemon=True, args=(wav_data, metadata)).start()

def _stt_audio(self, audio_bytes: bytes, stt_metadata: dict):
def _stt_audio(self, audio_bytes: bytes, stt_context: dict):
try:
listener = self.config["listener"]
if listener["save_utterances"]:
stt_metadata["filename"] = self._save_stt(audio_bytes, stt_metadata)
stt_context["filename"] = self._save_stt(audio_bytes, stt_context)
upload_disabled = listener.get('stt_upload', {}).get('disable')
if self.config['opt_in'] and not upload_disabled:
self._upload_stt(audio_bytes, stt_metadata)
self._upload_stt(audio_bytes, stt_context)
except Exception:
LOG.exception("Error while saving STT audio")
return stt_metadata
return stt_context

def _save_recording(self, audio_bytes, stt_meta, save_path=None):
LOG.info("Saving Recording")
Expand All @@ -493,12 +493,12 @@ def _save_recording(self, audio_bytes, stt_meta, save_path=None):
LOG.debug(f"Wrote {wav_path}")
return f"file://{wav_path.absolute()}"

def _recording_audio(self, audio_bytes: bytes, stt_metadata: dict):
def _recording_audio(self, audio_bytes: bytes, stt_context: dict):
try:
stt_metadata["filename"] = self._save_recording(audio_bytes, stt_metadata)
stt_context["filename"] = self._save_recording(audio_bytes, stt_context)
except Exception:
LOG.exception("Error while saving recording audio")
return stt_metadata
return stt_context

# mic bus api
def _handle_mute(self, _message: Message):
Expand Down
103 changes: 103 additions & 0 deletions ovos_dinkum_listener/transformers.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,103 @@
# NEON AI (TM) SOFTWARE, Software Development Kit & Application Framework
# All trademark and other rights reserved by their respective owners
# Copyright 2008-2022 Neongecko.com Inc.
# Contributors: Daniel McKnight, Guy Daniels, Elon Gasper, Richard Leeds,
# Regina Bloomstine, Casimiro Ferreira, Andrii Pernatii, Kirill Hrymailo
# BSD-3 License
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
# 1. Redistributions of source code must retain the above copyright notice,
# this list of conditions and the following disclaimer.
# 2. Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
# 3. Neither the name of the copyright holder nor the names of its
# contributors may be used to endorse or promote products derived from this
# software without specific prior written permission.
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
# THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
# OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

from ovos_plugin_manager.audio_transformers import find_audio_transformer_plugins
from ovos_utils.json_helper import merge_dict
from ovos_utils.log import LOG


class AudioTransformersService:

def __init__(self, bus, config=None):
self.config_core = config or {}
self.loaded_modules = {}
self.has_loaded = False
self.bus = bus
# to activate a plugin, just add an entry to mycroft.conf for it
self.config = self.config_core.get("audio_transformers") or {
# "ovos_audio_transformer_xxx_plugin": {}
}
self.load_plugins()

def load_plugins(self):
for plug_name, plug in find_audio_transformer_plugins().items():
if plug_name in self.config:
# if disabled skip it
if not self.config[plug_name].get("active", True):
continue
try:
self.loaded_modules[plug_name] = plug()
LOG.info(f"loaded audio transformer plugin: {plug_name}")
except Exception as e:
LOG.exception(f"Failed to load audio transfomer plugin: {plug_name}")

@property
def modules(self):
"""
Return loaded transformers in priority order, such that modules with a
higher `priority` rank are called first and changes from lower ranked
transformers are applied last.

A plugin of `priority` 1 will override any existing context keys and
will be the last to modify `audio_data`
"""
return sorted(self.loaded_modules.values(),
key=lambda k: k.priority, reverse=True)

def shutdown(self):
for module in self.modules:
try:
module.shutdown()
except:
pass

def feed_audio(self, chunk):
for module in self.modules:
module.feed_audio_chunk(chunk)

def feed_hotword(self, chunk):
for module in self.modules:
module.feed_hotword_chunk(chunk)

def feed_speech(self, chunk):
for module in self.modules:
module.feed_speech_chunk(chunk)

def transform(self, chunk):
context = {'client_name': 'ovos_dinkum_listener',
'source': 'audio', # default native audio source
'destination': ["skills"]}
for module in self.modules:
try:
chunk = module.feed_speech_utterance(chunk)
chunk, data = module.transform(chunk)
LOG.debug(f"{module.name}: {data}")
context = merge_dict(context, data)
except:
pass
return chunk, context
Loading