From e1724aba564f5f32a3c8f90ff633a421ccf6853a Mon Sep 17 00:00:00 2001
From: Daniel Schnell <dschnell@grammatek.com>
Date: Wed, 20 Mar 2024 11:22:10 +0000
Subject: [PATCH] Activate User Normalization Dictionary

Activate the user normalization dictionary and add it to the
prenormalization step of the frontend pipeline.

This needs to be done conditionally. We normally want the dictionary
activated for general TTS, but we need to deactivate it for the play
buttons in the NormDictInfo activity.

Signed-off-by: Daniel Schnell <dschnell@grammatek.com>
---
 .../grammatek/simaromur/AppRepository.java    | 18 +++++++-
 .../com/grammatek/simaromur/NormDictInfo.java |  2 +-
 .../com/grammatek/simaromur/TTSService.java   |  2 +-
 .../com/grammatek/simaromur/VoiceInfo.java    |  2 +-
 .../simaromur/frontend/FrontendManager.java   | 12 -----
 .../frontend/NormalizationManager.java        |  9 ++--
 .../simaromur/frontend/TTSNormalizer.java     | 46 ++++++++++++++++++-
 .../simaromur/NormalizationManagerTest.java   | 12 ++---
 8 files changed, 74 insertions(+), 29 deletions(-)
diff --git a/app/src/main/java/com/grammatek/simaromur/AppRepository.java b/app/src/main/java/com/grammatek/simaromur/AppRepository.java
index 579c4d4..81f8637 100644
--- a/app/src/main/java/com/grammatek/simaromur/AppRepository.java
+++ b/app/src/main/java/com/grammatek/simaromur/AppRepository.java
@@ -349,6 +349,15 @@ public LiveData<List<NormDictEntry>> getNormDictEntries() {
         return mNormDictDao.getSortedEntries();
     }
 
+    /**
+     * Get a LiveData list of all current normalization dictionary entries.
+     *
+     * @return List of all current normalization dictionary entries as LiveData
+     */
+    public List<NormDictEntry> getNormDictEntriesDirect() {
+        return mNormDictDao.getEntries();
+    }
+
     /**
      * Creates or updates the given entry inside the Db.
      */
@@ -953,15 +962,20 @@ public String getLoadedVoiceName() {
      *
      * @param text Raw text as received by the TTS service
      * @param item cache item to save into the speech audio cache
+     * @param voice voice to use for normalization and G2P
+     * @param doIgnoreUserDict true to ignore user dictionary, false otherwise
      * @return updated cache item
      */
     synchronized
-    public CacheItem executeFrontendAndSaveIntoCache(String text, CacheItem item, com.grammatek.simaromur.db.Voice voice) {
+    public CacheItem executeFrontendAndSaveIntoCache(String text,
+                                                     CacheItem item,
+                                                     com.grammatek.simaromur.db.Voice voice,
+                                                     boolean doIgnoreUserDict) {
         String phonemes = "";
         if (item.getUtterance().getNormalized().isEmpty()) {
             // we always need to normalize the text, but it doesn't hurt, if we always do G2P as well
             // for network voices, this is currently all that is needed.
-            String normalizedText = mFrontend.getNormalizationManager().process(text);
+            String normalizedText = mFrontend.getNormalizationManager().process(text, doIgnoreUserDict);
             phonemes = mFrontend.transcribe(normalizedText, voice.type, voice.version);
             Log.v(LOG_TAG, "executeFrontendAndSaveIntoCache: original (\"" + text + "\"), normalized (\"" + normalizedText + "\"), phonemes (\"" + phonemes + "\")");
             if (!phonemes.isEmpty()) {
diff --git a/app/src/main/java/com/grammatek/simaromur/NormDictInfo.java b/app/src/main/java/com/grammatek/simaromur/NormDictInfo.java
index 9deb8a5..151d0f1 100644
--- a/app/src/main/java/com/grammatek/simaromur/NormDictInfo.java
+++ b/app/src/main/java/com/grammatek/simaromur/NormDictInfo.java
@@ -229,7 +229,7 @@ public void onPlayCancelClicked(View v) {
         // TODO: do we need a special mode for the frontend to bypass any user dictionary ?
         if (mIsPlaying1 || mIsPlaying2) {
             CacheItem item = appRepo.getUtteranceCache().addUtterance(text);
-            item = appRepo.executeFrontendAndSaveIntoCache(text, item, appRepo.getCurrentVoice());
+            item = appRepo.executeFrontendAndSaveIntoCache(text, item, appRepo.getCurrentVoice(), true);
             if ((item.getUtterance().getPhonemesCount() == 0) ||
                     item.getUtterance().getPhonemesList().get(0).getSymbols().isEmpty()) {
                 Log.w(LOG_TAG, "onPlayCancelClicked: Nothing to speak ?!");
diff --git a/app/src/main/java/com/grammatek/simaromur/TTSService.java b/app/src/main/java/com/grammatek/simaromur/TTSService.java
index 7acbd5f..fba6f8e 100644
--- a/app/src/main/java/com/grammatek/simaromur/TTSService.java
+++ b/app/src/main/java/com/grammatek/simaromur/TTSService.java
@@ -197,7 +197,7 @@ protected void onSynthesizeText(SynthesisRequest request,
         // item and save it into cache, then test one-by-one availability of every single
         // requested utterance component and eventually add the missing pieces
         CacheItem item = mRepository.getUtteranceCache().addUtterance(text);
-        item = mRepository.executeFrontendAndSaveIntoCache(text, item, voice);
+        item = mRepository.executeFrontendAndSaveIntoCache(text, item, voice, false);
         if ((item.getUtterance().getPhonemesCount() == 0) ||
                 item.getUtterance().getPhonemesList().get(0).getSymbols().isEmpty()) {
             Log.w(LOG_TAG, "onSynthesizeText: No phonemes to speak");
diff --git a/app/src/main/java/com/grammatek/simaromur/VoiceInfo.java b/app/src/main/java/com/grammatek/simaromur/VoiceInfo.java
index 83c87ae..f37b0b8 100644
--- a/app/src/main/java/com/grammatek/simaromur/VoiceInfo.java
+++ b/app/src/main/java/com/grammatek/simaromur/VoiceInfo.java
@@ -379,7 +379,7 @@ public void onPlayClicked(View v) {
 
         // execute frontend
         CacheItem item = appRepo.getUtteranceCache().addUtterance(text);
-        item = appRepo.executeFrontendAndSaveIntoCache(text, item, mVoice);
+        item = appRepo.executeFrontendAndSaveIntoCache(text, item, mVoice, false);
         if ((item.getUtterance().getPhonemesCount() == 0) ||
                 item.getUtterance().getPhonemesList().get(0).getSymbols().isEmpty()) {
             Log.w(LOG_TAG, "onPlayClicked: No phonemes to speak");
diff --git a/app/src/main/java/com/grammatek/simaromur/frontend/FrontendManager.java b/app/src/main/java/com/grammatek/simaromur/frontend/FrontendManager.java
index d850f81..e893bf4 100644
--- a/app/src/main/java/com/grammatek/simaromur/frontend/FrontendManager.java
+++ b/app/src/main/java/com/grammatek/simaromur/frontend/FrontendManager.java
@@ -40,18 +40,6 @@ public static String getVersion() {
         return "1.0";
     }
 
-    /**
-     * Processes text for input into a TTS engine. This includes unicode cleaning, tokenizing, and
-     * normalizing the the text, and then to convert it into an X-SAMPA transcription.
-     *
-     * @param text raw input text
-     * @return an X-SAMPA transcription of @text
-     */
-    public String process(String text) {
-        final String normalized = mNormalizationManager.process(text);
-        return transcribe(normalized, IGNORE_TYPE, IGNORE_VERSION);
-    }
-
     /**
      * Transcribe text to IPA symbols. Punctuation is kept as is, which conforms to the kind of
      * IPA dialect encoded into the VITS model.
diff --git a/app/src/main/java/com/grammatek/simaromur/frontend/NormalizationManager.java b/app/src/main/java/com/grammatek/simaromur/frontend/NormalizationManager.java
index a0c853d..96e5bea 100644
--- a/app/src/main/java/com/grammatek/simaromur/frontend/NormalizationManager.java
+++ b/app/src/main/java/com/grammatek/simaromur/frontend/NormalizationManager.java
@@ -50,14 +50,15 @@ public NormalizationManager(Context context, Map<String, PronDictEntry> pronDict
      * Processes the input text according to the defined steps: unicode cleaning,
      * tokenizing, normalizing
      * @param text the input text
+     * @param doIgnoreUserDict if true, the user dictionary is ignored
      * @return normalized version of 'text'
      */
-    public String process(final String text) {
+    public String process(final String text, boolean doIgnoreUserDict) {
         Log.v(LOG_TAG, "process() called");
         String cleaned = mUnicodeNormalizer.normalizeEncoding(text);
 
         List<String> strings = mTokenizer.detectSentences(cleaned);
-        List<String> normalizedSentences = normalize(strings);
+        List<String> normalizedSentences = normalize(strings, doIgnoreUserDict);
         List<String> cleanNormalized = mUnicodeNormalizer.normalizeAlphabet(normalizedSentences);
         for (String sentence : cleanNormalized) {
             Log.v(LOG_TAG, "normalized sentence: " + sentence);
@@ -66,12 +67,12 @@ public String process(final String text) {
     }
 
     // pre-normalization, tagging and final normalization of the sentences in 'tokenized'
-    private List<String> normalize(final List<String> strings) {
+    private List<String> normalize(final List<String> strings, boolean doIgnoreUserDict) {
         String preNormalized;
         List<String> normalized = new ArrayList<>();
 
         for (String sentence : strings) {
-            preNormalized = mTTSNormalizer.preNormalize(sentence);
+            preNormalized = mTTSNormalizer.preNormalize(sentence, doIgnoreUserDict);
             String[] tags = tagText(preNormalized);
             // preNormalized is tokenized as string, so we know splitting on whitespace will give
             // us the correct tokens according to the tokenizer
diff --git a/app/src/main/java/com/grammatek/simaromur/frontend/TTSNormalizer.java b/app/src/main/java/com/grammatek/simaromur/frontend/TTSNormalizer.java
index 485866c..2646894 100644
--- a/app/src/main/java/com/grammatek/simaromur/frontend/TTSNormalizer.java
+++ b/app/src/main/java/com/grammatek/simaromur/frontend/TTSNormalizer.java
@@ -1,7 +1,11 @@
 package com.grammatek.simaromur.frontend;
 
+import android.util.Log;
+
 import androidx.annotation.NonNull;
 
+import com.grammatek.simaromur.App;
+import com.grammatek.simaromur.db.NormDictEntry;
 import com.grammatek.simaromur.device.SymbolsLvLIs;
 
 import java.util.*;
@@ -21,6 +25,7 @@
  */
 
 public class TTSNormalizer {
+    private static final String LOG_TAG = "Simaromur_" + TTSNormalizer.class.getSimpleName();
 
     private final List<CategoryTuple> BigCardinalFilledTupleList = Stream.of(CardinalOnesTuples.getTuples(), CardinalThousandTuples.getTuples(), CardinalMillionTuples.getTuples(),
                     CardinalBigTuples.getTuples())
@@ -80,9 +85,13 @@ public TTSNormalizer() {
      * @param text input text, unicode-normalized and if splitted on whitespace we have an array of tokens
      * @return pre-normalized text, i.e. some common abbreviations expanded
      */
-    public String preNormalize(String text) {
+    public String preNormalize(String text, boolean doIgnoreUserDict) {
         String normalized = text;
-        String domain = ""; //we will need to determine this from "text" in real life!
+        String domain = ""; // we will need to determine this from "text" in real life!
+
+        if (!doIgnoreUserDict) {
+            normalized = replaceFromNormDict(normalized);
+        }
 
         // some pre-processing and formatting of digits
         if (DIGITS_PTRN.matcher(normalized).matches()) {
@@ -132,6 +141,39 @@ public String preNormalize(String text) {
         return normalized;
     }
 
+    /**
+     * Replace abbreviations and other patterns from the normalization dictionary via the
+     * NormDictEntryDao.
+     *
+     * @param sentence  input sentence
+     * @return         normalized sentence with search terms replaced
+     */
+    private String replaceFromNormDict(String sentence) {
+        // replace abbreviations and other patterns from the normalization dictionary via the
+        // NormDictEntryDao
+        String normalized = sentence;
+        List<NormDictEntry> entries = App.getAppRepository().getNormDictEntriesDirect();
+
+        if (entries != null) {
+            // sort entries descending to match longer strings first. This is important for
+            // abbreviations, e.g. "Donald Trump" should be replaced before "Trump"
+            entries.sort((o1, o2) -> o2.term.length() - o1.term.length());
+            for (NormDictEntry entry : entries) {
+                // make for every entry.term a regular expression matching on word boundaries and
+                // case insensitive
+                Pattern regex = Pattern.compile("\\b(?i)" + entry.term.strip().toLowerCase() + "\\b");
+                if (regex.matcher(normalized).find()) {
+                    //Log.v(LOG_TAG, "replaceFromNormDict() - replacing: " + regex + " with: " + entry.replacement);
+                    normalized = regex.matcher(normalized).replaceAll(entry.replacement);
+                }
+            }
+        }
+        if (!normalized.equals(sentence)) {
+            Log.v(LOG_TAG, "replaceFromNormDict() replaced: " + sentence + " with: " + normalized);
+        }
+        return normalized;
+    }
+
     /**
      * Performs normalizing of text partly based on POS-tags. For number normalization the algorithm looks at
      * the POS-tags at the next token position, to determine the correct form of the normalization (case, gender, etc.)
diff --git a/app/src/test/java/com/grammatek/simaromur/NormalizationManagerTest.java b/app/src/test/java/com/grammatek/simaromur/NormalizationManagerTest.java
index 2ee47af..2775bd1 100644
--- a/app/src/test/java/com/grammatek/simaromur/NormalizationManagerTest.java
+++ b/app/src/test/java/com/grammatek/simaromur/NormalizationManagerTest.java
@@ -34,7 +34,7 @@ public class NormalizationManagerTest {
     public void processTest() {
         String input = "síma 421-6368";
         NormalizationManager manager = new NormalizationManager(context, pronDict);
-        String processed = manager.process(input);
+        String processed = manager.process(input, false);
         System.out.println(processed);
 
         assertEquals("síma fjórir tveir einn - sex þrír sex átta .",
@@ -45,7 +45,7 @@ public void processTest() {
     public void processDigitsTest() {
         NormalizationManager manager = new NormalizationManager(context, pronDict);
         for (String sent : getDigits().keySet()) {
-            String processed = manager.process(sent);
+            String processed = manager.process(sent, false);
             assertEquals(getDigits().get(sent), processed);
         }
     }
@@ -54,7 +54,7 @@ public void processDigitsTest() {
     public void processSymbolsTest() {
         NormalizationManager manager = new NormalizationManager(context, pronDict);
         for (String sent : getSymbols().keySet()) {
-            String processed = manager.process(sent);
+            String processed = manager.process(sent, false);
             assertEquals(getSymbols().get(sent), processed);
         }
     }
@@ -63,7 +63,7 @@ public void processSymbolsTest() {
     public void processNewIssuesTest() {
         NormalizationManager manager = new NormalizationManager(context, pronDict);
         for (String sent : getNewTestSentences().keySet()) {
-            String processed = manager.process(sent);
+            String processed = manager.process(sent, false);
             assertEquals(getNewTestSentences().get(sent), processed);
         }
     }
@@ -72,7 +72,7 @@ public void processNewIssuesTest() {
     public void processV14IssuesTest() {
         NormalizationManager manager = new NormalizationManager(context, pronDict);
         for (String sent : getV14TestSentences().keySet()) {
-            String processed = manager.process(sent);
+            String processed = manager.process(sent, false);
             assertEquals(getV14TestSentences().get(sent), processed);
         }
     }
@@ -81,7 +81,7 @@ public void processV14IssuesTest() {
     public void processListTest() {
         NormalizationManager manager = new NormalizationManager(context, pronDict);
         for (String sent : getTestSentences().keySet()) {
-            String processed = manager.process(sent);
+            String processed = manager.process(sent, false);
             assertEquals(getTestSentences().get(sent), processed);
         }
     }