[TN] bug fix "hundred" in Audio-based, added method so split text in …

…sentences (#4610) * fix duplex inference with grammars Signed-off-by: ekmb <ebakhturina@nvidia.com> * fix hundred TN audio bug, add split text Signed-off-by: ekmb <ebakhturina@nvidia.com> * fix header year Signed-off-by: ekmb <ebakhturina@nvidia.com> * style fix Signed-off-by: ekmb <ebakhturina@nvidia.com> * exclude I from roman-ordinal form Signed-off-by: ekmb <ebakhturina@nvidia.com> * fix graph_with_and Signed-off-by: ekmb <ebakhturina@nvidia.com> * fix tests Signed-off-by: ekmb <ebakhturina@nvidia.com> * fix split regex Signed-off-by: ekmb <ebakhturina@nvidia.com> * fix warning Signed-off-by: ekmb <ebakhturina@nvidia.com>
NVIDIA · Jul 29, 2022 · 2f85541 · 2f85541
1 parent 2a5516c
commit 2f85541
Show file tree

Hide file tree

Showing 9 changed files with 116 additions and 56 deletions.
diff --git a/Jenkinsfile b/Jenkinsfile
@@ -138,23 +138,23 @@ pipeline {
       parallel {
         stage('En TN grammars') {
           steps {
-            sh 'CUDA_VISIBLE_DEVICES="" python nemo_text_processing/text_normalization/normalize.py --text="1" --cache_dir /home/TestData/nlp/text_norm/ci/grammars/7-14-22'
+            sh 'CUDA_VISIBLE_DEVICES="" python nemo_text_processing/text_normalization/normalize.py --text="1" --cache_dir /home/TestData/nlp/text_norm/ci/grammars/7-29-22'
           }
         }
         stage('En ITN grammars') {
           steps {
-            sh 'CUDA_VISIBLE_DEVICES="" python nemo_text_processing/inverse_text_normalization/inverse_normalize.py --language en --text="twenty" --cache_dir /home/TestData/nlp/text_norm/ci/grammars/7-14-22'
+            sh 'CUDA_VISIBLE_DEVICES="" python nemo_text_processing/inverse_text_normalization/inverse_normalize.py --language en --text="twenty" --cache_dir /home/TestData/nlp/text_norm/ci/grammars/7-29-22'
           }
         }
         stage('Test En non-deterministic TN & Run all En TN/ITN tests (restore grammars from cache)') {
           steps {
-            sh 'CUDA_VISIBLE_DEVICES="" python nemo_text_processing/text_normalization/normalize_with_audio.py --text "\$.01" --n_tagged 2 --cache_dir /home/TestData/nlp/text_norm/ci/grammars/7-14-22'
-            sh 'CUDA_VISIBLE_DEVICES="" pytest tests/nemo_text_processing/en/ -m "not pleasefixme" --cpu --tn_cache_dir /home/TestData/nlp/text_norm/ci/grammars/7-14-22'
+            sh 'CUDA_VISIBLE_DEVICES="" python nemo_text_processing/text_normalization/normalize_with_audio.py --text "\$.01" --n_tagged 2 --cache_dir /home/TestData/nlp/text_norm/ci/grammars/7-29-22'
+            sh 'CUDA_VISIBLE_DEVICES="" pytest tests/nemo_text_processing/en/ -m "not pleasefixme" --cpu --tn_cache_dir /home/TestData/nlp/text_norm/ci/grammars/7-29-22'
           }
         }
         stage('Test En Hybrid TN') {
           steps {
-            sh 'CUDA_VISIBLE_DEVICES="" python nemo_text_processing/hybrid/wfst_lm_rescoring.py --data /home/TestData/nlp/text_norm/hybrid_tn/test.txt --regenerate_pkl --cache_dir /home/TestData/nlp/text_norm/ci/grammars/7-14-22 | grep "all_correct: True" || exit 1'
+            sh 'CUDA_VISIBLE_DEVICES="" python nemo_text_processing/hybrid/wfst_lm_rescoring.py --data /home/TestData/nlp/text_norm/hybrid_tn/test.txt --regenerate_pkl --cache_dir /home/TestData/nlp/text_norm/ci/grammars/7-29-22 | grep "all_correct: True" || exit 1'
           }
         }
       }
@@ -171,7 +171,7 @@ pipeline {
       parallel {
         stage('L2: Eng TN') {
           steps {
-            sh 'cd tools/text_processing_deployment && python pynini_export.py --output=/home/TestData/nlp/text_norm/output/ --grammars=tn_grammars --cache_dir /home/TestData/nlp/text_norm/ci/grammars/7-14-22 --language=en && ls -R /home/TestData/nlp/text_norm/output/ && echo ".far files created "|| exit 1'
+            sh 'cd tools/text_processing_deployment && python pynini_export.py --output=/home/TestData/nlp/text_norm/output/ --grammars=tn_grammars --cache_dir /home/TestData/nlp/text_norm/ci/grammars/7-29-22 --language=en && ls -R /home/TestData/nlp/text_norm/output/ && echo ".far files created "|| exit 1'
             sh 'cd nemo_text_processing/text_normalization/ &&  python normalize.py --input_file=/home/TestData/nlp/text_norm/ci/test.txt --input_case="lower_cased" --language=en --output_file=/home/TestData/nlp/text_norm/output/test.pynini.txt --verbose'
             sh 'cat /home/TestData/nlp/text_norm/output/test.pynini.txt'
             sh 'cmp --silent /home/TestData/nlp/text_norm/output/test.pynini.txt /home/TestData/nlp/text_norm/ci/test_goal_py_05-25.txt || exit 1'
@@ -181,7 +181,7 @@ pipeline {
 
         stage('L2: Eng ITN export') {
           steps {
-            sh 'cd tools/text_processing_deployment && python pynini_export.py --output=/home/TestData/nlp/text_denorm/output/ --grammars=itn_grammars --cache_dir /home/TestData/nlp/text_norm/ci/grammars/7-14-22 --language=en && ls -R /home/TestData/nlp/text_denorm/output/ && echo ".far files created "|| exit 1'
+            sh 'cd tools/text_processing_deployment && python pynini_export.py --output=/home/TestData/nlp/text_denorm/output/ --grammars=itn_grammars --cache_dir /home/TestData/nlp/text_norm/ci/grammars/7-29-22 --language=en && ls -R /home/TestData/nlp/text_denorm/output/ && echo ".far files created "|| exit 1'
             sh 'cd nemo_text_processing/inverse_text_normalization/ &&  python inverse_normalize.py --input_file=/home/TestData/nlp/text_denorm/ci/test.txt --language=en --output_file=/home/TestData/nlp/text_denorm/output/test.pynini.txt --verbose'
             sh 'cmp --silent /home/TestData/nlp/text_denorm/output/test.pynini.txt /home/TestData/nlp/text_denorm/ci/test_goal_py.txt || exit 1'
             sh 'rm -rf /home/TestData/nlp/text_denorm/output/*'
@@ -190,23 +190,23 @@ pipeline {
         stage('L2: TN with Audio (audio and raw text)') {
           steps {
             sh 'cd nemo_text_processing/text_normalization && \
-            python normalize_with_audio.py --language=en --cache_dir /home/TestData/nlp/text_norm/ci/grammars/7-14-22 --text "The total amounts to \\$4.76." \
+            python normalize_with_audio.py --language=en --cache_dir /home/TestData/nlp/text_norm/ci/grammars/7-29-22 --text "The total amounts to \\$4.76." \
             --audio_data /home/TestData/nlp/text_norm/audio_based/audio.wav | tail -n2 | head -n1 > /tmp/out_raw.txt 2>&1 && \
             cmp --silent /tmp/out_raw.txt /home/TestData/nlp/text_norm/audio_based/result.txt || exit 1'
           }
         }
         stage('L2: TN with Audio (audio and text file)') {
           steps {
             sh 'cd nemo_text_processing/text_normalization && \
-            python normalize_with_audio.py --language=en --cache_dir /home/TestData/nlp/text_norm/ci/grammars/7-14-22 --text /home/TestData/nlp/text_norm/audio_based/text.txt \
+            python normalize_with_audio.py --language=en --cache_dir /home/TestData/nlp/text_norm/ci/grammars/7-29-22 --text /home/TestData/nlp/text_norm/audio_based/text.txt \
             --audio_data /home/TestData/nlp/text_norm/audio_based/audio.wav | tail -n2 | head -n1 > /tmp/out_file.txt 2>&1 && \
             cmp --silent /tmp/out_file.txt /home/TestData/nlp/text_norm/audio_based/result.txt || exit 1'
           }
         }
         stage('L2: TN with Audio (manifest)') {
           steps {
             sh 'cd nemo_text_processing/text_normalization && \
-            python normalize_with_audio.py --language=en --audio_data /home/TestData/nlp/text_norm/audio_based/manifest.json --n_tagged=120 --cache_dir /home/TestData/nlp/text_norm/ci/grammars/7-14-22'
+            python normalize_with_audio.py --language=en --audio_data /home/TestData/nlp/text_norm/audio_based/manifest.json --n_tagged=120 --cache_dir /home/TestData/nlp/text_norm/ci/grammars/7-29-22'
           }
         }
       }

diff --git a/nemo_text_processing/text_normalization/en/taggers/cardinal.py b/nemo_text_processing/text_normalization/en/taggers/cardinal.py
@@ -114,14 +114,10 @@ def __init__(self, deterministic: bool = True, lm: bool = False):
         self.fst = final_graph.optimize()
 
     def add_optional_and(self, graph):
-        if not self.deterministic:
-            graph = pynini.compose(
-                graph, NEMO_SIGMA + pynini.closure(pynini.cross("hundred ", " "), 0, 1) + NEMO_SIGMA
-            )
-
-        graph_with_and = pynutil.add_weight(graph, 0.00001)
+        graph_with_and = graph
 
         if not self.lm:
+            graph_with_and = pynutil.add_weight(graph, 0.00001)
             not_quote = pynini.closure(NEMO_NOT_QUOTE)
             no_thousand_million = pynini.difference(
                 not_quote, not_quote + pynini.union("thousand", "million") + not_quote
@@ -135,6 +131,8 @@ def add_optional_and(self, graph):
                 not_quote + pynutil.add_weight(pynini.cross("thousand ", "thousand and ") + no_hundred, -0.0001)
             ).optimize()
 
+            optional_hundred = pynini.compose((NEMO_DIGIT - "0") ** 3, graph).optimize()
+            optional_hundred = pynini.compose(optional_hundred, NEMO_SIGMA + pynini.cross(" hundred", "") + NEMO_SIGMA)
             graph_with_and |= pynini.compose(graph, integer).optimize()
-
+            graph_with_and |= optional_hundred
         return graph_with_and
diff --git a/nemo_text_processing/text_normalization/en/taggers/roman.py b/nemo_text_processing/text_normalization/en/taggers/roman.py
@@ -37,7 +37,13 @@ def __init__(self, deterministic: bool = True, lm: bool = False):
         default_graph = pynutil.insert("integer: \"") + default_graph + pynutil.insert("\"")
         ordinal_limit = 19
 
-        graph_teens = pynini.string_map([x[0] for x in roman_dict[:ordinal_limit]]).optimize()
+        if deterministic:
+            # exclude "I"
+            start_idx = 1
+        else:
+            start_idx = 0
+
+        graph_teens = pynini.string_map([x[0] for x in roman_dict[start_idx:ordinal_limit]]).optimize()
 
         # roman numerals up to ordinal_limit with a preceding name are converted to ordinal form
         names = get_names()
@@ -90,7 +96,7 @@ def __init__(self, deterministic: bool = True, lm: bool = False):
         )
 
         graph |= roman_to_ordinal
-        graph = self.add_tokens(graph)
+        graph = self.add_tokens(graph.optimize())
 
         self.fst = graph.optimize()
 

diff --git a/nemo_text_processing/text_normalization/en/verbalizers/cardinal.py b/nemo_text_processing/text_normalization/en/verbalizers/cardinal.py
@@ -35,34 +35,8 @@ def __init__(self, deterministic: bool = True):
             self.optional_sign |= pynini.cross("negative: \"true\"", "negative ")
         self.optional_sign = pynini.closure(self.optional_sign + delete_space, 0, 1)
 
-        # no_thousand_million = pynini.difference(
-        #     pynini.closure(NEMO_NOT_QUOTE),
-        #     pynini.closure(NEMO_NOT_QUOTE) + pynini.union("thousand", "million") + pynini.closure(NEMO_NOT_QUOTE),
-        # ).optimize()
-        # integer = (
-        #     pynini.closure(NEMO_NOT_QUOTE)
-        #     + pynini.closure(
-        #         pynutil.add_weight(pynini.cross("hundred ", "hundred and ") + no_thousand_million, -0.0001), 0, 1
-        #     ).optimize()
-        # )
-        # no_hundred = pynini.difference(
-        #     pynini.closure(NEMO_NOT_QUOTE),
-        #     pynini.closure(NEMO_NOT_QUOTE) + "hundred" + pynini.closure(NEMO_NOT_QUOTE),
-        # ).optimize()
-        # integer |= (
-        #     pynini.closure(NEMO_NOT_QUOTE)
-        #     + pynini.closure(
-        #         pynutil.add_weight(pynini.cross("thousand ", "thousand and ") + no_hundred, -0.0001), 0, 1
-        #     ).optimize()
-        # )
-        #
-        # if not deterministic:
-        #     integer |= (
-        #         pynini.closure(NEMO_NOT_QUOTE)
-        #         + pynini.closure(pynini.cross("hundred ", "hundred and ") | pynini.cross("hundred ", " "), 0, 1)
-        #         + pynini.closure(NEMO_NOT_QUOTE)
-        #     ).optimize()
         integer = pynini.closure(NEMO_NOT_QUOTE)
+
         self.integer = delete_space + pynutil.delete("\"") + integer + pynutil.delete("\"")
         integer = pynutil.delete("integer:") + self.integer
 

diff --git a/nemo_text_processing/text_normalization/normalize.py b/nemo_text_processing/text_normalization/normalize.py
@@ -22,6 +22,7 @@
 from typing import Dict, List, Union
 
 import pynini
+import regex
 from joblib import Parallel, delayed
 from nemo_text_processing.text_normalization.data_loader_utils import (
     load_file,
@@ -34,6 +35,7 @@
 from tqdm import tqdm
 
 from nemo.collections.common.tokenizers.moses_tokenizers import MosesProcessor
+from nemo.utils import logging
 
 SPACE_DUP = re.compile(' {2,}')
 
@@ -251,9 +253,11 @@ def normalize(
 
         Returns: spoken form
         """
-        assert (
-            len(text.split()) < 500
-        ), "Your input is too long. Please split up the input into sentences, or strings with fewer than 500 words"
+        if len(text.split()) > 500:
+            logging.warning(
+                "Your input is too long, and this could take a long time to normalize."
+                "Use split_text_into_sentences() to make the input shorter and then call normalize_list()."
+            )
 
         original_text = text
         if punct_pre_process:
@@ -299,6 +303,27 @@ def normalize(
 
         return output
 
+    def split_text_into_sentences(self, text: str) -> List[str]:
+        """
+        Split text into sentences.
+
+        Args:
+            text: text
+
+        Returns list of sentences
+        """
+        lower_case_unicode = ''
+        upper_case_unicode = ''
+        if self.lang == "ru":
+            lower_case_unicode = '\u0430-\u04FF'
+            upper_case_unicode = '\u0410-\u042F'
+
+        # Read and split transcript by utterance (roughly, sentences)
+        split_pattern = f"(?<!\w\.\w.)(?<![A-Z{upper_case_unicode}][a-z{lower_case_unicode}]+\.)(?<![A-Z{upper_case_unicode}]\.)(?<=\.|\?|\!|\.”|\?”\!”)\s(?![0-9]+[a-z]*\.)"
+
+        sentences = regex.split(split_pattern, text)
+        return sentences
+
     def _permute(self, d: OrderedDict) -> List[str]:
         """
         Creates reorderings of dictionary elements and serializes as strings

diff --git a/tests/nemo_text_processing/en/data_text_normalization/test_cases_normalize_with_audio.txt b/tests/nemo_text_processing/en/data_text_normalization/test_cases_normalize_with_audio.txt
@@ -65,7 +65,7 @@ four five six seven
 forty five sixty seven
 four thousand five hundred and sixty seven
 ~This example number 15,000 can be a very long one, and can fail to produce valid normalization for such an easy number like 10,125 or dollar value $5349.01, and can fail to terminate, and can fail to terminate, and can fail to terminate, and can fail to terminate, and can fail to terminate, 452.
-This example number fifteen thousand can be a very long one, and can fail to produce valid normalization for such an easy number like ten thousand one hundred twenty five or dollar value five thousand and three forty nine us dollars and one cent, and can fail to terminate, and can fail to terminate, and can fail to terminate, and can fail to terminate, and can fail to terminate, four fifty two.
+This example number fifteen thousand can be a very long one, and can fail to produce valid normalization for such an easy number like ten thousand one hundred twenty five or dollar value five thousand three hundred and forty nine dollars and one cent, and can fail to terminate, and can fail to terminate, and can fail to terminate, and can fail to terminate, and can fail to terminate, four fifty two.
 ~$1.01
 one dollar one cent
 one dollar and one cent

diff --git a/tests/nemo_text_processing/en/data_text_normalization/test_cases_roman.txt b/tests/nemo_text_processing/en/data_text_normalization/test_cases_roman.txt
@@ -1,4 +1,3 @@
-George I~George first
 Sam II~Sam second
 Chapter IV~Chapter four
 PART XL~PART forty

diff --git a/tests/nemo_text_processing/en/test_text_split.py b/tests/nemo_text_processing/en/test_text_split.py
@@ -0,0 +1,38 @@
+# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import pytest
+from nemo_text_processing.text_normalization.normalize import Normalizer
+
+from ..utils import CACHE_DIR
+
+
+class TestTextSentenceSplit:
+    normalizer_en = Normalizer(
+        input_case='cased', lang='en', cache_dir=CACHE_DIR, overwrite_cache=False, post_process=True
+    )
+
+    @pytest.mark.run_only_on('CPU')
+    @pytest.mark.unit
+    def test_text_sentence_split(self):
+        text = "This happened in 1918 when Mrs. and Mr. Smith paid $111.12 in U.S.A. at 9 a.m. on Dec. 1. 2020. And Jan. 17th. This is an example. He paid $123 for this desk. 123rd, St. Patrick."
+        gt_sentences = [
+            'This happened in 1918 when Mrs. and Mr. Smith paid $111.12 in U.S.A. at 9 a.m. on Dec. 1. 2020.',
+            'And Jan. 17th.',
+            'This is an example.',
+            'He paid $123 for this desk.',
+            '123rd, St. Patrick.',
+        ]
+        sentences = self.normalizer_en.split_text_into_sentences(text)
+        assert gt_sentences == sentences
diff --git a/tutorials/text_processing/Text_(Inverse)_Normalization.ipynb b/tutorials/text_processing/Text_(Inverse)_Normalization.ipynb
@@ -113,7 +113,7 @@
     "\n",
     "\n",
     "\n",
-    "#### 2.1 Run TN on input string"
+    "### 2.1 Run TN on input string"
    ]
   },
   {
@@ -138,19 +138,39 @@
     "print(normalized)"
    ]
   },
-  {
+    {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "intermediate semiotic class information is shown if verbose=True."
+    "intermediate semiotic class information is shown if verbose=True. \n",
+    "\n",
+    "Long input text could be split into sentences as follows:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "written = \"Mr. Smith paid $111 in U.S.A. on Dec. 17th. We paid $123 for this desk.\"\n",
+    "\n",
+    "# split long text into sentences\n",
+    "sentences = normalizer.split_text_into_sentences(written)\n",
+    "\n",
+    "for sent in sentences:\n",
+    "    print(sent)\n",
+    "\n",
+    "# normalize each sentence separately using normalize() or all sentences at once with normalize_list()\n",
+    "normalizer.normalize_list(sentences)"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
     "\n",
-    "### 2.1 Run TN on list of input strings"
+    "### 2.2 Run TN on list of input strings"
    ]
   },
   {
@@ -212,7 +232,7 @@
     "id": "RMT5lkPYzZHK"
    },
    "source": [
-    "### 2.2 Evaluate TN on written-normalized text pairs \n",
+    "### 2.3 Evaluate TN on written-normalized text pairs \n",
     "\n",
     "The evaluation data needs to have the following format:\n",
     "\n",