add text normalization for ctc head targets

Signed-off-by: andrusenkoau <andrusenkoau@gmail.com>
andrusenkoau · Aug 20, 2024 · 67d3523 · 67d3523
1 parent 30e9251
commit 67d3523
Showing 1 changed file with 15 additions and 1 deletion.
diff --git a/nemo/collections/multimodal/speech_llm/parts/utils/data_utils.py b/nemo/collections/multimodal/speech_llm/parts/utils/data_utils.py
@@ -265,6 +265,19 @@ def __init__(
             self.prompt_template = self.prompt_template.encode('utf-8').decode('unicode_escape')
         assert self.truncation_field in ["answer", "context"]
 
+
+    def _normilize_text(self, text: str):
+        """
+        Normalize text for CTC head training.
+        """
+        text = text.lower()
+        text = text.replace(".", "")
+        text = text.replace("!", "")
+        text = text.replace(",", "")
+        text = text.replace("?", "")
+
+        return text.lower()
+
     def _process_example(self, context: str, output: str, lang_id: str):
         """
         Create an example by concatenating text and answer.
@@ -319,7 +332,8 @@ def _process_example(self, context: str, output: str, lang_id: str):
         #ctc_tokens_ids = answer_ids[1:]
         # logging.warning("++++"*10)
         # logging.warning(f"original_text: {original_text}")
-        ctc_tokens_ids = self.tokenizer.asr_tokenizer.text_to_ids(output, lang_id)
+        normalized_text = self._normilize_text(output)
+        ctc_tokens_ids = self.tokenizer.asr_tokenizer.text_to_ids(normalized_text, lang_id)
         # logging.warning(f"lang_id: {lang_id}")
         # logging.warning(f"ctc_tokens_ids: {ctc_tokens_ids}")
         # raise ValueError("stop here")