Thutmose tagger bug fixes (#4162)

* add pretrained ngc model, small fixes Signed-off-by: Alexandra Antonova <aleksandraa@nvidia.com> * fix model location Signed-off-by: Alexandra Antonova <aleksandraa@nvidia.com> * fix model location Signed-off-by: Alexandra Antonova <aleksandraa@nvidia.com> * 1. fix typos. 2. write magic functions without space Signed-off-by: Alexandra Antonova <aleksandraa@nvidia.com> * add example of inference with pretrained model Signed-off-by: Alexandra Antonova <aleksandraa@nvidia.com> * changed model location to nemo Signed-off-by: Alexandra Antonova <aleksandraa@nvidia.com> * style fix Signed-off-by: Alexandra Antonova <aleksandraa@nvidia.com> * fix space Signed-off-by: Alexandra Antonova <aleksandraa@nvidia.com> Co-authored-by: Alexandra Antonova <aleksandraa@nvidia.com>
NVIDIA · May 12, 2022 · 0704e14 · 0704e14
1 parent b34609f
commit 0704e14
Show file tree

Hide file tree

Showing 3 changed files with 159 additions and 62 deletions.
diff --git a/examples/nlp/text_normalization_as_tagging/conf/thutmose_tagger_itn_config.yaml b/examples/nlp/text_normalization_as_tagging/conf/thutmose_tagger_itn_config.yaml
@@ -77,8 +77,6 @@ data:
     data_path: ???  # provide the full path to the file
     batch_size: 8
     shuffle: true
-    max_insts: -1 # Maximum number of instances (-1 means no limit)
-    use_cache: false # uses a cache to store the processed dataset, you may use it for large datasets for speed up (especially when using multi GPUs)
     num_workers: 3
     pin_memory: false
     drop_last: false
@@ -87,8 +85,6 @@ data:
     data_path: ???  # provide the full path to the file.
     batch_size: 8
     shuffle: false
-    max_insts: -1 # Maximum number of instances (-1 means no limit)
-    use_cache: false # uses a cache to store the processed dataset, you may use it for large datasets for speed up (especially when using multi GPUs)
     num_workers: 3
     pin_memory: false
     drop_last: false

diff --git a/nemo/collections/nlp/models/text_normalization_as_tagging/thutmose_tagger.py b/nemo/collections/nlp/models/text_normalization_as_tagging/thutmose_tagger.py
@@ -297,6 +297,7 @@ def _infer(self, sents: List[str]) -> List[List[int]]:
                 - input words
                 - tags predicted for input words
                 - tags after swap preprocessing
+                - semiotic labels predicted for input words
         """
 
         # all input sentences go into one batch
@@ -406,4 +407,12 @@ def _setup_infer_dataloader(self, cfg: DictConfig, queries: List[str]) -> 'torch
 
     @classmethod
     def list_available_models(cls) -> Optional[PretrainedModelInfo]:
-        pass
+        result = [
+            PretrainedModelInfo(
+                pretrained_model_name="itn_en_thutmose_bert",
+                location="https://api.ngc.nvidia.com/v2/models/nvidia/nemo/itn_en_thutmose_bert/versions/1.9.0/files/itn_en_thutmose_bert.nemo",
+                description="A single-pass tagger-based model for inverse text normalization based"
+                "on bert-base-uncased, trained on 2 mln sentences from Google Text Normalization Dataset",
+            ),
+        ]
+        return result