set default to bpe.

Signed-off-by: Vahid <vnoroozi@nvidia.com>
NVIDIA · Mar 26, 2022 · e9bb886 · e9bb886
1 parent a189aa0
commit e9bb886
Show file tree

Hide file tree

Showing 3 changed files with 4 additions and 3 deletions.
diff --git a/examples/asr/conf/rnn/rnn_ctc_bpe.yaml b/examples/asr/conf/rnn/rnn_ctc_bpe.yaml
@@ -58,7 +58,7 @@ model:
   # You may find more detail on how to train a tokenizer at: /scripts/tokenizers/process_asr_text_tokenizer.py
   tokenizer:
     dir: ???  # path to directory which contains either tokenizer.model (bpe) or vocab.txt (for wpe)
-    type: wpe  # Can be either bpe (SentencePiece tokenizer) or wpe (WordPiece tokenizer)
+    type: bpe  # Can be either bpe (SentencePiece tokenizer) or wpe (WordPiece tokenizer)
 
   preprocessor:
     _target_: nemo.collections.asr.modules.AudioToMelSpectrogramPreprocessor

diff --git a/examples/asr/conf/rnn/rnn_transducer_bpe.yaml b/examples/asr/conf/rnn/rnn_transducer_bpe.yaml
@@ -63,7 +63,7 @@ model:
   # You may find more detail on how to train a tokenizer at: /scripts/tokenizers/process_asr_text_tokenizer.py
   tokenizer:
     dir: ???  # path to directory which contains either tokenizer.model (bpe) or vocab.txt (for wpe)
-    type: wpe  # Can be either bpe (SentencePiece tokenizer) or wpe (WordPiece tokenizer)
+    type: bpe  # Can be either bpe (SentencePiece tokenizer) or wpe (WordPiece tokenizer)
 
   preprocessor:
     _target_: nemo.collections.asr.modules.AudioToMelSpectrogramPreprocessor
@@ -157,6 +157,7 @@ model:
     warprnnt_numba_kwargs:
       # FastEmit regularization: https://arxiv.org/abs/2010.11148
       # You may enable FastEmit to reduce the latency of the model for streaming
+      # using fastemit_lambda=1e-3 can help the accuracy of the model when it is unidirectional
       fastemit_lambda: 0.0  # Recommended values to be in range [1e-4, 1e-2], 0.001 is a good start.
 
   # Adds Gaussian noise to the gradients of the decoder to avoid overfitting

diff --git a/scripts/tokenizers/process_asr_text_tokenizer.py b/scripts/tokenizers/process_asr_text_tokenizer.py
@@ -153,13 +153,13 @@ def __build_document_from_manifests(
         return document_path
 
     num_lines = 0
-    w = set("")
     with open(document_path, 'w') as out_writer:
         for manifest in manifests:
             with open(manifest, 'r') as in_reader:
                 for line in in_reader:
                     item = json.loads(line)
                     text = item['text']
+
                     out_writer.write(text + '\n')
                     out_writer.flush()