Skip to content

Commit

Permalink
Update config for AED models (#8294)
Browse files Browse the repository at this point in the history
Signed-off-by: smajumdar <titu1994@gmail.com>
  • Loading branch information
titu1994 authored Jan 31, 2024
1 parent 9b7aa0f commit ee16e5c
Showing 1 changed file with 13 additions and 2 deletions.
15 changes: 13 additions & 2 deletions examples/asr/conf/speech_multitask/fast-conformer_aed.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -74,8 +74,19 @@ model:
# recommend small vocab size of 128 or 256 when using 4x sub-sampling
# you may find more detail on how to train a tokenizer at: /scripts/tokenizers/process_asr_text_tokenizer.py
tokenizer:
dir: ??? # path to directory which contains either tokenizer.model (bpe) or vocab.txt (wpe)
type: bpe # Can be either bpe (SentencePiece tokenizer) or wpe (WordPiece tokenizer)
dir: null # Null for aggregate tokenizers
type: agg # Can be either bpe (SentencePiece tokenizer) or wpe (WordPiece tokenizer) or `agg` for aggregate tokenizers
langs:
spl_tokens: # special tokens model
dir: ???
type: bpe
en: # English tokenizer (example, replace with whichever language you would like)
dir: ???
type: bpe

custom_tokenizer:
_target_: nemo.collections.common.tokenizers.canary_tokenizer.CanaryTokenizer # Can be replaced with other tokenizer for different prompt formats
tokenizers: null # Filled at runtime by all the tokenizers inside the aggregate tokenizer

# Audio Preprocessor
preprocessor:
Expand Down

0 comments on commit ee16e5c

Please sign in to comment.