diff --git a/tutorials/asr/ASR_with_Transducers.ipynb b/tutorials/asr/ASR_with_Transducers.ipynb
index fbaa2ff9a725..e1339cbbba4a 100644
--- a/tutorials/asr/ASR_with_Transducers.ipynb
+++ b/tutorials/asr/ASR_with_Transducers.ipynb
@@ -1,30 +1,12 @@
{
- "nbformat": 4,
- "nbformat_minor": 0,
- "metadata": {
- "accelerator": "GPU",
- "colab": {
- "name": "ASR-with-Transducers.ipynb",
- "provenance": [],
- "collapsed_sections": [
- "V5sMoFHmVvhg"
- ],
- "toc_visible": true
- },
- "kernelspec": {
- "display_name": "Python 3",
- "name": "python3"
- },
- "language_info": {
- "name": "python"
- }
- },
"cells": [
{
"cell_type": "code",
+ "execution_count": null,
"metadata": {
"id": "SUOXg71A3w78"
},
+ "outputs": [],
"source": [
"\"\"\"\n",
"You can run either this notebook locally (if you have all the dependencies and a GPU) or on Google Colab.\n",
@@ -52,15 +34,15 @@
"## Grab the config we'll use in this example\n",
"!mkdir configs\n",
"!wget -P configs/ https://raw.githubusercontent.com/NVIDIA/NeMo/$BRANCH/examples/asr/conf/contextnet_rnnt/contextnet_rnnt.yaml\n"
- ],
- "execution_count": null,
- "outputs": []
+ ]
},
{
"cell_type": "code",
+ "execution_count": null,
"metadata": {
"id": "uj-UnhKk47oW"
},
+ "outputs": [],
"source": [
"# In a conda environment, you would use the following command\n",
"# Update Numba to > 0.53\n",
@@ -71,9 +53,7 @@
"# For pip based environments,\n",
"# Update Numba to > 0.53\n",
"!pip install --upgrade numba"
- ],
- "execution_count": null,
- "outputs": []
+ ]
},
{
"cell_type": "markdown",
@@ -109,9 +89,11 @@
},
{
"cell_type": "code",
+ "execution_count": null,
"metadata": {
"id": "pu4n4GkjAo9i"
},
+ "outputs": [],
"source": [
"import os\n",
"\n",
@@ -120,9 +102,7 @@
"\n",
"if not os.path.exists(\"scripts/process_an4_data.py\"):\n",
" !wget -P scripts/ https://raw.githubusercontent.com/NVIDIA/NeMo/$BRANCH/scripts/dataset_processing/process_an4_data.py"
- ],
- "execution_count": null,
- "outputs": []
+ ]
},
{
"cell_type": "markdown",
@@ -137,9 +117,11 @@
},
{
"cell_type": "code",
+ "execution_count": null,
"metadata": {
"id": "yV1rJcH6ApDo"
},
+ "outputs": [],
"source": [
"import wget\n",
"import tarfile \n",
@@ -174,32 +156,65 @@
" cmd = [\"sox\", sph_path, wav_path]\n",
" subprocess.run(cmd)\n",
"\n",
- "print(\"Finished conversion.\\n******\")\n",
- "\n",
- "if os.path.exists(f\"{data_dir}/an4\"):\n",
- " print(\"Preparing AN4 dataset ...\")\n",
- "\n",
- " an4_path = f\"{data_dir}/\"\n",
- " !python scripts/process_an4_data.py \\\n",
- " --data_root=$an4_path\n",
- "\n",
- "print(\"AN4 prepared !\")"
- ],
- "execution_count": null,
- "outputs": []
+ "print(\"Finished conversion.\\n******\")"
+ ]
},
{
"cell_type": "code",
- "metadata": {
- "id": "J4cw98QfI9-p"
- },
- "source": [
- "# Manifest filepaths\n",
- "TRAIN_MANIFEST = os.path.join(data_dir, \"an4\", \"train_manifest.json\")\n",
- "TEST_MANIFEST = os.path.join(data_dir, \"an4\", \"test_manifest.json\")"
- ],
"execution_count": null,
- "outputs": []
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# --- Building Manifest Files --- #\n",
+ "import json\n",
+ "import librosa\n",
+ "\n",
+ "# Function to build a manifest\n",
+ "def build_manifest(transcripts_path, manifest_path, wav_path):\n",
+ " with open(transcripts_path, 'r') as fin:\n",
+ " with open(manifest_path, 'w') as fout:\n",
+ " for line in fin:\n",
+ " # Lines look like this:\n",
+ " # transcript (fileID)\n",
+ " transcript = line[: line.find('(')-1].lower()\n",
+ " transcript = transcript.replace('', '').replace('', '')\n",
+ " transcript = transcript.strip()\n",
+ "\n",
+ " file_id = line[line.find('(')+1 : -2] # e.g. \"cen4-fash-b\"\n",
+ " audio_path = os.path.join(\n",
+ " data_dir, wav_path,\n",
+ " file_id[file_id.find('-')+1 : file_id.rfind('-')],\n",
+ " file_id + '.wav')\n",
+ "\n",
+ " duration = librosa.core.get_duration(filename=audio_path)\n",
+ "\n",
+ " # Write the metadata to the manifest\n",
+ " metadata = {\n",
+ " \"audio_filepath\": audio_path,\n",
+ " \"duration\": duration,\n",
+ " \"text\": transcript\n",
+ " }\n",
+ " json.dump(metadata, fout)\n",
+ " fout.write('\\n')\n",
+ "\n",
+ "# Building Manifests\n",
+ "print(\"******\")\n",
+ "train_transcripts = os.path.join(data_dir, 'an4/etc/an4_train.transcription')\n",
+ "train_manifest = os.path.join(data_dir, 'an4/train_manifest.json')\n",
+ "if not os.path.isfile(train_manifest):\n",
+ " build_manifest(train_transcripts, train_manifest, 'an4/wav/an4_clstk')\n",
+ " print(\"Training manifest created.\")\n",
+ "\n",
+ "test_transcripts = os.path.join(data_dir, 'an4/etc/an4_test.transcription')\n",
+ "test_manifest = os.path.join(data_dir, '/an4/test_manifest.json')\n",
+ "if not os.path.isfile(test_manifest):\n",
+ " build_manifest(test_transcripts, test_manifest, 'an4/wav/an4test_clstk')\n",
+ " print(\"Test manifest created.\")\n",
+ "print(\"***Done***\") \n",
+ "# Manifest filepaths\n",
+ "TRAIN_MANIFEST = train_manifest\n",
+ "TEST_MANIFEST = test_manifest"
+ ]
},
{
"cell_type": "markdown",
@@ -214,15 +229,15 @@
},
{
"cell_type": "code",
+ "execution_count": null,
"metadata": {
"id": "i2hD4LkoJvrx"
},
+ "outputs": [],
"source": [
"if not os.path.exists(\"scripts/process_asr_text_tokenizer.py\"):\n",
" !wget -P scripts/ https://raw.githubusercontent.com/NVIDIA/NeMo/$BRANCH/scripts/tokenizers/process_asr_text_tokenizer.py"
- ],
- "execution_count": null,
- "outputs": []
+ ]
},
{
"cell_type": "markdown",
@@ -237,9 +252,11 @@
},
{
"cell_type": "code",
+ "execution_count": null,
"metadata": {
"id": "i6Jzpt6UJvuI"
},
+ "outputs": [],
"source": [
"VOCAB_SIZE = 32 # can be any value above 29\n",
"TOKENIZER_TYPE = \"spe\" # can be wpe or spe\n",
@@ -259,15 +276,15 @@
" --no_lower_case \\\n",
" --log \\\n",
" --vocab_size=$VOCAB_SIZE"
- ],
- "execution_count": null,
- "outputs": []
+ ]
},
{
"cell_type": "code",
+ "execution_count": null,
"metadata": {
"id": "JHDZswN6LIBJ"
},
+ "outputs": [],
"source": [
"# Tokenizer path\n",
"if TOKENIZER_TYPE == 'spe':\n",
@@ -276,9 +293,7 @@
"else:\n",
" TOKENIZER = os.path.join(\"tokenizers\", f\"tokenizer_wpe_v{VOCAB_SIZE}\")\n",
" TOKENIZER_TYPE_CFG = \"wpe\""
- ],
- "execution_count": null,
- "outputs": []
+ ]
},
{
"cell_type": "markdown",
@@ -310,16 +325,16 @@
},
{
"cell_type": "code",
+ "execution_count": null,
"metadata": {
"id": "6a_vedo0Lyo8"
},
+ "outputs": [],
"source": [
"from omegaconf import OmegaConf, open_dict\n",
"\n",
- "config = OmegaConf.load(\"configs/contextnet_rnnt.yaml\")"
- ],
- "execution_count": null,
- "outputs": []
+ "config = OmegaConf.load(\"../../examples/asr/conf/contextnet_rnnt/contextnet_rnnt.yaml\")"
+ ]
},
{
"cell_type": "markdown",
@@ -336,15 +351,15 @@
},
{
"cell_type": "code",
+ "execution_count": null,
"metadata": {
"id": "B9nY5JQaIhKz"
},
+ "outputs": [],
"source": [
"config.model.encoder.jasper = config.model.encoder.jasper[:5]\n",
"config.model.encoder.jasper[-1].filters = '${model.model_defaults.enc_hidden}'"
- ],
- "execution_count": null,
- "outputs": []
+ ]
},
{
"cell_type": "markdown",
@@ -359,15 +374,15 @@
},
{
"cell_type": "code",
+ "execution_count": null,
"metadata": {
"id": "mJbqMEkjMTfM"
},
+ "outputs": [],
"source": [
"# print out the train and validation configs to know what needs to be changed\n",
"print(OmegaConf.to_yaml(config.model.train_ds))"
- ],
- "execution_count": null,
- "outputs": []
+ ]
},
{
"cell_type": "markdown",
@@ -382,16 +397,16 @@
},
{
"cell_type": "code",
+ "execution_count": null,
"metadata": {
"id": "5QNYNItcMeOC"
},
+ "outputs": [],
"source": [
"config.model.train_ds.manifest_filepath = TRAIN_MANIFEST\n",
"config.model.validation_ds.manifest_filepath = TEST_MANIFEST\n",
"config.model.test_ds.manifest_filepath = TEST_MANIFEST"
- ],
- "execution_count": null,
- "outputs": []
+ ]
},
{
"cell_type": "markdown",
@@ -406,26 +421,26 @@
},
{
"cell_type": "code",
+ "execution_count": null,
"metadata": {
"id": "CNlRNlVCNAjr"
},
+ "outputs": [],
"source": [
"print(OmegaConf.to_yaml(config.model.tokenizer))"
- ],
- "execution_count": null,
- "outputs": []
+ ]
},
{
"cell_type": "code",
+ "execution_count": null,
"metadata": {
"id": "MzmCNJZ7NAng"
},
+ "outputs": [],
"source": [
"config.model.tokenizer.dir = TOKENIZER\n",
"config.model.tokenizer.type = TOKENIZER_TYPE_CFG"
- ],
- "execution_count": null,
- "outputs": []
+ ]
},
{
"cell_type": "markdown",
@@ -439,27 +454,27 @@
},
{
"cell_type": "code",
+ "execution_count": null,
"metadata": {
"id": "cUkqv_bQraQe"
},
+ "outputs": [],
"source": [
"print(OmegaConf.to_yaml(config.model.optim))"
- ],
- "execution_count": null,
- "outputs": []
+ ]
},
{
"cell_type": "code",
+ "execution_count": null,
"metadata": {
"id": "mEyK1fqEiyxo"
},
+ "outputs": [],
"source": [
"# Finally, let's remove logging of samples and the warmup since the dataset is small (similar to CTC models)\n",
"config.model.log_prediction = False\n",
"config.model.optim.sched.warmup_steps = None"
- ],
- "execution_count": null,
- "outputs": []
+ ]
},
{
"cell_type": "markdown",
@@ -474,26 +489,26 @@
},
{
"cell_type": "code",
+ "execution_count": null,
"metadata": {
"id": "i4udFKMwDbRm"
},
+ "outputs": [],
"source": [
"print(OmegaConf.to_yaml(config.model.spec_augment))"
- ],
- "execution_count": null,
- "outputs": []
+ ]
},
{
"cell_type": "code",
+ "execution_count": null,
"metadata": {
"id": "OLUbRvoDDedd"
},
+ "outputs": [],
"source": [
"config.model.spec_augment.freq_masks = 0\n",
"config.model.spec_augment.time_masks = 0"
- ],
- "execution_count": null,
- "outputs": []
+ ]
},
{
"cell_type": "markdown",
@@ -645,40 +660,40 @@
},
{
"cell_type": "code",
+ "execution_count": null,
"metadata": {
"id": "Unv8-GvOWhad"
},
+ "outputs": [],
"source": [
"print(OmegaConf.to_yaml(config.model.joint))"
- ],
- "execution_count": null,
- "outputs": []
+ ]
},
{
"cell_type": "code",
+ "execution_count": null,
"metadata": {
"id": "VsunP99saF5c"
},
+ "outputs": [],
"source": [
"# Two lines to enable the fused batch step\n",
"config.model.joint.fuse_loss_wer = True\n",
"config.model.joint.fused_batch_size = 16 # this can be any value (preferably less than model.*_ds.batch_size)"
- ],
- "execution_count": null,
- "outputs": []
+ ]
},
{
"cell_type": "code",
+ "execution_count": null,
"metadata": {
"id": "yT1vJH9OkS0u"
},
+ "outputs": [],
"source": [
"# We will also reduce the hidden dimension of the joint and the prediction networks to preserve some memory\n",
"config.model.model_defaults.pred_hidden = 64\n",
"config.model.model_defaults.joint_hidden = 64"
- ],
- "execution_count": null,
- "outputs": []
+ ]
},
{
"cell_type": "markdown",
@@ -693,15 +708,15 @@
},
{
"cell_type": "code",
+ "execution_count": null,
"metadata": {
"id": "MywZQ9ADZpDW"
},
+ "outputs": [],
"source": [
"# Use just 128 filters across the model to speed up training and reduce parameter count\n",
"config.model.model_defaults.filters = 128"
- ],
- "execution_count": null,
- "outputs": []
+ ]
},
{
"cell_type": "markdown",
@@ -727,9 +742,11 @@
},
{
"cell_type": "code",
+ "execution_count": null,
"metadata": {
"id": "Fmf0iSY-a6LC"
},
+ "outputs": [],
"source": [
"import torch\n",
"from pytorch_lightning import Trainer\n",
@@ -745,44 +762,42 @@
"trainer = Trainer(devices=1, accelerator=accelerator, max_epochs=EPOCHS,\n",
" enable_checkpointing=False, logger=False,\n",
" log_every_n_steps=5, check_val_every_n_epoch=10)"
- ],
- "execution_count": null,
- "outputs": []
+ ]
},
{
"cell_type": "code",
+ "execution_count": null,
"metadata": {
"id": "jqVt4TEncEqv"
},
+ "outputs": [],
"source": [
"# Import the Transducer Model\n",
"import nemo.collections.asr as nemo_asr"
- ],
- "execution_count": null,
- "outputs": []
+ ]
},
{
"cell_type": "code",
+ "execution_count": null,
"metadata": {
"id": "RheLsmA1cRz0"
},
+ "outputs": [],
"source": [
"# Build the model\n",
"model = nemo_asr.models.EncDecRNNTBPEModel(cfg=config.model, trainer=trainer)"
- ],
- "execution_count": null,
- "outputs": []
+ ]
},
{
"cell_type": "code",
+ "execution_count": null,
"metadata": {
"id": "9eqYnTwqnRtI"
},
+ "outputs": [],
"source": [
"model.summarize();"
- ],
- "execution_count": null,
- "outputs": []
+ ]
},
{
"cell_type": "markdown",
@@ -814,15 +829,15 @@
},
{
"cell_type": "code",
+ "execution_count": null,
"metadata": {
"id": "OAjMrbK-dtIv"
},
+ "outputs": [],
"source": [
"# Load a small CTC model\n",
"# ctc_model = nemo_asr.models.EncDecCTCModelBPE.from_pretrained(\"stt_en_citrinet_256\", map_location='cpu')"
- ],
- "execution_count": null,
- "outputs": []
+ ]
},
{
"cell_type": "markdown",
@@ -837,18 +852,18 @@
},
{
"cell_type": "code",
+ "execution_count": null,
"metadata": {
"id": "Ugz8Y8eieLMK"
},
+ "outputs": [],
"source": [
"# <<< NOTE: This is only for demonstration ! >>>\n",
"# Below cell will fail because the two model's have incompatible kernel sizes in their Conv layers.\n",
"\n",
"# <<< NOTE: Below cell is only shown to illustrate the method >>>\n",
"# model.encoder.load_state_dict(ctc_model.encoder.state_dict(), strict=True)"
- ],
- "execution_count": null,
- "outputs": []
+ ]
},
{
"cell_type": "markdown",
@@ -863,9 +878,11 @@
},
{
"cell_type": "code",
+ "execution_count": null,
"metadata": {
"id": "OS_U6uO-fnW9"
},
+ "outputs": [],
"source": [
"# Prepare NeMo's Experiment manager to handle checkpoint saving and logging for us\n",
"from nemo.utils import exp_manager\n",
@@ -888,15 +905,15 @@
"exp_config = OmegaConf.structured(exp_config)\n",
"\n",
"logdir = exp_manager.exp_manager(trainer, exp_config)"
- ],
- "execution_count": null,
- "outputs": []
+ ]
},
{
"cell_type": "code",
+ "execution_count": null,
"metadata": {
"id": "SDleCL0Zf7lU"
},
+ "outputs": [],
"source": [
"try:\n",
" from google import colab\n",
@@ -910,15 +927,15 @@
" %tensorboard --logdir /content/experiments/Transducer-Model/\n",
"else:\n",
" print(\"To use TensorBoard, please use this notebook in a Google Colab environment.\")"
- ],
- "execution_count": null,
- "outputs": []
+ ]
},
{
"cell_type": "code",
+ "execution_count": null,
"metadata": {
"id": "hzqAsK23uYHG"
},
+ "outputs": [],
"source": [
"# Release resources prior to training\n",
"import gc\n",
@@ -926,24 +943,22 @@
"\n",
"if accelerator == 'gpu':\n",
" torch.cuda.empty_cache()"
- ],
- "execution_count": null,
- "outputs": []
+ ]
},
{
"cell_type": "code",
+ "execution_count": null,
"metadata": {
"colab": {
"background_save": true
},
"id": "A4neHTnSgaDb"
},
+ "outputs": [],
"source": [
"# Train the model\n",
"trainer.fit(model)"
- ],
- "execution_count": null,
- "outputs": []
+ ]
},
{
"cell_type": "markdown",
@@ -958,17 +973,17 @@
},
{
"cell_type": "code",
+ "execution_count": null,
"metadata": {
"colab": {
"background_save": true
},
"id": "iEjuXo4BNyOi"
},
+ "outputs": [],
"source": [
"trainer.test(model)"
- ],
- "execution_count": null,
- "outputs": []
+ ]
},
{
"cell_type": "markdown",
@@ -998,62 +1013,62 @@
},
{
"cell_type": "code",
+ "execution_count": null,
"metadata": {
"colab": {
"background_save": true
},
"id": "RJtrJKt0gY0i"
},
+ "outputs": [],
"source": [
"import copy\n",
"\n",
"decoding_config = copy.deepcopy(config.model.decoding)\n",
"print(OmegaConf.to_yaml(decoding_config))"
- ],
- "execution_count": null,
- "outputs": []
+ ]
},
{
"cell_type": "code",
+ "execution_count": null,
"metadata": {
"colab": {
"background_save": true
},
"id": "MNL5L1KthC1E"
},
+ "outputs": [],
"source": [
"# Update the config for the decoding strategy\n",
"decoding_config.strategy = \"alsd\" # Options are `greedy`, `greedy_batch`, `beam`, `tsd` and `alsd`\n",
"decoding_config.beam.beam_size = 4 # Increase beam size for better scores, but it will take much longer for transcription !"
- ],
- "execution_count": null,
- "outputs": []
+ ]
},
{
"cell_type": "code",
+ "execution_count": null,
"metadata": {
"colab": {
"background_save": true
},
"id": "xQgQRDnlhC7M"
},
+ "outputs": [],
"source": [
"# Finally update the model's decoding strategy !\n",
"model.change_decoding_strategy(decoding_config)"
- ],
- "execution_count": null,
- "outputs": []
+ ]
},
{
"cell_type": "code",
+ "execution_count": null,
"metadata": {
"id": "h7jrCMJvh8vE"
},
+ "outputs": [],
"source": [
"trainer.test(model)"
- ],
- "execution_count": null,
- "outputs": []
+ ]
},
{
"cell_type": "markdown",
@@ -1096,12 +1111,14 @@
},
{
"cell_type": "code",
+ "execution_count": null,
"metadata": {
"colab": {
"background_save": true
},
"id": "lgpsXSEeQAp4"
},
+ "outputs": [],
"source": [
"decoding_config.strategy = \"greedy_batch\"\n",
"\n",
@@ -1112,9 +1129,7 @@
" decoding_config.fused_batch_size = -1 # temporarily stop fused batch during inference.\n",
"\n",
"model.change_decoding_strategy(decoding_config)"
- ],
- "execution_count": null,
- "outputs": []
+ ]
},
{
"cell_type": "markdown",
@@ -1129,30 +1144,32 @@
},
{
"cell_type": "code",
+ "execution_count": null,
"metadata": {
"colab": {
"background_save": true
},
"id": "vrhldu5WPkvg"
},
+ "outputs": [],
"source": [
"test_dl = model.test_dataloader()\n",
"test_dl = iter(test_dl)\n",
"batch = next(test_dl)\n",
"\n",
"device = torch.device('cuda' if accelerator == 'gpu' else 'cpu')"
- ],
- "execution_count": null,
- "outputs": []
+ ]
},
{
"cell_type": "code",
+ "execution_count": null,
"metadata": {
"colab": {
"background_save": true
},
"id": "fAagUT_DPQhF"
},
+ "outputs": [],
"source": [
"def rnnt_alignments(model, batch):\n",
" model = model.to(device)\n",
@@ -1170,24 +1187,22 @@
" # 1) best hypothesis \n",
" # 2) Sorted list of hypothesis (if using beam search); None otherwise\n",
" return current_hypotheses"
- ],
- "execution_count": null,
- "outputs": []
+ ]
},
{
"cell_type": "code",
+ "execution_count": null,
"metadata": {
"colab": {
"background_save": true
},
"id": "OuSrv8lZPhwY"
},
+ "outputs": [],
"source": [
"# Get a batch of hypotheses, as well as a batch of all obtained hypotheses (if beam search is used)\n",
"hypotheses, all_hypotheses = rnnt_alignments(model, batch)"
- ],
- "execution_count": null,
- "outputs": []
+ ]
},
{
"cell_type": "markdown",
@@ -1202,44 +1217,44 @@
},
{
"cell_type": "code",
+ "execution_count": null,
"metadata": {
"colab": {
"background_save": true
},
"id": "arE7af_DPhyy"
},
+ "outputs": [],
"source": [
"# Select the sample ID from within the batch\n",
"SAMPLE_ID = 0"
- ],
- "execution_count": null,
- "outputs": []
+ ]
},
{
"cell_type": "code",
+ "execution_count": null,
"metadata": {
"id": "S6YcE27WPh1G"
},
+ "outputs": [],
"source": [
"# Obtain the hypothesis for this sample, as well as some ground truth information about this sample\n",
"hypothesis = hypotheses[SAMPLE_ID]\n",
"original_sample_len = batch[1][SAMPLE_ID]\n",
"ground_truth = batch[2][SAMPLE_ID]"
- ],
- "execution_count": null,
- "outputs": []
+ ]
},
{
"cell_type": "code",
+ "execution_count": null,
"metadata": {
"id": "oEg_73h9Qe8t"
},
+ "outputs": [],
"source": [
"# The Hypothesis object contains a lot of useful information regarding the decoding step.\n",
"print(hypothesis)"
- ],
- "execution_count": null,
- "outputs": []
+ ]
},
{
"cell_type": "markdown",
@@ -1254,9 +1269,11 @@
},
{
"cell_type": "code",
+ "execution_count": null,
"metadata": {
"id": "ALuefA4XPh5O"
},
+ "outputs": [],
"source": [
"decoded_text = hypothesis.text\n",
"decoded_hypothesis = model.decoding.decode_ids_to_tokens(hypothesis.y_sequence.cpu().numpy().tolist())\n",
@@ -1265,9 +1282,7 @@
"print(\"Decoded ground truth :\", decoded_ground_truth)\n",
"print(\"Decoded hypothesis :\", decoded_text)\n",
"print(\"Decoded hyp tokens :\", decoded_hypothesis)\n"
- ],
- "execution_count": null,
- "outputs": []
+ ]
},
{
"cell_type": "markdown",
@@ -1282,18 +1297,18 @@
},
{
"cell_type": "code",
+ "execution_count": null,
"metadata": {
"id": "vl1BF52iSjEq"
},
+ "outputs": [],
"source": [
"alignments = hypothesis.alignments\n",
"\n",
"# These two values should normally always match\n",
"print(\"Length of alignments (T): \", len(alignments))\n",
"print(\"Length of padded acoustic model after striding : \", int(hypothesis.length))"
- ],
- "execution_count": null,
- "outputs": []
+ ]
},
{
"cell_type": "markdown",
@@ -1312,9 +1327,11 @@
},
{
"cell_type": "code",
+ "execution_count": null,
"metadata": {
"id": "Xt5nDL55SdRL"
},
+ "outputs": [],
"source": [
"# Compute the alignment grid\n",
"for ti in range(len(alignments)):\n",
@@ -1326,9 +1343,7 @@
" t_u.append(decoded_token)\n",
" \n",
" print(f\"Tokens at timestep {ti} = {t_u}\")"
- ],
- "execution_count": null,
- "outputs": []
+ ]
},
{
"cell_type": "markdown",
@@ -1377,5 +1392,25 @@
"You will find that following many of the steps from CTC models, and simply modifying the config to include the Transducer components, we can train character or sub-word based transducer models with the same flexibility as we can train CTC models !"
]
}
- ]
-}
\ No newline at end of file
+ ],
+ "metadata": {
+ "accelerator": "GPU",
+ "colab": {
+ "collapsed_sections": [
+ "V5sMoFHmVvhg"
+ ],
+ "name": "ASR-with-Transducers.ipynb",
+ "provenance": [],
+ "toc_visible": true
+ },
+ "kernelspec": {
+ "display_name": "Python 3",
+ "name": "python3"
+ },
+ "language_info": {
+ "name": "python"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 0
+}