diff --git a/tutorials/asr/ASR_with_Transducers.ipynb b/tutorials/asr/ASR_with_Transducers.ipynb index fbaa2ff9a725..e1339cbbba4a 100644 --- a/tutorials/asr/ASR_with_Transducers.ipynb +++ b/tutorials/asr/ASR_with_Transducers.ipynb @@ -1,30 +1,12 @@ { - "nbformat": 4, - "nbformat_minor": 0, - "metadata": { - "accelerator": "GPU", - "colab": { - "name": "ASR-with-Transducers.ipynb", - "provenance": [], - "collapsed_sections": [ - "V5sMoFHmVvhg" - ], - "toc_visible": true - }, - "kernelspec": { - "display_name": "Python 3", - "name": "python3" - }, - "language_info": { - "name": "python" - } - }, "cells": [ { "cell_type": "code", + "execution_count": null, "metadata": { "id": "SUOXg71A3w78" }, + "outputs": [], "source": [ "\"\"\"\n", "You can run either this notebook locally (if you have all the dependencies and a GPU) or on Google Colab.\n", @@ -52,15 +34,15 @@ "## Grab the config we'll use in this example\n", "!mkdir configs\n", "!wget -P configs/ https://raw.githubusercontent.com/NVIDIA/NeMo/$BRANCH/examples/asr/conf/contextnet_rnnt/contextnet_rnnt.yaml\n" - ], - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "code", + "execution_count": null, "metadata": { "id": "uj-UnhKk47oW" }, + "outputs": [], "source": [ "# In a conda environment, you would use the following command\n", "# Update Numba to > 0.53\n", @@ -71,9 +53,7 @@ "# For pip based environments,\n", "# Update Numba to > 0.53\n", "!pip install --upgrade numba" - ], - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "markdown", @@ -109,9 +89,11 @@ }, { "cell_type": "code", + "execution_count": null, "metadata": { "id": "pu4n4GkjAo9i" }, + "outputs": [], "source": [ "import os\n", "\n", @@ -120,9 +102,7 @@ "\n", "if not os.path.exists(\"scripts/process_an4_data.py\"):\n", " !wget -P scripts/ https://raw.githubusercontent.com/NVIDIA/NeMo/$BRANCH/scripts/dataset_processing/process_an4_data.py" - ], - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "markdown", @@ -137,9 +117,11 @@ }, { "cell_type": "code", + "execution_count": null, "metadata": { "id": "yV1rJcH6ApDo" }, + "outputs": [], "source": [ "import wget\n", "import tarfile \n", @@ -174,32 +156,65 @@ " cmd = [\"sox\", sph_path, wav_path]\n", " subprocess.run(cmd)\n", "\n", - "print(\"Finished conversion.\\n******\")\n", - "\n", - "if os.path.exists(f\"{data_dir}/an4\"):\n", - " print(\"Preparing AN4 dataset ...\")\n", - "\n", - " an4_path = f\"{data_dir}/\"\n", - " !python scripts/process_an4_data.py \\\n", - " --data_root=$an4_path\n", - "\n", - "print(\"AN4 prepared !\")" - ], - "execution_count": null, - "outputs": [] + "print(\"Finished conversion.\\n******\")" + ] }, { "cell_type": "code", - "metadata": { - "id": "J4cw98QfI9-p" - }, - "source": [ - "# Manifest filepaths\n", - "TRAIN_MANIFEST = os.path.join(data_dir, \"an4\", \"train_manifest.json\")\n", - "TEST_MANIFEST = os.path.join(data_dir, \"an4\", \"test_manifest.json\")" - ], "execution_count": null, - "outputs": [] + "metadata": {}, + "outputs": [], + "source": [ + "# --- Building Manifest Files --- #\n", + "import json\n", + "import librosa\n", + "\n", + "# Function to build a manifest\n", + "def build_manifest(transcripts_path, manifest_path, wav_path):\n", + " with open(transcripts_path, 'r') as fin:\n", + " with open(manifest_path, 'w') as fout:\n", + " for line in fin:\n", + " # Lines look like this:\n", + " # transcript (fileID)\n", + " transcript = line[: line.find('(')-1].lower()\n", + " transcript = transcript.replace('', '').replace('', '')\n", + " transcript = transcript.strip()\n", + "\n", + " file_id = line[line.find('(')+1 : -2] # e.g. \"cen4-fash-b\"\n", + " audio_path = os.path.join(\n", + " data_dir, wav_path,\n", + " file_id[file_id.find('-')+1 : file_id.rfind('-')],\n", + " file_id + '.wav')\n", + "\n", + " duration = librosa.core.get_duration(filename=audio_path)\n", + "\n", + " # Write the metadata to the manifest\n", + " metadata = {\n", + " \"audio_filepath\": audio_path,\n", + " \"duration\": duration,\n", + " \"text\": transcript\n", + " }\n", + " json.dump(metadata, fout)\n", + " fout.write('\\n')\n", + "\n", + "# Building Manifests\n", + "print(\"******\")\n", + "train_transcripts = os.path.join(data_dir, 'an4/etc/an4_train.transcription')\n", + "train_manifest = os.path.join(data_dir, 'an4/train_manifest.json')\n", + "if not os.path.isfile(train_manifest):\n", + " build_manifest(train_transcripts, train_manifest, 'an4/wav/an4_clstk')\n", + " print(\"Training manifest created.\")\n", + "\n", + "test_transcripts = os.path.join(data_dir, 'an4/etc/an4_test.transcription')\n", + "test_manifest = os.path.join(data_dir, '/an4/test_manifest.json')\n", + "if not os.path.isfile(test_manifest):\n", + " build_manifest(test_transcripts, test_manifest, 'an4/wav/an4test_clstk')\n", + " print(\"Test manifest created.\")\n", + "print(\"***Done***\") \n", + "# Manifest filepaths\n", + "TRAIN_MANIFEST = train_manifest\n", + "TEST_MANIFEST = test_manifest" + ] }, { "cell_type": "markdown", @@ -214,15 +229,15 @@ }, { "cell_type": "code", + "execution_count": null, "metadata": { "id": "i2hD4LkoJvrx" }, + "outputs": [], "source": [ "if not os.path.exists(\"scripts/process_asr_text_tokenizer.py\"):\n", " !wget -P scripts/ https://raw.githubusercontent.com/NVIDIA/NeMo/$BRANCH/scripts/tokenizers/process_asr_text_tokenizer.py" - ], - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "markdown", @@ -237,9 +252,11 @@ }, { "cell_type": "code", + "execution_count": null, "metadata": { "id": "i6Jzpt6UJvuI" }, + "outputs": [], "source": [ "VOCAB_SIZE = 32 # can be any value above 29\n", "TOKENIZER_TYPE = \"spe\" # can be wpe or spe\n", @@ -259,15 +276,15 @@ " --no_lower_case \\\n", " --log \\\n", " --vocab_size=$VOCAB_SIZE" - ], - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "code", + "execution_count": null, "metadata": { "id": "JHDZswN6LIBJ" }, + "outputs": [], "source": [ "# Tokenizer path\n", "if TOKENIZER_TYPE == 'spe':\n", @@ -276,9 +293,7 @@ "else:\n", " TOKENIZER = os.path.join(\"tokenizers\", f\"tokenizer_wpe_v{VOCAB_SIZE}\")\n", " TOKENIZER_TYPE_CFG = \"wpe\"" - ], - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "markdown", @@ -310,16 +325,16 @@ }, { "cell_type": "code", + "execution_count": null, "metadata": { "id": "6a_vedo0Lyo8" }, + "outputs": [], "source": [ "from omegaconf import OmegaConf, open_dict\n", "\n", - "config = OmegaConf.load(\"configs/contextnet_rnnt.yaml\")" - ], - "execution_count": null, - "outputs": [] + "config = OmegaConf.load(\"../../examples/asr/conf/contextnet_rnnt/contextnet_rnnt.yaml\")" + ] }, { "cell_type": "markdown", @@ -336,15 +351,15 @@ }, { "cell_type": "code", + "execution_count": null, "metadata": { "id": "B9nY5JQaIhKz" }, + "outputs": [], "source": [ "config.model.encoder.jasper = config.model.encoder.jasper[:5]\n", "config.model.encoder.jasper[-1].filters = '${model.model_defaults.enc_hidden}'" - ], - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "markdown", @@ -359,15 +374,15 @@ }, { "cell_type": "code", + "execution_count": null, "metadata": { "id": "mJbqMEkjMTfM" }, + "outputs": [], "source": [ "# print out the train and validation configs to know what needs to be changed\n", "print(OmegaConf.to_yaml(config.model.train_ds))" - ], - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "markdown", @@ -382,16 +397,16 @@ }, { "cell_type": "code", + "execution_count": null, "metadata": { "id": "5QNYNItcMeOC" }, + "outputs": [], "source": [ "config.model.train_ds.manifest_filepath = TRAIN_MANIFEST\n", "config.model.validation_ds.manifest_filepath = TEST_MANIFEST\n", "config.model.test_ds.manifest_filepath = TEST_MANIFEST" - ], - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "markdown", @@ -406,26 +421,26 @@ }, { "cell_type": "code", + "execution_count": null, "metadata": { "id": "CNlRNlVCNAjr" }, + "outputs": [], "source": [ "print(OmegaConf.to_yaml(config.model.tokenizer))" - ], - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "code", + "execution_count": null, "metadata": { "id": "MzmCNJZ7NAng" }, + "outputs": [], "source": [ "config.model.tokenizer.dir = TOKENIZER\n", "config.model.tokenizer.type = TOKENIZER_TYPE_CFG" - ], - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "markdown", @@ -439,27 +454,27 @@ }, { "cell_type": "code", + "execution_count": null, "metadata": { "id": "cUkqv_bQraQe" }, + "outputs": [], "source": [ "print(OmegaConf.to_yaml(config.model.optim))" - ], - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "code", + "execution_count": null, "metadata": { "id": "mEyK1fqEiyxo" }, + "outputs": [], "source": [ "# Finally, let's remove logging of samples and the warmup since the dataset is small (similar to CTC models)\n", "config.model.log_prediction = False\n", "config.model.optim.sched.warmup_steps = None" - ], - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "markdown", @@ -474,26 +489,26 @@ }, { "cell_type": "code", + "execution_count": null, "metadata": { "id": "i4udFKMwDbRm" }, + "outputs": [], "source": [ "print(OmegaConf.to_yaml(config.model.spec_augment))" - ], - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "code", + "execution_count": null, "metadata": { "id": "OLUbRvoDDedd" }, + "outputs": [], "source": [ "config.model.spec_augment.freq_masks = 0\n", "config.model.spec_augment.time_masks = 0" - ], - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "markdown", @@ -645,40 +660,40 @@ }, { "cell_type": "code", + "execution_count": null, "metadata": { "id": "Unv8-GvOWhad" }, + "outputs": [], "source": [ "print(OmegaConf.to_yaml(config.model.joint))" - ], - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "code", + "execution_count": null, "metadata": { "id": "VsunP99saF5c" }, + "outputs": [], "source": [ "# Two lines to enable the fused batch step\n", "config.model.joint.fuse_loss_wer = True\n", "config.model.joint.fused_batch_size = 16 # this can be any value (preferably less than model.*_ds.batch_size)" - ], - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "code", + "execution_count": null, "metadata": { "id": "yT1vJH9OkS0u" }, + "outputs": [], "source": [ "# We will also reduce the hidden dimension of the joint and the prediction networks to preserve some memory\n", "config.model.model_defaults.pred_hidden = 64\n", "config.model.model_defaults.joint_hidden = 64" - ], - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "markdown", @@ -693,15 +708,15 @@ }, { "cell_type": "code", + "execution_count": null, "metadata": { "id": "MywZQ9ADZpDW" }, + "outputs": [], "source": [ "# Use just 128 filters across the model to speed up training and reduce parameter count\n", "config.model.model_defaults.filters = 128" - ], - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "markdown", @@ -727,9 +742,11 @@ }, { "cell_type": "code", + "execution_count": null, "metadata": { "id": "Fmf0iSY-a6LC" }, + "outputs": [], "source": [ "import torch\n", "from pytorch_lightning import Trainer\n", @@ -745,44 +762,42 @@ "trainer = Trainer(devices=1, accelerator=accelerator, max_epochs=EPOCHS,\n", " enable_checkpointing=False, logger=False,\n", " log_every_n_steps=5, check_val_every_n_epoch=10)" - ], - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "code", + "execution_count": null, "metadata": { "id": "jqVt4TEncEqv" }, + "outputs": [], "source": [ "# Import the Transducer Model\n", "import nemo.collections.asr as nemo_asr" - ], - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "code", + "execution_count": null, "metadata": { "id": "RheLsmA1cRz0" }, + "outputs": [], "source": [ "# Build the model\n", "model = nemo_asr.models.EncDecRNNTBPEModel(cfg=config.model, trainer=trainer)" - ], - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "code", + "execution_count": null, "metadata": { "id": "9eqYnTwqnRtI" }, + "outputs": [], "source": [ "model.summarize();" - ], - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "markdown", @@ -814,15 +829,15 @@ }, { "cell_type": "code", + "execution_count": null, "metadata": { "id": "OAjMrbK-dtIv" }, + "outputs": [], "source": [ "# Load a small CTC model\n", "# ctc_model = nemo_asr.models.EncDecCTCModelBPE.from_pretrained(\"stt_en_citrinet_256\", map_location='cpu')" - ], - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "markdown", @@ -837,18 +852,18 @@ }, { "cell_type": "code", + "execution_count": null, "metadata": { "id": "Ugz8Y8eieLMK" }, + "outputs": [], "source": [ "# <<< NOTE: This is only for demonstration ! >>>\n", "# Below cell will fail because the two model's have incompatible kernel sizes in their Conv layers.\n", "\n", "# <<< NOTE: Below cell is only shown to illustrate the method >>>\n", "# model.encoder.load_state_dict(ctc_model.encoder.state_dict(), strict=True)" - ], - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "markdown", @@ -863,9 +878,11 @@ }, { "cell_type": "code", + "execution_count": null, "metadata": { "id": "OS_U6uO-fnW9" }, + "outputs": [], "source": [ "# Prepare NeMo's Experiment manager to handle checkpoint saving and logging for us\n", "from nemo.utils import exp_manager\n", @@ -888,15 +905,15 @@ "exp_config = OmegaConf.structured(exp_config)\n", "\n", "logdir = exp_manager.exp_manager(trainer, exp_config)" - ], - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "code", + "execution_count": null, "metadata": { "id": "SDleCL0Zf7lU" }, + "outputs": [], "source": [ "try:\n", " from google import colab\n", @@ -910,15 +927,15 @@ " %tensorboard --logdir /content/experiments/Transducer-Model/\n", "else:\n", " print(\"To use TensorBoard, please use this notebook in a Google Colab environment.\")" - ], - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "code", + "execution_count": null, "metadata": { "id": "hzqAsK23uYHG" }, + "outputs": [], "source": [ "# Release resources prior to training\n", "import gc\n", @@ -926,24 +943,22 @@ "\n", "if accelerator == 'gpu':\n", " torch.cuda.empty_cache()" - ], - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "code", + "execution_count": null, "metadata": { "colab": { "background_save": true }, "id": "A4neHTnSgaDb" }, + "outputs": [], "source": [ "# Train the model\n", "trainer.fit(model)" - ], - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "markdown", @@ -958,17 +973,17 @@ }, { "cell_type": "code", + "execution_count": null, "metadata": { "colab": { "background_save": true }, "id": "iEjuXo4BNyOi" }, + "outputs": [], "source": [ "trainer.test(model)" - ], - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "markdown", @@ -998,62 +1013,62 @@ }, { "cell_type": "code", + "execution_count": null, "metadata": { "colab": { "background_save": true }, "id": "RJtrJKt0gY0i" }, + "outputs": [], "source": [ "import copy\n", "\n", "decoding_config = copy.deepcopy(config.model.decoding)\n", "print(OmegaConf.to_yaml(decoding_config))" - ], - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "code", + "execution_count": null, "metadata": { "colab": { "background_save": true }, "id": "MNL5L1KthC1E" }, + "outputs": [], "source": [ "# Update the config for the decoding strategy\n", "decoding_config.strategy = \"alsd\" # Options are `greedy`, `greedy_batch`, `beam`, `tsd` and `alsd`\n", "decoding_config.beam.beam_size = 4 # Increase beam size for better scores, but it will take much longer for transcription !" - ], - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "code", + "execution_count": null, "metadata": { "colab": { "background_save": true }, "id": "xQgQRDnlhC7M" }, + "outputs": [], "source": [ "# Finally update the model's decoding strategy !\n", "model.change_decoding_strategy(decoding_config)" - ], - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "code", + "execution_count": null, "metadata": { "id": "h7jrCMJvh8vE" }, + "outputs": [], "source": [ "trainer.test(model)" - ], - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "markdown", @@ -1096,12 +1111,14 @@ }, { "cell_type": "code", + "execution_count": null, "metadata": { "colab": { "background_save": true }, "id": "lgpsXSEeQAp4" }, + "outputs": [], "source": [ "decoding_config.strategy = \"greedy_batch\"\n", "\n", @@ -1112,9 +1129,7 @@ " decoding_config.fused_batch_size = -1 # temporarily stop fused batch during inference.\n", "\n", "model.change_decoding_strategy(decoding_config)" - ], - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "markdown", @@ -1129,30 +1144,32 @@ }, { "cell_type": "code", + "execution_count": null, "metadata": { "colab": { "background_save": true }, "id": "vrhldu5WPkvg" }, + "outputs": [], "source": [ "test_dl = model.test_dataloader()\n", "test_dl = iter(test_dl)\n", "batch = next(test_dl)\n", "\n", "device = torch.device('cuda' if accelerator == 'gpu' else 'cpu')" - ], - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "code", + "execution_count": null, "metadata": { "colab": { "background_save": true }, "id": "fAagUT_DPQhF" }, + "outputs": [], "source": [ "def rnnt_alignments(model, batch):\n", " model = model.to(device)\n", @@ -1170,24 +1187,22 @@ " # 1) best hypothesis \n", " # 2) Sorted list of hypothesis (if using beam search); None otherwise\n", " return current_hypotheses" - ], - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "code", + "execution_count": null, "metadata": { "colab": { "background_save": true }, "id": "OuSrv8lZPhwY" }, + "outputs": [], "source": [ "# Get a batch of hypotheses, as well as a batch of all obtained hypotheses (if beam search is used)\n", "hypotheses, all_hypotheses = rnnt_alignments(model, batch)" - ], - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "markdown", @@ -1202,44 +1217,44 @@ }, { "cell_type": "code", + "execution_count": null, "metadata": { "colab": { "background_save": true }, "id": "arE7af_DPhyy" }, + "outputs": [], "source": [ "# Select the sample ID from within the batch\n", "SAMPLE_ID = 0" - ], - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "code", + "execution_count": null, "metadata": { "id": "S6YcE27WPh1G" }, + "outputs": [], "source": [ "# Obtain the hypothesis for this sample, as well as some ground truth information about this sample\n", "hypothesis = hypotheses[SAMPLE_ID]\n", "original_sample_len = batch[1][SAMPLE_ID]\n", "ground_truth = batch[2][SAMPLE_ID]" - ], - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "code", + "execution_count": null, "metadata": { "id": "oEg_73h9Qe8t" }, + "outputs": [], "source": [ "# The Hypothesis object contains a lot of useful information regarding the decoding step.\n", "print(hypothesis)" - ], - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "markdown", @@ -1254,9 +1269,11 @@ }, { "cell_type": "code", + "execution_count": null, "metadata": { "id": "ALuefA4XPh5O" }, + "outputs": [], "source": [ "decoded_text = hypothesis.text\n", "decoded_hypothesis = model.decoding.decode_ids_to_tokens(hypothesis.y_sequence.cpu().numpy().tolist())\n", @@ -1265,9 +1282,7 @@ "print(\"Decoded ground truth :\", decoded_ground_truth)\n", "print(\"Decoded hypothesis :\", decoded_text)\n", "print(\"Decoded hyp tokens :\", decoded_hypothesis)\n" - ], - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "markdown", @@ -1282,18 +1297,18 @@ }, { "cell_type": "code", + "execution_count": null, "metadata": { "id": "vl1BF52iSjEq" }, + "outputs": [], "source": [ "alignments = hypothesis.alignments\n", "\n", "# These two values should normally always match\n", "print(\"Length of alignments (T): \", len(alignments))\n", "print(\"Length of padded acoustic model after striding : \", int(hypothesis.length))" - ], - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "markdown", @@ -1312,9 +1327,11 @@ }, { "cell_type": "code", + "execution_count": null, "metadata": { "id": "Xt5nDL55SdRL" }, + "outputs": [], "source": [ "# Compute the alignment grid\n", "for ti in range(len(alignments)):\n", @@ -1326,9 +1343,7 @@ " t_u.append(decoded_token)\n", " \n", " print(f\"Tokens at timestep {ti} = {t_u}\")" - ], - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "markdown", @@ -1377,5 +1392,25 @@ "You will find that following many of the steps from CTC models, and simply modifying the config to include the Transducer components, we can train character or sub-word based transducer models with the same flexibility as we can train CTC models !" ] } - ] -} \ No newline at end of file + ], + "metadata": { + "accelerator": "GPU", + "colab": { + "collapsed_sections": [ + "V5sMoFHmVvhg" + ], + "name": "ASR-with-Transducers.ipynb", + "provenance": [], + "toc_visible": true + }, + "kernelspec": { + "display_name": "Python 3", + "name": "python3" + }, + "language_info": { + "name": "python" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +}